要求向量和矩阵都是16字节边界对齐的
void M4MulM4(Matrix4f& mout, const Matrix4f& m1, const Matrix4f& m2)
{
MC_ASSERT( !(((int)(&mout))&0xF) && !(((int)(&m1))&0xF) && !(((int)(&m2))&0xF) );
asm(
"li r0,0;"
"li r10,0x10;"
"li r11,0x20;"
"li r12,0x30;"
"vxor v0,v0,v0;"
"lvx v1,r4,r0;"
"lvx v5,r5,r0;"
"lvx v2,r4,r10;"
"lvx v6,r5,r10;"
"lvx v3,r4,r11;"
"lvx v7,r5,r11;"
"lvx v4,r4,r12;"
"lvx v8,r5,r12;"
"vspltw v9,v1,0;"
"vmaddfp v13,v9,v5,v0;"
"vspltw v10,v1,1;"
"vmaddfp v13,v10,v6,v13;"
"vspltw v11,v1,2;"
"vmaddfp v13,v11,v7,v13;"
"vspltw v12,v1,3;"
"vmaddfp v13,v12,v8,v13;"
"stvx v13,r3,r0;"
"vspltw v9,v2,0;"
"vmaddfp v13,v9,v5,v0;"
"vspltw v10,v2,1;"
"vmaddfp v13,v10,v6,v13;"
"vspltw v11,v2,2;"
"vmaddfp v13,v11,v7,v13;"
"vspltw v12,v2,3;"
"vmaddfp v13,v12,v8,v13;"
"stvx v13,r3,r10;"
"vspltw v9,v3,0;"
"vmaddfp v13,v9,v5,v0;"
"vspltw v10,v3,1;"
"vmaddfp v13,v10,v6,v13;"
"vspltw v11,v3,2;"
"vmaddfp v13,v11,v7,v13;"
"vspltw v12,v3,3;"
"vmaddfp v13,v12,v8,v13;"
"stvx v13,r3,r11;"
"vspltw v9,v4,0;"
"vmaddfp v13,v9,v5,v0;"
"vspltw v10,v4,1;"
"vmaddfp v13,v10,v6,v13;"
"vspltw v11,v4,2;"
"vmaddfp v13,v11,v7,v13;"
"vspltw v12,v4,3;"
"vmaddfp v13,v12,v8,v13;"
"stvx v13,r3,r12;"
);
}
void V4MulM4(Vector4f& vout, const Vector4f& v, const Matrix4f& m)
{
MC_ASSERT( !(((int)(&vout))&0xF) && !(((int)(&v))&0xF) && !(((int)(&m))&0xF) );
asm(
"li r0,0;"
"li r10,0x10;"
"li r11,0x20;"
"li r12,0x30;"
"vxor v0,v0,v0;"
"lvx v1,r4,r0;"
"lvx v5,r5,r0;"
"lvx v6,r5,r10;"
"lvx v7,r5,r11;"
"lvx v8,r5,r12;"
"vspltw v9,v1,0;"
"vmaddfp v13,v9,v5,v0;"
"vspltw v10,v1,1;"
"vmaddfp v13,v10,v6,v13;"
"vspltw v11,v1,2;"
"vmaddfp v13,v11,v7,v13;"
"vspltw v12,v1,3;"
"vmaddfp v13,v12,v8,v13;"
"stvx v13,r3,r0;"
);
}