This is my code for performing quaternion multiplication. I know, it's an old hat, but my question is from POV of compiler optimization. The results match nicely from a known library.
I my convetion. the first three are the vector parts and the last is the scalar part.
compiled with gcc 4.3 using
g++ quat_mul.cpp -msse3 -O3 -I ./eigen-2.0.0 -march=native
on my athlon 1.6 GHz (K8) cpu. with sse3.
The assembly for the core routine is
My questions are
1) I used a lot of variables in the mul routine and yet the compiler used only 8 of them. Are they that good with register allocation and lifetime analysis?
2) If yes to 1) then should one use lots of intermediate if fleeting variables in the core routines to help in compiler optimization.
3) The instructions seem to be nicely scheduled. Is it because I ordered my C code in that way or what?
4) I noticed that sse assembly goes the other way. IE destination register appears on the rhs of an instruction. Right or wrong?
5) any other tips while writing optimized sse code in general or improving it in particular.
Comments/criticisms/suggestions welcome.
#include<iostream>
#include<cmath>
#include<pmmintrin.h>
#include<Eigen/Geometry>
using namespace std;
using namespace Eigen;
float __attribute__((aligned(16))) printBuf[4];
#define vec4f_swizzle(v,p,q,r,s) (_mm_shuffle_ps( (v),(v), ((s)<<6|(r)<<4|(q)<<2|(p))))
const int zero=0;
const int flipSign=0x80000000;
const __m128 quat_mask=_mm_setr_ps( *(float*)&zero,
*(float*)&zero,
*(float*)&zero,
*(float*)&flipSign);
void print( __m128 a)
{
_mm_store_ps(printBuf,a);
cout<<printBuf[0]<<' '<<
printBuf[1]<<' '<<
printBuf[2]<<' '<<
printBuf[3]<<endl;
}
__m128 quat_mul(__m128 a, __m128 b)
{
__m128 swiz1=vec4f_swizzle(b,3,3,3,3);
__m128 swiz2=vec4f_swizzle(a,2,0,1,0);
__m128 swiz3=vec4f_swizzle(b,1,2,0,0);
__m128 swiz4=vec4f_swizzle(a,3,3,3,1);
__m128 swiz5=vec4f_swizzle(b,0,1,2,1);
__m128 swiz6=vec4f_swizzle(a,1,2,0,2);
__m128 swiz7=vec4f_swizzle(b,2,0,1,2);
__m128 mul4=_mm_mul_ps(swiz6,swiz7);
__m128 mul3=_mm_mul_ps(swiz4,swiz5);
__m128 mul2=_mm_mul_ps(swiz2,swiz3);
__m128 mul1=_mm_mul_ps(a,swiz1);
__m128 flip1=_mm_xor_ps(mul4,quat_mask);
__m128 flip2=_mm_xor_ps(mul3,quat_mask);
__m128 retVal=_mm_sub_ps(mul1,mul2);
__m128 retVal2=_mm_add_ps(flip1,flip2);
return _mm_add_ps(retVal,retVal2);
}
int main()
{
Quaternionf a(1,2,3,4);
Quaternionf b(11,12,13,14);
__m128 a1,b1;
a1=_mm_setr_ps(2.0,3.0,4.0,1);
b1=_mm_setr_ps(12,13,14,11);
print(a1);
print(b1);
print(quat_mul(a1,b1));
cout<<"quat\n";
asm("#start");
Quaternionf c=a*b;
asm("#end");
cout<<c.x()<<' '<<
c.y()<<' '<<
c.z()<<' '<<
c.w()<<"\n";
cout<<"test\n";
cout<<a.x()<<' '<<
a.y()<<' '<<
a.z()<<' '<<
a.w()<<"\n";
return 0;
}
compiled with gcc 4.3 using
g++ quat_mul.cpp -msse3 -O3 -I ./eigen-2.0.0 -march=native
on my athlon 1.6 GHz (K8) cpu. with sse3.
The assembly for the core routine is
00000000004009e0 <_Z8quat_mulU8__vectorfS_>:
4009e0: 0f 28 f9 movaps %xmm1,%xmm7
4009e3: 0f 28 f1 movaps %xmm1,%xmm6
4009e6: 0f 28 d8 movaps %xmm0,%xmm3
4009e9: 0f 28 e9 movaps %xmm1,%xmm5
4009ec: 0f 28 d0 movaps %xmm0,%xmm2
4009ef: 0f c6 f9 ff shufps $0xff,%xmm1,%xmm7
4009f3: 0f c6 f1 09 shufps $0x9,%xmm1,%xmm6
4009f7: 0f c6 e9 64 shufps $0x64,%xmm1,%xmm5
4009fb: 0f 28 e0 movaps %xmm0,%xmm4
4009fe: 0f c6 d8 7f shufps $0x7f,%xmm0,%xmm3
400a02: 0f c6 d0 89 shufps $0x89,%xmm0,%xmm2
400a06: 0f c6 c9 92 shufps $0x92,%xmm1,%xmm1
400a0a: 0f c6 e0 12 shufps $0x12,%xmm0,%xmm4
400a0e: 0f 59 dd mulps %xmm5,%xmm3
400a11: 0f 59 d1 mulps %xmm1,%xmm2
400a14: 0f 28 0d 45 0f 20 00 movaps 0x200f45(%rip),%xmm1 # 601960 <_ZL9quat_mask>
400a1b: 0f 59 e6 mulps %xmm6,%xmm4
400a1e: 0f 59 c7 mulps %xmm7,%xmm0
400a21: 0f 57 d1 xorps %xmm1,%xmm2
400a24: 0f 57 d9 xorps %xmm1,%xmm3
400a27: 0f 5c c4 subps %xmm4,%xmm0
400a2a: 0f 58 d3 addps %xmm3,%xmm2
400a2d: 0f 58 c2 addps %xmm2,%xmm0
400a30: c3 retq
400a31: 66 66 66 66 66 66 2e nopw %cs:0x0(%rax,%rax,1)
400a38: 0f 1f 84 00 00 00 00
400a3f: 00
My questions are
1) I used a lot of variables in the mul routine and yet the compiler used only 8 of them. Are they that good with register allocation and lifetime analysis?
2) If yes to 1) then should one use lots of intermediate if fleeting variables in the core routines to help in compiler optimization.
3) The instructions seem to be nicely scheduled. Is it because I ordered my C code in that way or what?
4) I noticed that sse assembly goes the other way. IE destination register appears on the rhs of an instruction. Right or wrong?
5) any other tips while writing optimized sse code in general or improving it in particular.
Comments/criticisms/suggestions welcome.