SSE2: Vectorize CMatrix3x4

--HG--
branch : sse2
hg/feature/sse2
kaetemi 11 years ago
parent a73faa4115
commit 267e544bc4

@ -57,57 +57,127 @@ public:
void set(const CMatrix &mat) void set(const CMatrix &mat)
{ {
const float *m = mat.get(); const float *m = mat.get();
#ifdef NL_HAS_SSE2
mm1 = _mm_load_ps(&m[0]);
mm2 = _mm_load_ps(&m[4]);
mm3 = _mm_load_ps(&m[8]);
mm4 = _mm_load_ps(&m[12]);
#else
a11= m[0]; a12= m[4]; a13= m[8] ; a14= m[12]; a11= m[0]; a12= m[4]; a13= m[8] ; a14= m[12];
a21= m[1]; a22= m[5]; a23= m[9] ; a24= m[13]; a21= m[1]; a22= m[5]; a23= m[9] ; a24= m[13];
a31= m[2]; a32= m[6]; a33= m[10]; a34= m[14]; a31= m[2]; a32= m[6]; a33= m[10]; a34= m[14];
#endif
} }
// mulSetvector. NB: in should be different as v!! (else don't work). // mulSetvector. NB: in should be different as v!! (else don't work).
void mulSetVector(const CVector &in, CVector &out) void mulSetVector(const CVector &in, CVector &out)
{ {
#ifdef NL_HAS_SSE2
__m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2));
out.mm = _mm_mul_ps(mm1, xxx);
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy));
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz));
#else
out.x= (a11*in.x + a12*in.y + a13*in.z); out.x= (a11*in.x + a12*in.y + a13*in.z);
out.y= (a21*in.x + a22*in.y + a23*in.z); out.y= (a21*in.x + a22*in.y + a23*in.z);
out.z= (a31*in.x + a32*in.y + a33*in.z); out.z= (a31*in.x + a32*in.y + a33*in.z);
#endif
} }
// mulSetpoint. NB: in should be different as v!! (else don't work). // mulSetpoint. NB: in should be different as v!! (else don't work).
void mulSetPoint(const CVector &in, CVector &out) void mulSetPoint(const CVector &in, CVector &out)
{ {
#ifdef NL_HAS_SSE2
__m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2));
out.mm = _mm_mul_ps(mm1, xxx);
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy));
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz));
out.mm = _mm_add_ps(out.mm, mm4);
#else
out.x= (a11*in.x + a12*in.y + a13*in.z + a14); out.x= (a11*in.x + a12*in.y + a13*in.z + a14);
out.y= (a21*in.x + a22*in.y + a23*in.z + a24); out.y= (a21*in.x + a22*in.y + a23*in.z + a24);
out.z= (a31*in.x + a32*in.y + a33*in.z + a34); out.z= (a31*in.x + a32*in.y + a33*in.z + a34);
#endif
} }
// mulSetvector. NB: in should be different as v!! (else don't work). // mulSetvector. NB: in should be different as v!! (else don't work).
void mulSetVector(const CVector &in, float scale, CVector &out) void mulSetVector(const CVector &in, float scale, CVector &out)
{ {
#ifdef NL_HAS_SSE2
__m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2));
out.mm = _mm_mul_ps(mm1, xxx);
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy));
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz));
out.mm = _mm_mul_ps(out.mm, _mm_set1_ps(scale));
#else
out.x= (a11*in.x + a12*in.y + a13*in.z) * scale; out.x= (a11*in.x + a12*in.y + a13*in.z) * scale;
out.y= (a21*in.x + a22*in.y + a23*in.z) * scale; out.y= (a21*in.x + a22*in.y + a23*in.z) * scale;
out.z= (a31*in.x + a32*in.y + a33*in.z) * scale; out.z= (a31*in.x + a32*in.y + a33*in.z) * scale;
#endif
} }
// mulSetpoint. NB: in should be different as v!! (else don't work). // mulSetpoint. NB: in should be different as v!! (else don't work).
void mulSetPoint(const CVector &in, float scale, CVector &out) void mulSetPoint(const CVector &in, float scale, CVector &out)
{ {
#ifdef NL_HAS_SSE2
__m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2));
out.mm = _mm_mul_ps(mm1, xxx);
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy));
out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz));
out.mm = _mm_add_ps(out.mm, mm4);
out.mm = _mm_mul_ps(out.mm, _mm_set1_ps(scale));
#else
out.x= (a11*in.x + a12*in.y + a13*in.z + a14) * scale; out.x= (a11*in.x + a12*in.y + a13*in.z + a14) * scale;
out.y= (a21*in.x + a22*in.y + a23*in.z + a24) * scale; out.y= (a21*in.x + a22*in.y + a23*in.z + a24) * scale;
out.z= (a31*in.x + a32*in.y + a33*in.z + a34) * scale; out.z= (a31*in.x + a32*in.y + a33*in.z + a34) * scale;
#endif
} }
// mulAddvector. NB: in should be different as v!! (else don't work). // mulAddvector. NB: in should be different as v!! (else don't work).
void mulAddVector(const CVector &in, float scale, CVector &out) void mulAddVector(const CVector &in, float scale, CVector &out)
{ {
#ifdef NL_HAS_SSE2
__m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2));
__m128 temp = _mm_mul_ps(mm1, xxx);
temp = _mm_add_ps(temp, _mm_mul_ps(mm2, yyy));
temp = _mm_add_ps(temp, _mm_mul_ps(mm3, zzz));
temp = _mm_mul_ps(temp, _mm_set1_ps(scale));
out.mm = _mm_add_ps(out.mm, temp);
#else
out.x+= (a11*in.x + a12*in.y + a13*in.z) * scale; out.x+= (a11*in.x + a12*in.y + a13*in.z) * scale;
out.y+= (a21*in.x + a22*in.y + a23*in.z) * scale; out.y+= (a21*in.x + a22*in.y + a23*in.z) * scale;
out.z+= (a31*in.x + a32*in.y + a33*in.z) * scale; out.z+= (a31*in.x + a32*in.y + a33*in.z) * scale;
#endif
} }
// mulAddpoint. NB: in should be different as v!! (else don't work). // mulAddpoint. NB: in should be different as v!! (else don't work).
void mulAddPoint(const CVector &in, float scale, CVector &out) void mulAddPoint(const CVector &in, float scale, CVector &out)
{ {
#ifdef NL_HAS_SSE2
__m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0));
__m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1));
__m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2));
__m128 temp = _mm_mul_ps(mm1, xxx);
temp = _mm_add_ps(temp, _mm_mul_ps(mm2, yyy));
temp = _mm_add_ps(temp, _mm_mul_ps(mm3, zzz));
temp = _mm_add_ps(temp, mm4);
temp = _mm_mul_ps(temp, _mm_set1_ps(scale));
out.mm = _mm_add_ps(out.mm, temp);
#else
out.x+= (a11*in.x + a12*in.y + a13*in.z + a14) * scale; out.x+= (a11*in.x + a12*in.y + a13*in.z + a14) * scale;
out.y+= (a21*in.x + a22*in.y + a23*in.z + a24) * scale; out.y+= (a21*in.x + a22*in.y + a23*in.z + a24) * scale;
out.z+= (a31*in.x + a32*in.y + a33*in.z + a34) * scale; out.z+= (a31*in.x + a32*in.y + a33*in.z + a34) * scale;
#endif
} }

Loading…
Cancel
Save