From 267e544bc4850fd6344518bc6d79e36e1227f605 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Fri, 20 Jun 2014 06:10:12 +0200 Subject: [PATCH] SSE2: Vectorize CMatrix3x4 --HG-- branch : sse2 --- code/nel/include/nel/3d/matrix_3x4.h | 72 +++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/code/nel/include/nel/3d/matrix_3x4.h b/code/nel/include/nel/3d/matrix_3x4.h index 994ea555e..929aca0d8 100644 --- a/code/nel/include/nel/3d/matrix_3x4.h +++ b/code/nel/include/nel/3d/matrix_3x4.h @@ -56,58 +56,128 @@ public: // Copy from a matrix. void set(const CMatrix &mat) { - const float *m =mat.get(); + const float *m = mat.get(); +#ifdef NL_HAS_SSE2 + mm1 = _mm_load_ps(&m[0]); + mm2 = _mm_load_ps(&m[4]); + mm3 = _mm_load_ps(&m[8]); + mm4 = _mm_load_ps(&m[12]); +#else a11= m[0]; a12= m[4]; a13= m[8] ; a14= m[12]; a21= m[1]; a22= m[5]; a23= m[9] ; a24= m[13]; a31= m[2]; a32= m[6]; a33= m[10]; a34= m[14]; +#endif } // mulSetvector. NB: in should be different as v!! (else don't work). void mulSetVector(const CVector &in, CVector &out) { +#ifdef NL_HAS_SSE2 + __m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2)); + out.mm = _mm_mul_ps(mm1, xxx); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy)); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz)); +#else out.x= (a11*in.x + a12*in.y + a13*in.z); out.y= (a21*in.x + a22*in.y + a23*in.z); out.z= (a31*in.x + a32*in.y + a33*in.z); +#endif } // mulSetpoint. NB: in should be different as v!! (else don't work). void mulSetPoint(const CVector &in, CVector &out) { +#ifdef NL_HAS_SSE2 + __m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2)); + out.mm = _mm_mul_ps(mm1, xxx); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy)); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz)); + out.mm = _mm_add_ps(out.mm, mm4); +#else out.x= (a11*in.x + a12*in.y + a13*in.z + a14); out.y= (a21*in.x + a22*in.y + a23*in.z + a24); out.z= (a31*in.x + a32*in.y + a33*in.z + a34); +#endif } // mulSetvector. NB: in should be different as v!! (else don't work). void mulSetVector(const CVector &in, float scale, CVector &out) { +#ifdef NL_HAS_SSE2 + __m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2)); + out.mm = _mm_mul_ps(mm1, xxx); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy)); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz)); + out.mm = _mm_mul_ps(out.mm, _mm_set1_ps(scale)); +#else out.x= (a11*in.x + a12*in.y + a13*in.z) * scale; out.y= (a21*in.x + a22*in.y + a23*in.z) * scale; out.z= (a31*in.x + a32*in.y + a33*in.z) * scale; +#endif } // mulSetpoint. NB: in should be different as v!! (else don't work). void mulSetPoint(const CVector &in, float scale, CVector &out) { +#ifdef NL_HAS_SSE2 + __m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2)); + out.mm = _mm_mul_ps(mm1, xxx); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm2, yyy)); + out.mm = _mm_add_ps(out.mm, _mm_mul_ps(mm3, zzz)); + out.mm = _mm_add_ps(out.mm, mm4); + out.mm = _mm_mul_ps(out.mm, _mm_set1_ps(scale)); +#else out.x= (a11*in.x + a12*in.y + a13*in.z + a14) * scale; out.y= (a21*in.x + a22*in.y + a23*in.z + a24) * scale; out.z= (a31*in.x + a32*in.y + a33*in.z + a34) * scale; +#endif } // mulAddvector. NB: in should be different as v!! (else don't work). void mulAddVector(const CVector &in, float scale, CVector &out) { +#ifdef NL_HAS_SSE2 + __m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 temp = _mm_mul_ps(mm1, xxx); + temp = _mm_add_ps(temp, _mm_mul_ps(mm2, yyy)); + temp = _mm_add_ps(temp, _mm_mul_ps(mm3, zzz)); + temp = _mm_mul_ps(temp, _mm_set1_ps(scale)); + out.mm = _mm_add_ps(out.mm, temp); +#else out.x+= (a11*in.x + a12*in.y + a13*in.z) * scale; out.y+= (a21*in.x + a22*in.y + a23*in.z) * scale; out.z+= (a31*in.x + a32*in.y + a33*in.z) * scale; +#endif } // mulAddpoint. NB: in should be different as v!! (else don't work). void mulAddPoint(const CVector &in, float scale, CVector &out) { +#ifdef NL_HAS_SSE2 + __m128 xxx = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 yyy = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 zzz = _mm_shuffle_ps(in.mm, in.mm, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 temp = _mm_mul_ps(mm1, xxx); + temp = _mm_add_ps(temp, _mm_mul_ps(mm2, yyy)); + temp = _mm_add_ps(temp, _mm_mul_ps(mm3, zzz)); + temp = _mm_add_ps(temp, mm4); + temp = _mm_mul_ps(temp, _mm_set1_ps(scale)); + out.mm = _mm_add_ps(out.mm, temp); +#else out.x+= (a11*in.x + a12*in.y + a13*in.z + a14) * scale; out.y+= (a21*in.x + a22*in.y + a23*in.z + a24) * scale; out.z+= (a31*in.x + a32*in.y + a33*in.z + a34) * scale; +#endif }