From 923e5a34738a975a3c3833057feaaf302271f50f Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 20 Jun 2014 06:37:47 +0200
Subject: [PATCH] SSE2: Vectorize CVector

--HG--
branch : sse2
---
 code/nel/include/nel/misc/vector.h        |   4 +
 code/nel/include/nel/misc/vector_inline.h | 129 +++++++++++++++++++++-
 2 files changed, 131 insertions(+), 2 deletions(-)

diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index ebf56c048..6c1e27479 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -64,7 +64,11 @@ public:		// Methods.
 	/// Constructor .
 	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) { /* if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); */ }
 	/// Copy Constructor.
+#ifdef NL_HAS_SSE2
+	CVector(const CVector &v) : mm(v.mm) { /* if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); */ }
+#else
 	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { /* if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); */ }
+#endif
 	//@}
 
 	/// @name Base Maths.
diff --git a/code/nel/include/nel/misc/vector_inline.h b/code/nel/include/nel/misc/vector_inline.h
index 9f890f637..299e4c148 100644
--- a/code/nel/include/nel/misc/vector_inline.h
+++ b/code/nel/include/nel/misc/vector_inline.h
@@ -31,95 +31,196 @@ namespace	NLMISC
 // Base Maths.
 inline	CVector	&CVector::operator+=(const CVector &v)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_add_ps(mm, v.mm);
+#else
 	x+=v.x;
 	y+=v.y;
 	z+=v.z;
+#endif
 	return *this;
 }
 inline	CVector	&CVector::operator-=(const CVector &v)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_sub_ps(mm, v.mm);
+#else
 	x-=v.x;
 	y-=v.y;
 	z-=v.z;
+#endif
 	return *this;
 }
 inline	CVector	&CVector::operator*=(float f)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_mul_ps(mm, _mm_set1_ps(f));
+#else
 	x*=f;
 	y*=f;
 	z*=f;
+#endif
 	return *this;
 }
 inline	CVector	&CVector::operator/=(float f)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_mul_ps(mm, _mm_set1_ps(1.0f / f));
+	return *this;
+#else
 	return *this*= (1.0f/f);
+#endif
 }
 inline	CVector	CVector::operator+(const CVector &v) const
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	ret.mm = _mm_add_ps(mm, v.mm);
+	return ret;
+#else
 	CVector	ret(x+v.x, y+v.y, z+v.z);
 	return ret;
+#endif
 }
 inline	CVector	CVector::operator-(const CVector &v) const
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	ret.mm = _mm_sub_ps(mm, v.mm);
+	return ret;
+#else
 	CVector	ret(x-v.x, y-v.y, z-v.z);
 	return ret;
+#endif
 }
 inline	CVector	CVector::operator*(float f) const
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	ret.mm = _mm_mul_ps(mm, _mm_set1_ps(f));
+	return ret;
+#else
 	CVector	ret(x*f, y*f, z*f);
 	return ret;
+#endif
 }
 inline	CVector	CVector::operator/(float f) const
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	ret.mm = _mm_mul_ps(mm, _mm_set1_ps(1.0f / f));
+	return ret;
+#else
 	return *this*(1.0f/f);
+#endif
 }
 inline	CVector	CVector::operator-() const
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	ret.mm = _mm_mul_ps(mm, _mm_set1_ps(-1.0f));
+	return ret;
+#else
 	return CVector(-x,-y,-z);
+#endif
 }
 inline CVector	operator*(float f, const CVector &v)
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	ret.mm = _mm_mul_ps(_mm_set1_ps(f), v.mm);
+	return ret;
+#else
 	CVector	ret(v.x*f, v.y*f, v.z*f);
 	return ret;
+#endif
 }
 
+#ifdef NL_HAS_SSE2
+inline __m128 dotsplat(const __m128 &l, const __m128 &r)
+{
+	// TODO: _mm_hadd_ps SSE3
+	__m128 mult = _mm_mul_ps(l, r);
+	__m128 vx = _mm_shuffle_ps(mult, mult, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 vy = _mm_shuffle_ps(mult, mult, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 vz = _mm_shuffle_ps(mult, mult, _MM_SHUFFLE(2, 2, 2, 2));
+	__m128 result = _mm_add_ps(_mm_add_ps(vx, vy), vz);
+	return result;
+}
+#endif
 
 // ============================================================================================
 // Advanced Maths.
 inline	float	CVector::operator*(const CVector &v) const
 {
+#ifdef NL_HAS_SSE2
+	return _mm_cvtss_f32(dotsplat(mm, v.mm));
+#else
 	return x*v.x + y*v.y + z*v.z;
+#endif
 }
 inline	CVector	CVector::operator^(const CVector &v) const
 {
+#ifdef NL_HAS_SSE2
+	CVector ret;
+	__m128 l = _mm_shuffle_ps(mm, mm, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 r = _mm_shuffle_ps(v.mm, v.mm, _MM_SHUFFLE(3, 1, 0, 2));
+	__m128 mul1 = _mm_mul_ps(l, r);
+	l = _mm_shuffle_ps(mm, mm, _MM_SHUFFLE(3, 1, 0, 2));
+	r = _mm_shuffle_ps(v.mm, v.mm, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 mul2 = _mm_mul_ps(l, r);
+	ret.mm = _mm_sub_ps(mul1, mul2);
+	return ret;
+#else
 	CVector	ret;
-
 	ret.x= y*v.z - z*v.y;
 	ret.y= z*v.x - x*v.z;
 	ret.z= x*v.y - y*v.x;
-
 	return ret;
+#endif
 }
 inline	float	CVector::sqrnorm() const
 {
+#ifdef NL_HAS_SSE2
+	return _mm_cvtss_f32(dotsplat(mm, mm));
+#else
 	return (float)(x*x + y*y + z*z);
+#endif
 }
 inline	float	CVector::norm() const
 {
+#ifdef NL_HAS_SSE2
+	return sqrt(_mm_cvtss_f32(dotsplat(mm, mm)));
+#else
 	return (float)sqrt(x*x + y*y + z*z);
+#endif
 }
 inline	void	CVector::normalize()
 {
+#ifdef NL_HAS_SSE2
+	__m128 normsplat = _mm_sqrt_ps(dotsplat(mm, mm));
+	if (_mm_cvtss_f32(normsplat))
+		mm = _mm_div_ps(mm, normsplat);
+#else
 	float	n=norm();
 	if(n)
 		*this/=n;
+#endif
 }
 inline	CVector	CVector::normed() const
 {
+#ifdef NL_HAS_SSE2
+	CVector res;
+	__m128 normsplat = _mm_sqrt_ps(dotsplat(mm, mm));
+	if (_mm_cvtss_f32(normsplat))
+		res.mm = _mm_div_ps(mm, normsplat);
+	return res;
+#else
 	CVector	ret;
 	ret= *this;
 	ret.normalize();
 	return ret;
+#endif
 }
 
 
@@ -127,19 +228,35 @@ inline	CVector	CVector::normed() const
 // Misc.
 inline	void	CVector::set(float _x, float _y, float _z)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_setr_ps(_x, _y, _z, 0.0f);
+#else
 	x=_x; y=_y; z=_z;
+#endif
 }
 inline	bool	CVector::operator==(const CVector &v) const
 {
+#ifdef NL_HAS_SSE2
+	return (_mm_movemask_ps(_mm_cmpeq_ps(mm, v.mm)) & 0x07) == 0x07;
+#else
 	return x==v.x && y==v.y && z==v.z;
+#endif
 }
 inline	bool	CVector::operator!=(const CVector &v) const
 {
+#ifdef NL_HAS_SSE2
+	return (_mm_movemask_ps(_mm_cmpneq_ps(mm, v.mm)) & 0x07) != 0;
+#else
 	return !(*this==v);
+#endif
 }
 inline	bool	CVector::isNull() const
 {
+#ifdef NL_HAS_SSE2
+	return (_mm_movemask_ps(_mm_cmpeq_ps(mm, _mm_setzero_ps())) & 0x07) == 0x07;
+#else
 	return *this==CVector::Null;
+#endif
 }
 inline	bool	CVector::operator<(const CVector &v) const
 {
@@ -177,15 +294,23 @@ inline	void	CVector::sphericToCartesian(float r, float theta,float phi)
 }
 inline	void	CVector::minof(const CVector &a, const CVector &b)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_min_ps(a.mm, b.mm);
+#else
 	x= std::min(a.x, b.x);
 	y= std::min(a.y, b.y);
 	z= std::min(a.z, b.z);
+#endif
 }
 inline	void	CVector::maxof(const CVector &a, const CVector &b)
 {
+#ifdef NL_HAS_SSE2
+	mm = _mm_max_ps(a.mm, b.mm);
+#else
 	x= std::max(a.x, b.x);
 	y= std::max(a.y, b.y);
 	z= std::max(a.z, b.z);
+#endif
 }
 inline	void	CVector::serial(IStream &f)
 {