SSE2: Initial testing implementation

--HG-- branch : sse2
11 years ago · 5f54f75802
parent 0b4e64f0c9
commit 5f54f75802
35 changed files with 307 additions and 57 deletions
--- a/code/CMakeLists.txt
+++ b/code/CMakeLists.txt
@ -131,6 +131,10 @@ IF(FINAL_VERSION)
  ADD_DEFINITIONS(-DFINAL_VERSION=1)
 ENDIF(FINAL_VERSION)

+IF(WITH_SSE2)
+  ADD_DEFINITIONS(-DUSE_SSE2)
+ENDIF(WITH_SSE2)
+
 IF(WITH_QT)
  FIND_PACKAGE(Qt4 COMPONENTS QtCore QtGui QtXml QtOpenGL REQUIRED)
 ENDIF(WITH_QT)
--- a/code/CMakeModules/nel.cmake
+++ b/code/CMakeModules/nel.cmake
@ -324,6 +324,8 @@ MACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)
  OPTION(WITH_LIBOVR              "With LibOVR support"                           OFF)
  OPTION(WITH_LIBVR               "With LibVR support"                            OFF)
  OPTION(WITH_PERFHUD             "With NVIDIA PerfHUD support"                   OFF)
+
+  OPTION(WITH_SSE2                "With SSE2"                                     ON )
 ENDMACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)

 MACRO(NL_SETUP_NELNS_DEFAULT_OPTIONS)
--- a/code/nel/include/nel/3d/computed_string.h
+++ b/code/nel/include/nel/3d/computed_string.h
@ -290,7 +290,7 @@ public:
 	 * \param matrix transformation matrix
 	 * \param hotspot position of string origine
 	 */
-	void render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot = MiddleMiddle);
+	void render3D (IDriver& driver, const CMatrix &matrix, THotSpot hotspot = MiddleMiddle);

 };

--- a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
@ -33,14 +33,14 @@ namespace NL3D {
  */

 template <class T>
-inline T PSBinOpModulate(T arg1, T arg2) { return arg1 * arg2; }
+inline T PSBinOpModulate(const T &arg1, const T &arg2) { return arg1 * arg2; }
 template <class T>
-inline T PSBinOpAdd(T arg1, T arg2) { return arg1 + arg2; }
+inline T PSBinOpAdd(const T &arg1, const T &arg2) { return arg1 + arg2; }
 template <class T>
-inline T PSBinOpSubtract(T arg1, T arg2) { return arg1 - arg2; }
+inline T PSBinOpSubtract(const T &arg1, const T &arg2) { return arg1 - arg2; }

 template <>
-inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2)
+inline CPlaneBasis PSBinOpModulate(const CPlaneBasis &p1, const CPlaneBasis &p2)
 {
 	// we compute p1 * p2
 	NLMISC::CVector z = p1.X ^ p1.Y;
@ -57,13 +57,13 @@ inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2)

 }
 template <>
-inline CPlaneBasis PSBinOpAdd(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
+inline CPlaneBasis PSBinOpAdd(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */)
 {
 	nlassert(0); // not allowed for now
 	return CPlaneBasis(NLMISC::CVector::Null);
 }
 template <>
-inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
+inline CPlaneBasis PSBinOpSubtract(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */)
 {
 	nlassert(0); // not allowed for now
 	return CPlaneBasis(NLMISC::CVector::Null);
@ -71,21 +71,21 @@ inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)


 template <>
-inline uint32 PSBinOpSubtract(uint32 lhs, uint32 rhs)
+inline uint32 PSBinOpSubtract(const uint32 &lhs, const uint32 &rhs)
 {
 	return rhs > lhs ? 0 : lhs - rhs; // avoid overflow
 }


 template <>
-inline NLMISC::CRGBA PSBinOpModulate(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
+inline NLMISC::CRGBA PSBinOpModulate(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
 {
 	NLMISC::CRGBA result;
 	result.modulateFromColor(t1, t2);
 	return result;
 }
 template <>
-inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
+inline NLMISC::CRGBA PSBinOpAdd(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
 {
 	NLMISC::CRGBA r;
 	uint S = t1.R + t2.R; if (S > 255) S = 255; r.R = (uint8) S;
@ -94,7 +94,7 @@ inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
 	return r;
 }
 template <>
-inline NLMISC::CRGBA PSBinOpSubtract(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
+inline NLMISC::CRGBA PSBinOpSubtract(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
 {
 	NLMISC::CRGBA r;
 	sint S = t1.R - t2.R; if (S < 0) S = 0; r.R = (uint8) S;
--- a/code/nel/include/nel/3d/ps_attrib_maker_helper.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_helper.h
@ -1432,7 +1432,7 @@ public:
 	  *
 	  */

-    virtual void setDefaultValue(T defaultValue) { _DefaultValue = defaultValue;}
+    virtual void setDefaultValue(const T &defaultValue) { _DefaultValue = defaultValue;}

 	/// get the default value :
 	virtual T getDefaultValue(void) const { return _DefaultValue; }
--- a/code/nel/include/nel/3d/ps_attrib_maker_template.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_template.h
@ -68,7 +68,7 @@ template <typename T> struct CPSValueBlendFuncBase
 {
 	virtual ~CPSValueBlendFuncBase() {}
 	virtual void getValues(T &startValue, T &endValue) const = 0;
-	virtual void setValues(T startValue, T endValue) = 0;
+	virtual void setValues(const T &startValue, const T &endValue) = 0;
 };


@ -122,7 +122,7 @@ public:
 		}

 		/// Set the Values between which to blend.
-		virtual void setValues(T startValue, T endValue)
+		virtual void setValues(const T &startValue, const T &endValue)
 		{
 			_StartValue = startValue;
 			_EndValue = endValue;
@ -210,7 +210,7 @@ public:

 	/// set the Values

-	virtual void setValues(T startValue, T endValue)
+	virtual void setValues(const T &startValue, const T &endValue)
 	{
 		float step = 1.f / n;
 		float alpha = 0.0f;
--- a/code/nel/include/nel/3d/ps_color.h
+++ b/code/nel/include/nel/3d/ps_color.h
@ -62,7 +62,7 @@ public:
 		endValue = convertVBColor(endValue, _ColorType);

 	}
-	virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue)
+	virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue)
 	{
 		CPSValueBlendFunc<NLMISC::CRGBA>::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType));
 	}
@ -96,7 +96,7 @@ public:
 		endValue = convertVBColor(endValue, _ColorType);

 	}
-	virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue)
+	virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue)
 	{
 		CPSValueBlendSampleFunc<NLMISC::CRGBA, RGBA_BLENDER_NUM_VALUES>::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType));
 	}
--- a/code/nel/include/nel/3d/ps_edit.h
+++ b/code/nel/include/nel/3d/ps_edit.h
@ -82,7 +82,7 @@ struct IPSMover
 	virtual NLMISC::CVector			getNormal(uint32 /* index */) { NL_PS_FUNC(getNormal); return NLMISC::CVector::Null ; }

 	/// if the object only stores a normal, this set the normal of the object. Otherwise it has no effect
-	virtual void					setNormal(uint32 /* index */, NLMISC::CVector /* n */) { NL_PS_FUNC(setNormal); }
+	virtual void					setNormal(uint32 /* index */, const NLMISC::CVector &/* n */) { NL_PS_FUNC(setNormal); }

 	// set a new orthogonal matrix for the object
 	virtual void					setMatrix(uint32 index, const NLMISC::CMatrix &m) = 0 ;
--- a/code/nel/include/nel/3d/ps_force.h
+++ b/code/nel/include/nel/3d/ps_force.h
@ -741,7 +741,7 @@ public:
 	virtual NLMISC::CVector getScale(uint32 k) const { return NLMISC::CVector(_Radius[k], _Radius[k], _Radius[k]); }
 	virtual bool onlyStoreNormal(void) const { return true; }
 	virtual NLMISC::CVector getNormal(uint32 index) { return _Normal[index]; }
-	virtual void setNormal(uint32 index, NLMISC::CVector n) { _Normal[index] = n; }
+	virtual void setNormal(uint32 index, const NLMISC::CVector &n) { _Normal[index] = n; }

 	virtual void setMatrix(uint32 index, const NLMISC::CMatrix &m);
 	virtual NLMISC::CMatrix getMatrix(uint32 index) const;
--- a/code/nel/include/nel/3d/ps_located.h
+++ b/code/nel/include/nel/3d/ps_located.h
@ -613,7 +613,7 @@ public:
 	struct CParametricInfo
 	{
 		CParametricInfo() {}
-		CParametricInfo(NLMISC::CVector pos, NLMISC::CVector speed, float date)
+		CParametricInfo(const NLMISC::CVector &pos, const NLMISC::CVector &speed, float date)
 			: Pos(pos), Speed(speed), Date(date)
 		{
 		}
--- a/code/nel/include/nel/3d/ps_zone.h
+++ b/code/nel/include/nel/3d/ps_zone.h
@ -153,7 +153,7 @@ class CPSZonePlane : public CPSZone, public IPSMover
 		virtual NLMISC::CMatrix getMatrix(uint32 index) const;
 		virtual bool onlyStoreNormal(void) const { return true; }
 		virtual NLMISC::CVector getNormal(uint32 index);
-		virtual void setNormal(uint32 index, NLMISC::CVector n);
+		virtual void setNormal(uint32 index, const NLMISC::CVector &n);

 		virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream);

@ -255,7 +255,7 @@ class CPSZoneDisc : public CPSZone, public IPSMover
 		virtual NLMISC::CVector getScale(uint32 k) const;
 		virtual bool onlyStoreNormal(void) const { return true; }
 		virtual NLMISC::CVector getNormal(uint32 index);
-		virtual void setNormal(uint32 index, NLMISC::CVector n);
+		virtual void setNormal(uint32 index, const NLMISC::CVector &n);

 		virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream);

--- a/code/nel/include/nel/3d/quad_grid.h
+++ b/code/nel/include/nel/3d/quad_grid.h
@ -314,11 +314,11 @@ private:// Methods.
 	}

 	// return the coordinates on the grid of what include the bbox.
-	void		selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1)
+	void		selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1)
 	{
-		CVector		bminp, bmaxp;
-		bminp= bmin;
-		bmaxp= bmax;
+		CVector		bmin, bmax;
+		bmin= bminp;
+		bmax= bmaxp;
 		bmin.minof(bminp, bmaxp);
 		bmax.maxof(bminp, bmaxp);
 		bmin/= _EltSize;
--- a/code/nel/include/nel/3d/shadow_poly_receiver.h
+++ b/code/nel/include/nel/3d/shadow_poly_receiver.h
@ -85,10 +85,27 @@ public:
 	// a vertex
 	struct CRGBAVertex
 	{
+#if USE_SSE2
+		float X, Y, Z;
+#else
 		CVector V;
+#endif
 		CRGBA Color;
 		CRGBAVertex() {}
-		CRGBAVertex(const CVector &v, CRGBA c) : V(v), Color(c) {}
+#if USE_SSE2
+		CRGBAVertex(const CVector &v, CRGBA c) : X(v.x), Y(v.y), Z(v.z), Color(c) {}
+		const CVector &asVector() const
+		{
+			//nlctassert(sizeof(CVector) == sizeof(CRGBAVertex));
+			nlctassert(sizeof(CVector) + 4 == sizeof(CRGBAVertex));
+			*reinterpret_cast<const CVector *>(this);
+		}
+#else
+		const CVector &asVector() const
+		{
+			return V;
+		}
+#endif
 	};

 	/** Compute list of clipped tri under the shadow mat
--- a/code/nel/include/nel/3d/static_quad_grid.h
+++ b/code/nel/include/nel/3d/static_quad_grid.h
@ -102,8 +102,10 @@ private:// Atttributes.


 	// return the coordinates on the grid of what include the bbox.
-	void		selectPoint(CVector point, sint &x0, sint &y0)
+	void		selectPoint(const CVector &pointp, sint &x0, sint &y0)
 	{
+		CVector point = pointp;
+
 		point/= _EltSize;
 		x0= (sint)(floor(point.x));
 		y0= (sint)(floor(point.y));
--- a/code/nel/include/nel/3d/zone_lighter.h
+++ b/code/nel/include/nel/3d/zone_lighter.h
@ -421,7 +421,7 @@ private:
 	  * The vector of water shapes is released then
 	  * \param bbox the bbox of the zone containing the water shapes
 	  */
-	void makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox);
+	void makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox);


 	/** For each tile of the current zone, check whether it below or above water.
--- a/code/nel/include/nel/ligo/primitive.h
+++ b/code/nel/include/nel/ligo/primitive.h
@ -523,7 +523,7 @@ public:

 	std::vector<CPrimVector>	VPoints;

-	static float getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos);
+	static float getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos);

 public:

--- a/code/nel/include/nel/misc/types_nl.h
+++ b/code/nel/include/nel/misc/types_nl.h
@ -328,6 +328,20 @@ typedef	unsigned	int			uint;			// at least 32bits (depend of processor)

 #endif // NL_OS_UNIX

+#define NL_DEFAULT_MEMORY_ALIGNMENT 16
+#ifdef NL_COMP_VC
+#define NL_ALIGN(nb) __declspec(align(nb))
+#else
+#define NL_ALIGN(nb) __attribute__((aligned(nb)))
+#endif
+
+#ifdef USE_SSE2
+extern void *operator new(size_t size) throw(std::bad_alloc);
+extern void *operator new[](size_t size) throw(std::bad_alloc);
+extern void operator delete(void *p) throw();
+extern void operator delete[](void *p) throw();
+#endif
+
 // CHashMap, CHashSet and CHashMultiMap definitions
 #if defined(_STLPORT_VERSION) // STLport detected
 #	include <hash_map>
--- a/code/nel/include/nel/pacs/chain_quad.h
+++ b/code/nel/include/nel/pacs/chain_quad.h
@ -81,7 +81,7 @@ public:
 	 * \param cst the array of CEdgeChainEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF.
 	 * \return number of edgechain found. stored in cst.EdgeChainEntries (array cleared first).
 	 */
-	sint			selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const;
+	sint			selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const;


 	/// serial.
--- a/code/nel/include/nel/pacs/edge_quad.h
+++ b/code/nel/include/nel/pacs/edge_quad.h
@ -92,7 +92,7 @@ public:
 	 * \param cst the array of CExteriorEdgeEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF.
 	 * \return number of exterioredge found. stored in cst.ExteriorEdgeEntries (array cleared first).
 	 */
-	sint			selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const;
+	sint			selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const;


 	/// Get the whole set of edge entries
--- a/code/nel/include/nel/pacs/local_retriever.h
+++ b/code/nel/include/nel/pacs/local_retriever.h
@ -548,12 +548,12 @@ public:
 	/**
 	 * Check all surfaces integrity
 	 */
-	bool								checkSurfacesIntegrity(NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const;
+	bool								checkSurfacesIntegrity(const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const;

 	/**
 	 * Check surface integrity
 	 */
-	bool								checkSurfaceIntegrity(uint surf, NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const;
+	bool								checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const;

 	// @}

@ -565,7 +565,7 @@ protected:
 	bool								insurePosition(ULocalPosition &local) const;

 	/// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border
-	void								retrievePosition(NLMISC::CVector estimated, CCollisionSurfaceTemp &cst) const;
+	void								retrievePosition(const NLMISC::CVector &estimated, CCollisionSurfaceTemp &cst) const;

 	/// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border
 	void								retrieveAccuratePosition(CVector2s estimated, CCollisionSurfaceTemp &cst, bool &onBorder) const;
--- a/code/nel/include/nel/pacs/quad_grid.h
+++ b/code/nel/include/nel/pacs/quad_grid.h
@ -187,11 +187,11 @@ private:// Atttributes.
 private:// Methods.

 	// return the coordinates on the grid of what include the bbox.
-	void		selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1)
+	void		selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1)
 	{
-		CVector		bminp, bmaxp;
-		bminp= bmin;
-		bmaxp= bmax;
+		CVector		bmin, bmax;
+		bmin= bminp;
+		bmax= bmaxp;
 		bmin.minof(bminp, bmaxp);
 		bmax.maxof(bminp, bmaxp);
 		bmin/= _EltSize;
--- a/code/nel/include/nel/sound/background_sound_manager.h
+++ b/code/nel/include/nel/sound/background_sound_manager.h
@ -270,7 +270,7 @@ private:
 		/// flag if inside a sound zone
 		bool				Inside;
 		/// Constructor.
-		TSoundStatus(TSoundData &sd, NLMISC::CVector position, float gain, float distance, bool inside)
+		TSoundStatus(TSoundData &sd, const NLMISC::CVector &position, float gain, float distance, bool inside)
 			: SoundData(sd), Position(position), Gain(gain), Distance(distance), Inside(inside)
 		{}
 	};
--- a/code/nel/src/3d/computed_string.cpp
+++ b/code/nel/src/3d/computed_string.cpp
@ -143,11 +143,13 @@ void CComputedString::render2D (IDriver& driver,
 /*------------------------------------------------------------------*\
 							render3D()
 \*------------------------------------------------------------------*/
-void CComputedString::render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot)
+void CComputedString::render3D (IDriver& driver,const CMatrix &matrixp,THotSpot hotspot)
 {
 	if (Vertices.getNumVertices() == 0)
 		return;

+	CMatrix matrix = matrixp;
+
 	// get window size
 	uint32	wndWidth, wndHeight;
 	driver.getWindowSize(wndWidth, wndHeight);
--- a/code/nel/src/3d/mesh_mrm_skin_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp
@ -494,7 +494,7 @@ void	CMeshMRMGeom::applySkinWithTangentSpace(CLod &lod, const CSkeletonModel *sk
 	On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
 	saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
 */
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
 //#define	NL3D_RAWSKIN_PRECACHE
 #define	NL3D_RAWSKIN_ASM
 #endif
--- a/code/nel/src/3d/mesh_mrm_skinned_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
@ -43,7 +43,7 @@
 	On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
 	saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
 */
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
 //#define	NL3D_RAWSKIN_PRECACHE
 #define	NL3D_RAWSKIN_ASM
 #endif
--- a/code/nel/src/3d/ps_zone.cpp
+++ b/code/nel/src/3d/ps_zone.cpp
@ -267,7 +267,7 @@ CVector CPSZonePlane::getNormal(uint32 index)
 	NL_PS_FUNC(CPSZonePlane_getNormal)
 	return _Normal[index];
 }
-void CPSZonePlane::setNormal(uint32 index, CVector n)
+void CPSZonePlane::setNormal(uint32 index, const CVector &n)
 {
 	NL_PS_FUNC(CPSZonePlane_setNormal)
 	_Normal[index] = n;
@ -576,7 +576,7 @@ CVector CPSZoneDisc::getNormal(uint32 index)
 	NL_PS_FUNC(CPSZoneDisc_getNormal)
 	return _Normal[index];
 }
-void CPSZoneDisc::setNormal(uint32 index, CVector n)
+void CPSZoneDisc::setNormal(uint32 index, const CVector &n)
 {
 	NL_PS_FUNC(CPSZoneDisc_setNormal)
 	_Normal[index] = n;
--- a/code/nel/src/3d/zone_lighter.cpp
+++ b/code/nel/src/3d/zone_lighter.cpp
@ -3109,7 +3109,7 @@ void CZoneLighter::addWaterShape(CWaterShape *shape, const NLMISC::CMatrix &MT)
 }

 // ***********************************************************
-void CZoneLighter::makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox)
+void CZoneLighter::makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox)
 {
 	if (!_WaterShapes.size()) return;

--- a/code/nel/src/ligo/primitive.cpp
+++ b/code/nel/src/ligo/primitive.cpp
@ -875,7 +875,7 @@ bool CPrimZone::contains (const NLMISC::CVector &v, const std::vector<CVector> &

 // ***************************************************************************

-float CPrimZone::getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos)
+float CPrimZone::getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos)
 {
 	// two points, compute distance to the segment.
 	CVector V = (p2-p1).normed();
--- a/code/nel/src/misc/common.cpp
+++ b/code/nel/src/misc/common.cpp
@ -71,6 +71,61 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); }
 #endif // NL_OS_WINDOWS


+#ifdef HAS_SSE2
+
+#	ifdef NL_COMP_VC
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+	return _aligned_malloc(size, alignment);
+}
+
+inline void aligned_free(void *p)
+{
+	_aligned_free(ptr);
+}
+
+#	else
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+	return memalign(alignment, size);
+}
+
+inline void aligned_free(void *ptr)
+{
+	free(ptr);
+}
+
+#	endif /* NL_COMP_ */
+
+void *operator new(size_t size) throw(std::bad_alloc)
+{
+	void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
+	if (p == NULL) throw std::bad_alloc();
+	return p;
+}
+
+void *operator new[](size_t size) throw(std::bad_alloc)
+{
+	void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
+	if (p == NULL) throw std::bad_alloc();
+	return p;
+}
+
+void operator delete(void *p) throw()
+{
+	aligned_free(p);
+}
+
+void operator delete[](void *p) throw()
+{
+	aligned_free(p);
+}
+
+#endif /* HAS_SSE2 */
+
+
 #ifdef DEBUG_NEW
 	#define new DEBUG_NEW
 #endif
--- a/code/nel/src/misc/matrix.cpp
+++ b/code/nel/src/misc/matrix.cpp
@ -16,6 +16,11 @@

 #include "stdmisc.h"

+#if (USE_SSE2)
+#	include <xmmintrin.h>
+#	include <emmintrin.h>
+#endif
+
 #include "nel/misc/matrix.h"
 #include "nel/misc/plane.h"
 #include "nel/misc/debug.h"
@ -690,10 +695,86 @@ void		CMatrix::scale(const CVector &v)
 // ======================================================================================================
 // ======================================================================================================

+void		CMatrix::setMulMatrixSSE2(const CMatrix &m1, const CMatrix &m2)
+{
+	m1.testExpandRot();
+	m1.testExpandProj();
+	m2.testExpandRot();
+	m2.testExpandProj();
+
+	// Use exactly the 8 MMX registers we have
+	register __m128 in1a = _mm_loadu_ps(&m1.M[0]);
+	register __m128 in1b = _mm_loadu_ps(&m1.M[4]);
+	register __m128 in1c = _mm_loadu_ps(&m1.M[8]);
+	register __m128 in1d = _mm_loadu_ps(&m1.M[12]);
+	register __m128 in2;
+	register __m128 outrow;
+	register __m128 tempsplat;
+	register __m128 tempmul;
+
+	in2 = _mm_loadu_ps(&m2.M[0]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[0], outrow);
+
+	in2 = _mm_loadu_ps(&m2.M[4]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[4], outrow);
+
+	in2 = _mm_loadu_ps(&m2.M[8]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[8], outrow);
+
+	in2 = _mm_loadu_ps(&m2.M[12]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[12], outrow);
+}

 // ***************************************************************************
 void		CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
 {
+#if USE_SSE2
+	setMulMatrixSSE2(m1, m2);
+#else
 	/*
 	For a fast MulMatrix, it appears to be better to not take State bits into account (no test/if() overhead)
 	Just do heavy mul all the time (common case, and not so slow)
@ -720,6 +801,7 @@ void		CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
 	a14= m1.a11*m2.a14 + m1.a12*m2.a24 + m1.a13*m2.a34 + m1.a14;
 	a24= m1.a21*m2.a14 + m1.a22*m2.a24 + m1.a23*m2.a34 + m1.a24;
 	a34= m1.a31*m2.a14 + m1.a32*m2.a24 + m1.a33*m2.a34 + m1.a34;
+#endif

 	// Setup no proj at all, and force valid rot (still may be identity, but 0/1 are filled)
 	StateBit= (m1.StateBit | m2.StateBit | MAT_VALIDROT) & ~(MAT_PROJ|MAT_VALIDPROJ);
@ -737,6 +819,13 @@ void		CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
 void		CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
 {
 	// Do *this= m1*m2
+#ifdef USE_SSE2
+	setMulMatrixSSE2(m1, m2);
+	StateBit = m1.StateBit | m2.StateBit;
+	StateBit |= MAT_VALIDALL;
+	if (m1.hasTrans() && m2.hasProj())
+		StateBit |= MAT_ROT | MAT_SCALEANY;
+#else
 	identity();
 	StateBit= m1.StateBit | m2.StateBit;
 	StateBit&= ~MAT_VALIDALL;
@ -824,18 +913,22 @@ void		CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
 		a32+= m1.a34*m2.a42;
 		a33+= m1.a34*m2.a43;
 	}
+#endif

 	// Modify Scale.
 	if( (StateBit & MAT_SCALEUNI) && !(StateBit & MAT_SCALEANY) )
 	{
 		// Must have correct Scale33
+#ifndef USE_SSE2
 		m1.testExpandRot();
 		m2.testExpandRot();
+#endif
 		Scale33= m1.Scale33*m2.Scale33;
 	}
 	else
 		Scale33=1;

+#ifndef USE_SSE2
 	// In every case, I am valid now!
 	StateBit|=MAT_VALIDROT;

@ -902,6 +995,7 @@ void		CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
 	{
 		// Don't copy proj part, and leave MAT_VALIDPROJ not set
 	}
+#endif
 }
 // ======================================================================================================
 void		CMatrix::invert()
@ -1237,11 +1331,36 @@ bool		CMatrix::normalize(TRotOrder ro)
 // ======================================================================================================
 // ======================================================================================================

-
 // ======================================================================================================
 CVector		CMatrix::mulVector(const CVector &v) const
 {
-
+#ifdef USE_SSE2
+	if (hasRot())
+	{
+		CVector ret;
+		register __m128 in1a = _mm_loadu_ps(&M[0]);
+		register __m128 in1b = _mm_loadu_ps(&M[4]);
+		register __m128 in1c = _mm_loadu_ps(&M[8]);
+		register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector!
+		register __m128 tempsplat;
+		register __m128 tempmul;
+		register __m128 out;
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+		out = _mm_mul_ps(in1a, tempsplat);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+		tempmul = _mm_mul_ps(in1b, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+		tempmul = _mm_mul_ps(in1c, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		_mm_storeu_ps(&ret.x, out);
+		return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)!
+	}
+	else
+	{
+		return v;
+	}
+#else
 	CVector	ret;

 	if( hasRot() )
@ -1253,6 +1372,7 @@ CVector		CMatrix::mulVector(const CVector &v) const
 	}
 	else
 		return v;
+#endif
 }

 // ======================================================================================================
@ -1263,9 +1383,31 @@ CVector		CMatrix::mulPoint(const CVector &v) const

 	if( hasRot() )
 	{
+#ifdef USE_SSE2
+		register __m128 in1a = _mm_loadu_ps(&M[0]);
+		register __m128 in1b = _mm_loadu_ps(&M[4]);
+		register __m128 in1c = _mm_loadu_ps(&M[8]);
+		register __m128 in1d = _mm_loadu_ps(&M[12]);
+		register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector!
+		register __m128 tempsplat;
+		register __m128 tempmul;
+		register __m128 out;
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+		out = _mm_mul_ps(in1a, tempsplat);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+		tempmul = _mm_mul_ps(in1b, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+		tempmul = _mm_mul_ps(in1c, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		out = _mm_add_ps(out, in1d);
+		_mm_storeu_ps(&ret.x, out);
+		return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)!
+#else
 		ret.x= a11*v.x + a12*v.y + a13*v.z;
 		ret.y= a21*v.x + a22*v.y + a23*v.z;
 		ret.z= a31*v.x + a32*v.y + a33*v.z;
+#endif
 	}
 	else
 	{
--- a/code/nel/src/misc/polygon.cpp
+++ b/code/nel/src/misc/polygon.cpp
@ -249,7 +249,7 @@ public:
 		Back = NULL;
 		Front = NULL;
 	}
-	CBSPNode2v ( const CPlane &plane, CVector p0, CVector p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1)
+	CBSPNode2v ( const CPlane &plane, const CVector &p0, const CVector &p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1)
 	{
 		Back = NULL;
 		Front = NULL;
--- a/code/nel/src/pacs/chain_quad.cpp
+++ b/code/nel/src/pacs/chain_quad.cpp
@ -344,8 +344,11 @@ sint			CChainQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTem
 	return nRes;
 }

-sint		CChainQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const
+sint		CChainQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const
 {
+	CVector start = startp;
+	CVector end = endp;
+
 	sint	nRes=0;
 	sint	i;
 	uint16	*ochainLUT= cst.OChainLUT;
--- a/code/nel/src/pacs/edge_quad.cpp
+++ b/code/nel/src/pacs/edge_quad.cpp
@ -453,8 +453,11 @@ sint			CEdgeQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTemp
 	return nRes;
 }

-sint		CEdgeQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const
+sint		CEdgeQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const
 {
+	CVector start = startp;
+	CVector end = endp;
+
 	sint	nRes=0;
 	sint	i;
 	uint16	*indexLUT= cst.OChainLUT;
--- a/code/nel/src/pacs/local_retriever.cpp
+++ b/code/nel/src/pacs/local_retriever.cpp
@ -1052,7 +1052,7 @@ bool	NLPACS::CLocalRetriever::testPosition(NLPACS::ULocalPosition &local, CColli
 }


-void	NLPACS::CLocalRetriever::retrievePosition(CVector estimated, CCollisionSurfaceTemp &cst) const
+void	NLPACS::CLocalRetriever::retrievePosition(const CVector &estimated, CCollisionSurfaceTemp &cst) const
 {
 	if (!_Loaded)
 		return;
@ -2200,7 +2200,7 @@ void	NLPACS::CLocalRetriever::replaceChain(uint32 chainId, const std::vector<NLP
 /*
 * Check surface integrity
 */
-bool	NLPACS::CLocalRetriever::checkSurfacesIntegrity(NLMISC::CVector translation, bool verbose) const
+bool	NLPACS::CLocalRetriever::checkSurfacesIntegrity(const NLMISC::CVector &translation, bool verbose) const
 {
 	bool	success = true;
 	uint	surf;
@ -2225,7 +2225,7 @@ bool	NLPACS::CLocalRetriever::checkSurfacesIntegrity(NLMISC::CVector translation
 /**
 * Check surface integrity
 */
-bool	NLPACS::CLocalRetriever::checkSurfaceIntegrity(uint surf, NLMISC::CVector translation, bool verbose) const
+bool	NLPACS::CLocalRetriever::checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation, bool verbose) const
 {
 	if (surf >= _Surfaces.size())
 		return false;
--- a/code/ryzom/client/src/decal.cpp
+++ b/code/ryzom/client/src/decal.cpp
@ -433,10 +433,16 @@ void CDecal::renderTriCache(NL3D::IDriver &drv,   NL3D::CShadowPolyReceiver &/*
 			float bottomBlendBias = bottomBlendScale * (_RefPosition.z - _BottomBlendZMin);
 			do
 			{
+#if USE_SSE2
+				dest->X = srcVert->X;
+				dest->Y = srcVert->Y;
+				dest->Z = srcVert->Z;
+#else
 				dest->V = srcVert->V;
-				float dist = (camPos - srcVert->V).norm();
+#endif
+				float dist = (camPos - srcVert->asVector()).norm();
 				float intensity = scale * dist + bias;
-				float bottomBlend = srcVert->V.z * bottomBlendScale + bottomBlendBias;
+				float bottomBlend = srcVert->asVector().z * bottomBlendScale + bottomBlendBias;
 				clamp(bottomBlend, 0.f, 1.f);
 				clamp(intensity, 0.f, 255.f);
 				intensity *= bottomBlend;