From 5f54f75802910517b4248e632d3d8431092707d3 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Thu, 12 Jun 2014 21:57:34 +0200
Subject: [PATCH 01/21] SSE2: Initial testing implementation

--HG--
branch : sse2
---
 code/CMakeLists.txt                           |   4 +
 code/CMakeModules/nel.cmake                   |   2 +
 code/nel/include/nel/3d/computed_string.h     |   2 +-
 .../nel/3d/ps_attrib_maker_bin_op_inline.h    |  20 +--
 .../include/nel/3d/ps_attrib_maker_helper.h   |   2 +-
 .../include/nel/3d/ps_attrib_maker_template.h |   6 +-
 code/nel/include/nel/3d/ps_color.h            |   4 +-
 code/nel/include/nel/3d/ps_edit.h             |   2 +-
 code/nel/include/nel/3d/ps_force.h            |   2 +-
 code/nel/include/nel/3d/ps_located.h          |   2 +-
 code/nel/include/nel/3d/ps_zone.h             |   4 +-
 code/nel/include/nel/3d/quad_grid.h           |   8 +-
 .../nel/include/nel/3d/shadow_poly_receiver.h |  19 ++-
 code/nel/include/nel/3d/static_quad_grid.h    |   4 +-
 code/nel/include/nel/3d/zone_lighter.h        |   2 +-
 code/nel/include/nel/ligo/primitive.h         |   2 +-
 code/nel/include/nel/misc/types_nl.h          |  14 ++
 code/nel/include/nel/pacs/chain_quad.h        |   2 +-
 code/nel/include/nel/pacs/edge_quad.h         |   2 +-
 code/nel/include/nel/pacs/local_retriever.h   |   6 +-
 code/nel/include/nel/pacs/quad_grid.h         |   8 +-
 .../nel/sound/background_sound_manager.h      |   2 +-
 code/nel/src/3d/computed_string.cpp           |   4 +-
 code/nel/src/3d/mesh_mrm_skin_template.cpp    |   2 +-
 code/nel/src/3d/mesh_mrm_skinned_template.cpp |   2 +-
 code/nel/src/3d/ps_zone.cpp                   |   4 +-
 code/nel/src/3d/zone_lighter.cpp              |   2 +-
 code/nel/src/ligo/primitive.cpp               |   2 +-
 code/nel/src/misc/common.cpp                  |  55 +++++++
 code/nel/src/misc/matrix.cpp                  | 146 +++++++++++++++++-
 code/nel/src/misc/polygon.cpp                 |   2 +-
 code/nel/src/pacs/chain_quad.cpp              |   5 +-
 code/nel/src/pacs/edge_quad.cpp               |   5 +-
 code/nel/src/pacs/local_retriever.cpp         |   6 +-
 code/ryzom/client/src/decal.cpp               |  10 +-
 35 files changed, 307 insertions(+), 57 deletions(-)

diff --git a/code/CMakeLists.txt b/code/CMakeLists.txt
index 4f0439dfd..f2fb9ac81 100644
--- a/code/CMakeLists.txt
+++ b/code/CMakeLists.txt
@@ -131,6 +131,10 @@ IF(FINAL_VERSION)
   ADD_DEFINITIONS(-DFINAL_VERSION=1)
 ENDIF(FINAL_VERSION)
 
+IF(WITH_SSE2)
+  ADD_DEFINITIONS(-DUSE_SSE2)
+ENDIF(WITH_SSE2)
+
 IF(WITH_QT)
   FIND_PACKAGE(Qt4 COMPONENTS QtCore QtGui QtXml QtOpenGL REQUIRED)
 ENDIF(WITH_QT)
diff --git a/code/CMakeModules/nel.cmake b/code/CMakeModules/nel.cmake
index b194b5ff9..5a4002ed4 100644
--- a/code/CMakeModules/nel.cmake
+++ b/code/CMakeModules/nel.cmake
@@ -324,6 +324,8 @@ MACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)
   OPTION(WITH_LIBOVR              "With LibOVR support"                           OFF)
   OPTION(WITH_LIBVR               "With LibVR support"                            OFF)
   OPTION(WITH_PERFHUD             "With NVIDIA PerfHUD support"                   OFF)
+
+  OPTION(WITH_SSE2                "With SSE2"                                     ON )
 ENDMACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)
 
 MACRO(NL_SETUP_NELNS_DEFAULT_OPTIONS)
diff --git a/code/nel/include/nel/3d/computed_string.h b/code/nel/include/nel/3d/computed_string.h
index fcb758da4..517200383 100644
--- a/code/nel/include/nel/3d/computed_string.h
+++ b/code/nel/include/nel/3d/computed_string.h
@@ -290,7 +290,7 @@ public:
 	 * \param matrix transformation matrix
 	 * \param hotspot position of string origine
 	 */
-	void render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot = MiddleMiddle);
+	void render3D (IDriver& driver, const CMatrix &matrix, THotSpot hotspot = MiddleMiddle);
 
 };
 
diff --git a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
index 2a9cbff45..0070ffb38 100644
--- a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
@@ -33,14 +33,14 @@ namespace NL3D {
   */
 
 template <class T>
-inline T PSBinOpModulate(T arg1, T arg2) { return arg1 * arg2; }
+inline T PSBinOpModulate(const T &arg1, const T &arg2) { return arg1 * arg2; }
 template <class T>
-inline T PSBinOpAdd(T arg1, T arg2) { return arg1 + arg2; }
+inline T PSBinOpAdd(const T &arg1, const T &arg2) { return arg1 + arg2; }
 template <class T>
-inline T PSBinOpSubtract(T arg1, T arg2) { return arg1 - arg2; }
+inline T PSBinOpSubtract(const T &arg1, const T &arg2) { return arg1 - arg2; }
 
 template <>
-inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2)
+inline CPlaneBasis PSBinOpModulate(const CPlaneBasis &p1, const CPlaneBasis &p2)
 {
 	// we compute p1 * p2
 	NLMISC::CVector z = p1.X ^ p1.Y;
@@ -57,13 +57,13 @@ inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2)
 
 }
 template <>
-inline CPlaneBasis PSBinOpAdd(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
+inline CPlaneBasis PSBinOpAdd(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */)
 {
 	nlassert(0); // not allowed for now
 	return CPlaneBasis(NLMISC::CVector::Null);
 }
 template <>
-inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
+inline CPlaneBasis PSBinOpSubtract(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */)
 {
 	nlassert(0); // not allowed for now
 	return CPlaneBasis(NLMISC::CVector::Null);
@@ -71,21 +71,21 @@ inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
 
 
 template <>
-inline uint32 PSBinOpSubtract(uint32 lhs, uint32 rhs)
+inline uint32 PSBinOpSubtract(const uint32 &lhs, const uint32 &rhs)
 {
 	return rhs > lhs ? 0 : lhs - rhs; // avoid overflow
 }
 
 
 template <>
-inline NLMISC::CRGBA PSBinOpModulate(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
+inline NLMISC::CRGBA PSBinOpModulate(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
 {
 	NLMISC::CRGBA result;
 	result.modulateFromColor(t1, t2);
 	return result;
 }
 template <>
-inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
+inline NLMISC::CRGBA PSBinOpAdd(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
 {
 	NLMISC::CRGBA r;
 	uint S = t1.R + t2.R; if (S > 255) S = 255; r.R = (uint8) S;
@@ -94,7 +94,7 @@ inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
 	return r;
 }
 template <>
-inline NLMISC::CRGBA PSBinOpSubtract(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
+inline NLMISC::CRGBA PSBinOpSubtract(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
 {
 	NLMISC::CRGBA r;
 	sint S = t1.R - t2.R; if (S < 0) S = 0; r.R = (uint8) S;
diff --git a/code/nel/include/nel/3d/ps_attrib_maker_helper.h b/code/nel/include/nel/3d/ps_attrib_maker_helper.h
index 10d29fe52..147d1ae5d 100644
--- a/code/nel/include/nel/3d/ps_attrib_maker_helper.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_helper.h
@@ -1432,7 +1432,7 @@ public:
 	  *
 	  */
 
-    virtual void setDefaultValue(T defaultValue) { _DefaultValue = defaultValue;}
+    virtual void setDefaultValue(const T &defaultValue) { _DefaultValue = defaultValue;}
 
 	/// get the default value :
 	virtual T getDefaultValue(void) const { return _DefaultValue; }
diff --git a/code/nel/include/nel/3d/ps_attrib_maker_template.h b/code/nel/include/nel/3d/ps_attrib_maker_template.h
index 92953b86f..72bc62df9 100644
--- a/code/nel/include/nel/3d/ps_attrib_maker_template.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_template.h
@@ -68,7 +68,7 @@ template <typename T> struct CPSValueBlendFuncBase
 {
 	virtual ~CPSValueBlendFuncBase() {}
 	virtual void getValues(T &startValue, T &endValue) const = 0;
-	virtual void setValues(T startValue, T endValue) = 0;
+	virtual void setValues(const T &startValue, const T &endValue) = 0;
 };
 
 
@@ -122,7 +122,7 @@ public:
 		}
 
 		/// Set the Values between which to blend.
-		virtual void setValues(T startValue, T endValue)
+		virtual void setValues(const T &startValue, const T &endValue)
 		{
 			_StartValue = startValue;
 			_EndValue = endValue;
@@ -210,7 +210,7 @@ public:
 
 	/// set the Values
 
-	virtual void setValues(T startValue, T endValue)
+	virtual void setValues(const T &startValue, const T &endValue)
 	{
 		float step = 1.f / n;
 		float alpha = 0.0f;
diff --git a/code/nel/include/nel/3d/ps_color.h b/code/nel/include/nel/3d/ps_color.h
index d05d9cf11..bd92bcbe6 100644
--- a/code/nel/include/nel/3d/ps_color.h
+++ b/code/nel/include/nel/3d/ps_color.h
@@ -62,7 +62,7 @@ public:
 		endValue = convertVBColor(endValue, _ColorType);
 
 	}
-	virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue)
+	virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue)
 	{
 		CPSValueBlendFunc<NLMISC::CRGBA>::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType));
 	}
@@ -96,7 +96,7 @@ public:
 		endValue = convertVBColor(endValue, _ColorType);
 
 	}
-	virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue)
+	virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue)
 	{
 		CPSValueBlendSampleFunc<NLMISC::CRGBA, RGBA_BLENDER_NUM_VALUES>::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType));
 	}
diff --git a/code/nel/include/nel/3d/ps_edit.h b/code/nel/include/nel/3d/ps_edit.h
index de7957f54..0c2da9e71 100644
--- a/code/nel/include/nel/3d/ps_edit.h
+++ b/code/nel/include/nel/3d/ps_edit.h
@@ -82,7 +82,7 @@ struct IPSMover
 	virtual NLMISC::CVector			getNormal(uint32 /* index */) { NL_PS_FUNC(getNormal); return NLMISC::CVector::Null ; }
 
 	/// if the object only stores a normal, this set the normal of the object. Otherwise it has no effect
-	virtual void					setNormal(uint32 /* index */, NLMISC::CVector /* n */) { NL_PS_FUNC(setNormal); }
+	virtual void					setNormal(uint32 /* index */, const NLMISC::CVector &/* n */) { NL_PS_FUNC(setNormal); }
 
 	// set a new orthogonal matrix for the object
 	virtual void					setMatrix(uint32 index, const NLMISC::CMatrix &m) = 0 ;
diff --git a/code/nel/include/nel/3d/ps_force.h b/code/nel/include/nel/3d/ps_force.h
index e93c21361..76f22f40b 100644
--- a/code/nel/include/nel/3d/ps_force.h
+++ b/code/nel/include/nel/3d/ps_force.h
@@ -741,7 +741,7 @@ public:
 	virtual NLMISC::CVector getScale(uint32 k) const { return NLMISC::CVector(_Radius[k], _Radius[k], _Radius[k]); }
 	virtual bool onlyStoreNormal(void) const { return true; }
 	virtual NLMISC::CVector getNormal(uint32 index) { return _Normal[index]; }
-	virtual void setNormal(uint32 index, NLMISC::CVector n) { _Normal[index] = n; }
+	virtual void setNormal(uint32 index, const NLMISC::CVector &n) { _Normal[index] = n; }
 
 	virtual void setMatrix(uint32 index, const NLMISC::CMatrix &m);
 	virtual NLMISC::CMatrix getMatrix(uint32 index) const;
diff --git a/code/nel/include/nel/3d/ps_located.h b/code/nel/include/nel/3d/ps_located.h
index 30fa7defa..2c4862b63 100644
--- a/code/nel/include/nel/3d/ps_located.h
+++ b/code/nel/include/nel/3d/ps_located.h
@@ -613,7 +613,7 @@ public:
 	struct CParametricInfo
 	{
 		CParametricInfo() {}
-		CParametricInfo(NLMISC::CVector pos, NLMISC::CVector speed, float date)
+		CParametricInfo(const NLMISC::CVector &pos, const NLMISC::CVector &speed, float date)
 			: Pos(pos), Speed(speed), Date(date)
 		{
 		}
diff --git a/code/nel/include/nel/3d/ps_zone.h b/code/nel/include/nel/3d/ps_zone.h
index 7289e64e0..cf29bc258 100644
--- a/code/nel/include/nel/3d/ps_zone.h
+++ b/code/nel/include/nel/3d/ps_zone.h
@@ -153,7 +153,7 @@ class CPSZonePlane : public CPSZone, public IPSMover
 		virtual NLMISC::CMatrix getMatrix(uint32 index) const;
 		virtual bool onlyStoreNormal(void) const { return true; }
 		virtual NLMISC::CVector getNormal(uint32 index);
-		virtual void setNormal(uint32 index, NLMISC::CVector n);
+		virtual void setNormal(uint32 index, const NLMISC::CVector &n);
 
 		virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream);
 
@@ -255,7 +255,7 @@ class CPSZoneDisc : public CPSZone, public IPSMover
 		virtual NLMISC::CVector getScale(uint32 k) const;
 		virtual bool onlyStoreNormal(void) const { return true; }
 		virtual NLMISC::CVector getNormal(uint32 index);
-		virtual void setNormal(uint32 index, NLMISC::CVector n);
+		virtual void setNormal(uint32 index, const NLMISC::CVector &n);
 
 		virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream);
 
diff --git a/code/nel/include/nel/3d/quad_grid.h b/code/nel/include/nel/3d/quad_grid.h
index 12160b540..e97543896 100644
--- a/code/nel/include/nel/3d/quad_grid.h
+++ b/code/nel/include/nel/3d/quad_grid.h
@@ -314,11 +314,11 @@ private:// Methods.
 	}
 
 	// return the coordinates on the grid of what include the bbox.
-	void		selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1)
+	void		selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1)
 	{
-		CVector		bminp, bmaxp;
-		bminp= bmin;
-		bmaxp= bmax;
+		CVector		bmin, bmax;
+		bmin= bminp;
+		bmax= bmaxp;
 		bmin.minof(bminp, bmaxp);
 		bmax.maxof(bminp, bmaxp);
 		bmin/= _EltSize;
diff --git a/code/nel/include/nel/3d/shadow_poly_receiver.h b/code/nel/include/nel/3d/shadow_poly_receiver.h
index 5c9476849..c781578ea 100644
--- a/code/nel/include/nel/3d/shadow_poly_receiver.h
+++ b/code/nel/include/nel/3d/shadow_poly_receiver.h
@@ -85,10 +85,27 @@ public:
 	// a vertex
 	struct CRGBAVertex
 	{
+#if USE_SSE2
+		float X, Y, Z;
+#else
 		CVector V;
+#endif
 		CRGBA Color;
 		CRGBAVertex() {}
-		CRGBAVertex(const CVector &v, CRGBA c) : V(v), Color(c) {}
+#if USE_SSE2
+		CRGBAVertex(const CVector &v, CRGBA c) : X(v.x), Y(v.y), Z(v.z), Color(c) {}
+		const CVector &asVector() const
+		{
+			//nlctassert(sizeof(CVector) == sizeof(CRGBAVertex));
+			nlctassert(sizeof(CVector) + 4 == sizeof(CRGBAVertex));
+			*reinterpret_cast<const CVector *>(this);
+		}
+#else
+		const CVector &asVector() const
+		{
+			return V;
+		}
+#endif
 	};
 
 	/** Compute list of clipped tri under the shadow mat
diff --git a/code/nel/include/nel/3d/static_quad_grid.h b/code/nel/include/nel/3d/static_quad_grid.h
index 568ae3c0e..0bc171a74 100644
--- a/code/nel/include/nel/3d/static_quad_grid.h
+++ b/code/nel/include/nel/3d/static_quad_grid.h
@@ -102,8 +102,10 @@ private:// Atttributes.
 
 
 	// return the coordinates on the grid of what include the bbox.
-	void		selectPoint(CVector point, sint &x0, sint &y0)
+	void		selectPoint(const CVector &pointp, sint &x0, sint &y0)
 	{
+		CVector point = pointp;
+
 		point/= _EltSize;
 		x0= (sint)(floor(point.x));
 		y0= (sint)(floor(point.y));
diff --git a/code/nel/include/nel/3d/zone_lighter.h b/code/nel/include/nel/3d/zone_lighter.h
index 4f2910c52..52ef66199 100644
--- a/code/nel/include/nel/3d/zone_lighter.h
+++ b/code/nel/include/nel/3d/zone_lighter.h
@@ -421,7 +421,7 @@ private:
 	  * The vector of water shapes is released then
 	  * \param bbox the bbox of the zone containing the water shapes
 	  */
-	void makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox);
+	void makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox);
 
 
 	/** For each tile of the current zone, check whether it below or above water.
diff --git a/code/nel/include/nel/ligo/primitive.h b/code/nel/include/nel/ligo/primitive.h
index c050f14b6..b2f703015 100644
--- a/code/nel/include/nel/ligo/primitive.h
+++ b/code/nel/include/nel/ligo/primitive.h
@@ -523,7 +523,7 @@ public:
 
 	std::vector<CPrimVector>	VPoints;
 
-	static float getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos);
+	static float getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos);
 
 public:
 
diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h
index 5c3b80475..b5aa77e68 100644
--- a/code/nel/include/nel/misc/types_nl.h
+++ b/code/nel/include/nel/misc/types_nl.h
@@ -328,6 +328,20 @@ typedef	unsigned	int			uint;			// at least 32bits (depend of processor)
 
 #endif // NL_OS_UNIX
 
+#define NL_DEFAULT_MEMORY_ALIGNMENT 16
+#ifdef NL_COMP_VC
+#define NL_ALIGN(nb) __declspec(align(nb))
+#else
+#define NL_ALIGN(nb) __attribute__((aligned(nb)))
+#endif
+
+#ifdef USE_SSE2
+extern void *operator new(size_t size) throw(std::bad_alloc);
+extern void *operator new[](size_t size) throw(std::bad_alloc);
+extern void operator delete(void *p) throw();
+extern void operator delete[](void *p) throw();
+#endif
+
 // CHashMap, CHashSet and CHashMultiMap definitions
 #if defined(_STLPORT_VERSION) // STLport detected
 #	include <hash_map>
diff --git a/code/nel/include/nel/pacs/chain_quad.h b/code/nel/include/nel/pacs/chain_quad.h
index 0a9df779e..fb332f1f9 100644
--- a/code/nel/include/nel/pacs/chain_quad.h
+++ b/code/nel/include/nel/pacs/chain_quad.h
@@ -81,7 +81,7 @@ public:
 	 * \param cst the array of CEdgeChainEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF.
 	 * \return number of edgechain found. stored in cst.EdgeChainEntries (array cleared first).
 	 */
-	sint			selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const;
+	sint			selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const;
 
 
 	/// serial.
diff --git a/code/nel/include/nel/pacs/edge_quad.h b/code/nel/include/nel/pacs/edge_quad.h
index 71785cb13..95e57042c 100644
--- a/code/nel/include/nel/pacs/edge_quad.h
+++ b/code/nel/include/nel/pacs/edge_quad.h
@@ -92,7 +92,7 @@ public:
 	 * \param cst the array of CExteriorEdgeEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF.
 	 * \return number of exterioredge found. stored in cst.ExteriorEdgeEntries (array cleared first).
 	 */
-	sint			selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const;
+	sint			selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const;
 
 
 	/// Get the whole set of edge entries
diff --git a/code/nel/include/nel/pacs/local_retriever.h b/code/nel/include/nel/pacs/local_retriever.h
index d76aa0891..d90ed6dc8 100644
--- a/code/nel/include/nel/pacs/local_retriever.h
+++ b/code/nel/include/nel/pacs/local_retriever.h
@@ -548,12 +548,12 @@ public:
 	/**
 	 * Check all surfaces integrity
 	 */
-	bool								checkSurfacesIntegrity(NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const;
+	bool								checkSurfacesIntegrity(const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const;
 
 	/**
 	 * Check surface integrity
 	 */
-	bool								checkSurfaceIntegrity(uint surf, NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const;
+	bool								checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const;
 
 	// @}
 
@@ -565,7 +565,7 @@ protected:
 	bool								insurePosition(ULocalPosition &local) const;
 
 	/// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border
-	void								retrievePosition(NLMISC::CVector estimated, CCollisionSurfaceTemp &cst) const;
+	void								retrievePosition(const NLMISC::CVector &estimated, CCollisionSurfaceTemp &cst) const;
 
 	/// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border
 	void								retrieveAccuratePosition(CVector2s estimated, CCollisionSurfaceTemp &cst, bool &onBorder) const;
diff --git a/code/nel/include/nel/pacs/quad_grid.h b/code/nel/include/nel/pacs/quad_grid.h
index aa2b383b0..61cf76c6f 100644
--- a/code/nel/include/nel/pacs/quad_grid.h
+++ b/code/nel/include/nel/pacs/quad_grid.h
@@ -187,11 +187,11 @@ private:// Atttributes.
 private:// Methods.
 
 	// return the coordinates on the grid of what include the bbox.
-	void		selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1)
+	void		selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1)
 	{
-		CVector		bminp, bmaxp;
-		bminp= bmin;
-		bmaxp= bmax;
+		CVector		bmin, bmax;
+		bmin= bminp;
+		bmax= bmaxp;
 		bmin.minof(bminp, bmaxp);
 		bmax.maxof(bminp, bmaxp);
 		bmin/= _EltSize;
diff --git a/code/nel/include/nel/sound/background_sound_manager.h b/code/nel/include/nel/sound/background_sound_manager.h
index 11f33d2be..326ece0f3 100644
--- a/code/nel/include/nel/sound/background_sound_manager.h
+++ b/code/nel/include/nel/sound/background_sound_manager.h
@@ -270,7 +270,7 @@ private:
 		/// flag if inside a sound zone
 		bool				Inside;
 		/// Constructor.
-		TSoundStatus(TSoundData &sd, NLMISC::CVector position, float gain, float distance, bool inside)
+		TSoundStatus(TSoundData &sd, const NLMISC::CVector &position, float gain, float distance, bool inside)
 			: SoundData(sd), Position(position), Gain(gain), Distance(distance), Inside(inside)
 		{}
 	};
diff --git a/code/nel/src/3d/computed_string.cpp b/code/nel/src/3d/computed_string.cpp
index a57191cc0..1c8962f5e 100644
--- a/code/nel/src/3d/computed_string.cpp
+++ b/code/nel/src/3d/computed_string.cpp
@@ -143,11 +143,13 @@ void CComputedString::render2D (IDriver& driver,
 /*------------------------------------------------------------------*\
 							render3D()
 \*------------------------------------------------------------------*/
-void CComputedString::render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot)
+void CComputedString::render3D (IDriver& driver,const CMatrix &matrixp,THotSpot hotspot)
 {
 	if (Vertices.getNumVertices() == 0)
 		return;
 
+	CMatrix matrix = matrixp;
+
 	// get window size
 	uint32	wndWidth, wndHeight;
 	driver.getWindowSize(wndWidth, wndHeight);
diff --git a/code/nel/src/3d/mesh_mrm_skin_template.cpp b/code/nel/src/3d/mesh_mrm_skin_template.cpp
index 1958cae90..808dce31a 100644
--- a/code/nel/src/3d/mesh_mrm_skin_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp
@@ -494,7 +494,7 @@ void	CMeshMRMGeom::applySkinWithTangentSpace(CLod &lod, const CSkeletonModel *sk
 	On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
 	saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
 */
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
 //#define	NL3D_RAWSKIN_PRECACHE
 #define	NL3D_RAWSKIN_ASM
 #endif
diff --git a/code/nel/src/3d/mesh_mrm_skinned_template.cpp b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
index 5d1b2f582..be072713f 100644
--- a/code/nel/src/3d/mesh_mrm_skinned_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
@@ -43,7 +43,7 @@
 	On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
 	saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
 */
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
 //#define	NL3D_RAWSKIN_PRECACHE
 #define	NL3D_RAWSKIN_ASM
 #endif
diff --git a/code/nel/src/3d/ps_zone.cpp b/code/nel/src/3d/ps_zone.cpp
index 30349906f..813103896 100644
--- a/code/nel/src/3d/ps_zone.cpp
+++ b/code/nel/src/3d/ps_zone.cpp
@@ -267,7 +267,7 @@ CVector CPSZonePlane::getNormal(uint32 index)
 	NL_PS_FUNC(CPSZonePlane_getNormal)
 	return _Normal[index];
 }
-void CPSZonePlane::setNormal(uint32 index, CVector n)
+void CPSZonePlane::setNormal(uint32 index, const CVector &n)
 {
 	NL_PS_FUNC(CPSZonePlane_setNormal)
 	_Normal[index] = n;
@@ -576,7 +576,7 @@ CVector CPSZoneDisc::getNormal(uint32 index)
 	NL_PS_FUNC(CPSZoneDisc_getNormal)
 	return _Normal[index];
 }
-void CPSZoneDisc::setNormal(uint32 index, CVector n)
+void CPSZoneDisc::setNormal(uint32 index, const CVector &n)
 {
 	NL_PS_FUNC(CPSZoneDisc_setNormal)
 	_Normal[index] = n;
diff --git a/code/nel/src/3d/zone_lighter.cpp b/code/nel/src/3d/zone_lighter.cpp
index 1d7ec5a66..b78fa5635 100644
--- a/code/nel/src/3d/zone_lighter.cpp
+++ b/code/nel/src/3d/zone_lighter.cpp
@@ -3109,7 +3109,7 @@ void CZoneLighter::addWaterShape(CWaterShape *shape, const NLMISC::CMatrix &MT)
 }
 
 // ***********************************************************
-void CZoneLighter::makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox)
+void CZoneLighter::makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox)
 {
 	if (!_WaterShapes.size()) return;
 
diff --git a/code/nel/src/ligo/primitive.cpp b/code/nel/src/ligo/primitive.cpp
index 9cf7df13f..34b650f2a 100644
--- a/code/nel/src/ligo/primitive.cpp
+++ b/code/nel/src/ligo/primitive.cpp
@@ -875,7 +875,7 @@ bool CPrimZone::contains (const NLMISC::CVector &v, const std::vector<CVector> &
 
 // ***************************************************************************
 
-float CPrimZone::getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos)
+float CPrimZone::getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos)
 {
 	// two points, compute distance to the segment.
 	CVector V = (p2-p1).normed();
diff --git a/code/nel/src/misc/common.cpp b/code/nel/src/misc/common.cpp
index 36e167260..b58792a65 100644
--- a/code/nel/src/misc/common.cpp
+++ b/code/nel/src/misc/common.cpp
@@ -71,6 +71,61 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); }
 #endif // NL_OS_WINDOWS
 
 
+#ifdef HAS_SSE2
+
+#	ifdef NL_COMP_VC
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+	return _aligned_malloc(size, alignment);
+}
+
+inline void aligned_free(void *p)
+{
+	_aligned_free(ptr);
+}
+
+#	else
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+	return memalign(alignment, size);
+}
+
+inline void aligned_free(void *ptr)
+{
+	free(ptr);
+}
+
+#	endif /* NL_COMP_ */
+
+void *operator new(size_t size) throw(std::bad_alloc)
+{
+	void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
+	if (p == NULL) throw std::bad_alloc();
+	return p;
+}
+
+void *operator new[](size_t size) throw(std::bad_alloc)
+{
+	void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
+	if (p == NULL) throw std::bad_alloc();
+	return p;
+}
+
+void operator delete(void *p) throw()
+{
+	aligned_free(p);
+}
+
+void operator delete[](void *p) throw()
+{
+	aligned_free(p);
+}
+
+#endif /* HAS_SSE2 */
+
+
 #ifdef DEBUG_NEW
 	#define new DEBUG_NEW
 #endif
diff --git a/code/nel/src/misc/matrix.cpp b/code/nel/src/misc/matrix.cpp
index dd884f4d5..e99e04304 100644
--- a/code/nel/src/misc/matrix.cpp
+++ b/code/nel/src/misc/matrix.cpp
@@ -16,6 +16,11 @@
 
 #include "stdmisc.h"
 
+#if (USE_SSE2)
+#	include <xmmintrin.h>
+#	include <emmintrin.h>
+#endif
+
 #include "nel/misc/matrix.h"
 #include "nel/misc/plane.h"
 #include "nel/misc/debug.h"
@@ -690,10 +695,86 @@ void		CMatrix::scale(const CVector &v)
 // ======================================================================================================
 // ======================================================================================================
 
+void		CMatrix::setMulMatrixSSE2(const CMatrix &m1, const CMatrix &m2)
+{
+	m1.testExpandRot();
+	m1.testExpandProj();
+	m2.testExpandRot();
+	m2.testExpandProj();
+
+	// Use exactly the 8 MMX registers we have
+	register __m128 in1a = _mm_loadu_ps(&m1.M[0]);
+	register __m128 in1b = _mm_loadu_ps(&m1.M[4]);
+	register __m128 in1c = _mm_loadu_ps(&m1.M[8]);
+	register __m128 in1d = _mm_loadu_ps(&m1.M[12]);
+	register __m128 in2;
+	register __m128 outrow;
+	register __m128 tempsplat;
+	register __m128 tempmul;
+
+	in2 = _mm_loadu_ps(&m2.M[0]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[0], outrow);
+
+	in2 = _mm_loadu_ps(&m2.M[4]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[4], outrow);
+
+	in2 = _mm_loadu_ps(&m2.M[8]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[8], outrow);
+
+	in2 = _mm_loadu_ps(&m2.M[12]);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+	outrow = _mm_mul_ps(in1a, tempsplat);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+	tempmul = _mm_mul_ps(in1b, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+	tempmul = _mm_mul_ps(in1c, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
+	tempmul = _mm_mul_ps(in1d, tempsplat);
+	outrow = _mm_add_ps(outrow, tempmul);
+	_mm_storeu_ps(&M[12], outrow);
+}
 
 // ***************************************************************************
 void		CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
 {
+#if USE_SSE2
+	setMulMatrixSSE2(m1, m2);
+#else
 	/*
 	For a fast MulMatrix, it appears to be better to not take State bits into account (no test/if() overhead)
 	Just do heavy mul all the time (common case, and not so slow)
@@ -720,6 +801,7 @@ void		CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
 	a14= m1.a11*m2.a14 + m1.a12*m2.a24 + m1.a13*m2.a34 + m1.a14;
 	a24= m1.a21*m2.a14 + m1.a22*m2.a24 + m1.a23*m2.a34 + m1.a24;
 	a34= m1.a31*m2.a14 + m1.a32*m2.a24 + m1.a33*m2.a34 + m1.a34;
+#endif
 
 	// Setup no proj at all, and force valid rot (still may be identity, but 0/1 are filled)
 	StateBit= (m1.StateBit | m2.StateBit | MAT_VALIDROT) & ~(MAT_PROJ|MAT_VALIDPROJ);
@@ -737,6 +819,13 @@ void		CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
 void		CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
 {
 	// Do *this= m1*m2
+#ifdef USE_SSE2
+	setMulMatrixSSE2(m1, m2);
+	StateBit = m1.StateBit | m2.StateBit;
+	StateBit |= MAT_VALIDALL;
+	if (m1.hasTrans() && m2.hasProj())
+		StateBit |= MAT_ROT | MAT_SCALEANY;
+#else
 	identity();
 	StateBit= m1.StateBit | m2.StateBit;
 	StateBit&= ~MAT_VALIDALL;
@@ -824,18 +913,22 @@ void		CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
 		a32+= m1.a34*m2.a42;
 		a33+= m1.a34*m2.a43;
 	}
+#endif
 
 	// Modify Scale.
 	if( (StateBit & MAT_SCALEUNI) && !(StateBit & MAT_SCALEANY) )
 	{
 		// Must have correct Scale33
+#ifndef USE_SSE2
 		m1.testExpandRot();
 		m2.testExpandRot();
+#endif
 		Scale33= m1.Scale33*m2.Scale33;
 	}
 	else
 		Scale33=1;
 
+#ifndef USE_SSE2
 	// In every case, I am valid now!
 	StateBit|=MAT_VALIDROT;
 
@@ -902,6 +995,7 @@ void		CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
 	{
 		// Don't copy proj part, and leave MAT_VALIDPROJ not set
 	}
+#endif
 }
 // ======================================================================================================
 void		CMatrix::invert()
@@ -1237,11 +1331,36 @@ bool		CMatrix::normalize(TRotOrder ro)
 // ======================================================================================================
 // ======================================================================================================
 
-
 // ======================================================================================================
 CVector		CMatrix::mulVector(const CVector &v) const
 {
-
+#ifdef USE_SSE2
+	if (hasRot())
+	{
+		CVector ret;
+		register __m128 in1a = _mm_loadu_ps(&M[0]);
+		register __m128 in1b = _mm_loadu_ps(&M[4]);
+		register __m128 in1c = _mm_loadu_ps(&M[8]);
+		register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector!
+		register __m128 tempsplat;
+		register __m128 tempmul;
+		register __m128 out;
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+		out = _mm_mul_ps(in1a, tempsplat);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+		tempmul = _mm_mul_ps(in1b, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+		tempmul = _mm_mul_ps(in1c, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		_mm_storeu_ps(&ret.x, out);
+		return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)!
+	}
+	else
+	{
+		return v;
+	}
+#else
 	CVector	ret;
 
 	if( hasRot() )
@@ -1253,6 +1372,7 @@ CVector		CMatrix::mulVector(const CVector &v) const
 	}
 	else
 		return v;
+#endif
 }
 
 // ======================================================================================================
@@ -1263,9 +1383,31 @@ CVector		CMatrix::mulPoint(const CVector &v) const
 
 	if( hasRot() )
 	{
+#ifdef USE_SSE2
+		register __m128 in1a = _mm_loadu_ps(&M[0]);
+		register __m128 in1b = _mm_loadu_ps(&M[4]);
+		register __m128 in1c = _mm_loadu_ps(&M[8]);
+		register __m128 in1d = _mm_loadu_ps(&M[12]);
+		register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector!
+		register __m128 tempsplat;
+		register __m128 tempmul;
+		register __m128 out;
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
+		out = _mm_mul_ps(in1a, tempsplat);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
+		tempmul = _mm_mul_ps(in1b, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
+		tempmul = _mm_mul_ps(in1c, tempsplat);
+		out = _mm_add_ps(out, tempmul);
+		out = _mm_add_ps(out, in1d);
+		_mm_storeu_ps(&ret.x, out);
+		return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)!
+#else
 		ret.x= a11*v.x + a12*v.y + a13*v.z;
 		ret.y= a21*v.x + a22*v.y + a23*v.z;
 		ret.z= a31*v.x + a32*v.y + a33*v.z;
+#endif
 	}
 	else
 	{
diff --git a/code/nel/src/misc/polygon.cpp b/code/nel/src/misc/polygon.cpp
index b541d2eba..2cd60058d 100644
--- a/code/nel/src/misc/polygon.cpp
+++ b/code/nel/src/misc/polygon.cpp
@@ -249,7 +249,7 @@ public:
 		Back = NULL;
 		Front = NULL;
 	}
-	CBSPNode2v ( const CPlane &plane, CVector p0, CVector p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1)
+	CBSPNode2v ( const CPlane &plane, const CVector &p0, const CVector &p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1)
 	{
 		Back = NULL;
 		Front = NULL;
diff --git a/code/nel/src/pacs/chain_quad.cpp b/code/nel/src/pacs/chain_quad.cpp
index 321761953..c7af9785b 100644
--- a/code/nel/src/pacs/chain_quad.cpp
+++ b/code/nel/src/pacs/chain_quad.cpp
@@ -344,8 +344,11 @@ sint			CChainQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTem
 	return nRes;
 }
 
-sint		CChainQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const
+sint		CChainQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const
 {
+	CVector start = startp;
+	CVector end = endp;
+
 	sint	nRes=0;
 	sint	i;
 	uint16	*ochainLUT= cst.OChainLUT;
diff --git a/code/nel/src/pacs/edge_quad.cpp b/code/nel/src/pacs/edge_quad.cpp
index 1515af075..14082a3b2 100644
--- a/code/nel/src/pacs/edge_quad.cpp
+++ b/code/nel/src/pacs/edge_quad.cpp
@@ -453,8 +453,11 @@ sint			CEdgeQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTemp
 	return nRes;
 }
 
-sint		CEdgeQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const
+sint		CEdgeQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const
 {
+	CVector start = startp;
+	CVector end = endp;
+
 	sint	nRes=0;
 	sint	i;
 	uint16	*indexLUT= cst.OChainLUT;
diff --git a/code/nel/src/pacs/local_retriever.cpp b/code/nel/src/pacs/local_retriever.cpp
index 7158cee0a..1b18a6052 100644
--- a/code/nel/src/pacs/local_retriever.cpp
+++ b/code/nel/src/pacs/local_retriever.cpp
@@ -1052,7 +1052,7 @@ bool	NLPACS::CLocalRetriever::testPosition(NLPACS::ULocalPosition &local, CColli
 }
 
 
-void	NLPACS::CLocalRetriever::retrievePosition(CVector estimated, CCollisionSurfaceTemp &cst) const
+void	NLPACS::CLocalRetriever::retrievePosition(const CVector &estimated, CCollisionSurfaceTemp &cst) const
 {
 	if (!_Loaded)
 		return;
@@ -2200,7 +2200,7 @@ void	NLPACS::CLocalRetriever::replaceChain(uint32 chainId, const std::vector<NLP
 /*
  * Check surface integrity
  */
-bool	NLPACS::CLocalRetriever::checkSurfacesIntegrity(NLMISC::CVector translation, bool verbose) const
+bool	NLPACS::CLocalRetriever::checkSurfacesIntegrity(const NLMISC::CVector &translation, bool verbose) const
 {
 	bool	success = true;
 	uint	surf;
@@ -2225,7 +2225,7 @@ bool	NLPACS::CLocalRetriever::checkSurfacesIntegrity(NLMISC::CVector translation
 /**
  * Check surface integrity
  */
-bool	NLPACS::CLocalRetriever::checkSurfaceIntegrity(uint surf, NLMISC::CVector translation, bool verbose) const
+bool	NLPACS::CLocalRetriever::checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation, bool verbose) const
 {
 	if (surf >= _Surfaces.size())
 		return false;
diff --git a/code/ryzom/client/src/decal.cpp b/code/ryzom/client/src/decal.cpp
index 1454d9f59..bfcf4dc4b 100644
--- a/code/ryzom/client/src/decal.cpp
+++ b/code/ryzom/client/src/decal.cpp
@@ -433,10 +433,16 @@ void CDecal::renderTriCache(NL3D::IDriver &drv,   NL3D::CShadowPolyReceiver &/*
 			float bottomBlendBias = bottomBlendScale * (_RefPosition.z - _BottomBlendZMin);
 			do
 			{
+#if USE_SSE2
+				dest->X = srcVert->X;
+				dest->Y = srcVert->Y;
+				dest->Z = srcVert->Z;
+#else
 				dest->V = srcVert->V;
-				float dist = (camPos - srcVert->V).norm();
+#endif
+				float dist = (camPos - srcVert->asVector()).norm();
 				float intensity = scale * dist + bias;
-				float bottomBlend = srcVert->V.z * bottomBlendScale + bottomBlendBias;
+				float bottomBlend = srcVert->asVector().z * bottomBlendScale + bottomBlendBias;
 				clamp(bottomBlend, 0.f, 1.f);
 				clamp(intensity, 0.f, 255.f);
 				intensity *= bottomBlend;

From e8852d630ed353eb822d2b140f7eed096f01bd07 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Thu, 12 Jun 2014 22:02:15 +0200
Subject: [PATCH 02/21] SSE2: Add FIXME_SSE2 notes

--HG--
branch : sse2
---
 code/nel/include/nel/3d/shadow_poly_receiver.h | 2 +-
 code/nel/include/nel/misc/matrix.h             | 5 +++++
 code/nel/include/nel/misc/vector.h             | 5 +++++
 code/nel/src/3d/mesh_mrm_skin_template.cpp     | 2 +-
 code/nel/src/3d/mesh_mrm_skinned_template.cpp  | 2 +-
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/code/nel/include/nel/3d/shadow_poly_receiver.h b/code/nel/include/nel/3d/shadow_poly_receiver.h
index c781578ea..0d97a00ad 100644
--- a/code/nel/include/nel/3d/shadow_poly_receiver.h
+++ b/code/nel/include/nel/3d/shadow_poly_receiver.h
@@ -83,7 +83,7 @@ public:
 									  );
 
 	// a vertex
-	struct CRGBAVertex
+	struct CRGBAVertex // FIXME_SSE2
 	{
 #if USE_SSE2
 		float X, Y, Z;
diff --git a/code/nel/include/nel/misc/matrix.h b/code/nel/include/nel/misc/matrix.h
index 700eb4a14..7c7d7d666 100644
--- a/code/nel/include/nel/misc/matrix.h
+++ b/code/nel/include/nel/misc/matrix.h
@@ -53,6 +53,7 @@ class	CPlane;
  * \author Nevrax France
  * \date 2000
  */
+NL_ALIGN(16)
 class CMatrix
 {
 public:
@@ -363,6 +364,10 @@ private:
 	float	Scale33;
 	uint32	StateBit;	// BitVector. 0<=>identity.
 
+#if USE_SSE2
+	void setMulMatrixSSE2(const CMatrix &m1, const CMatrix &m2);
+#endif
+
 	// Methods For inversion.
 	bool	fastInvert33(CMatrix &ret) const;
 	bool	slowInvert33(CMatrix &ret) const;
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index bbf7001b7..46df7edce 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -36,11 +36,16 @@ class IStream;
  * \author Nevrax France
  * \date 2000
  */
+// NL_ALIGN(16) // FIXME_SSE2
 class CVector
 {
 public:		// Attributes.
 	float	x,y,z;
 
+/*#ifdef USE_SSE2 // FIXME_SSE2
+	float	w; // Padding
+#endif*/
+
 public:		// const.
 	/// Null vector (0,0,0).
 	static const	CVector		Null;
diff --git a/code/nel/src/3d/mesh_mrm_skin_template.cpp b/code/nel/src/3d/mesh_mrm_skin_template.cpp
index 808dce31a..6e6c160ae 100644
--- a/code/nel/src/3d/mesh_mrm_skin_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp
@@ -494,7 +494,7 @@ void	CMeshMRMGeom::applySkinWithTangentSpace(CLod &lod, const CSkeletonModel *sk
 	On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
 	saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
 */
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2) // FIXME_SSE2
 //#define	NL3D_RAWSKIN_PRECACHE
 #define	NL3D_RAWSKIN_ASM
 #endif
diff --git a/code/nel/src/3d/mesh_mrm_skinned_template.cpp b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
index be072713f..e60a5632b 100644
--- a/code/nel/src/3d/mesh_mrm_skinned_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
@@ -43,7 +43,7 @@
 	On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
 	saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
 */
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2) // FIXME_SSE2
 //#define	NL3D_RAWSKIN_PRECACHE
 #define	NL3D_RAWSKIN_ASM
 #endif

From 4c86f536ae571c07e2096640cdf5857a42833254 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Thu, 12 Jun 2014 22:54:13 +0200
Subject: [PATCH 03/21] Disable outdated assembly

--HG--
branch : sse2
---
 code/nel/src/3d/mesh_mrm_skin_template.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/nel/src/3d/mesh_mrm_skin_template.cpp b/code/nel/src/3d/mesh_mrm_skin_template.cpp
index 6e6c160ae..6b85326ee 100644
--- a/code/nel/src/3d/mesh_mrm_skin_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp
@@ -39,7 +39,7 @@ static void	applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin
 {
 	/* Prefetch all vertex/normal before, it is to be faster.
 	*/
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if 0// defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
 	{
 		uint	nInfTmp= nInf;
 		uint32	*infTmpPtr= infPtr;
@@ -176,7 +176,7 @@ static void	applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh:
 {
 	/* Prefetch all vertex/normal/tgSpace before, it is faster.
 	*/
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if 0 // defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
 	{
 		uint	nInfTmp= nInf;
 		uint32	*infTmpPtr= infPtr;

From dbb966c8a5e81ce8f6367cf436e50c0453ad6da3 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Thu, 12 Jun 2014 22:54:36 +0200
Subject: [PATCH 04/21] SSE2: Some reference

--HG--
branch : sse2
---
 code/nel/include/nel/3d/matrix_3x4.h | 201 ++++++++++++++++++++++++++-
 1 file changed, 200 insertions(+), 1 deletion(-)

diff --git a/code/nel/include/nel/3d/matrix_3x4.h b/code/nel/include/nel/3d/matrix_3x4.h
index d7ed660fc..94aee3a25 100644
--- a/code/nel/include/nel/3d/matrix_3x4.h
+++ b/code/nel/include/nel/3d/matrix_3x4.h
@@ -116,7 +116,7 @@ public:
 
 
 // ***************************************************************************
-#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
+#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) && !defined(USE_SSE2)
 
 
 /** For fast vector/point multiplication. Special usage for Skinning.
@@ -376,6 +376,205 @@ public:
 
 };
 
+#elif 0
+
+NL_ALIGN(16)
+class CVectorSSEAligned
+{
+	float f[4];
+};
+
+/** For fast vector/point multiplication. Special usage for Skinning.
+ */
+NL_ALIGN(16)
+class	CMatrix3x4SSE
+{
+public:
+	__m128 c1, c2, c3, c4;
+
+	// Copy from a matrix.
+	inline void set(const CMatrix &mat)
+	{
+		const float	*m = mat.get();
+		register __m128 xmm0 = _mm_loadu_ps(&m[0]);
+		register __m128 xmm1 = _mm_loadu_ps(&m[4]);
+		register __m128 xmm2 = _mm_loadu_ps(&m[8]);
+		register __m128 xmm3 = _mm_loadu_ps(&m[12]);
+		c1 = xmm0;
+		c2 = xmm1;
+		c3 = xmm2;
+		c4 = xmm3;
+	}
+
+	// mulSetvector. NB: in should be different as v!! (else don't work).
+	inline void mulSetVector(const CVector &vin, CVector &vout)
+	{
+		CVectorSSEAligned outf; // FIXME_SSE2
+
+		register __m128 xmm0 = _mm_loadu_ps(&vout.x); // WARNING: Reads beyond CVector size! // FIXME_SSE2: Align CVector
+
+		register __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		register __m128 xmm2 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+		xmm0 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		xmm0 = _mm_mul_ps(xmm0, c1);
+		xmm1 = _mm_mul_ps(xmm1, c2);
+		xmm2 = _mm_mul_ps(xmm2, c3);
+
+		xmm0 = _mm_add_ps(xmm0, xmm1);
+		xmm0 = _mm_add_ps(xmm0, xmm2);
+
+		_mm_store_ps(&outf.f[0], xmm0);		
+		vout.x = outf[0]; // FIXME_SSE2
+		vout.y = outf[1];
+		vout.z = outf[2];
+	}
+
+	// mulSetpoint. NB: in should be different as v!! (else don't work).
+	inline void	mulSetPoint(const CVector &vin, CVector &vout)
+	{
+		CVectorSSEAligned outf; // FIXME_SSE2
+
+		register __m128 xmm0 = _mm_loadu_ps(&vout.x); // WARNING: Reads beyond CVector size! // FIXME_SSE2: Align CVector
+
+		register __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		register __m128 xmm2 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+		xmm0 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		xmm0 = _mm_mul_ps(xmm0, c1);
+		xmm1 = _mm_mul_ps(xmm1, c2);
+		xmm2 = _mm_mul_ps(xmm2, c3);
+
+		xmm0 = _mm_add_ps(xmm0, xmm1);
+		xmm0 = _mm_add_ps(xmm0, xmm2);
+
+		xmm0 = _mm_add_ps(xmm0, c4);
+
+		_mm_store_ps(&outf.f[0], xmm0);		
+		vout.x = outf[0]; // FIXME_SSE2
+		vout.y = outf[1];
+		vout.z = outf[2];
+	}
+
+
+	// mulSetvector. NB: vin should be different as v!! (else don't work).
+	inline void	mulSetVector(const CVector &vin, float scale, CVector &vout)
+	{
+		CVectorSSEAligned outf; // FIXME_SSE2
+
+		register __m128 xmm0 = _mm_loadu_ps(&vout.x); // WARNING: Reads beyond CVector size! // FIXME_SSE2: Align CVector
+
+		register __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		register __m128 xmm2 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+		xmm0 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		register __m128 xmm3 = _mm_set1_ps(scale);
+
+		xmm0 = _mm_mul_ps(xmm0, c1);
+		xmm1 = _mm_mul_ps(xmm1, c2);
+		xmm2 = _mm_mul_ps(xmm2, c3);
+
+		xmm0 = _mm_add_ps(xmm0, xmm1);
+		xmm0 = _mm_add_ps(xmm0, xmm2);
+
+		xmm0 = _mm_mul_ps(xmm0, xmm3);
+
+		_mm_store_ps(&outf.f[0], xmm0);		
+		vout.x = outf[0]; // FIXME_SSE2
+		vout.y = outf[1];
+		vout.z = outf[2];
+	}
+	// mulSetpoint. NB: vin should be different as v!! (else don't work).
+	inline void	mulSetPoint(const CVector &vin, float scale, CVector &vout)
+	{
+		CVectorSSEAligned outf; // FIXME_SSE2
+
+		register __m128 xmm0 = _mm_loadu_ps(&vout.x); // WARNING: Reads beyond CVector size! // FIXME_SSE2: Align CVector
+
+		register __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		register __m128 xmm2 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+		xmm0 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		register __m128 xmm3 = _mm_set1_ps(scale);
+
+		xmm0 = _mm_mul_ps(xmm0, c1);
+		xmm1 = _mm_mul_ps(xmm1, c2);
+		xmm2 = _mm_mul_ps(xmm2, c3);
+
+		xmm0 = _mm_add_ps(xmm0, xmm1);
+		xmm0 = _mm_add_ps(xmm0, xmm2);
+
+		xmm0 = _mm_add_ps(xmm0, c4);
+
+		xmm0 = _mm_mul_ps(xmm0, xmm3);
+
+		_mm_store_ps(&outf.f[0], xmm0);		
+		vout.x = outf[0]; // FIXME_SSE2
+		vout.y = outf[1];
+		vout.z = outf[2];
+	}
+
+
+	// mulAddvector. NB: vin should be different as v!! (else don't work).
+	inline void	mulAddVector(const CVector &vin, float scale, CVector &vout)
+	{
+		CVectorSSEAligned outf; // FIXME_SSE2
+
+		register __m128 xmm0 = _mm_loadu_ps(&vout.x); // WARNING: Reads beyond CVector size! // FIXME_SSE2: Align CVector
+
+		register __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		register __m128 xmm2 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+		xmm0 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		register __m128 xmm3 = _mm_set1_ps(scale);
+
+		xmm0 = _mm_mul_ps(xmm0, c1);
+		xmm1 = _mm_mul_ps(xmm1, c2);
+		xmm2 = _mm_mul_ps(xmm2, c3);
+
+		xmm0 = _mm_add_ps(xmm0, xmm1);
+		xmm0 = _mm_add_ps(xmm0, xmm2);
+
+		xmm0 = _mm_mul_ps(xmm0, xmm3);
+
+		_mm_store_ps(&outf.f[0], xmm0);		
+		vout.x += outf[0]; // FIXME_SSE2
+		vout.y += outf[1];
+		vout.z += outf[2];
+	}
+
+	// mulAddpoint. NB: vin should be different as v!! (else don't work).
+	inline void	mulAddPoint(const CVector &vin, float scale, CVector &vout)
+	{
+		CVectorSSEAligned outf; // FIXME_SSE2
+
+		register __m128 xmm0 = _mm_loadu_ps(&vout.x); // WARNING: Reads beyond CVector size! // FIXME_SSE2: Align CVector
+
+		register __m128 xmm1 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(1, 1, 1, 1));
+		register __m128 xmm2 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(2, 2, 2, 2));
+		xmm0 = _mm_shuffle_ps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0));
+
+		register __m128 xmm3 = _mm_set1_ps(scale);
+
+		xmm0 = _mm_mul_ps(xmm0, c1);
+		xmm1 = _mm_mul_ps(xmm1, c2);
+		xmm2 = _mm_mul_ps(xmm2, c3);
+
+		xmm0 = _mm_add_ps(xmm0, xmm1);
+		xmm0 = _mm_add_ps(xmm0, xmm2);
+
+		xmm0 = _mm_add_ps(xmm0, c4);
+
+		xmm0 = _mm_mul_ps(xmm0, xmm3);
+
+		_mm_store_ps(&outf.f[0], xmm0);		
+		vout.x += outf[0]; // FIXME_SSE2
+		vout.y += outf[1];
+		vout.z += outf[2];
+	}
+
+};
+
 #else // NL_OS_WINDOWS
 /// dummy CMatrix3x4SSE for non windows platform
 class CMatrix3x4SSE : public  CMatrix3x4 { };

From 95fb48fbfc6bccf315955397144747ab64e67500 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Thu, 12 Jun 2014 23:40:12 +0200
Subject: [PATCH 05/21] SSE2: Prepare for CVector alignment

--HG--
branch : sse2
---
 code/nel/include/nel/3d/mesh_mrm_skinned.h    |  17 ++
 code/nel/include/nel/3d/packed_zone.h         |   2 +
 code/nel/include/nel/3d/raw_skin.h            |  12 +-
 code/nel/include/nel/3d/vertex_buffer.h       |   8 +-
 code/nel/include/nel/misc/vector.h            |  37 ++++
 code/nel/src/3d/cloud.cpp                     |  56 ++---
 code/nel/src/3d/mesh_morpher.cpp              |  18 +-
 code/nel/src/3d/mesh_mrm_skin_template.cpp    | 205 +++++++++++-------
 code/nel/src/3d/mesh_mrm_skinned_template.cpp |  23 +-
 code/nel/src/3d/noise_3d.cpp                  |  52 ++---
 code/nel/src/3d/packed_zone.cpp               |  41 +++-
 code/nel/src/3d/vegetable_shape.cpp           |   8 +-
 code/nel/src/3d/vertex_buffer.cpp             |  16 +-
 .../client/src/landscape_poly_drawer.cpp      |   2 +-
 14 files changed, 320 insertions(+), 177 deletions(-)

diff --git a/code/nel/include/nel/3d/mesh_mrm_skinned.h b/code/nel/include/nel/3d/mesh_mrm_skinned.h
index 9e43f8fb9..e3cec562b 100644
--- a/code/nel/include/nel/3d/mesh_mrm_skinned.h
+++ b/code/nel/include/nel/3d/mesh_mrm_skinned.h
@@ -43,6 +43,7 @@ namespace NL3D
 
 
 using	NLMISC::CVector;
+using	NLMISC::CVectorPacked;
 using	NLMISC::CPlane;
 using	NLMISC::CMatrix;
 class	CMRMBuilder;
@@ -405,12 +406,24 @@ public:
 			uint8	Weights[NL3D_MESH_MRM_SKINNED_MAX_MATRIX];
 
 			// Decompact it
+			inline void getPos (CVectorPacked &dest, float factor) const
+			{
+				dest.x = (float)X * factor;
+				dest.y = (float)Y * factor;
+				dest.z = (float)Z * factor;
+			}
 			inline void getPos (CVector &dest, float factor) const
 			{
 				dest.x = (float)X * factor;
 				dest.y = (float)Y * factor;
 				dest.z = (float)Z * factor;
 			}
+			inline void getNormal (CVectorPacked &dest) const
+			{
+				dest.x = (float)Nx * (1.f/NL3D_MESH_MRM_SKINNED_NORMAL_FACTOR);
+				dest.y = (float)Ny * (1.f/NL3D_MESH_MRM_SKINNED_NORMAL_FACTOR);
+				dest.z = (float)Nz * (1.f/NL3D_MESH_MRM_SKINNED_NORMAL_FACTOR);
+			}
 			inline void getNormal (CVector &dest) const
 			{
 				dest.x = (float)Nx * (1.f/NL3D_MESH_MRM_SKINNED_NORMAL_FACTOR);
@@ -480,6 +493,10 @@ public:
 		}
 
 		// Decompact position
+		inline void getPos (CVectorPacked &dest, const CPackedVertex &src) const
+		{
+			src.getPos (dest, _DecompactScale);
+		}
 		inline void getPos (CVector &dest, const CPackedVertex &src) const
 		{
 			src.getPos (dest, _DecompactScale);
diff --git a/code/nel/include/nel/3d/packed_zone.h b/code/nel/include/nel/3d/packed_zone.h
index 2beb5f5ed..191eea0df 100644
--- a/code/nel/include/nel/3d/packed_zone.h
+++ b/code/nel/include/nel/3d/packed_zone.h
@@ -164,6 +164,7 @@ private:
 	void    addInstance(const CShapeInfo &si, const NLMISC::CMatrix &matrix, TVertexGrid &vertexGrid, TTriListGrid &triListGrid);
 public:
 	// PRIVATE : unpack a packed tri
+	void	unpackTri(const CPackedTri &src, NLMISC::CVectorPacked dest[3]) const;
 	void	unpackTri(const CPackedTri &src, NLMISC::CVector dest[3]) const;
 };
 
@@ -197,6 +198,7 @@ private:
 	NLMISC::CVector			_PackedLocalToWorld;
 public:
 	// PRIVATE : unpack a packed tri
+	void	unpackTri(const CPackedTri16 &src, NLMISC::CVectorPacked dest[3]) const;
 	void	unpackTri(const CPackedTri16 &src, NLMISC::CVector dest[3]) const;
 };
 
diff --git a/code/nel/include/nel/3d/raw_skin.h b/code/nel/include/nel/3d/raw_skin.h
index dbf263326..59c3e2c16 100644
--- a/code/nel/include/nel/3d/raw_skin.h
+++ b/code/nel/include/nel/3d/raw_skin.h
@@ -30,15 +30,21 @@ namespace NL3D
 
 
 using	NLMISC::CVector;
+using	NLMISC::CVectorPacked;
 using	NLMISC::CUV;
 
 /// A simple Vertex Pos/Normal/Uv
 class	CRawSkinVertex
 {
 public:
-	CVector		Pos;
-	CVector		Normal;
-	CUV			UV;
+#if USE_SSE2
+	CVectorPacked	Pos;
+	CVectorPacked	Normal;
+#else
+	CVector			Pos;
+	CVector			Normal;
+#endif
+	CUV				UV;
 };
 
 /// Vertices influenced by 1 matrix only.
diff --git a/code/nel/include/nel/3d/vertex_buffer.h b/code/nel/include/nel/3d/vertex_buffer.h
index 6c269ec6c..fbce363d9 100644
--- a/code/nel/include/nel/3d/vertex_buffer.h
+++ b/code/nel/include/nel/3d/vertex_buffer.h
@@ -790,8 +790,8 @@ public:
 	 *	A call to IDriver::activeVertexBuffer() will change this format to the format returned by IDriver::getVertexColorFormat().
 	 *	So, before each write of vertex color in the vertex buffer, the vertex color format must be checked with CVertexBuffer::getVertexColorFormat().
 	 */
-	NLMISC::CVector*		getVertexCoordPointer(uint idx=0);
-	NLMISC::CVector*		getNormalCoordPointer(uint idx=0);
+	NLMISC::CVectorPacked*		getVertexCoordPointer(uint idx=0);
+	NLMISC::CVectorPacked*		getNormalCoordPointer(uint idx=0);
 	NLMISC::CUV*			getTexCoordPointer(uint idx=0, uint8 stage=0);
 	void*					getColorPointer(uint idx=0);
 	void*					getSpecularPointer(uint idx=0);
@@ -854,8 +854,8 @@ public:
 	 *	A call to IDriver::activeVertexBuffer() will change this format to the format returned by IDriver::getVertexColorFormat().
 	 *	So, before each write of vertex color in the vertex buffer, the vertex color format must be checked with CVertexBuffer::getVertexColorFormat().
 	 */
-	const NLMISC::CVector*	getVertexCoordPointer(uint idx=0) const;
-	const NLMISC::CVector*	getNormalCoordPointer(uint idx=0) const;
+	const NLMISC::CVectorPacked*	getVertexCoordPointer(uint idx=0) const;
+	const NLMISC::CVectorPacked*	getNormalCoordPointer(uint idx=0) const;
 	const NLMISC::CUV*		getTexCoordPointer(uint idx=0, uint8 stage=0) const;
 	const void*				getColorPointer(uint idx=0) const;
 	const void*				getSpecularPointer(uint idx=0) const;
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index 46df7edce..ff3db1312 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -141,6 +141,43 @@ public:		// Methods.
 	friend	CVector	operator*(float f, const CVector &v0);
 };
 
+class CVectorPacked
+{
+public: // Attributes.
+	float	x,y,z;
+
+public:
+	/// @name Object.
+	//@{
+	/// Constructor which does nothing.
+	CVectorPacked() { }
+	/// Constructor .
+	CVectorPacked(float _x, float _y, float _z) : x(_x), y(_y), z(_z) {}
+	/// Copy Constructor.
+	CVectorPacked(const CVector &v) : x(v.x), y(v.y), z(v.z) {}
+	//@}
+
+	void set(float _x, float _y, float _z)
+	{
+		x = _x;
+		y = _y;
+		z = _z;
+	}
+
+	CVectorPacked &operator += (const CVector &v)
+	{
+		x += v.x;
+		y += v.y;
+		z += v.z;
+		return *this;
+	}
+
+	operator CVector () const
+	{
+		return CVector(x, y, z);
+	}
+};
+
 // blend (faster version than the generic version found in algo.h)
 inline CVector blend(const CVector &v0, const CVector &v1, float lambda)
 {
diff --git a/code/nel/src/3d/cloud.cpp b/code/nel/src/3d/cloud.cpp
index 2606f9ad5..280ba2f04 100644
--- a/code/nel/src/3d/cloud.cpp
+++ b/code/nel/src/3d/cloud.cpp
@@ -120,10 +120,10 @@ void CCloud::generate (CNoise3d &noise)
 	{
 		CVertexBufferReadWrite vba;
 		rVB.lock (vba);
-		CVector *pVertices = vba.getVertexCoordPointer (0);
-		*pVertices = CVector(0.0f,				0.0f,				0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)_NbW*_Width,0.0f,				0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)_NbW*_Width,(float)_NbH*_Height,0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+		CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
+		*pVertices = CVector(0.0f,				0.0f,				0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector((float)_NbW*_Width,0.0f,				0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector((float)_NbW*_Width,(float)_NbH*_Height,0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 		*pVertices = CVector(0.0f,				(float)_NbH*_Height,0.0f);
 		_CloudScape->_MatClear.setColor (CRGBA(0,0,0,0));
 	}
@@ -197,10 +197,10 @@ void CCloud::light ()
 	{
 		CVertexBufferReadWrite vba;
 		rVB.lock (vba);
-		CVector *pVertices = vba.getVertexCoordPointer (0);
-		*pVertices = CVector((float)0.0f,	(float)0.0f,	0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)1.f,	(float)0.0f,	0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)1.f,	(float)1.f,		0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+		CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
+		*pVertices = CVector((float)0.0f,	(float)0.0f,	0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector((float)1.f,	(float)0.0f,	0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector((float)1.f,	(float)1.f,		0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 		*pVertices = CVector((float)0.0f,	(float)1.f,		0.0f);
 	}
 
@@ -340,10 +340,10 @@ void CCloud::reset (NL3D::CCamera *pViewer)
 		CVertexBufferReadWrite vba;
 		rVB.lock (vba);
 		uint32 nVSize = rVB.getVertexSize ();
-		CVector *pVertices = vba.getVertexCoordPointer (0);
-		*pVertices = CVector(0.0f, 0.0f, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector(5.0f, 0.0f, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector(5.0f, 5.0f, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+		CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
+		*pVertices = CVector(0.0f, 0.0f, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector(5.0f, 0.0f, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector(5.0f, 5.0f, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 		*pVertices = CVector(0.0f, 5.0f, 0.0f);
 		_CloudScape->_MatClear.setColor (CRGBA(0,0,0,0));
 	}
@@ -469,7 +469,7 @@ void CCloud::dispXYZ (CMaterial *pMat)
 	float oneOverNbWNbH = 1.0f / (_NbW*_NbH);
 	CVertexBuffer &rVB = _CloudScape->_VertexBuffer;
 	uint32 nVSize = rVB.getVertexSize ();
-	CVector *pVertices;
+	CVectorPacked *pVertices;
 	CUV *pUV;
 	_Driver->activeVertexBuffer (rVB);
 
@@ -487,9 +487,9 @@ void CCloud::dispXYZ (CMaterial *pMat)
 				rVB.lock (vba);
 
 				pVertices = vba.getVertexCoordPointer (0);
-				*pVertices = CVector(_Pos.x,			_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-				*pVertices = CVector(_Pos.x+_Size.x,	_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-				*pVertices = CVector(_Pos.x+_Size.x,	_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+				*pVertices = CVector(_Pos.x,			_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+				*pVertices = CVector(_Pos.x+_Size.x,	_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+				*pVertices = CVector(_Pos.x+_Size.x,	_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 				*pVertices = CVector(_Pos.x,			_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH);
 
 				pUV = vba.getTexCoordPointer (0, 0);
@@ -512,10 +512,10 @@ void CCloud::dispXYZ (CMaterial *pMat)
 	{
 		CVertexBufferReadWrite vba;
 		rVB.lock (vba);
-		CVector *pVertices = vba.getVertexCoordPointer (0);
-		*pVertices = CVector((float)0.25f,	0, (float)0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)0.75f,	0, (float)0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)0.75f,	0, (float)0.75f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+		CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
+		*pVertices = CVector((float)0.25f,	0, (float)0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector((float)0.75f,	0, (float)0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVector((float)0.75f,	0, (float)0.75f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 		*pVertices = CVector((float)0.25f,	0, (float)0.75f);
 	}
 }
@@ -664,10 +664,10 @@ void CCloud::genBill (CCamera *pCam, uint32 nBillSize)
 		CVertexBufferReadWrite vba;
 		rVB.lock (vba);
 		{
-			CVector *pVertices = vba.getVertexCoordPointer (0);
-			*pVertices = CVector(0.0f,	0.0f,	0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector(1.0f,	0.0f,	0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector(1.0f,	0.0f,	1.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+			CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
+			*pVertices = CVector(0.0f,	0.0f,	0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector(1.0f,	0.0f,	0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector(1.0f,	0.0f,	1.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 			*pVertices = CVector(0.0f,	0.0f,	1.0f);
 		}
 	}
@@ -782,10 +782,10 @@ void CCloud::dispBill (CCamera *pCam)
 		rVB.lock (vba);
 
 		uint32 nVSize = rVB.getVertexSize ();
-		CVector *pVertices = vba.getVertexCoordPointer (0);
-		*pVertices = qc.V0; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = qc.V1; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = qc.V2; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+		CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
+		*pVertices = qc.V0; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = qc.V1; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = qc.V2; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 		*pVertices = qc.V3;
 
 		CUV *pUV = vba.getTexCoordPointer (0, 0);
diff --git a/code/nel/src/3d/mesh_morpher.cpp b/code/nel/src/3d/mesh_morpher.cpp
index 4d7ed8255..bfca4b7c7 100644
--- a/code/nel/src/3d/mesh_morpher.cpp
+++ b/code/nel/src/3d/mesh_morpher.cpp
@@ -163,14 +163,14 @@ void CMeshMorpher::update (std::vector<CAnimatedMorph> *pBSFactor)
 			if (_VBDst->getVertexFormat() & CVertexBuffer::PositionFlag)
 			if (rBS.deltaPos.size() > 0)
 			{
-				CVector *pV = dstvba.getVertexCoordPointer (vp);
+				CVectorPacked *pV = dstvba.getVertexCoordPointer (vp);
 				*pV += rBS.deltaPos[j] * rFactor;
 			}
 
 			if (_VBDst->getVertexFormat() & CVertexBuffer::NormalFlag)
 			if (rBS.deltaNorm.size() > 0)
 			{
-				CVector *pV = dstvba.getNormalCoordPointer (vp);
+				CVectorPacked *pV = dstvba.getNormalCoordPointer (vp);
 				*pV += rBS.deltaNorm[j] * rFactor;
 			}
 
@@ -264,13 +264,13 @@ void CMeshMorpher::updateSkinned (std::vector<CAnimatedMorph> *pBSFactor)
 			pDst[j+i*VBVertexSize] = pOri[j+i*VBVertexSize];
 
 		if (_Vertices != NULL)
-			_Vertices->operator[](i) = ((CVector*)(pOri+i*VBVertexSize))[0];
+			_Vertices->operator[](i) = ((CVectorPacked*)(pOri+i*VBVertexSize))[0];
 
 		if (_Normals != NULL)
-			_Normals->operator[](i) = ((CVector*)(pOri+i*VBVertexSize))[1];
+			_Normals->operator[](i) = ((CVectorPacked*)(pOri+i*VBVertexSize))[1];
 
 		if (_TgSpace != NULL)
-			(*_TgSpace)[i] = * (CVector*)(pOri + i * VBVertexSize + tgSpaceOff);
+			(*_TgSpace)[i] = * (CVectorPacked*)(pOri + i * VBVertexSize + tgSpaceOff);
 
 		_Flags[i] = OriginalVBDst;
 	}
@@ -388,8 +388,8 @@ void CMeshMorpher::updateRawSkin (CVertexBuffer *vbOri,
 	{
 		if(*vRemap)
 		{
-			(*vRemap)->Pos= *(CVector*)(pOri);
-			(*vRemap)->Normal= *(CVector*)(pOri + NL3D_RAWSKIN_NORMAL_OFF);
+			(*vRemap)->Pos= *(CVectorPacked*)(pOri);
+			(*vRemap)->Normal= *(CVectorPacked*)(pOri + NL3D_RAWSKIN_NORMAL_OFF);
 			(*vRemap)->UV= *(CUV*)(pOri + NL3D_RAWSKIN_UV_OFF);
 		}
 		pOri+= NL3D_RAWSKIN_VERTEX_SIZE;
@@ -420,9 +420,9 @@ void CMeshMorpher::updateRawSkin (CVertexBuffer *vbOri,
 				// If exist in this Lod RawSkin, apply
 				if(rsVert)
 				{
-					if(hasPos)
+					if(hasPos) // FIXME_SSE2: +=
 						rsVert->Pos+= rBS.deltaPos[j] * rFactor;
-					if(hasNorm)
+					if(hasNorm) // FIXME_SSE2: +=
 						rsVert->Normal+= rBS.deltaNorm[j] * rFactor;
 					if(hasUV)
 						rsVert->UV+= rBS.deltaUV[j] * rFactor;
diff --git a/code/nel/src/3d/mesh_mrm_skin_template.cpp b/code/nel/src/3d/mesh_mrm_skin_template.cpp
index 6b85326ee..bda804ebc 100644
--- a/code/nel/src/3d/mesh_mrm_skin_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp
@@ -78,14 +78,18 @@ static void	applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin
 			CVector				*srcVertex= srcVertexPtr + index;
 			CVector				*srcNormal= srcNormalPtr + index;
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
 
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, tempNormal);
+			*dstNormal = tempNormal;
 		}
 		break;
 
@@ -99,16 +103,20 @@ static void	applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin
 			CVector				*srcVertex= srcVertexPtr + index;
 			CVector				*srcNormal= srcNormalPtr + index;
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
 
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], tempVertex);
+			*dstNormal = tempNormal;
 		}
 		break;
 
@@ -122,18 +130,22 @@ static void	applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin
 			CVector				*srcVertex= srcVertexPtr + index;
 			CVector				*srcNormal= srcNormalPtr + index;
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
 
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], tempNormal);
+			*dstNormal = tempNormal;
 		}
 		break;
 
@@ -147,20 +159,24 @@ static void	applyArraySkinNormalT(uint numMatrixes, uint32 *infPtr, CMesh::CSkin
 			CVector				*srcVertex= srcVertexPtr + index;
 			CVector				*srcNormal= srcNormalPtr + index;
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
 
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddVector( *srcNormal, srcSkin->Weights[3], *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddVector( *srcNormal, srcSkin->Weights[3], tempNormal);
+			*dstNormal = tempNormal;
 		}
 		break;
 
@@ -220,18 +236,24 @@ static void	applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh:
 			CVector				*srcTgSpace= tgSpacePtr + index;
 			//
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
-			CVector				*dstTgSpace= (CVector*)(dstVertexVB + tgSpaceOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstTgSpace= (CVectorPacked*)(dstVertexVB + tgSpaceOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
+			CVector				tempTgSpace;
 
 
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, tempNormal);
+			*dstNormal = tempNormal;
 			// Tg space
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, *dstTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, tempTgSpace);
+			*dstTgSpace = tempTgSpace;
 
 		}
 		break;
@@ -248,19 +270,25 @@ static void	applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh:
 			CVector				*srcTgSpace= tgSpacePtr + index;
 			//
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
-			CVector				*dstTgSpace= (CVector*)(dstVertexVB + tgSpaceOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstTgSpace= (CVectorPacked*)(dstVertexVB + tgSpaceOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
+			CVector				tempTgSpace;
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], tempNormal);
+			*dstNormal = tempNormal;
 			// Tg space
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, srcSkin->Weights[0], *dstTgSpace);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcTgSpace, srcSkin->Weights[1], *dstTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, srcSkin->Weights[0], tempTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcTgSpace, srcSkin->Weights[1], tempTgSpace);
+			*dstTgSpace = tempTgSpace;
 		}
 		break;
 
@@ -276,22 +304,28 @@ static void	applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh:
 			CVector				*srcTgSpace= tgSpacePtr + index;
 			//
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
-			CVector				*dstTgSpace= (CVector*)(dstVertexVB + tgSpaceOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstTgSpace= (CVectorPacked*)(dstVertexVB + tgSpaceOff);
+			CVector				tempVertex;
+			CVector				tempNormal;
+			CVector				tempTgSpace;
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], tempVertex);
+			*dstVertex = tempVertex;
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], tempNormal);
+			*dstNormal = tempNormal;
 			// Tg space
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, srcSkin->Weights[0], *dstTgSpace);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcTgSpace, srcSkin->Weights[1], *dstTgSpace);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcTgSpace, srcSkin->Weights[2], *dstTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, srcSkin->Weights[0], tempTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcTgSpace, srcSkin->Weights[1], tempTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcTgSpace, srcSkin->Weights[2], tempTgSpace);
+			*dstTgSpace = tempTgSpace;
 		}
 		break;
 
@@ -307,25 +341,33 @@ static void	applyArraySkinTangentSpaceT(uint numMatrixes, uint32 *infPtr, CMesh:
 			CVector				*srcTgSpace= tgSpacePtr + index;
 			//
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
-			CVector				*dstTgSpace= (CVector*)(dstVertexVB + tgSpaceOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstTgSpace= (CVectorPacked*)(dstVertexVB + tgSpaceOff);
+
+			CVector				tempVertex;
+			CVector				tempNormal;
+			CVector				tempTgSpace;
 
 			// Vertex.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex);
-			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], *dstVertex);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], tempVertex);
+			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], tempVertex);
 			// Normal.
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], *dstNormal);
-			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddVector( *srcNormal, srcSkin->Weights[3], *dstNormal);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcNormal, srcSkin->Weights[0], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcNormal, srcSkin->Weights[1], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcNormal, srcSkin->Weights[2], tempNormal);
+			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddVector( *srcNormal, srcSkin->Weights[3], tempNormal);
 			// Tg space
-			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, srcSkin->Weights[0], *dstTgSpace);
-			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcTgSpace, srcSkin->Weights[1], *dstTgSpace);
-			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcTgSpace, srcSkin->Weights[2], *dstTgSpace);
-			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddVector( *srcTgSpace, srcSkin->Weights[3], *dstTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[0] ].mulSetVector( *srcTgSpace, srcSkin->Weights[0], tempTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[1] ].mulAddVector( *srcTgSpace, srcSkin->Weights[1], tempTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[2] ].mulAddVector( *srcTgSpace, srcSkin->Weights[2], tempTgSpace);
+			boneMat3x4[ srcSkin->MatrixId[3] ].mulAddVector( *srcTgSpace, srcSkin->Weights[3], tempTgSpace);
+
+			*dstVertex = tempVertex;
+			*dstNormal = tempNormal;
+			*dstTgSpace = tempTgSpace;
 		}
 		break;
 
@@ -530,16 +572,19 @@ void		CMeshMRMGeom::applyArrayRawSkinNormal1(CRawVertexNormalSkin1 *src, uint8 *
 
 #ifndef NL3D_RAWSKIN_ASM
 		//  for all InfluencedVertices only.
+		CVector tmp;
 		for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE)
 		{
-			CVector				*dstVertex= (CVector*)(destVertexPtr);
-			CVector				*dstNormal= (CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF);
+			CVectorPacked				*dstVertex= (CVectorPacked*)(destVertexPtr);
+			CVectorPacked				*dstNormal= (CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF);
 
 			// For 1 matrix, can write directly to AGP (if destVertexPtr is AGP...)
 			// Vertex.
-			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, *(CVector*)(destVertexPtr) );
+			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, tmp );
+			*(CVectorPacked*)(destVertexPtr) = tmp;
 			// Normal.
-			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) );
+			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, tmp );
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) = tmp;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV;
 		}
@@ -717,11 +762,11 @@ void		CMeshMRMGeom::applyArrayRawSkinNormal2(CRawVertexNormalSkin2 *src, uint8 *
 			// Vertex.
 			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex.Pos, src->Weights[1], tmpVert);
-			*(CVector*)(destVertexPtr)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr)= tmpVert;
 			// Normal.
 			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Vertex.Normal, src->Weights[1], tmpVert);
-			*(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV;
 		}
@@ -1021,12 +1066,12 @@ void		CMeshMRMGeom::applyArrayRawSkinNormal3(CRawVertexNormalSkin3 *src, uint8 *
 			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex.Pos, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex.Pos, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex.Pos, src->Weights[2], tmpVert);
-			*(CVector*)(destVertexPtr)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr)= tmpVert;
 			// Normal.
 			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Vertex.Normal, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Vertex.Normal, src->Weights[2], tmpVert);
-			*(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV;
 		}
@@ -1414,13 +1459,13 @@ void		CMeshMRMGeom::applyArrayRawSkinNormal4(CRawVertexNormalSkin4 *src, uint8 *
 			boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex.Pos, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex.Pos, src->Weights[2], tmpVert);
 			boneMat3x4[ src->MatrixId[3] ].mulAddPoint( src->Vertex.Pos, src->Weights[3], tmpVert);
-			*(CVector*)(destVertexPtr)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr)= tmpVert;
 			// Normal.
 			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Vertex.Normal, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Vertex.Normal, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Vertex.Normal, src->Weights[2], tmpVert);
 			boneMat3x4[ src->MatrixId[3] ].mulAddVector( src->Vertex.Normal, src->Weights[3], tmpVert);
-			*(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->Vertex.UV;
 		}
diff --git a/code/nel/src/3d/mesh_mrm_skinned_template.cpp b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
index e60a5632b..afacd48fb 100644
--- a/code/nel/src/3d/mesh_mrm_skinned_template.cpp
+++ b/code/nel/src/3d/mesh_mrm_skinned_template.cpp
@@ -79,16 +79,19 @@ void		CMeshMRMSkinnedGeom::applyArrayRawSkinNormal1(CRawVertexNormalSkinned1 *sr
 
 #ifndef NL3D_RAWSKIN_ASM
 		//  for all InfluencedVertices only.
+		CVector tmp;
 		for(;nBlockInf>0;nBlockInf--, src++, destVertexPtr+=NL3D_RAWSKIN_VERTEX_SIZE)
 		{
-			CVector				*dstVertex= (CVector*)(destVertexPtr);
-			CVector				*dstNormal= (CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF);
+			CVectorPacked				*dstVertex= (CVectorPacked*)(destVertexPtr);
+			CVectorPacked				*dstNormal= (CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF);
 
 			// For 1 matrix, can write directly to AGP (if destVertexPtr is AGP...)
 			// Vertex.
-			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, *(CVector*)(destVertexPtr) );
+			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, tmp );
+			*(CVectorPacked*)(destVertexPtr) = tmp;
 			// Normal.
-			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, *(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) );
+			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, tmp );
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF) = tmp;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
 		}
@@ -266,11 +269,11 @@ void		CMeshMRMSkinnedGeom::applyArrayRawSkinNormal2(CRawVertexNormalSkinned2 *sr
 			// Vertex.
 			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex, src->Weights[1], tmpVert);
-			*(CVector*)(destVertexPtr)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr)= tmpVert;
 			// Normal.
 			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Normal, src->Weights[1], tmpVert);
-			*(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
 		}
@@ -570,12 +573,12 @@ void		CMeshMRMSkinnedGeom::applyArrayRawSkinNormal3(CRawVertexNormalSkinned3 *sr
 			boneMat3x4[ src->MatrixId[0] ].mulSetPoint( src->Vertex, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex, src->Weights[2], tmpVert);
-			*(CVector*)(destVertexPtr)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr)= tmpVert;
 			// Normal.
 			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Normal, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Normal, src->Weights[2], tmpVert);
-			*(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
 		}
@@ -963,13 +966,13 @@ void		CMeshMRMSkinnedGeom::applyArrayRawSkinNormal4(CRawVertexNormalSkinned4 *sr
 			boneMat3x4[ src->MatrixId[1] ].mulAddPoint( src->Vertex, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddPoint( src->Vertex, src->Weights[2], tmpVert);
 			boneMat3x4[ src->MatrixId[3] ].mulAddPoint( src->Vertex, src->Weights[3], tmpVert);
-			*(CVector*)(destVertexPtr)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr)= tmpVert;
 			// Normal.
 			boneMat3x4[ src->MatrixId[0] ].mulSetVector( src->Normal, src->Weights[0], tmpVert);
 			boneMat3x4[ src->MatrixId[1] ].mulAddVector( src->Normal, src->Weights[1], tmpVert);
 			boneMat3x4[ src->MatrixId[2] ].mulAddVector( src->Normal, src->Weights[2], tmpVert);
 			boneMat3x4[ src->MatrixId[3] ].mulAddVector( src->Normal, src->Weights[3], tmpVert);
-			*(CVector*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
+			*(CVectorPacked*)(destVertexPtr + NL3D_RAWSKIN_NORMAL_OFF)= tmpVert;
 			// UV copy.
 			*(CUV*)(destVertexPtr + NL3D_RAWSKIN_UV_OFF)= src->UV;
 		}
diff --git a/code/nel/src/3d/noise_3d.cpp b/code/nel/src/3d/noise_3d.cpp
index 5900e4750..d8e27677b 100644
--- a/code/nel/src/3d/noise_3d.cpp
+++ b/code/nel/src/3d/noise_3d.cpp
@@ -151,14 +151,14 @@ void CNoise3d::render2passes (CQuadUV &qc, float wpos, float alpha)
 	_VertexBuffer.lock (vba);
 
 	uint32 nVSize = _VertexBuffer.getVertexSize ();
-	CVector *pVertices = vba.getVertexCoordPointer(_NbVertices);
-	*pVertices = qc.V0; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V1; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V2; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V3; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V0; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V1; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V2; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+	CVectorPacked *pVertices = vba.getVertexCoordPointer(_NbVertices);
+	*pVertices = qc.V0; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V1; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V2; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V3; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V0; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V1; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V2; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 	*pVertices = qc.V3;
 
 	CUV *pUV = vba.getTexCoordPointer (_NbVertices, 0);
@@ -232,10 +232,10 @@ void CNoise3d::render (CQuadUV &qc, float wpos, float intensity)
 	CVertexBufferReadWrite vba;
 	_VertexBuffer.lock (vba);
 
-	CVector *pVertices = vba.getVertexCoordPointer(_NbVertices);
-	*pVertices = qc.V0; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V1; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-	*pVertices = qc.V2; pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+	CVectorPacked *pVertices = vba.getVertexCoordPointer(_NbVertices);
+	*pVertices = qc.V0; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V1; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+	*pVertices = qc.V2; pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 	*pVertices = qc.V3;
 
 	CUV *pUV = vba.getTexCoordPointer (_NbVertices, 0);
@@ -281,7 +281,7 @@ void CNoise3d::renderGrid (uint32 nbw, uint32 nbh, uint32 w, uint32 h,
 
 	uint32 i, j, nSlice1, nSlice2;
 	float wpos, oneOverNbWNbH = 1.0f / (nbw*nbh);
-	CVector *pVertices;
+	CVectorPacked *pVertices;
 	CUV *pUV0, *pUV1;
 	uint8 *pColA, nAlphaPos;
 	uint32 nVSize = _VertexBuffer.getVertexSize ();
@@ -319,10 +319,10 @@ void CNoise3d::renderGrid (uint32 nbw, uint32 nbh, uint32 w, uint32 h,
 			// If wpos is just on slice1 alpha must be one
 			nAlphaPos = (uint8)( 255*(1.0f - _Depth*(wpos - (((float)nSlice1) / _Depth))) );
 
-			*pVertices = CVector((float)i*w,	 (float)j*h,	 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)(i+1)*w, (float)j*h,	 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)(i+1)*w, (float)(j+1)*h, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)i*w,	 (float)(j+1)*h, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)i*w,	 (float)j*h,	 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)(i+1)*w, (float)j*h,	 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)(i+1)*w, (float)(j+1)*h, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)i*w,	 (float)(j+1)*h, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 
 			pUV0->U = UStart+_OffS[nSlice1].U;	pUV0->V = VStart+_OffS[nSlice1].V;	pUV0 = (CUV*)( ((uint8*)pUV0) + nVSize );
 			pUV0->U = dU+_OffS[nSlice1].U;		pUV0->V = VStart+_OffS[nSlice1].V;	pUV0 = (CUV*)( ((uint8*)pUV0) + nVSize );
@@ -350,7 +350,7 @@ void CNoise3d::renderGrid2passes (uint32 nbw, uint32 nbh, uint32 w, uint32 h,
 {
 	uint32 i, j, nSlice1, nSlice2;
 	float wpos, oneOverNbWNbH = 1.0f / (nbw*nbh);
-	CVector *pVertices;
+	CVectorPacked *pVertices;
 	CUV *pUV0;
 	uint8 *pColA, nFinalAlpha;
 	uint32 nVSize = _VertexBuffer.getVertexSize ();
@@ -387,14 +387,14 @@ void CNoise3d::renderGrid2passes (uint32 nbw, uint32 nbh, uint32 w, uint32 h,
 			// If wpos is just on slice1 alpha must be one
 			float alphaPos = 1.0f - _Depth*(wpos - (((float)nSlice1) / _Depth));
 
-			*pVertices = CVector((float)i*w,	 (float)j*h,	 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)(i+1)*w, (float)j*h,	 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)(i+1)*w, (float)(j+1)*h, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)i*w,	 (float)(j+1)*h, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)i*w,	 (float)j*h,	 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)(i+1)*w, (float)j*h,	 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)(i+1)*w, (float)(j+1)*h, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
-			*pVertices = CVector((float)i*w,	 (float)(j+1)*h, 0.0f); pVertices = (CVector*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)i*w,	 (float)j*h,	 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)(i+1)*w, (float)j*h,	 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)(i+1)*w, (float)(j+1)*h, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)i*w,	 (float)(j+1)*h, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)i*w,	 (float)j*h,	 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)(i+1)*w, (float)j*h,	 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)(i+1)*w, (float)(j+1)*h, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+			*pVertices = CVector((float)i*w,	 (float)(j+1)*h, 0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
 
 			pUV0->U = UStart+_OffS[nSlice1].U;	pUV0->V = VStart+_OffS[nSlice1].V;	pUV0 = (CUV*)( ((uint8*)pUV0) + nVSize );
 			pUV0->U = dU+_OffS[nSlice1].U;		pUV0->V = VStart+_OffS[nSlice1].V;	pUV0 = (CUV*)( ((uint8*)pUV0) + nVSize );
diff --git a/code/nel/src/3d/packed_zone.cpp b/code/nel/src/3d/packed_zone.cpp
index 1c6bf4817..944e9c8d1 100644
--- a/code/nel/src/3d/packed_zone.cpp
+++ b/code/nel/src/3d/packed_zone.cpp
@@ -576,6 +576,22 @@ void CPackedZone32::unpackTri(const CPackedTri &src, CVector dest[3]) const
 
 }
 
+// ***************************************************************************************
+void CPackedZone32::unpackTri(const CPackedTri &src, CVectorPacked dest[3]) const
+{
+	// TODO: add 'multiply-add' operator
+	dest[0].set(Verts[src.V0].X * _PackedLocalToWorld.x + _Origin.x,
+		        Verts[src.V0].Y * _PackedLocalToWorld.y + _Origin.y,
+				Verts[src.V0].Z * _PackedLocalToWorld.z + _Origin.z);
+	dest[1].set(Verts[src.V1].X * _PackedLocalToWorld.x + _Origin.x,
+		        Verts[src.V1].Y * _PackedLocalToWorld.y + _Origin.y,
+				Verts[src.V1].Z * _PackedLocalToWorld.z + _Origin.z);
+	dest[2].set(Verts[src.V2].X * _PackedLocalToWorld.x + _Origin.x,
+		        Verts[src.V2].Y * _PackedLocalToWorld.y + _Origin.y,
+				Verts[src.V2].Z * _PackedLocalToWorld.z + _Origin.z);
+
+}
+
 uint32 CPackedZone32::UndefIndex = 0xffffffff;
 
 // ***************************************************************************************
@@ -973,8 +989,8 @@ void CPackedZone32::render(CVertexBuffer &vb, IDriver &drv, CMaterial &material,
 		CVertexBufferReadWrite vba;
 		vb.setNumVertices(batchSize * 3);
 		vb.lock(vba);
-		CVector *dest = vba.getVertexCoordPointer(0);
-		const CVector *endDest = dest + batchSize * 3;
+		CVectorPacked *dest = vba.getVertexCoordPointer(0);
+		const CVectorPacked *endDest = dest + batchSize * 3;
 		for(sint y = 0; y < (sint) silhouette.size(); ++y)
 		{
 			sint gridY = y + minY;
@@ -1196,8 +1212,8 @@ void CPackedZone16::render(CVertexBuffer &vb, IDriver &drv, CMaterial &material,
 		CVertexBufferReadWrite vba;
 		vb.setNumVertices(batchSize * 3);
 		vb.lock(vba);
-		CVector *dest = vba.getVertexCoordPointer(0);
-		const CVector *endDest = dest + batchSize * 3;
+		CVectorPacked *dest = vba.getVertexCoordPointer(0);
+		const CVectorPacked *endDest = dest + batchSize * 3;
 		for(sint y = 0; y < (sint) silhouette.size(); ++y)
 		{
 			sint gridY = y + minY;
@@ -1254,6 +1270,23 @@ void CPackedZone16::render(CVertexBuffer &vb, IDriver &drv, CMaterial &material,
 
 
 
+// ***************************************************************************************
+void CPackedZone16::unpackTri(const CPackedTri16 &src, CVectorPacked dest[3]) const
+{
+	// yes this is ugly code duplication of CPackedZone16::unpackTri but this code is temporary anyway...
+	// TODO: add 'multiply-add' operator
+	dest[0].set(Verts[src.V0].X * _PackedLocalToWorld.x + _Origin.x,
+		        Verts[src.V0].Y * _PackedLocalToWorld.y + _Origin.y,
+				Verts[src.V0].Z * _PackedLocalToWorld.z + _Origin.z);
+	dest[1].set(Verts[src.V1].X * _PackedLocalToWorld.x + _Origin.x,
+		        Verts[src.V1].Y * _PackedLocalToWorld.y + _Origin.y,
+				Verts[src.V1].Z * _PackedLocalToWorld.z + _Origin.z);
+	dest[2].set(Verts[src.V2].X * _PackedLocalToWorld.x + _Origin.x,
+		        Verts[src.V2].Y * _PackedLocalToWorld.y + _Origin.y,
+				Verts[src.V2].Z * _PackedLocalToWorld.z + _Origin.z);
+
+}
+
 // ***************************************************************************************
 void CPackedZone16::unpackTri(const CPackedTri16 &src, CVector dest[3]) const
 {
diff --git a/code/nel/src/3d/vegetable_shape.cpp b/code/nel/src/3d/vegetable_shape.cpp
index 7d9991b65..5b08d32ea 100644
--- a/code/nel/src/3d/vegetable_shape.cpp
+++ b/code/nel/src/3d/vegetable_shape.cpp
@@ -128,15 +128,15 @@ void		CVegetableShape::build(CVegetableShapeBuild &vbuild)
 	for(i=0;i<nbVerts;i++)
 	{
 		// Position.
-		const CVector		*srcPos= vba.getVertexCoordPointer(i);
-		CVector		*dstPos= vbaOut.getVertexCoordPointer(i);
+		const CVectorPacked		*srcPos= vba.getVertexCoordPointer(i);
+		CVectorPacked		*dstPos= vbaOut.getVertexCoordPointer(i);
 		*dstPos= *srcPos;
 
 		// Normal
 		if(Lighted)
 		{
-			const CVector *srcNormal= vba.getNormalCoordPointer(i);
-			CVector		*dstNormal= vbaOut.getNormalCoordPointer(i);
+			const CVectorPacked *srcNormal= vba.getNormalCoordPointer(i);
+			CVectorPacked		*dstNormal= vbaOut.getNormalCoordPointer(i);
 			*dstNormal= *srcNormal;
 		}
 
diff --git a/code/nel/src/3d/vertex_buffer.cpp b/code/nel/src/3d/vertex_buffer.cpp
index ea3ec5100..94f269a2a 100644
--- a/code/nel/src/3d/vertex_buffer.cpp
+++ b/code/nel/src/3d/vertex_buffer.cpp
@@ -1155,19 +1155,19 @@ IVBDrvInfos::~IVBDrvInfos()
 // CVertexBufferReadWrite
 // --------------------------------------------------
 
-NLMISC::CVector* CVertexBufferReadWrite::getVertexCoordPointer(uint idx)
+NLMISC::CVectorPacked* CVertexBufferReadWrite::getVertexCoordPointer(uint idx)
 {
 	nlassert (_Parent->checkLockedBuffer());
 	uint8*	ptr;
 
 	ptr=_Parent->_LockedBuffer;
 	ptr+=(idx*_Parent->_VertexSize);
-	return((NLMISC::CVector*)ptr);
+	return((NLMISC::CVectorPacked*)ptr);
 }
 
 // --------------------------------------------------
 
-NLMISC::CVector* CVertexBufferReadWrite::getNormalCoordPointer(uint idx)
+NLMISC::CVectorPacked* CVertexBufferReadWrite::getNormalCoordPointer(uint idx)
 {
 	nlassert (_Parent->checkLockedBuffer());
 	uint8*	ptr;
@@ -1179,7 +1179,7 @@ NLMISC::CVector* CVertexBufferReadWrite::getNormalCoordPointer(uint idx)
 	ptr=_Parent->_LockedBuffer;
 	ptr+=_Parent->_Offset[CVertexBuffer::Normal];
 	ptr+=idx*_Parent->_VertexSize;
-	return((NLMISC::CVector*)ptr);
+	return((NLMISC::CVectorPacked*)ptr);
 }
 
 // --------------------------------------------------
@@ -1280,19 +1280,19 @@ void CVertexBufferReadWrite::touchVertices (uint first, uint last)
 // CVertexBufferRead
 // --------------------------------------------------
 
-const NLMISC::CVector* CVertexBufferRead::getVertexCoordPointer(uint idx) const
+const NLMISC::CVectorPacked* CVertexBufferRead::getVertexCoordPointer(uint idx) const
 {
 	nlassert (_Parent->checkLockedBuffer());
 	const uint8*	ptr;
 
 	ptr=_Parent->_LockedBuffer;
 	ptr+=(idx*_Parent->_VertexSize);
-	return((const NLMISC::CVector*)ptr);
+	return((const NLMISC::CVectorPacked*)ptr);
 }
 
 // --------------------------------------------------
 
-const NLMISC::CVector* CVertexBufferRead::getNormalCoordPointer(uint idx) const
+const NLMISC::CVectorPacked* CVertexBufferRead::getNormalCoordPointer(uint idx) const
 {
 	nlassert (_Parent->checkLockedBuffer());
 	const uint8*	ptr;
@@ -1304,7 +1304,7 @@ const NLMISC::CVector* CVertexBufferRead::getNormalCoordPointer(uint idx) const
 	ptr=_Parent->_LockedBuffer;
 	ptr+=_Parent->_Offset[CVertexBuffer::Normal];
 	ptr+=idx*_Parent->_VertexSize;
-	return((const NLMISC::CVector*)ptr);
+	return((const NLMISC::CVectorPacked*)ptr);
 }
 
 // --------------------------------------------------
diff --git a/code/ryzom/client/src/landscape_poly_drawer.cpp b/code/ryzom/client/src/landscape_poly_drawer.cpp
index e98f17dc7..e8d5ba154 100644
--- a/code/ryzom/client/src/landscape_poly_drawer.cpp
+++ b/code/ryzom/client/src/landscape_poly_drawer.cpp
@@ -488,7 +488,7 @@ void CLandscapePolyDrawer::drawShadowVolume(uint poly, bool firstPass)
 
 		uint i;
 		CVector2f vertex;
-		CVector * vertexVB = NULL;
+		CVectorPacked * vertexVB = NULL;
 		const CVector cameraPos = Scene->getCam().getPos();
 
 		float height = 2000.0;

From 31b2141b129259908575d8c2bf7f261c6b74afa6 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 00:38:24 +0200
Subject: [PATCH 06/21] SSE2: CVector alignment fixes for particle systems

--HG--
branch : sse2
---
 code/nel/include/nel/3d/mesh.h                |   5 +-
 code/nel/include/nel/3d/ps_force.h            |  12 +-
 code/nel/include/nel/3d/ps_located.h          |  11 +-
 code/nel/include/nel/3d/ps_misc.h             |   6 +-
 code/nel/include/nel/3d/ps_ribbon.h           |   2 +-
 code/nel/include/nel/3d/ps_ribbon_base.h      |  20 +-
 .../nel/include/nel/3d/shadow_poly_receiver.h |   4 +-
 code/nel/include/nel/misc/matrix.h            |   2 +-
 code/nel/include/nel/misc/types_nl.h          |   3 +
 code/nel/include/nel/misc/vector.h            |  11 +-
 code/nel/src/3d/computed_string.cpp           |  81 ++---
 code/nel/src/3d/driver_user.cpp               |  88 ++---
 code/nel/src/3d/mesh.cpp                      |  32 +-
 code/nel/src/3d/mesh_morpher.cpp              |   2 +-
 code/nel/src/3d/packed_zone.cpp               |   8 +-
 code/nel/src/3d/patch_render.cpp              |  36 +--
 code/nel/src/3d/ps_dot.cpp                    |   8 +-
 code/nel/src/3d/ps_emitter.cpp                |  16 +-
 code/nel/src/3d/ps_face.cpp                   |  50 +--
 code/nel/src/3d/ps_face_look_at.cpp           | 305 +++++++++---------
 code/nel/src/3d/ps_fan_light.cpp              |  11 +-
 code/nel/src/3d/ps_force.cpp                  |  14 +-
 code/nel/src/3d/ps_located.cpp                |   4 +-
 code/nel/src/3d/ps_mesh.cpp                   |  18 +-
 code/nel/src/3d/ps_ribbon.cpp                 |  40 +--
 code/nel/src/3d/ps_ribbon_base.cpp            |  38 +--
 code/nel/src/3d/ps_ribbon_look_at.cpp         |  26 +-
 code/nel/src/3d/ps_shockwave.cpp              |   5 +-
 code/nel/src/3d/ps_tail_dot.cpp               |   6 +-
 code/nel/src/3d/ps_util.cpp                   |   1 +
 code/nel/src/3d/seg_remanence.cpp             |   5 +-
 code/nel/src/3d/water_model.cpp               |  37 +--
 32 files changed, 470 insertions(+), 437 deletions(-)

diff --git a/code/nel/include/nel/3d/mesh.h b/code/nel/include/nel/3d/mesh.h
index 780a455aa..2d31adbd7 100644
--- a/code/nel/include/nel/3d/mesh.h
+++ b/code/nel/include/nel/3d/mesh.h
@@ -41,6 +41,7 @@ namespace NL3D
 
 
 using	NLMISC::CVector;
+using	NLMISC::CVectorPacked;
 using	NLMISC::CPlane;
 using	NLMISC::CMatrix;
 
@@ -842,8 +843,8 @@ private:
 
 	void	flagSkinVerticesForMatrixBlock(uint8 *skinFlags, CMatrixBlock &mb);
 	void	computeSkinMatrixes(CSkeletonModel *skeleton, CMatrix3x4 *matrixes, CMatrixBlock  *prevBlock, CMatrixBlock &curBlock);
-	void	computeSoftwarePointSkinning(CMatrix3x4 *matrixes, CVector *srcVector, CPaletteSkin *srcPal, float *srcWgt, CVector *dstVector);
-	void	computeSoftwareVectorSkinning(CMatrix3x4 *matrixes, CVector *srcVector, CPaletteSkin *srcPal, float *srcWgt, CVector *dstVector);
+	void	computeSoftwarePointSkinning(CMatrix3x4 *matrixes, CVector *srcVector, CPaletteSkin *srcPal, float *srcWgt, CVectorPacked *dstVector);
+	void	computeSoftwareVectorSkinning(CMatrix3x4 *matrixes, CVector *srcVector, CPaletteSkin *srcPal, float *srcWgt, CVectorPacked *dstVector);
 
 	// Shadow mapping and CMesh. NB: not serialized, but created at each load
 	CShadowSkin				_ShadowSkin;
diff --git a/code/nel/include/nel/3d/ps_force.h b/code/nel/include/nel/3d/ps_force.h
index 76f22f40b..76cf90ee4 100644
--- a/code/nel/include/nel/3d/ps_force.h
+++ b/code/nel/include/nel/3d/ps_force.h
@@ -102,9 +102,9 @@ public:
 	  */
 	virtual void integrateSingle(float /* startDate */, float /* deltaT */, uint /* numStep */,
 								 const CPSLocated * /* src */, uint32 /* indexInLocated */,
-								 NLMISC::CVector * /* destPos */,
+								 NLMISC::CVectorPacked * /* destPos */,
 								 bool /* accumulate */ = false,
-								 uint /* posStride */ = sizeof(NLMISC::CVector)) const
+								 uint /* posStride */ = sizeof(NLMISC::CVectorPacked)) const
 	{
 		 nlassert(0); // not an integrable force
 	}
@@ -412,9 +412,9 @@ public:
 
 	virtual void integrateSingle(float startDate, float deltaT, uint numStep,
 								 const CPSLocated *src, uint32 indexInLocated,
-								 NLMISC::CVector *destPos,
+								 NLMISC::CVectorPacked *destPos,
 								 bool accumulate = false,
-								 uint posStride = sizeof(NLMISC::CVector)) const;
+								 uint posStride = sizeof(NLMISC::CVectorPacked)) const;
 
 protected:
 	/// inherited from CPSForceIntensityHelper
@@ -583,9 +583,9 @@ public:
 
 	virtual void integrateSingle(float startDate, float deltaT, uint numStep,
 								 const CPSLocated *src, uint32 indexInLocated,
-								 NLMISC::CVector *destPos,
+								 NLMISC::CVectorPacked *destPos,
 								 bool accumulate = false,
-								 uint posStride = sizeof(NLMISC::CVector)) const;
+								 uint posStride = sizeof(NLMISC::CVectorPacked)) const;
 
 	/// perform initialisations
 	static void initPrecalc();
diff --git a/code/nel/include/nel/3d/ps_located.h b/code/nel/include/nel/3d/ps_located.h
index 2c4862b63..ca1c86a7b 100644
--- a/code/nel/include/nel/3d/ps_located.h
+++ b/code/nel/include/nel/3d/ps_located.h
@@ -508,11 +508,12 @@ public:
 	  */
 	void integrateSingle(float startDate, float deltaT, uint numStep,
 						 uint32 indexInLocated,
-						 NLMISC::CVector *destPos,
-						 uint posStride = sizeof(NLMISC::CVector)) const;
+						 NLMISC::CVectorPacked *destPos,
+						 uint posStride = sizeof(NLMISC::CVectorPacked)) const;
 
 	// compute position for a single element at the given date
 	// NB : only works with object that have parametric trajectories
+	inline void computeParametricPos(float date, uint indexInLocated, NLMISC::CVectorPacked &dest) const;
 	inline void computeParametricPos(float date, uint indexInLocated, NLMISC::CVector &dest) const;
 
 
@@ -1052,6 +1053,12 @@ inline TAnimationTime	CPSLocated::getAgeInSeconds(uint elementIndex) const
 
 // *****************************************************************************************************
 inline void	CPSLocated::computeParametricPos(float date, uint indexInLocated, NLMISC::CVector &dest) const
+{
+	NLMISC::CVectorPacked temp;
+	integrateSingle(date, 1.f, 1, indexInLocated, &temp);
+	dest = temp;
+}
+inline void	CPSLocated::computeParametricPos(float date, uint indexInLocated, NLMISC::CVectorPacked &dest) const
 {
 	integrateSingle(date, 1.f, 1, indexInLocated, &dest);
 }
diff --git a/code/nel/include/nel/3d/ps_misc.h b/code/nel/include/nel/3d/ps_misc.h
index d425f908a..993df7625 100644
--- a/code/nel/include/nel/3d/ps_misc.h
+++ b/code/nel/include/nel/3d/ps_misc.h
@@ -51,12 +51,12 @@ inline uint ScaleFloatGE(float f, float deltaT, float clampValue, uint numStep)
   * \param destPos		The destination, that will be filled with the given value
   * \param stride		Number of byte between each value to be copied
   */
-inline NLMISC::CVector *FillBufUsingSubdiv(const	NLMISC::CVector &value,
+inline NLMISC::CVectorPacked *FillBufUsingSubdiv(const	NLMISC::CVector &value,
 									  float					clampValue,
 									  float					&startValue,
 									  float					deltaT,
 									  uint					&maxNumStep,
-									  NLMISC::CVector		*destPos,
+									  NLMISC::CVectorPacked	*destPos,
 									  uint32				stride
 									  )
 {
@@ -68,7 +68,7 @@ inline NLMISC::CVector *FillBufUsingSubdiv(const	NLMISC::CVector &value,
 	while (numToFill--)
 	{
 		*destPos = value;
-		destPos = (NLMISC::CVector *) ( (uint8 *) destPos + stride);
+		destPos = (NLMISC::CVectorPacked *) ( (uint8 *) destPos + stride);
 	}
 
 	return destPos;
diff --git a/code/nel/include/nel/3d/ps_ribbon.h b/code/nel/include/nel/3d/ps_ribbon.h
index ccda62021..9c6350d9c 100644
--- a/code/nel/include/nel/3d/ps_ribbon.h
+++ b/code/nel/include/nel/3d/ps_ribbon.h
@@ -247,7 +247,7 @@ private:
 	//@}
 
 	CSmartPtr<ITexture>				_Tex;
-	CPSVector<NLMISC::CVector>::V	_Shape;
+	CPSVector<NLMISC::CVectorPacked>::V	_Shape;
 	float							_UFactor, _VFactor;
 	TOrientation					_Orientation;
 
diff --git a/code/nel/include/nel/3d/ps_ribbon_base.h b/code/nel/include/nel/3d/ps_ribbon_base.h
index ea3443060..793e22142 100644
--- a/code/nel/include/nel/3d/ps_ribbon_base.h
+++ b/code/nel/include/nel/3d/ps_ribbon_base.h
@@ -120,8 +120,8 @@ protected:
 	  * The dest tab must have at least nbSegs + 1 entries.
 	  */
 	void							computeRibbon( uint index,
-												   NLMISC::CVector *dest,
-												   uint stride = sizeof(NLMISC::CVector)
+												   NLMISC::CVectorPacked *dest,
+												   uint stride = sizeof(NLMISC::CVectorPacked)
 												  );
 
 	/// Called each time the time of the system change in order to update the ribbons positions
@@ -168,26 +168,26 @@ private:
 
 	/// Compute the ribbon points using linear interpolation between each sampling point.
 	void					computeLinearRibbon( uint index,
-											     NLMISC::CVector *dest,
-										         uint stride = sizeof(NLMISC::CVector)
+											     NLMISC::CVectorPacked *dest,
+										         uint stride = sizeof(NLMISC::CVectorPacked)
 										       );
 	/// The same as compute linear ribbon but try to make its length constant
 	void					computeLinearCstSizeRibbon( uint index,
-											     NLMISC::CVector *dest,
-										         uint stride = sizeof(NLMISC::CVector)
+											     NLMISC::CVectorPacked *dest,
+										         uint stride = sizeof(NLMISC::CVectorPacked)
 										       );
 	/// Compute the ribbon points using hermitte splines between each sampling point.
 	void					computeHermitteRibbon( uint index,
-											     NLMISC::CVector *dest,
-										         uint stride = sizeof(NLMISC::CVector)
+											     NLMISC::CVectorPacked *dest,
+										         uint stride = sizeof(NLMISC::CVectorPacked)
 										       );
 
 	/** Compute the ribbon points using hermitte splines between each sampling point,
 	  * and make a rough approximation to get a constant length
 	  */
 	void					computeHermitteCstSizeRibbon( uint index,
-											     NLMISC::CVector *dest,
-										         uint stride = sizeof(NLMISC::CVector)
+											     NLMISC::CVectorPacked *dest,
+										         uint stride = sizeof(NLMISC::CVectorPacked)
 										       );
 	// called by the system when its date has been manually changed
 	virtual void			systemDateChanged();
diff --git a/code/nel/include/nel/3d/shadow_poly_receiver.h b/code/nel/include/nel/3d/shadow_poly_receiver.h
index 0d97a00ad..ccf379638 100644
--- a/code/nel/include/nel/3d/shadow_poly_receiver.h
+++ b/code/nel/include/nel/3d/shadow_poly_receiver.h
@@ -83,6 +83,7 @@ public:
 									  );
 
 	// a vertex
+	NL_ALIGN_SSE2(16)
 	struct CRGBAVertex // FIXME_SSE2
 	{
 #if USE_SSE2
@@ -96,8 +97,7 @@ public:
 		CRGBAVertex(const CVector &v, CRGBA c) : X(v.x), Y(v.y), Z(v.z), Color(c) {}
 		const CVector &asVector() const
 		{
-			//nlctassert(sizeof(CVector) == sizeof(CRGBAVertex));
-			nlctassert(sizeof(CVector) + 4 == sizeof(CRGBAVertex));
+			nlctassert(sizeof(CVector) == sizeof(CRGBAVertex));
 			*reinterpret_cast<const CVector *>(this);
 		}
 #else
diff --git a/code/nel/include/nel/misc/matrix.h b/code/nel/include/nel/misc/matrix.h
index 7c7d7d666..611ca3882 100644
--- a/code/nel/include/nel/misc/matrix.h
+++ b/code/nel/include/nel/misc/matrix.h
@@ -53,7 +53,7 @@ class	CPlane;
  * \author Nevrax France
  * \date 2000
  */
-NL_ALIGN(16)
+NL_ALIGN_SSE2(16)
 class CMatrix
 {
 public:
diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h
index b5aa77e68..b94ffe50f 100644
--- a/code/nel/include/nel/misc/types_nl.h
+++ b/code/nel/include/nel/misc/types_nl.h
@@ -340,6 +340,9 @@ extern void *operator new(size_t size) throw(std::bad_alloc);
 extern void *operator new[](size_t size) throw(std::bad_alloc);
 extern void operator delete(void *p) throw();
 extern void operator delete[](void *p) throw();
+#define NL_ALIGN_SSE2(nb) NL_ALIGN(nb)
+#else
+#define NL_ALIGN_SSE2(nb) 
 #endif
 
 // CHashMap, CHashSet and CHashMultiMap definitions
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index ff3db1312..b1e2573d5 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -36,15 +36,15 @@ class IStream;
  * \author Nevrax France
  * \date 2000
  */
-// NL_ALIGN(16) // FIXME_SSE2
+NL_ALIGN_SSE2(16)
 class CVector
 {
 public:		// Attributes.
 	float	x,y,z;
 
-/*#ifdef USE_SSE2 // FIXME_SSE2
+#ifdef USE_SSE2
 	float	w; // Padding
-#endif*/
+#endif
 
 public:		// const.
 	/// Null vector (0,0,0).
@@ -176,6 +176,11 @@ public:
 	{
 		return CVector(x, y, z);
 	}
+
+	void serial(IStream &f)
+	{
+		f.serial(x,y,z);
+	}
 };
 
 // blend (faster version than the generic version found in algo.h)
diff --git a/code/nel/src/3d/computed_string.cpp b/code/nel/src/3d/computed_string.cpp
index 1c8962f5e..ff09c6df8 100644
--- a/code/nel/src/3d/computed_string.cpp
+++ b/code/nel/src/3d/computed_string.cpp
@@ -30,6 +30,7 @@
 #include "nel/misc/fast_mem.h"
 
 using namespace std;
+using NLMISC::CVectorPacked;
 
 namespace NL3D {
 
@@ -270,9 +271,9 @@ void CComputedString::render2DClip (IDriver& driver, CRenderStringBuffer &rdrBuf
 			// copy and translate pos
 			CHECK_VBA_RANGE(srcvba, srcPtr, Vertices.getVertexSize());
 			CHECK_VBA_RANGE(dstvba, dstPtr, rdrBuffer.Vertices.getVertexSize())
-			((CVector*)dstPtr)->x= x + ((CVector*)srcPtr)->x;
-			((CVector*)dstPtr)->y= ((CVector*)srcPtr)->y;
-			((CVector*)dstPtr)->z= z + ((CVector*)srcPtr)->z;
+			((CVectorPacked*)dstPtr)->x= x + ((CVectorPacked*)srcPtr)->x;
+			((CVectorPacked*)dstPtr)->y= ((CVectorPacked*)srcPtr)->y;
+			((CVectorPacked*)dstPtr)->z= z + ((CVectorPacked*)srcPtr)->z;
 			// uv
 			*((CUV*)(dstPtr+ofsDstUV))= *((CUV*)(srcPtr+ofsSrcUV));
 			// color
@@ -298,12 +299,12 @@ void CComputedString::render2DClip (IDriver& driver, CRenderStringBuffer &rdrBuf
 		uint	numVerts= nNumQuadSrc*4;
 
 		// clip into VerticesClipped
-		CVector *pIniPos0 = (CVector*)srcPtr;
-		CVector *pIniPos2 = (CVector*)(((uint8*)pIniPos0) + srcSize*2);
-		CVector *pClipPos0 = (CVector*)dstPtr;
-		CVector *pClipPos1 = (CVector*)(((uint8*)pClipPos0) + dstSize);
-		CVector *pClipPos2 = (CVector*)(((uint8*)pClipPos1) + dstSize);
-		CVector *pClipPos3 = (CVector*)(((uint8*)pClipPos2) + dstSize);
+		CVectorPacked *pIniPos0 = (CVectorPacked*)srcPtr;
+		CVectorPacked *pIniPos2 = (CVectorPacked*)(((uint8*)pIniPos0) + srcSize*2);
+		CVectorPacked *pClipPos0 = (CVectorPacked*)dstPtr;
+		CVectorPacked *pClipPos1 = (CVectorPacked*)(((uint8*)pClipPos0) + dstSize);
+		CVectorPacked *pClipPos2 = (CVectorPacked*)(((uint8*)pClipPos1) + dstSize);
+		CVectorPacked *pClipPos3 = (CVectorPacked*)(((uint8*)pClipPos2) + dstSize);
 		CUV *pClipUV0 = (CUV*)(dstPtr + ofsDstUV );
 		CUV *pClipUV1 = (CUV*)(((uint8*)pClipUV0) + dstSize);
 		CUV *pClipUV2 = (CUV*)(((uint8*)pClipUV1) + dstSize);
@@ -336,28 +337,28 @@ void CComputedString::render2DClip (IDriver& driver, CRenderStringBuffer &rdrBuf
 
 				// copy with no clip
 				// v0
-				*((CVector*) (dstPtr + dstSize*0))= *((CVector*) (srcPtr + srcSize*0));
+				*((CVectorPacked*) (dstPtr + dstSize*0))= *((CVectorPacked*) (srcPtr + srcSize*0));
 				*((CUV*)	 (dstPtr + dstSize*0 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*0 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*0 + ofsDstColor))= mCol;
 				else
 					*((CBGRA*)	 (dstPtr + dstSize*0 + ofsDstColor))= mCol;
 				// v1
-				*((CVector*) (dstPtr + dstSize*1))= *((CVector*) (srcPtr + srcSize*1));
+				*((CVectorPacked*) (dstPtr + dstSize*1))= *((CVectorPacked*) (srcPtr + srcSize*1));
 				*((CUV*)	 (dstPtr + dstSize*1 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*1 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*1 + ofsDstColor))= mCol;
 				else
 					*((CBGRA*)	 (dstPtr + dstSize*1 + ofsDstColor))= mCol;
 				// v2
-				*((CVector*) (dstPtr + dstSize*2))= *((CVector*) (srcPtr + srcSize*2));
+				*((CVectorPacked*) (dstPtr + dstSize*2))= *((CVectorPacked*) (srcPtr + srcSize*2));
 				*((CUV*)	 (dstPtr + dstSize*2 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*2 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*2 + ofsDstColor))= mCol;
 				else
 					*((CBGRA*)	 (dstPtr + dstSize*2 + ofsDstColor))= mCol;
 				// v3
-				*((CVector*) (dstPtr + dstSize*3))= *((CVector*) (srcPtr + srcSize*3));
+				*((CVectorPacked*) (dstPtr + dstSize*3))= *((CVectorPacked*) (srcPtr + srcSize*3));
 				*((CUV*)	 (dstPtr + dstSize*3 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*3 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*3 + ofsDstColor))= mCol;
@@ -410,10 +411,10 @@ void CComputedString::render2DClip (IDriver& driver, CRenderStringBuffer &rdrBuf
 
 				// next quad out
 				++nNumQuadClipped;
-				pClipPos0 = (CVector*)(((uint8*)pClipPos0) + dstSize*4);
-				pClipPos1 = (CVector*)(((uint8*)pClipPos0) + dstSize);
-				pClipPos2 = (CVector*)(((uint8*)pClipPos1) + dstSize);
-				pClipPos3 = (CVector*)(((uint8*)pClipPos2) + dstSize);
+				pClipPos0 = (CVectorPacked*)(((uint8*)pClipPos0) + dstSize*4);
+				pClipPos1 = (CVectorPacked*)(((uint8*)pClipPos0) + dstSize);
+				pClipPos2 = (CVectorPacked*)(((uint8*)pClipPos1) + dstSize);
+				pClipPos3 = (CVectorPacked*)(((uint8*)pClipPos2) + dstSize);
 				pClipUV0 = (CUV*)( ((uint8*)pClipUV0) + dstSize*4 );
 				pClipUV1 = (CUV*)(((uint8*)pClipUV0) + dstSize);
 				pClipUV2 = (CUV*)(((uint8*)pClipUV1) + dstSize);
@@ -421,8 +422,8 @@ void CComputedString::render2DClip (IDriver& driver, CRenderStringBuffer &rdrBuf
 				dstPtr+=  4*dstSize;
 			}
 			// next quad in
-			pIniPos0 = (CVector*)(((uint8*)pIniPos0) + srcSize*4);
-			pIniPos2 = (CVector*)(((uint8*)pIniPos0) + srcSize*2);
+			pIniPos0 = (CVectorPacked*)(((uint8*)pIniPos0) + srcSize*4);
+			pIniPos2 = (CVectorPacked*)(((uint8*)pIniPos0) + srcSize*2);
 			srcPtr+=  4*srcSize;
 		}
 
@@ -506,8 +507,8 @@ void CComputedString::render2DUnProjected (IDriver& driver, CRenderStringBuffer
 			// copy and translate pos
 			CHECK_VBA_RANGE(dstvba, dstPtr, Vertices.getVertexSize());
 			CHECK_VBA_RANGE(srcvba, srcPtr, rdrBuffer.Vertices.getVertexSize());
-			((CVector*)dstPtr)->x= x + ((CVector*)srcPtr)->x;
-			((CVector*)dstPtr)->z= z + ((CVector*)srcPtr)->z;
+			((CVectorPacked*)dstPtr)->x= x + ((CVectorPacked*)srcPtr)->x;
+			((CVectorPacked*)dstPtr)->z= z + ((CVectorPacked*)srcPtr)->z;
 
 			// uv
 			*((CUV*)(dstPtr+ofsDstUV))= *((CUV*)(srcPtr+ofsSrcUV));
@@ -533,12 +534,12 @@ void CComputedString::render2DUnProjected (IDriver& driver, CRenderStringBuffer
 		uint	numVerts= nNumQuadSrc*4;
 
 		// clip into VerticesClipped
-		CVector *pIniPos0 = (CVector*)srcPtr;
-		CVector *pIniPos2 = (CVector*)(((uint8*)pIniPos0) + srcSize*2);
-		CVector *pClipPos0 = (CVector*)dstPtr;
-		CVector *pClipPos1 = (CVector*)(((uint8*)pClipPos0) + dstSize);
-		CVector *pClipPos2 = (CVector*)(((uint8*)pClipPos1) + dstSize);
-		CVector *pClipPos3 = (CVector*)(((uint8*)pClipPos2) + dstSize);
+		CVectorPacked *pIniPos0 = (CVectorPacked*)srcPtr;
+		CVectorPacked *pIniPos2 = (CVectorPacked*)(((uint8*)pIniPos0) + srcSize*2);
+		CVectorPacked *pClipPos0 = (CVectorPacked*)dstPtr;
+		CVectorPacked *pClipPos1 = (CVectorPacked*)(((uint8*)pClipPos0) + dstSize);
+		CVectorPacked *pClipPos2 = (CVectorPacked*)(((uint8*)pClipPos1) + dstSize);
+		CVectorPacked *pClipPos3 = (CVectorPacked*)(((uint8*)pClipPos2) + dstSize);
 		CUV *pClipUV0 = (CUV*)(dstPtr + ofsDstUV );
 		CUV *pClipUV1 = (CUV*)(((uint8*)pClipUV0) + dstSize);
 		CUV *pClipUV2 = (CUV*)(((uint8*)pClipUV1) + dstSize);
@@ -555,28 +556,28 @@ void CComputedString::render2DUnProjected (IDriver& driver, CRenderStringBuffer
 			{
 				// copy with no clip
 				// v0
-				*((CVector*) (dstPtr + dstSize*0))= *((CVector*) (srcPtr + srcSize*0));
+				*((CVectorPacked*) (dstPtr + dstSize*0))= *((CVectorPacked*) (srcPtr + srcSize*0));
 				*((CUV*)	 (dstPtr + dstSize*0 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*0 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*0 + ofsDstColor))= Color;
 				else
 					*((CBGRA*)	 (dstPtr + dstSize*0 + ofsDstColor))= Color;
 				// v1
-				*((CVector*) (dstPtr + dstSize*1))= *((CVector*) (srcPtr + srcSize*1));
+				*((CVectorPacked*) (dstPtr + dstSize*1))= *((CVectorPacked*) (srcPtr + srcSize*1));
 				*((CUV*)	 (dstPtr + dstSize*1 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*1 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*1 + ofsDstColor))= Color;
 				else
 					*((CBGRA*)	 (dstPtr + dstSize*1 + ofsDstColor))= Color;
 				// v2
-				*((CVector*) (dstPtr + dstSize*2))= *((CVector*) (srcPtr + srcSize*2));
+				*((CVectorPacked*) (dstPtr + dstSize*2))= *((CVectorPacked*) (srcPtr + srcSize*2));
 				*((CUV*)	 (dstPtr + dstSize*2 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*2 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*2 + ofsDstColor))= Color;
 				else
 					*((CBGRA*)	 (dstPtr + dstSize*2 + ofsDstColor))= Color;
 				// v3
-				*((CVector*) (dstPtr + dstSize*3))= *((CVector*) (srcPtr + srcSize*3));
+				*((CVectorPacked*) (dstPtr + dstSize*3))= *((CVectorPacked*) (srcPtr + srcSize*3));
 				*((CUV*)	 (dstPtr + dstSize*3 + ofsDstUV))= *((CUV*)(srcPtr + srcSize*3 + ofsSrcUV));
 				if (vtype == CVertexBuffer::TRGBA)
 					*((CRGBA*)	 (dstPtr + dstSize*3 + ofsDstColor))= Color;
@@ -630,10 +631,10 @@ void CComputedString::render2DUnProjected (IDriver& driver, CRenderStringBuffer
 
 				// next quad out
 				++nNumQuadClipped;
-				pClipPos0 = (CVector*)(((uint8*)pClipPos0) + dstSize*4);
-				pClipPos1 = (CVector*)(((uint8*)pClipPos0) + dstSize);
-				pClipPos2 = (CVector*)(((uint8*)pClipPos1) + dstSize);
-				pClipPos3 = (CVector*)(((uint8*)pClipPos2) + dstSize);
+				pClipPos0 = (CVectorPacked*)(((uint8*)pClipPos0) + dstSize*4);
+				pClipPos1 = (CVectorPacked*)(((uint8*)pClipPos0) + dstSize);
+				pClipPos2 = (CVectorPacked*)(((uint8*)pClipPos1) + dstSize);
+				pClipPos3 = (CVectorPacked*)(((uint8*)pClipPos2) + dstSize);
 				pClipUV0 = (CUV*)( ((uint8*)pClipUV0) + dstSize*4 );
 				pClipUV1 = (CUV*)(((uint8*)pClipUV0) + dstSize);
 				pClipUV2 = (CUV*)(((uint8*)pClipUV1) + dstSize);
@@ -641,8 +642,8 @@ void CComputedString::render2DUnProjected (IDriver& driver, CRenderStringBuffer
 				dstPtr+=  4*dstSize;
 			}
 			// next quad in
-			pIniPos0 = (CVector*)(((uint8*)pIniPos0) + srcSize*4);
-			pIniPos2 = (CVector*)(((uint8*)pIniPos0) + srcSize*2);
+			pIniPos0 = (CVectorPacked*)(((uint8*)pIniPos0) + srcSize*4);
+			pIniPos2 = (CVectorPacked*)(((uint8*)pIniPos0) + srcSize*2);
 			srcPtr+=  4*srcSize;
 		}
 
@@ -657,13 +658,13 @@ void CComputedString::render2DUnProjected (IDriver& driver, CRenderStringBuffer
 	{
 		// preset unprojection
 		CVector tmp;
-		tmp.x = ((CVector*)dstPtrBackup)->x * OOW;
-		tmp.y = ((CVector*)dstPtrBackup)->z * OOH;
+		tmp.x = ((CVectorPacked*)dstPtrBackup)->x * OOW;
+		tmp.y = ((CVectorPacked*)dstPtrBackup)->z * OOH;
 		tmp.z = depth;
 		// mul by user scale matrix
 		tmp= scaleMatrix * tmp;
 		// Unproject it
-		*((CVector*)dstPtrBackup) = frustum.unProjectZ(tmp);
+		*((CVectorPacked*)dstPtrBackup) = frustum.unProjectZ(tmp);
 		dstPtrBackup += dstSize;
 	}
 
diff --git a/code/nel/src/3d/driver_user.cpp b/code/nel/src/3d/driver_user.cpp
index e5d814755..cdfce0ce0 100644
--- a/code/nel/src/3d/driver_user.cpp
+++ b/code/nel/src/3d/driver_user.cpp
@@ -920,29 +920,29 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV *quads, uint32 nbQuads,
 			for (uint32 i = 0; i < nbQuads; ++i)
 			{
 				const NLMISC::CQuadColorUV &qcuv = quads[i];
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V0;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv0;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= qcuv.Color0;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V1;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv1;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= qcuv.Color1;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V2;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv2;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= qcuv.Color2;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V3;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V3;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv3;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
@@ -955,29 +955,29 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV *quads, uint32 nbQuads,
 			for (uint32 i = 0; i < nbQuads; ++i)
 			{
 				const NLMISC::CQuadColorUV &qcuv = quads[i];
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V0;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv0;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= qcuv.Color0;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V1;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv1;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= qcuv.Color1;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V2;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv2;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CBGRA))
 				*(CBGRA*)(dstPtr+colorOfs)= qcuv.Color2;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V3;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V3;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs)= qcuv.Uv3;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
@@ -1014,8 +1014,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 			for (uint32 i = 0; i < nbQuads; ++i)
 			{
 				const NLMISC::CQuadColorUV2 &qcuv = quads[i];
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V0;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1023,8 +1023,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= qcuv.Color0;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V1;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1032,8 +1032,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= qcuv.Color1;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V2;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1041,8 +1041,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= qcuv.Color2;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V3;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V3;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv3;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1057,8 +1057,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 			for (uint32 i = 0; i < nbQuads; ++i)
 			{
 				const NLMISC::CQuadColorUV2 &qcuv = quads[i];
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V0;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1066,8 +1066,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= qcuv.Color0;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V1;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1075,8 +1075,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= qcuv.Color1;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V2;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1084,8 +1084,8 @@ void			CDriverUser::drawQuads(const NLMISC::CQuadColorUV2 *quads, uint32 nbQuads
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= qcuv.Color2;
 				dstPtr+= vSize;
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= qcuv.V3;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= qcuv.V3;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= qcuv.Uv3;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs1, sizeof(CUV))
@@ -1127,24 +1127,24 @@ void CDriverUser::drawTriangles(const NLMISC::CTriangleColorUV *tris, uint32 nbT
 			do
 			{
 				//
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= tris->V0;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= tris->V0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= tris->Uv0;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= tris->Color0;
 				dstPtr+= vSize;
 				//
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= tris->V1;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= tris->V1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= tris->Uv1;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CRGBA*)(dstPtr+colorOfs)= tris->Color1;
 				dstPtr+= vSize;
 				//
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= tris->V2;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= tris->V2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= tris->Uv2;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
@@ -1159,24 +1159,24 @@ void CDriverUser::drawTriangles(const NLMISC::CTriangleColorUV *tris, uint32 nbT
 			do
 			{
 				//
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= tris->V0;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= tris->V0;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= tris->Uv0;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= tris->Color0;
 				dstPtr+= vSize;
 				//
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= tris->V1;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= tris->V1;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= tris->Uv1;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
 				*(CBGRA*)(dstPtr+colorOfs)= tris->Color1;
 				dstPtr+= vSize;
 				//
-				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVector))
-				*(CVector*)(dstPtr+0)= tris->V2;
+				CHECK_VBA_RANGE(vba, dstPtr+0, sizeof(CVectorPacked))
+				*(CVectorPacked*)(dstPtr+0)= tris->V2;
 				CHECK_VBA_RANGE(vba, dstPtr+uvOfs0, sizeof(CUV))
 				*(CUV*)(dstPtr+uvOfs0)= tris->Uv2;
 				CHECK_VBA_RANGE(vba, dstPtr+colorOfs, sizeof(CRGBA))
diff --git a/code/nel/src/3d/mesh.cpp b/code/nel/src/3d/mesh.cpp
index a533632c5..dfaed0ce4 100644
--- a/code/nel/src/3d/mesh.cpp
+++ b/code/nel/src/3d/mesh.cpp
@@ -1870,15 +1870,15 @@ void	CMeshGeom::applySkin(CSkeletonModel *skeleton)
 				nlassert(psPal->MatrixId[3]<IDriver::MaxModelMatrix);
 
 				// compute vertex part.
-				computeSoftwarePointSkinning(matrixes, srcVector, psPal, (float*)srcWgt, (CVector*)dstVector);
+				computeSoftwarePointSkinning(matrixes, srcVector, psPal, (float*)srcWgt, (CVectorPacked*)dstVector);
 
 				// compute normal part.
 				if(skinType>=SkinWithNormal)
-					computeSoftwareVectorSkinning(matrixes, srcNormal, psPal, (float*)srcWgt, (CVector*)dstNormal);
+					computeSoftwareVectorSkinning(matrixes, srcNormal, psPal, (float*)srcWgt, (CVectorPacked*)dstNormal);
 
 				// compute tg part.
 				if(skinType>=SkinWithTgSpace)
-					computeSoftwareVectorSkinning(matrixes, srcTgSpace, psPal, (float*)srcWgt, (CVector*)dstTgSpace);
+					computeSoftwareVectorSkinning(matrixes, srcTgSpace, psPal, (float*)srcWgt, (CVectorPacked*)dstTgSpace);
 			}
 
 			// inc flags.
@@ -1938,42 +1938,48 @@ void	CMeshGeom::flagSkinVerticesForMatrixBlock(uint8 *skinFlags, CMatrixBlock &m
 
 
 // ***************************************************************************
-void	CMeshGeom::computeSoftwarePointSkinning(CMatrix3x4 *matrixes, CVector *srcVec, CPaletteSkin *srcPal, float *srcWgt, CVector *pDst)
+void	CMeshGeom::computeSoftwarePointSkinning(CMatrix3x4 *matrixes, CVector *srcVec, CPaletteSkin *srcPal, float *srcWgt, CVectorPacked *pDst)
 {
 	CMatrix3x4		*pMat;
+	CVector			temp;
 
 	// 0th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[0];
-	pMat->mulSetPoint(*srcVec, srcWgt[0], *pDst);
+	pMat->mulSetPoint(*srcVec, srcWgt[0], temp);
 	// 1th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[1];
-	pMat->mulAddPoint(*srcVec, srcWgt[1], *pDst);
+	pMat->mulAddPoint(*srcVec, srcWgt[1], temp);
 	// 2th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[2];
-	pMat->mulAddPoint(*srcVec, srcWgt[2], *pDst);
+	pMat->mulAddPoint(*srcVec, srcWgt[2], temp);
 	// 3th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[3];
-	pMat->mulAddPoint(*srcVec, srcWgt[3], *pDst);
+	pMat->mulAddPoint(*srcVec, srcWgt[3], temp);
+
+	*pDst = temp;
 }
 
 
 // ***************************************************************************
-void	CMeshGeom::computeSoftwareVectorSkinning(CMatrix3x4 *matrixes, CVector *srcVec, CPaletteSkin *srcPal, float *srcWgt, CVector *pDst)
+void	CMeshGeom::computeSoftwareVectorSkinning(CMatrix3x4 *matrixes, CVector *srcVec, CPaletteSkin *srcPal, float *srcWgt, CVectorPacked *pDst)
 {
 	CMatrix3x4		*pMat;
+	CVector			temp;
 
 	// 0th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[0];
-	pMat->mulSetVector(*srcVec, srcWgt[0], *pDst);
+	pMat->mulSetVector(*srcVec, srcWgt[0], temp);
 	// 1th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[1];
-	pMat->mulAddVector(*srcVec, srcWgt[1], *pDst);
+	pMat->mulAddVector(*srcVec, srcWgt[1], temp);
 	// 2th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[2];
-	pMat->mulAddVector(*srcVec, srcWgt[2], *pDst);
+	pMat->mulAddVector(*srcVec, srcWgt[2], temp);
 	// 3th matrix influence.
 	pMat= matrixes + srcPal->MatrixId[3];
-	pMat->mulAddVector(*srcVec, srcWgt[3], *pDst);
+	pMat->mulAddVector(*srcVec, srcWgt[3], temp);
+
+	*pDst = temp;
 }
 
 
diff --git a/code/nel/src/3d/mesh_morpher.cpp b/code/nel/src/3d/mesh_morpher.cpp
index bfca4b7c7..14224af2b 100644
--- a/code/nel/src/3d/mesh_morpher.cpp
+++ b/code/nel/src/3d/mesh_morpher.cpp
@@ -177,7 +177,7 @@ void CMeshMorpher::update (std::vector<CAnimatedMorph> *pBSFactor)
 			if (_UseTgSpace)
 			if (rBS.deltaTgSpace.size() > 0)
 			{
-				CVector *pV = (CVector*)dstvba.getTexCoordPointer (vp, tgSpaceStage);
+				CVectorPacked *pV = (CVectorPacked*)dstvba.getTexCoordPointer (vp, tgSpaceStage);
 				*pV += rBS.deltaTgSpace[j] * rFactor;
 			}
 
diff --git a/code/nel/src/3d/packed_zone.cpp b/code/nel/src/3d/packed_zone.cpp
index 944e9c8d1..5b522a656 100644
--- a/code/nel/src/3d/packed_zone.cpp
+++ b/code/nel/src/3d/packed_zone.cpp
@@ -418,7 +418,7 @@ void serialPackedVector12(std::vector<uint16> &v, NLMISC::IStream &f)
 }
 
 // some function to ease writing of some primitives into a vertex buffer
-static inline void pushVBLine2D(NLMISC::CVector *&dest, const NLMISC::CVector &v0, const NLMISC::CVector &v1)
+static inline void pushVBLine2D(NLMISC::CVectorPacked *&dest, const NLMISC::CVector &v0, const NLMISC::CVector &v1)
 {
 	dest->x = v0.x;
 	dest->y = v0.y;
@@ -434,7 +434,7 @@ static inline void pushVBLine2D(NLMISC::CVector *&dest, const NLMISC::CVector &v
 	++ dest;
 }
 
-static inline void pushVBTri2D(NLMISC::CVector *&dest, const NLMISC::CTriangle &tri)
+static inline void pushVBTri2D(NLMISC::CVectorPacked *&dest, const NLMISC::CTriangle &tri)
 {
 	dest->x = tri.V0.x;
 	dest->y = tri.V0.y;
@@ -451,7 +451,7 @@ static inline void pushVBTri2D(NLMISC::CVector *&dest, const NLMISC::CTriangle &
 }
 
 
-static inline void pushVBQuad2D(NLMISC::CVector *&dest, const NLMISC::CQuad &quad)
+static inline void pushVBQuad2D(NLMISC::CVectorPacked *&dest, const NLMISC::CQuad &quad)
 {
 	dest->x = quad.V0.x;
 	dest->y = quad.V0.y;
@@ -471,7 +471,7 @@ static inline void pushVBQuad2D(NLMISC::CVector *&dest, const NLMISC::CQuad &qua
 	++ dest;
 }
 
-static inline void pushVBQuad(NLMISC::CVector *&dest, const NLMISC::CQuad &quad)
+static inline void pushVBQuad(NLMISC::CVectorPacked *&dest, const NLMISC::CQuad &quad)
 {
 	*dest++ = quad.V0;
 	*dest++ = quad.V1;
diff --git a/code/nel/src/3d/patch_render.cpp b/code/nel/src/3d/patch_render.cpp
index bc74648d3..135b9fdb3 100644
--- a/code/nel/src/3d/patch_render.cpp
+++ b/code/nel/src/3d/patch_render.cpp
@@ -1026,8 +1026,8 @@ inline void		CPatch::fillFar0VertexVB(CTessFarVertex *pVert)
 	if( !CLandscapeGlobals::VertexProgramEnabled )
 	{
 		// Set Pos. Set it local to the current center of landscape
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr, sizeof(CVector));
-		*(CVector*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked));
+		*(CVectorPacked*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
 		// Set Uvs.
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.TexCoordOff0, sizeof(CUV));
 		*(CUV*)(CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.TexCoordOff0)= uv;
@@ -1038,8 +1038,8 @@ inline void		CPatch::fillFar0VertexVB(CTessFarVertex *pVert)
 	{
 		// Else must setup Vertex program inputs
 		// v[0]== StartPos.
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr, sizeof(CVector));
-		*(CVector*)CurVBPtr= pVert->Src->StartPos;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked));
+		*(CVectorPacked*)CurVBPtr= pVert->Src->StartPos;
 		// v[8]== Tex0
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.TexCoordOff0, sizeof(CUV));
 		*(CUV*)(CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.TexCoordOff0)= uv;
@@ -1110,8 +1110,8 @@ inline void		CPatch::fillFar1VertexVB(CTessFarVertex *pVert)
 	if( !CLandscapeGlobals::VertexProgramEnabled )
 	{
 		// Set Pos. Set it local to the current center of landscape
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr, sizeof(CVector));
-		*(CVector*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked));
+		*(CVectorPacked*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
 		// Set Uvs.
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.TexCoordOff0, sizeof(CUV));
 		*(CUV*)(CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.TexCoordOff0)= uv;
@@ -1126,8 +1126,8 @@ inline void		CPatch::fillFar1VertexVB(CTessFarVertex *pVert)
 	{
 		// Else must setup Vertex program inputs
 		// v[0]== StartPos.
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr, sizeof(CVector));
-		*(CVector*)CurVBPtr= pVert->Src->StartPos;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked));
+		*(CVectorPacked*)CurVBPtr= pVert->Src->StartPos;
 		// v[8]== Tex0
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.TexCoordOff0, sizeof(CUV));
 		*(CUV*)(CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.TexCoordOff0)= uv;
@@ -1179,8 +1179,8 @@ inline void		CPatch::fillTileVertexVB(CTessNearVertex *pVert)
 	if( !CLandscapeGlobals::VertexProgramEnabled )
 	{
 		// Set Pos. Set it local to the current center of landscape
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr, sizeof(CVector))
-		*(CVector*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked))
+		*(CVectorPacked*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
 		// Set Uvs.
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.TexCoordOff0, sizeof(CUV))
 		*(CUV*)(CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.TexCoordOff0)= pVert->PUv0;
@@ -1193,8 +1193,8 @@ inline void		CPatch::fillTileVertexVB(CTessNearVertex *pVert)
 	{
 		// Else must setup Vertex program inputs
 		// v[0]== StartPos.
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr, sizeof(CVector))
-		*(CVector*)CurVBPtr= pVert->Src->StartPos;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked))
+		*(CVectorPacked*)CurVBPtr= pVert->Src->StartPos;
 		// v[8]== Tex0
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.TexCoordOff0, sizeof(CUV))
 		*(CUV*)(CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.TexCoordOff0)= pVert->PUv0;
@@ -1383,8 +1383,8 @@ void		CPatch::computeGeomorphFar0VertexListVB(CTessList<CTessFarVertex>  &vertLi
 		CurVBPtr+= pVert->Index0 * CLandscapeGlobals::CurrentFar0VBInfo.VertexSize;
 
 		// Set Geomorphed Position. Set it local to the current center of landscape
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr, sizeof(CVector))
-		*(CVector*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked))
+		*(CVectorPacked*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
 	}
 }
 
@@ -1404,8 +1404,8 @@ void		CPatch::computeGeomorphAlphaFar1VertexListVB(CTessList<CTessFarVertex>  &v
 		// NB: the filling order of data is important, for AGP write combiners.
 
 		// Set Geomorphed Position. Set it local to the current center of landscape
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr, sizeof(CVector))
-		*(CVector*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked))
+		*(CVectorPacked*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
 
 		// Set Alpha color.
 		static CRGBA	col(255,255,255,255);
@@ -1434,8 +1434,8 @@ void		CPatch::computeGeomorphTileVertexListVB(CTessList<CTessNearVertex>  &vertL
 		CurVBPtr+= pVert->Index * CLandscapeGlobals::CurrentTileVBInfo.VertexSize;
 
 		// Set Geomorphed Position. Set it local to the current center of landscape
-		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr, sizeof(CVector))
-		*(CVector*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
+		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr, sizeof(CVectorPacked))
+		*(CVectorPacked*)CurVBPtr= pVert->Src->Pos - CLandscapeGlobals::PZBModelPosition;
 	}
 }
 
diff --git a/code/nel/src/3d/ps_dot.cpp b/code/nel/src/3d/ps_dot.cpp
index 516cac8f1..6219605f7 100644
--- a/code/nel/src/3d/ps_dot.cpp
+++ b/code/nel/src/3d/ps_dot.cpp
@@ -23,6 +23,8 @@
 #include "nel/3d/particle_system.h"
 #include "nel/misc/fast_mem.h"
 
+using NLMISC::CVectorPacked;
+
 namespace NL3D
 {
 
@@ -84,7 +86,7 @@ inline void DrawDot(T it,
 				do
 				{
 					CHECK_VERTEX_BUFFER(vb, currPos);
-					*((CVector *) currPos) =  *it;
+					*((CVectorPacked *) currPos) =  *it;
 					++it ;
 					currPos += stride;
 				}
@@ -93,7 +95,7 @@ inline void DrawDot(T it,
 			else if (srcStep == (1 << 16)) // make sure we haven't got auto-lod and that the step is 1.0
 			{
 				// there's no color information in the buffer, so we can copy it directly
-				NLMISC::CFastMem::memcpy(vba.getVertexCoordPointer(), &(*it), sizeof(NLMISC::CVector) * toProcess);
+				NLMISC::CFastMem::memcpy(vba.getVertexCoordPointer(), &(*it), sizeof(NLMISC::CVectorPacked) * toProcess);
 				it += toProcess;
 			}
 			else
@@ -103,7 +105,7 @@ inline void DrawDot(T it,
 				do
 				{
 					CHECK_VERTEX_BUFFER(vb, currPos);
-					*((CVector *) currPos) =  *it;
+					*((CVectorPacked *) currPos) =  *it;
 					++it ;
 					currPos += sizeof(float[3]);
 				}
diff --git a/code/nel/src/3d/ps_emitter.cpp b/code/nel/src/3d/ps_emitter.cpp
index c9806722f..0084111a0 100644
--- a/code/nel/src/3d/ps_emitter.cpp
+++ b/code/nel/src/3d/ps_emitter.cpp
@@ -914,7 +914,7 @@ uint GenEmitterPositions(CPSLocated *emitter,
 									   uint numStep,
 									   TAnimationTime deltaT, /* fraction of time needed to reach the first emission */
 									   TAnimationTime step,
-									   std::vector<NLMISC::CVector> &dest
+									   std::vector<NLMISC::CVectorPacked> &dest
 									  )
 {
 	NL_PS_FUNC(GenEmitterPositions)
@@ -930,8 +930,8 @@ uint GenEmitterPositions(CPSLocated *emitter,
 		}
 		else
 		{
-			std::vector<NLMISC::CVector>::iterator outIt = dest.end();
-			std::vector<NLMISC::CVector>::iterator endIt = dest.begin();
+			std::vector<NLMISC::CVectorPacked>::iterator outIt = dest.end();
+			std::vector<NLMISC::CVectorPacked>::iterator endIt = dest.begin();
 			NLMISC::CVector pos = emitter->getPos()[emitterIndex] - deltaT * emitter->getSpeed()[emitterIndex];
 			NLMISC::CVector speed = step * emitter->getSpeed()[emitterIndex];
 			do
@@ -966,7 +966,7 @@ static inline uint GenEmitterPositionsWithLOD(CPSLocated *emitter,
 									   TAnimationTime deltaT, /* fraction of time needed to reach the first emission */
 									   TAnimationTime step,
 									   float invLODRatio,
-									   std::vector<NLMISC::CVector> &dest
+									   std::vector<NLMISC::CVectorPacked> &dest
 									  )
 {
 	NL_PS_FUNC(GenEmitterPositionsWithLOD)
@@ -982,8 +982,8 @@ static inline uint GenEmitterPositionsWithLOD(CPSLocated *emitter,
 		}
 		else
 		{
-			std::vector<NLMISC::CVector>::iterator outIt = dest.end();
-			std::vector<NLMISC::CVector>::iterator endIt = dest.begin();
+			std::vector<NLMISC::CVectorPacked>::iterator outIt = dest.end();
+			std::vector<NLMISC::CVectorPacked>::iterator endIt = dest.begin();
 			NLMISC::CVector pos = emitter->getPos()[emitterIndex] - deltaT * emitter->getSpeed()[emitterIndex];
 			NLMISC::CVector speed = step * invLODRatio * emitter->getSpeed()[emitterIndex];
 			do
@@ -1021,7 +1021,7 @@ void CPSEmitter::processRegularEmissionConsistent(uint firstInstanceIndex, float
 	//
 
 
-	static std::vector<NLMISC::CVector> emitterPositions;
+	static std::vector<NLMISC::CVectorPacked> emitterPositions;
 	// Positions for the emitter. They are computed by using a parametric trajectory or by using integration
 
 	const uint size = _Owner->getSize();
@@ -1454,7 +1454,7 @@ void CPSEmitter::processRegularEmissionConsistentWithNoLOD(uint firstInstanceInd
 	//
 
 
-	static std::vector<NLMISC::CVector> emitterPositions;
+	static std::vector<NLMISC::CVectorPacked> emitterPositions;
 	// Positions for the emitter. They are computed by using a parametric trajectory or by using integration
 
 	const uint size = _Owner->getSize();
diff --git a/code/nel/src/3d/ps_face.cpp b/code/nel/src/3d/ps_face.cpp
index 63909287d..bc60fb313 100644
--- a/code/nel/src/3d/ps_face.cpp
+++ b/code/nel/src/3d/ps_face.cpp
@@ -23,7 +23,7 @@
 #include "nel/3d/particle_system.h"
 #include "nel/misc/quat.h"
 
-
+using NLMISC::CVectorPacked;
 
 namespace NL3D
 {
@@ -96,27 +96,27 @@ public:
 					{
 						const CPlaneBasis &currBasis = f._PrecompBasis[*indexIt].Basis;
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  + *ptSize * currBasis.X.x;
-						((CVector *) currVertex)->y = (*posIt).y  + *ptSize * currBasis.X.y;
-						((CVector *) currVertex)->z = (*posIt).z  + *ptSize * currBasis.X.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  + *ptSize * currBasis.X.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  + *ptSize * currBasis.X.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  + *ptSize * currBasis.X.z;
 						currVertex += stride;
 
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  + *ptSize * currBasis.Y.x;
-						((CVector *) currVertex)->y = (*posIt).y  + *ptSize * currBasis.Y.y;
-						((CVector *) currVertex)->z = (*posIt).z  + *ptSize * currBasis.Y.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  + *ptSize * currBasis.Y.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  + *ptSize * currBasis.Y.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  + *ptSize * currBasis.Y.z;
 						currVertex += stride;
 
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  - *ptSize * currBasis.X.x;
-						((CVector *) currVertex)->y = (*posIt).y  - *ptSize * currBasis.X.y;
-						((CVector *) currVertex)->z = (*posIt).z  - *ptSize * currBasis.X.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  - *ptSize * currBasis.X.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  - *ptSize * currBasis.X.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  - *ptSize * currBasis.X.z;
 						currVertex += stride;
 
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  - *ptSize * currBasis.Y.x;
-						((CVector *) currVertex)->y = (*posIt).y  - *ptSize * currBasis.Y.y;
-						((CVector *) currVertex)->z = (*posIt).z  - *ptSize * currBasis.Y.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  - *ptSize * currBasis.Y.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  - *ptSize * currBasis.Y.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  - *ptSize * currBasis.Y.z;
 						currVertex += stride;
 						ptSize += ptSizeIncrement;
 						++indexIt;
@@ -168,27 +168,27 @@ public:
 					{
 						// we use this instead of the + operator, because we avoid 4 constructor calls this way
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  + *ptSize * currBasis->X.x;
-						((CVector *) currVertex)->y = (*posIt).y  + *ptSize * currBasis->X.y;
-						((CVector *) currVertex)->z = (*posIt).z  + *ptSize * currBasis->X.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  + *ptSize * currBasis->X.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  + *ptSize * currBasis->X.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  + *ptSize * currBasis->X.z;
 						currVertex += vSize;
 
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  + *ptSize * currBasis->Y.x;
-						((CVector *) currVertex)->y = (*posIt).y  + *ptSize * currBasis->Y.y;
-						((CVector *) currVertex)->z = (*posIt).z  + *ptSize * currBasis->Y.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  + *ptSize * currBasis->Y.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  + *ptSize * currBasis->Y.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  + *ptSize * currBasis->Y.z;
 						currVertex += vSize;
 
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  - *ptSize * currBasis->X.x;
-						((CVector *) currVertex)->y = (*posIt).y  - *ptSize * currBasis->X.y;
-						((CVector *) currVertex)->z = (*posIt).z  - *ptSize * currBasis->X.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  - *ptSize * currBasis->X.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  - *ptSize * currBasis->X.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  - *ptSize * currBasis->X.z;
 						currVertex += vSize;
 
 						CHECK_VERTEX_BUFFER(vb, currVertex);
-						((CVector *) currVertex)->x = (*posIt).x  - *ptSize * currBasis->Y.x;
-						((CVector *) currVertex)->y = (*posIt).y  - *ptSize * currBasis->Y.y;
-						((CVector *) currVertex)->z = (*posIt).z  - *ptSize * currBasis->Y.z;
+						((CVectorPacked *) currVertex)->x = (*posIt).x  - *ptSize * currBasis->Y.x;
+						((CVectorPacked *) currVertex)->y = (*posIt).y  - *ptSize * currBasis->Y.y;
+						((CVectorPacked *) currVertex)->z = (*posIt).z  - *ptSize * currBasis->Y.z;
 						currVertex += vSize;
 						ptSize += ptSizeIncrement;
 						++posIt;
diff --git a/code/nel/src/3d/ps_face_look_at.cpp b/code/nel/src/3d/ps_face_look_at.cpp
index 5b06f6eb5..ccc5907c5 100644
--- a/code/nel/src/3d/ps_face_look_at.cpp
+++ b/code/nel/src/3d/ps_face_look_at.cpp
@@ -23,6 +23,7 @@
 #include "nel/3d/particle_system.h"
 #include "nel/misc/fast_floor.h"
 
+using NLMISC::CVectorPacked;
 
 namespace NL3D
 {
@@ -147,27 +148,27 @@ public:
 						v1 = rotTable[tabIndex] * currAlign->I + rotTable[tabIndex + 1] * currAlign->K;
 						v2 = rotTable[tabIndex + 2] * currAlign->I + rotTable[tabIndex + 3] * currAlign->K;
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v2.z;
 						ptPos += stride;
 
 						++it;
@@ -199,27 +200,27 @@ public:
 						v1 = CPSUtil::getCos((sint32) la._Angle2D) * currAlign->I  + CPSUtil::getSin((sint32) la._Angle2D) * currAlign->K;
 						v2 = - CPSUtil::getSin((sint32) la._Angle2D) * currAlign->I + CPSUtil::getCos((sint32) la._Angle2D) * currAlign->K;
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
 						ptPos += stride;
 						++it;
 						++currAlign;
@@ -283,24 +284,24 @@ public:
 						CHECK_VERTEX_BUFFER(vb, ptPos + stride2);
 						CHECK_VERTEX_BUFFER(vb, ptPos + stride3);
 
-						((CVector *) ptPos)->x  = (*it).x  + v1.x;
-						((CVector *) ptPos)->y  = (*it).y  + v1.y;
-						((CVector *) ptPos)->z = (*it).z  + v1.z;
+						((CVectorPacked *) ptPos)->x  = (*it).x  + v1.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  + v1.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + v1.z;
 						ptPos += stride;
 
-						((CVector *) ptPos)->x  = (*it).x  + v2.x;
-						((CVector *) ptPos)->y  = (*it).y  + v2.y;
-						((CVector *) ptPos)->z = (*it).z  + v2.z;
+						((CVectorPacked *) ptPos)->x  = (*it).x  + v2.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  + v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + v2.z;
 						ptPos += stride;
 
-						((CVector *) ptPos)->x  = (*it).x  - v1.x;
-						((CVector *) ptPos)->y  = (*it).y  - v1.y;
-						((CVector *) ptPos)->z = (*it).z  - v1.z;
+						((CVectorPacked *) ptPos)->x  = (*it).x  - v1.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  - v1.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - v1.z;
 						ptPos += stride;
 
-						((CVector *) ptPos)->x  = (*it).x  - v2.x;
-						((CVector *) ptPos)->y  = (*it).y  - v2.y;
-						((CVector *) ptPos)->z = (*it).z  - v2.z;
+						((CVectorPacked *) ptPos)->x  = (*it).x  - v2.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  - v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - v2.z;
 						ptPos += stride;
 
 						++it;
@@ -336,27 +337,27 @@ public:
 						v2 = - sinAngle * currAlign->I + cosAngle * currAlign->K;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
 						ptPos += stride;
 						++it;
 						++currentAngle;
@@ -462,27 +463,27 @@ public:
 							while (it != endIt)
 							{
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x;
-								((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y;
-								((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z;
 								ptPos += stride;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  + *currentSize * v2.x;
-								((CVector *) ptPos)->y = (*it).y  + *currentSize * v2.y;
-								((CVector *) ptPos)->z = (*it).z  + *currentSize * v2.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v2.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v2.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v2.z;
 								ptPos += stride;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x;
-								((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y;
-								((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z;
 								ptPos += stride;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  - *currentSize * v2.x;
-								((CVector *) ptPos)->y = (*it).y  - *currentSize * v2.y;
-								((CVector *) ptPos)->z = (*it).z  - *currentSize * v2.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v2.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v2.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v2.z;
 								ptPos += stride;
 
 								++it;
@@ -498,27 +499,27 @@ public:
 							while (it != endIt)
 							{
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  + myV1.x;
-								((CVector *) ptPos)->y = (*it).y  + myV1.y;
-								((CVector *) ptPos)->z = (*it).z  + myV1.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  + myV1.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  + myV1.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  + myV1.z;
 								ptPos += stride;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  + myV2.x;
-								((CVector *) ptPos)->y = (*it).y  + myV2.y;
-								((CVector *) ptPos)->z = (*it).z  + myV2.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  + myV2.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  + myV2.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  + myV2.z;
 								ptPos += stride;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  - myV1.x;
-								((CVector *) ptPos)->y = (*it).y  - myV1.y;
-								((CVector *) ptPos)->z = (*it).z  - myV1.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  - myV1.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  - myV1.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  - myV1.z;
 								ptPos += stride;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  - myV2.x;
-								((CVector *) ptPos)->y = (*it).y  - myV2.y;
-								((CVector *) ptPos)->z = (*it).z  - myV2.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  - myV2.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  - myV2.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  - myV2.z;
 								ptPos += stride;
 								++it;
 							}
@@ -548,27 +549,27 @@ public:
 						while (it != endIt)
 						{
 							CHECK_VERTEX_BUFFER(vb, ptPos);
-							((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
-							((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
-							((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
+							((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
+							((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
+							((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
 							ptPos += stride;
 
 							CHECK_VERTEX_BUFFER(vb, ptPos);
-							((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
-							((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
-							((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
+							((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
+							((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
+							((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
 							ptPos += stride;
 
 							CHECK_VERTEX_BUFFER(vb, ptPos);
-							((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
-							((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
-							((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
+							((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
+							((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
+							((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
 							ptPos += stride;
 
 							CHECK_VERTEX_BUFFER(vb, ptPos);
-							((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
-							((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
-							((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
+							((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
+							((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
+							((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
 							ptPos += stride;
 							++it;
 							currentSize += currentSizeStep;
@@ -637,81 +638,81 @@ public:
 								mbv12 = -*currentSize * mbv1n;
 								mbv1 *= *currentSize * (1 + la._MotionBlurCoeff * n * n) / n;
 
-								*(CVector *) ptPos = *it - mbv2;
-								*(CVector *) (ptPos + stride) = *it  + mbv1;
-								*(CVector *) (ptPos + stride2) = *it + mbv2;
-								*(CVector *) (ptPos + stride3) = *it + mbv12;
+								*(CVectorPacked *) ptPos = *it - mbv2;
+								*(CVectorPacked *) (ptPos + stride) = *it  + mbv1;
+								*(CVectorPacked *) (ptPos + stride2) = *it + mbv2;
+								*(CVectorPacked *) (ptPos + stride3) = *it + mbv12;
 
 
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  - mbv2.x;
-								((CVector *) ptPos)->y = (*it).y  - mbv2.y;
-								((CVector *) ptPos)->z = (*it).z  - mbv2.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  - mbv2.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  - mbv2.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  - mbv2.z;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos + stride);
-								((CVector *) (ptPos + stride))->x = (*it).x  + mbv1.x;
-								((CVector *) (ptPos + stride))->y = (*it).y  + mbv1.y;
-								((CVector *) (ptPos + stride))->z = (*it).z  + mbv1.z;
+								((CVectorPacked *) (ptPos + stride))->x = (*it).x  + mbv1.x;
+								((CVectorPacked *) (ptPos + stride))->y = (*it).y  + mbv1.y;
+								((CVectorPacked *) (ptPos + stride))->z = (*it).z  + mbv1.z;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos + stride2);
-								((CVector *) (ptPos + stride2))->x = (*it).x  + mbv2.x;
-								((CVector *) (ptPos + stride2))->y = (*it).y  + mbv2.y;
-								((CVector *) (ptPos + stride2))->z = (*it).z  + mbv2.z;
+								((CVectorPacked *) (ptPos + stride2))->x = (*it).x  + mbv2.x;
+								((CVectorPacked *) (ptPos + stride2))->y = (*it).y  + mbv2.y;
+								((CVectorPacked *) (ptPos + stride2))->z = (*it).z  + mbv2.z;
 
 
 								CHECK_VERTEX_BUFFER(vb, ptPos + stride3);
-								((CVector *) (ptPos + stride3))->x = (*it).x  + mbv12.x;
-								((CVector *) (ptPos + stride3))->y = (*it).y  + mbv12.y;
-								((CVector *) (ptPos + stride3))->z = (*it).z  + mbv12.z;
+								((CVectorPacked *) (ptPos + stride3))->x = (*it).x  + mbv12.x;
+								((CVectorPacked *) (ptPos + stride3))->y = (*it).y  + mbv12.y;
+								((CVectorPacked *) (ptPos + stride3))->z = (*it).z  + mbv12.z;
 
 							}
 							else // speed too small, we must avoid imprecision
 							{
 								CHECK_VERTEX_BUFFER(vb, ptPos);
-								((CVector *) ptPos)->x = (*it).x  - *currentSize * v2.x;
-								((CVector *) ptPos)->y = (*it).y  - *currentSize * v2.y;
-								((CVector *) ptPos)->z = (*it).z  - *currentSize * v2.z;
+								((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v2.x;
+								((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v2.y;
+								((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v2.z;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos + stride);
-								((CVector *) (ptPos + stride))->x = (*it).x  + *currentSize * v1.x;
-								((CVector *) (ptPos + stride))->y = (*it).y  + *currentSize * v1.y;
-								((CVector *) (ptPos + stride))->z = (*it).z  + *currentSize * v1.z;
+								((CVectorPacked *) (ptPos + stride))->x = (*it).x  + *currentSize * v1.x;
+								((CVectorPacked *) (ptPos + stride))->y = (*it).y  + *currentSize * v1.y;
+								((CVectorPacked *) (ptPos + stride))->z = (*it).z  + *currentSize * v1.z;
 
 								CHECK_VERTEX_BUFFER(vb, ptPos + stride2);
-								((CVector *) (ptPos + stride2))->x = (*it).x  + *currentSize * v2.x;
-								((CVector *) (ptPos + stride2))->y = (*it).y  + *currentSize * v2.y;
-								((CVector *) (ptPos + stride2))->z = (*it).z  + *currentSize * v2.z;
+								((CVectorPacked *) (ptPos + stride2))->x = (*it).x  + *currentSize * v2.x;
+								((CVectorPacked *) (ptPos + stride2))->y = (*it).y  + *currentSize * v2.y;
+								((CVectorPacked *) (ptPos + stride2))->z = (*it).z  + *currentSize * v2.z;
 
 
 								CHECK_VERTEX_BUFFER(vb, ptPos + stride3);
-								((CVector *) (ptPos + stride3))->x = (*it).x  - *currentSize * v1.x;
-								((CVector *) (ptPos + stride3))->y = (*it).y  - *currentSize * v1.y;
-								((CVector *) (ptPos + stride3))->z = (*it).z  - *currentSize * v1.z;
+								((CVectorPacked *) (ptPos + stride3))->x = (*it).x  - *currentSize * v1.x;
+								((CVectorPacked *) (ptPos + stride3))->y = (*it).y  - *currentSize * v1.y;
+								((CVectorPacked *) (ptPos + stride3))->z = (*it).z  - *currentSize * v1.z;
 							}
 						}
 						else
 						{
 
 							CHECK_VERTEX_BUFFER(vb, ptPos);
-							((CVector *) ptPos)->x = (*it).x  - *currentSize * v2.x;
-							((CVector *) ptPos)->y = (*it).y  - *currentSize * v2.y;
-							((CVector *) ptPos)->z = (*it).z  - *currentSize * v2.z;
+							((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v2.x;
+							((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v2.y;
+							((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v2.z;
 
 							CHECK_VERTEX_BUFFER(vb, ptPos + stride);
-							((CVector *) (ptPos + stride))->x = (*it).x  + *currentSize * v1.x;
-							((CVector *) (ptPos + stride))->y = (*it).y  + *currentSize * v1.y;
-							((CVector *) (ptPos + stride))->z = (*it).z  + *currentSize * v1.z;
+							((CVectorPacked *) (ptPos + stride))->x = (*it).x  + *currentSize * v1.x;
+							((CVectorPacked *) (ptPos + stride))->y = (*it).y  + *currentSize * v1.y;
+							((CVectorPacked *) (ptPos + stride))->z = (*it).z  + *currentSize * v1.z;
 
 							CHECK_VERTEX_BUFFER(vb, ptPos + stride2);
-							((CVector *) (ptPos + stride2))->x = (*it).x  + *currentSize * v2.x;
-							((CVector *) (ptPos + stride2))->y = (*it).y  + *currentSize * v2.y;
-							((CVector *) (ptPos + stride2))->z = (*it).z  + *currentSize * v2.z;
+							((CVectorPacked *) (ptPos + stride2))->x = (*it).x  + *currentSize * v2.x;
+							((CVectorPacked *) (ptPos + stride2))->y = (*it).y  + *currentSize * v2.y;
+							((CVectorPacked *) (ptPos + stride2))->z = (*it).z  + *currentSize * v2.z;
 
 
 							CHECK_VERTEX_BUFFER(vb, ptPos + stride3);
-							((CVector *) (ptPos + stride3))->x = (*it).x  - *currentSize * v1.x;
-							((CVector *) (ptPos + stride3))->y = (*it).y  - *currentSize * v1.y;
-							((CVector *) (ptPos + stride3))->z = (*it).z  - *currentSize * v1.z;
+							((CVectorPacked *) (ptPos + stride3))->x = (*it).x  - *currentSize * v1.x;
+							((CVectorPacked *) (ptPos + stride3))->y = (*it).y  - *currentSize * v1.y;
+							((CVectorPacked *) (ptPos + stride3))->z = (*it).z  - *currentSize * v1.z;
 						}
 
 						ptPos += stride4;
@@ -793,30 +794,30 @@ public:
 						CHECK_VERTEX_BUFFER(vb, ptPos + stride2);
 						CHECK_VERTEX_BUFFER(vb, ptPos + stride3);
 
-						((CVector *) ptPos)->x  = (*it).x  + v1.x;
-						((CVector *) ptPos)->y  = (*it).y  + v1.y;
-						((CVector *) ptPos)->z = (*it).z  + v1.z;
-						//nlinfo("** %f, %f, %f", ((CVector *) ptPos)->x, ((CVector *) ptPos)->y, ((CVector *) ptPos)->z);
+						((CVectorPacked *) ptPos)->x  = (*it).x  + v1.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  + v1.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + v1.z;
+						//nlinfo("** %f, %f, %f", ((CVectorPacked *) ptPos)->x, ((CVectorPacked *) ptPos)->y, ((CVectorPacked *) ptPos)->z);
 						ptPos += stride;
 
 
 
-						((CVector *) ptPos)->x  = (*it).x  + v2.x;
-						((CVector *) ptPos)->y  = (*it).y  + v2.y;
-						((CVector *) ptPos)->z = (*it).z  + v2.z;
-						//nlinfo("%f, %f, %f", ((CVector *) ptPos)->x, ((CVector *) ptPos)->y, ((CVector *) ptPos)->z);
+						((CVectorPacked *) ptPos)->x  = (*it).x  + v2.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  + v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + v2.z;
+						//nlinfo("%f, %f, %f", ((CVectorPacked *) ptPos)->x, ((CVectorPacked *) ptPos)->y, ((CVectorPacked *) ptPos)->z);
 						ptPos += stride;
 
-						((CVector *) ptPos)->x  = (*it).x  - v1.x;
-						((CVector *) ptPos)->y  = (*it).y  - v1.y;
-						((CVector *) ptPos)->z = (*it).z  - v1.z;
-						//nlinfo("%f, %f, %f", ((CVector *) ptPos)->x, ((CVector *) ptPos)->y, ((CVector *) ptPos)->z);
+						((CVectorPacked *) ptPos)->x  = (*it).x  - v1.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  - v1.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - v1.z;
+						//nlinfo("%f, %f, %f", ((CVectorPacked *) ptPos)->x, ((CVectorPacked *) ptPos)->y, ((CVectorPacked *) ptPos)->z);
 						ptPos += stride;
 
-						((CVector *) ptPos)->x  = (*it).x  - v2.x;
-						((CVector *) ptPos)->y  = (*it).y  - v2.y;
-						((CVector *) ptPos)->z = (*it).z  - v2.z;
-						//nlinfo("%f, %f, %f", ((CVector *) ptPos)->x, ((CVector *) ptPos)->y, ((CVector *) ptPos)->z);
+						((CVectorPacked *) ptPos)->x  = (*it).x  - v2.x;
+						((CVectorPacked *) ptPos)->y  = (*it).y  - v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - v2.z;
+						//nlinfo("%f, %f, %f", ((CVectorPacked *) ptPos)->x, ((CVectorPacked *) ptPos)->y, ((CVectorPacked *) ptPos)->z);
 						ptPos += stride;
 
 						++it;
@@ -851,27 +852,27 @@ public:
 						v2 = - sinAngle * I + cosAngle * K;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x + *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y + *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z + *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x + *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y + *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z + *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  + *currentSize * v1.x - *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  + *currentSize * v1.y - *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  + *currentSize * v1.z - *currentSize2 * v2.z;
 						ptPos += stride;
 
 						CHECK_VERTEX_BUFFER(vb, ptPos);
-						((CVector *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
-						((CVector *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
-						((CVector *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
+						((CVectorPacked *) ptPos)->x = (*it).x  - *currentSize * v1.x - *currentSize2 * v2.x;
+						((CVectorPacked *) ptPos)->y = (*it).y  - *currentSize * v1.y - *currentSize2 * v2.y;
+						((CVectorPacked *) ptPos)->z = (*it).z  - *currentSize * v1.z - *currentSize2 * v2.z;
 						ptPos += stride;
 						++it;
 						++currentAngle;
diff --git a/code/nel/src/3d/ps_fan_light.cpp b/code/nel/src/3d/ps_fan_light.cpp
index cb33fcd20..4ead1f362 100644
--- a/code/nel/src/3d/ps_fan_light.cpp
+++ b/code/nel/src/3d/ps_fan_light.cpp
@@ -23,6 +23,7 @@
 #include "nel/3d/particle_system.h"
 #include "nel/3d/driver.h"
 
+using NLMISC::CVectorPacked;
 
 
 namespace NL3D
@@ -154,7 +155,7 @@ public:
 				{
 
 					CHECK_VERTEX_BUFFER(*vb, ptVect);
-					*(CVector *) ptVect = *posIt;
+					*(CVectorPacked *) ptVect = *posIt;
 					// the start angle
 					currentAngle = *currentAnglePt;
 					const uint8 phaseAdd = (uint8) (f._PhaseSpeed * (*timeIt));
@@ -163,7 +164,7 @@ public:
 					const float moveIntensity = f._MoveIntensity * fanSize;
 					// compute radius & vect for first fan
 					firstSize  = fanSize + (moveIntensity * CPSUtil::getCos(randomPhaseTab[0] + phaseAdd));
-					*(CVector *) ptVect = (*posIt) + I * firstSize * (CPSUtil::getCos((sint32) currentAngle))
+					*(CVectorPacked *) ptVect = (*posIt) + I * firstSize * (CPSUtil::getCos((sint32) currentAngle))
 										  + K * firstSize * (CPSUtil::getSin((sint32) currentAngle));
 					currentAngle += angleStep;
 					ptVect += stride;
@@ -173,7 +174,7 @@ public:
 					for (k = 1; k <= upperBound; ++k)
 					{
 						fSize  = fanSize + (moveIntensity * CPSUtil::getCos(randomPhaseTab[k] + phaseAdd));
-						*(CVector *) ptVect = (*posIt) + I * fSize * (CPSUtil::getCos((sint32) currentAngle))
+						*(CVectorPacked *) ptVect = (*posIt) + I * fSize * (CPSUtil::getCos((sint32) currentAngle))
 											  + K * fSize * (CPSUtil::getSin((sint32) currentAngle));
 						currentAngle += angleStep;
 						ptVect += stride;
@@ -183,14 +184,14 @@ public:
 					sizeStep = sizeStepBase * (firstSize - fSize);
 					for (; k <= (sint32) (f._NbFans - 1); ++k)
 					{
-						*(CVector *) ptVect = (*posIt) + I * fSize * (CPSUtil::getCos((sint32) currentAngle))
+						*(CVectorPacked *) ptVect = (*posIt) + I * fSize * (CPSUtil::getCos((sint32) currentAngle))
 											  + K * fSize * (CPSUtil::getSin((sint32) currentAngle));
 						currentAngle += angleStep;
 						ptVect += stride;
 						fSize  += sizeStep;
 					}
 					// last fan
-					*(CVector *) ptVect = (*posIt) + I * firstSize * (CPSUtil::getCos((sint32) *currentAnglePt))
+					*(CVectorPacked *) ptVect = (*posIt) + I * firstSize * (CPSUtil::getCos((sint32) *currentAnglePt))
 											  + K * firstSize * (CPSUtil::getSin((sint32) *currentAnglePt));
 					ptVect += stride;
 					currentSizePt += currentSizePtIncrement;
diff --git a/code/nel/src/3d/ps_force.cpp b/code/nel/src/3d/ps_force.cpp
index cb3445619..7659ce7af 100644
--- a/code/nel/src/3d/ps_force.cpp
+++ b/code/nel/src/3d/ps_force.cpp
@@ -602,9 +602,9 @@ void CPSGravity::integrate(float date, CPSLocated *src, uint32 startIndex, uint3
 
 void CPSGravity::integrateSingle(float startDate, float deltaT, uint numStep,
 								 const CPSLocated *src, uint32 indexInLocated,
-								 NLMISC::CVector *destPos,
+								 NLMISC::CVectorPacked *destPos,
 								 bool accumulate /*= false*/,
-								 uint stride/* = sizeof(NLMISC::CVector)*/) const
+								 uint stride/* = sizeof(NLMISC::CVectorPacked)*/) const
 {
 	NL_PS_FUNC(CPSGravity_CVector )
 	nlassert(src->isParametricMotionEnabled());
@@ -635,7 +635,7 @@ void CPSGravity::integrateSingle(float startDate, float deltaT, uint numStep,
 					destPos->y = startPos.y + currDate * startSpeed.y;
 					destPos->z = startPos.z + currDate * startSpeed.z - _K * halfTimeSquare;
 					currDate += deltaT;
-					destPos = (NLMISC::CVector *) ( (uint8 *) destPos + stride);
+					destPos = (NLMISC::CVectorPacked *) ( (uint8 *) destPos + stride);
 				}
 				while (--numStep);
 			}
@@ -655,7 +655,7 @@ void CPSGravity::integrateSingle(float startDate, float deltaT, uint numStep,
 					float halfTimeSquare  = 0.5f * currDate * currDate;
 					destPos->z -=  _K * halfTimeSquare;
 					currDate += deltaT;
-					destPos = (NLMISC::CVector *) ( (uint8 *) destPos + stride);
+					destPos = (NLMISC::CVectorPacked *) ( (uint8 *) destPos + stride);
 				}
 				while (--numStep);
 			}
@@ -1146,7 +1146,7 @@ void CPSBrownianForce::integrate(float date, CPSLocated *src,
 ///==========================================================
 void CPSBrownianForce::integrateSingle(float startDate, float deltaT, uint numStep,
 								 const CPSLocated *src, uint32 indexInLocated,
-								 NLMISC::CVector *destPos,
+								 NLMISC::CVectorPacked *destPos,
 								 bool accumulate,
 								 uint stride) const
 {
@@ -1181,7 +1181,7 @@ void CPSBrownianForce::integrateSingle(float startDate, float deltaT, uint numSt
 					destPos->y = startPos.y + currDate * startSpeed.y + _K * PrecomputedPos[index].y;
 					destPos->z = startPos.z + currDate * startSpeed.z + _K * PrecomputedPos[index].z;
 					currDate += deltaT;
-					destPos = (NLMISC::CVector *) ( (uint8 *) destPos + stride);
+					destPos = (NLMISC::CVectorPacked *) ( (uint8 *) destPos + stride);
 				}
 				while (--numStep);
 			}
@@ -1203,7 +1203,7 @@ void CPSBrownianForce::integrateSingle(float startDate, float deltaT, uint numSt
 					destPos->y += _K * PrecomputedPos[index].y;
 					destPos->z += _K * PrecomputedPos[index].z;
 					currDate += deltaT;
-					destPos = (NLMISC::CVector *) ( (uint8 *) destPos + stride);
+					destPos = (NLMISC::CVectorPacked *) ( (uint8 *) destPos + stride);
 				}
 				while (--numStep);
 			}
diff --git a/code/nel/src/3d/ps_located.cpp b/code/nel/src/3d/ps_located.cpp
index 9be0baf41..57997e2be 100644
--- a/code/nel/src/3d/ps_located.cpp
+++ b/code/nel/src/3d/ps_located.cpp
@@ -257,7 +257,7 @@ void CPSLocated::notifyMotionTypeChanged(void)
 /// ***************************************************************************************
 void CPSLocated::integrateSingle(float startDate, float deltaT, uint numStep,
 								uint32 indexInLocated,
-								NLMISC::CVector *destPos,
+								NLMISC::CVectorPacked *destPos,
 								uint stride) const
 {
 	NL_PS_FUNC(CPSLocated_integrateSingle)
@@ -293,7 +293,7 @@ void CPSLocated::integrateSingle(float startDate, float deltaT, uint numStep,
 					destPos->y = pi.Pos.y + currDate * pi.Speed.y;
 					destPos->z = pi.Pos.z + currDate * pi.Speed.z;
 					currDate += deltaT;
-					destPos = (NLMISC::CVector *) ( (uint8 *) destPos + stride);
+					destPos = (NLMISC::CVectorPacked *) ( (uint8 *) destPos + stride);
 				}
 				while (--numStep);
 			}
diff --git a/code/nel/src/3d/ps_mesh.cpp b/code/nel/src/3d/ps_mesh.cpp
index f473a79ee..ddbf024fb 100644
--- a/code/nel/src/3d/ps_mesh.cpp
+++ b/code/nel/src/3d/ps_mesh.cpp
@@ -660,9 +660,9 @@ public:
 								CHECK_VERTEX_BUFFER(outVb,	  outVertex + outNormalOff);
 
 								// translate and resize the vertex (relatively to the mesh origin)
-								*(CVector *) outVertex = *posIt + sM * *(CVector *) inVertex;
+								*(CVectorPacked *) outVertex = *posIt + sM * *(CVector *) inVertex;
 								// copy the normal
-								*(CVector *) (outVertex + outNormalOff) = M * *(CVector *) (inVertex + inNormalOff);
+								*(CVectorPacked *) (outVertex + outNormalOff) = M * *(CVector *) (inVertex + inNormalOff);
 
 
 								inVertex  += inVSize;
@@ -683,7 +683,7 @@ public:
 								CHECK_VERTEX_BUFFER(outVb, outVertex);
 
 								// translate and resize the vertex (relatively to the mesh origin)
-								*(CVector *) outVertex = *posIt + sM * *(CVector *) inVertex;
+								*(CVectorPacked *) outVertex = *posIt + sM * *(CVector *) inVertex;
 
 								inVertex  += inVSize;
 								outVertex += outVSize;
@@ -774,9 +774,9 @@ public:
 								CHECK_VERTEX_BUFFER(outVb,	  outVertex + outNormalOff);
 
 								// morph, and transform the vertex
-								*(CVector *) outVertex = *posIt + sM * (opLambda * *(CVector *) m0 + lambda * *(CVector *) m1);
+								*(CVectorPacked *) outVertex = *posIt + sM * (opLambda * *(CVector *) m0 + lambda * *(CVector *) m1);
 								// morph, and transform the normal
-								*(CVector *) (outVertex + outNormalOff) = M * (opLambda * *(CVector *) (m0 + inNormalOff)
+								*(CVectorPacked *) (outVertex + outNormalOff) = M * (opLambda * *(CVector *) (m0 + inNormalOff)
 																			  + lambda * *(CVector *) (m1 + inNormalOff)).normed();
 
 
@@ -799,7 +799,7 @@ public:
 								CHECK_VERTEX_BUFFER((*inVB1),	  m1);
 								CHECK_VERTEX_BUFFER(outVb, outVertex);
 								// morph, and transform the vertex
-								*(CVector *) outVertex = *posIt + sM * (opLambda * *(CVector *) m0 + opLambda * *(CVector *) m1);
+								*(CVectorPacked *) outVertex = *posIt + sM * (opLambda * *(CVector *) m0 + opLambda * *(CVector *) m1);
 
 								m0  += inVSize;
 								m1  += inVSize;
@@ -1684,8 +1684,8 @@ CVertexBuffer &CPSConstraintMesh::makePrerotatedVb(const CVertexBuffer &inVb)
 				CHECK_VERTEX_BUFFER(prerotatedVb, outVertex);
 				CHECK_VERTEX_BUFFER(prerotatedVb, outVertex + pNormalOff);
 
-				* (CVector *) outVertex =  mat.mulVector(* (CVector *) inVertex);
-				* (CVector *) (outVertex + normalOff) =  mat.mulVector(* (CVector *) (inVertex + pNormalOff) );
+				* (CVectorPacked *) outVertex =  mat.mulVector(* (CVector *) inVertex);
+				* (CVectorPacked *) (outVertex + normalOff) =  mat.mulVector(* (CVector *) (inVertex + pNormalOff) );
 				outVertex += vpSize;
 				inVertex  += vSize;
 
@@ -1701,7 +1701,7 @@ CVertexBuffer &CPSConstraintMesh::makePrerotatedVb(const CVertexBuffer &inVb)
 				CHECK_VERTEX_BUFFER(prerotatedVb, outVertex);
 				CHECK_VERTEX_BUFFER(inVb, inVertex);
 
-				* (CVector *) outVertex =  mat.mulVector(* (CVector *) inVertex);
+				* (CVectorPacked *) outVertex =  mat.mulVector(* (CVector *) inVertex);
 				outVertex += vpSize;
 				inVertex += vSize;
 			}
diff --git a/code/nel/src/3d/ps_ribbon.cpp b/code/nel/src/3d/ps_ribbon.cpp
index cc7d0bcd7..2bf0a74bd 100644
--- a/code/nel/src/3d/ps_ribbon.cpp
+++ b/code/nel/src/3d/ps_ribbon.cpp
@@ -397,7 +397,7 @@ static inline uint8 *BuildRibbonFirstSlice(const NLMISC::CVector &pos,
 	NL_PS_FUNC(BuildRibbonFirstSlice)
 	do
 	{
-		* (NLMISC::CVector *) dest = pos;
+		* (NLMISC::CVectorPacked *) dest = pos;
 		dest += vertexSize;
 	}
 	while (--numVerts);
@@ -409,7 +409,7 @@ static inline uint8 *BuildRibbonFirstSlice(const NLMISC::CVector &pos,
 // This compute one slice of a ribbon, and return the next vertex to be filled
 static inline uint8 *ComputeRibbonSliceFollowPath(const NLMISC::CVector &prev,
 									    const NLMISC::CVector &next,
-									    const NLMISC::CVector *shape,
+									    const NLMISC::CVectorPacked *shape,
 									    uint  numVerts,
 									    uint8 *dest,
 									    uint  vertexSize,
@@ -430,10 +430,10 @@ static inline uint8 *ComputeRibbonSliceFollowPath(const NLMISC::CVector &prev,
 	}
 	basis.setPos(next);
 
-	const NLMISC::CVector *shapeEnd = shape + numVerts;
+	const NLMISC::CVectorPacked *shapeEnd = shape + numVerts;
 	do
 	{
-		*(NLMISC::CVector *) dest = basis * (size * (*shape));
+		*(NLMISC::CVectorPacked *) dest = basis * (size * CVector(*shape));
 		++shape;
 		dest += vertexSize;
 	}
@@ -445,7 +445,7 @@ static inline uint8 *ComputeRibbonSliceFollowPath(const NLMISC::CVector &prev,
 // This compute one slice of a ribbon, and return the next vertex to be filled
 static inline uint8 *ComputeRibbonSliceIdentity(const NLMISC::CVector &prev,
 											   const NLMISC::CVector &next,
-											   const NLMISC::CVector *shape,
+											   const NLMISC::CVectorPacked *shape,
 											   uint  numVerts,
 											   uint8 *dest,
 											   uint  vertexSize,
@@ -453,10 +453,10 @@ static inline uint8 *ComputeRibbonSliceIdentity(const NLMISC::CVector &prev,
 											  )
 {
 	NL_PS_FUNC(ComputeRibbonSliceIdentity)
-	const NLMISC::CVector *shapeEnd = shape + numVerts;
+	const NLMISC::CVectorPacked *shapeEnd = shape + numVerts;
 	do
 	{
-		((NLMISC::CVector *) dest)->set(size * shape->x + next.x,
+		((NLMISC::CVectorPacked *) dest)->set(size * shape->x + next.x,
 			                            size * shape->y + next.y,
 										size * shape->z + next.z);
 		++shape;
@@ -469,7 +469,7 @@ static inline uint8 *ComputeRibbonSliceIdentity(const NLMISC::CVector &prev,
 ///=========================================================================
 static inline uint8 *ComputeRibbonSliceFollowPathXY(const NLMISC::CVector &prev,
 												  const NLMISC::CVector &next,
-												  const NLMISC::CVector *shape,
+												  const NLMISC::CVectorPacked *shape,
 												  uint  numVerts,
 												  uint8 *dest,
 												  uint  vertexSize,
@@ -492,10 +492,10 @@ static inline uint8 *ComputeRibbonSliceFollowPathXY(const NLMISC::CVector &prev,
 		basis.setRot(I, CVector::K, J, true);
 	}
 	basis.setPos(next);
-	const NLMISC::CVector *shapeEnd = shape + numVerts;
+	const NLMISC::CVectorPacked *shapeEnd = shape + numVerts;
 	do
 	{
-		*(NLMISC::CVector *) dest = basis * (size * (*shape));
+		*(NLMISC::CVectorPacked *) dest = basis * (size * CVector(*shape));
 		++shape;
 		dest += vertexSize;
 	}
@@ -511,8 +511,8 @@ static inline uint8 *ComputeRibbonSliceFollowPathXY(const NLMISC::CVector &prev,
 // This is for untextured versions (no need to duplicate the last vertex of each slice)
 static inline uint8 *ComputeUntexturedRibbonMesh(uint8 *destVb,
 											     uint  vertexSize,
-											     const NLMISC::CVector *curve,
-											     const NLMISC::CVector *shape,
+											     const NLMISC::CVectorPacked *curve,
+											     const NLMISC::CVectorPacked *shape,
 											     uint  numSegs,
 												 uint  numVerticesInShape,
 												 float sizeIncrement,
@@ -585,8 +585,8 @@ static inline uint8 *ComputeUntexturedRibbonMesh(uint8 *destVb,
 // (Textured Version)
 static inline uint8 *ComputeTexturedRibbonMesh(uint8 *destVb,
 											   uint  vertexSize,
-											   const NLMISC::CVector *curve,
-											   const NLMISC::CVector *shape,
+											   const NLMISC::CVectorPacked *curve,
+											   const NLMISC::CVectorPacked *shape,
 											   uint  numSegs,
 											   uint  numVerticesInShape,
 											   float sizeIncrement,
@@ -612,7 +612,7 @@ static inline uint8 *ComputeTexturedRibbonMesh(uint8 *destVb,
 													   basis
 													  );
 				// duplicate last vertex ( equal first)
-				* (NLMISC::CVector *) nextDestVb = * (NLMISC::CVector *) destVb;
+				* (NLMISC::CVectorPacked *) nextDestVb = * (NLMISC::CVectorPacked *) destVb;
 				destVb = nextDestVb + vertexSize;
 				//
 				++ curve;
@@ -633,7 +633,7 @@ static inline uint8 *ComputeTexturedRibbonMesh(uint8 *destVb,
 					basis
 					);
 				// duplicate last vertex ( equal first)
-				* (NLMISC::CVector *) nextDestVb = * (NLMISC::CVector *) destVb;
+				* (NLMISC::CVectorPacked *) nextDestVb = * (NLMISC::CVectorPacked *) destVb;
 				destVb = nextDestVb + vertexSize;
 				//
 				++ curve;
@@ -653,7 +653,7 @@ static inline uint8 *ComputeTexturedRibbonMesh(uint8 *destVb,
 					size
 					);
 				// duplicate last vertex ( equal first)
-				* (NLMISC::CVector *) nextDestVb = * (NLMISC::CVector *) destVb;
+				* (NLMISC::CVectorPacked *) nextDestVb = * (NLMISC::CVectorPacked *) destVb;
 				destVb = nextDestVb + vertexSize;
 				//
 				++ curve;
@@ -727,7 +727,7 @@ void CPSRibbon::displayRibbons(uint32 nbRibbons, uint32 srcStep)
 		const uint numVerticesInShape = (uint)_Shape.size();
 		//
 		static std::vector<float> sizes;
-		static std::vector<NLMISC::CVector> ribbonPos;  // this is where the position of each ribbon slice center i stored
+		static std::vector<NLMISC::CVectorPacked> ribbonPos;  // this is where the position of each ribbon slice center i stored
 		ribbonPos.resize(_UsedNbSegs + 1); // make sure we have enough room
 		sizes.resize(numRibbonBatch);
 
@@ -782,7 +782,7 @@ void CPSRibbon::displayRibbons(uint32 nbRibbons, uint32 srcStep)
 							const float ribbonSizeIncrement = *ptCurrSize / (float) _UsedNbSegs;
 							ptCurrSize += ptCurrSizeIncrement;
 							// the parent class has a method to get the ribbons positions
-							computeRibbon((uint) (fpRibbonIndex >> 16), &ribbonPos[0], sizeof(NLMISC::CVector));
+							computeRibbon((uint) (fpRibbonIndex >> 16), &ribbonPos[0], sizeof(NLMISC::CVectorPacked));
 							currVert = ComputeTexturedRibbonMesh(currVert,
 																 vertexSize,
 																 &ribbonPos[0],
@@ -804,7 +804,7 @@ void CPSRibbon::displayRibbons(uint32 nbRibbons, uint32 srcStep)
 							const float ribbonSizeIncrement = *ptCurrSize / (float) _UsedNbSegs;
 							ptCurrSize += ptCurrSizeIncrement;
 							// the parent class has a method to get the ribbons positions
-							computeRibbon((uint) (fpRibbonIndex >> 16), &ribbonPos[0], sizeof(NLMISC::CVector));
+							computeRibbon((uint) (fpRibbonIndex >> 16), &ribbonPos[0], sizeof(NLMISC::CVectorPacked));
 							currVert = ComputeUntexturedRibbonMesh(currVert,
 																   vertexSize,
 																   &ribbonPos[0],
diff --git a/code/nel/src/3d/ps_ribbon_base.cpp b/code/nel/src/3d/ps_ribbon_base.cpp
index 8f2a64932..212456fcf 100644
--- a/code/nel/src/3d/ps_ribbon_base.cpp
+++ b/code/nel/src/3d/ps_ribbon_base.cpp
@@ -33,7 +33,7 @@ static inline void BuildHermiteVector(const NLMISC::CVector &P0,
 							   const NLMISC::CVector &P1,
 							   const NLMISC::CVector &T0,
 							   const NLMISC::CVector &T1,
-									 NLMISC::CVector &dest,
+									 NLMISC::CVectorPacked &dest,
 							   float lambda
 							   )
 {
@@ -54,7 +54,7 @@ static inline void BuildHermiteVector(const NLMISC::CVector &P0,
 /// for test
 static inline void BuildLinearVector(const NLMISC::CVector &P0,
 									 const NLMISC::CVector &P1,
-									 NLMISC::CVector &dest,
+									 NLMISC::CVectorPacked &dest,
 									 float lambda,
 									 float oneMinusLambda
 							        )
@@ -204,7 +204,7 @@ void	CPSRibbonBase::updateGlobals()
 
 
 //=======================================================
-void	CPSRibbonBase::computeHermitteRibbon(uint index, NLMISC::CVector *dest, uint stride /* = sizeof(NLMISC::CVector)*/)
+void	CPSRibbonBase::computeHermitteRibbon(uint index, NLMISC::CVectorPacked *dest, uint stride /* = sizeof(NLMISC::CVectorPacked)*/)
 {
 	NL_PS_FUNC(CPSRibbonBase_CVector )
 	nlassert(!_Parametric);
@@ -242,7 +242,7 @@ void	CPSRibbonBase::computeHermitteRibbon(uint index, NLMISC::CVector *dest, uin
 					nlassert(NLMISC::isValidDouble(dest->y));
 					nlassert(NLMISC::isValidDouble(dest->z));
 				#endif
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			}
 			while (--leftToDo);
 			return;
@@ -262,7 +262,7 @@ void	CPSRibbonBase::computeHermitteRibbon(uint index, NLMISC::CVector *dest, uin
 				nlassert(NLMISC::isValidDouble(dest->y));
 				nlassert(NLMISC::isValidDouble(dest->z));
 			#endif
-			dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+			dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			-- leftToDo;
 			if (!leftToDo) return;
 			lambda += lambdaStep;
@@ -289,7 +289,7 @@ void	CPSRibbonBase::computeHermitteRibbon(uint index, NLMISC::CVector *dest, uin
 }
 
 //=======================================================
-void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint stride)
+void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVectorPacked *dest, uint stride)
 {
 	NL_PS_FUNC(CPSRibbonBase_computeLinearRibbon)
 	nlassert(!_Parametric);
@@ -321,7 +321,7 @@ void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint
 					nlassert(NLMISC::isValidDouble(dest->y));
 					nlassert(NLMISC::isValidDouble(dest->z));
 				#endif
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 
 			}
 			while (--leftToDo);
@@ -345,7 +345,7 @@ void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint
 				nlassert(NLMISC::isValidDouble(dest->y));
 				nlassert(NLMISC::isValidDouble(dest->z));
 			#endif
-			dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+			dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			-- leftToDo;
 			if (!leftToDo) return;
 			lambda += lambdaStep;
@@ -387,14 +387,14 @@ void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint
 		do
 		{
 			*dest = *currIt;
-			dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+			dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 		}
 		while (--leftToDo);
 		return;
 	}
 	float lambdaStep = _UsedSegDuration / dt;
 	BuildLinearVector(*currIt, *nextIt, *dest, 0.f, 1.f);
-	dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+	dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 	-- leftToDo;
 	// snap lambda to nearest time step
 	lambda = lambdaStep * fmodf(date[0], _UsedSegDuration) / _UsedSegDuration;
@@ -406,7 +406,7 @@ void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint
 			if (lambda >= 1.f) break;
 			/// compute a location
 			BuildLinearVector(*currIt, *nextIt, *dest, lambda, oneMinusLambda);
-			dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+			dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			-- leftToDo;
 			if (!leftToDo) return;
 			lambda += lambdaStep;
@@ -426,7 +426,7 @@ void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint
 			do
 			{
 				*dest = *currIt;
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			}
 			while (--leftToDo);
 			return;
@@ -442,7 +442,7 @@ void CPSRibbonBase::computeLinearRibbon(uint index, NLMISC::CVector *dest, uint
 
 
 //=======================================================
-void CPSRibbonBase::computeLinearCstSizeRibbon(uint index, NLMISC::CVector *dest, uint stride /* = sizeof(NLMISC::CVector)*/)
+void CPSRibbonBase::computeLinearCstSizeRibbon(uint index, NLMISC::CVectorPacked *dest, uint stride /* = sizeof(NLMISC::CVectorPacked)*/)
 {
 	NL_PS_FUNC(CPSRibbonBase_CVector )
 	nlassert(!_Parametric);
@@ -485,7 +485,7 @@ void CPSRibbonBase::computeLinearCstSizeRibbon(uint index, NLMISC::CVector *dest
 					nlassert(NLMISC::isValidDouble(dest->y));
 					nlassert(NLMISC::isValidDouble(dest->z));
 				#endif
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 				-- leftToDo;
 				if (!leftToDo) return;
 				lambda += lambdaStep;
@@ -512,7 +512,7 @@ void CPSRibbonBase::computeLinearCstSizeRibbon(uint index, NLMISC::CVector *dest
 					nlassert(NLMISC::isValidDouble(dest->y));
 					nlassert(NLMISC::isValidDouble(dest->z));
 				#endif
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			}
 			return;
 		}
@@ -520,7 +520,7 @@ void CPSRibbonBase::computeLinearCstSizeRibbon(uint index, NLMISC::CVector *dest
 }
 
 //=======================================================
-void CPSRibbonBase::computeHermitteCstSizeRibbon(uint index, NLMISC::CVector *dest, uint stride /* = sizeof(NLMISC::CVector)*/)
+void CPSRibbonBase::computeHermitteCstSizeRibbon(uint index, NLMISC::CVectorPacked *dest, uint stride /* = sizeof(NLMISC::CVectorPacked)*/)
 {
 	NL_PS_FUNC(CPSRibbonBase_CVector )
 	nlassert(!_Parametric);
@@ -567,7 +567,7 @@ void CPSRibbonBase::computeHermitteCstSizeRibbon(uint index, NLMISC::CVector *de
 					nlassert(NLMISC::isValidDouble(dest->z));
 				#endif
 
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 				-- leftToDo;
 				if (!leftToDo) return;
 				lambda += lambdaStep;
@@ -593,7 +593,7 @@ void CPSRibbonBase::computeHermitteCstSizeRibbon(uint index, NLMISC::CVector *de
 					nlassert(NLMISC::isValidDouble(dest->y));
 					nlassert(NLMISC::isValidDouble(dest->z));
 				#endif
-				dest  = (NLMISC::CVector *) ((uint8 *) dest + stride);
+				dest  = (NLMISC::CVectorPacked *) ((uint8 *) dest + stride);
 			}
 			return;
 		}
@@ -605,7 +605,7 @@ void CPSRibbonBase::computeHermitteCstSizeRibbon(uint index, NLMISC::CVector *de
 
 
 //=======================================================
-void CPSRibbonBase::computeRibbon(uint index, NLMISC::CVector *dest, uint stride /* = sizeof(NLMISC::CVector)*/)
+void CPSRibbonBase::computeRibbon(uint index, NLMISC::CVectorPacked *dest, uint stride /* = sizeof(NLMISC::CVectorPacked)*/)
 {
 	NL_PS_FUNC(CPSRibbonBase_CVector )
 	switch (_InterpolationMode)
diff --git a/code/nel/src/3d/ps_ribbon_look_at.cpp b/code/nel/src/3d/ps_ribbon_look_at.cpp
index 3b22f561e..e682135fe 100644
--- a/code/nel/src/3d/ps_ribbon_look_at.cpp
+++ b/code/nel/src/3d/ps_ribbon_look_at.cpp
@@ -34,7 +34,7 @@ const float NormEpsilon = 10E-8f;
 
 struct CVectInfo
 {
-	NLMISC::CVector Interp;
+	NLMISC::CVectorPacked Interp;
 	NLMISC::CVector Proj;
 };
 typedef std::vector<CVectInfo> TRibbonVect; // a vector used for intermediate computations
@@ -247,8 +247,8 @@ static inline void BuildSlice(const NLMISC::CMatrix &mat, CVertexBuffer &vb, uin
 			invTgNorm = 1.f;
 		}
 		// build orthogonals vectors to tangent
-		*(NLMISC::CVector *) currVert = pos->Interp + ribSize * invTgNorm * (tangent.x * K - tangent.z * I);
-		*(NLMISC::CVector *) (currVert + vertexSize) = pos->Interp + ribSize * invTgNorm * (- tangent.x * K + tangent.z * I);
+		*(NLMISC::CVectorPacked *) currVert = NLMISC::CVector(pos->Interp) + ribSize * invTgNorm * (tangent.x * K - tangent.z * I);
+		*(NLMISC::CVectorPacked *) (currVert + vertexSize) = NLMISC::CVector(pos->Interp) + ribSize * invTgNorm * (- tangent.x * K + tangent.z * I);
 	}
 	else if (prev->Proj.y > ZEpsilon) // second point cross the near plane
 	{
@@ -263,8 +263,8 @@ static inline void BuildSlice(const NLMISC::CMatrix &mat, CVertexBuffer &vb, uin
 		}
 		else //
 		{
-			*(NLMISC::CVector *) currVert = pos->Interp;
-			*(NLMISC::CVector *) (currVert + vertexSize) = pos->Interp;
+			*(NLMISC::CVectorPacked *) currVert = pos->Interp;
+			*(NLMISC::CVectorPacked *) (currVert + vertexSize) = pos->Interp;
 			return;
 		}
 
@@ -282,8 +282,8 @@ static inline void BuildSlice(const NLMISC::CMatrix &mat, CVertexBuffer &vb, uin
 		}
 		// build orthogonals vectors to tangent
 
-		*(NLMISC::CVector *) currVert = inter + ribSize *  invTgNorm * (tangent.x * K - tangent.z * I);
-		*(NLMISC::CVector *) (currVert + vertexSize) = inter + ribSize * invTgNorm * (- tangent.x * K + tangent.z * I);
+		*(NLMISC::CVectorPacked *) currVert = inter + ribSize *  invTgNorm * (tangent.x * K - tangent.z * I);
+		*(NLMISC::CVectorPacked *) (currVert + vertexSize) = inter + ribSize * invTgNorm * (- tangent.x * K + tangent.z * I);
 	}
 	else if (next->Proj.y > ZEpsilon) // first point cross the near plane
 	{
@@ -298,8 +298,8 @@ static inline void BuildSlice(const NLMISC::CMatrix &mat, CVertexBuffer &vb, uin
 		}
 		else //
 		{
-			*(NLMISC::CVector *) currVert = pos->Interp;
-			*(NLMISC::CVector *) (currVert + vertexSize) = pos->Interp;
+			*(NLMISC::CVectorPacked *) currVert = pos->Interp;
+			*(NLMISC::CVectorPacked *) (currVert + vertexSize) = pos->Interp;
 			return;
 		}
 
@@ -316,14 +316,14 @@ static inline void BuildSlice(const NLMISC::CMatrix &mat, CVertexBuffer &vb, uin
 		}
 		// build orthogonals vectors to tangent
 
-		*(NLMISC::CVector *) currVert = inter + ribSize * invTgNorm * (tangent.x * K - tangent.z * I);
-		*(NLMISC::CVector *) (currVert + vertexSize) = inter + ribSize * invTgNorm * (- tangent.x * K + tangent.z * I);
+		*(NLMISC::CVectorPacked *) currVert = inter + ribSize * invTgNorm * (tangent.x * K - tangent.z * I);
+		*(NLMISC::CVectorPacked *) (currVert + vertexSize) = inter + ribSize * invTgNorm * (- tangent.x * K + tangent.z * I);
 
 	}
 	else // two points are not visible
 	{
-		*(NLMISC::CVector *) currVert = pos->Interp;
-		*(NLMISC::CVector *) (currVert + vertexSize) = pos->Interp;
+		*(NLMISC::CVectorPacked *) currVert = pos->Interp;
+		*(NLMISC::CVectorPacked *) (currVert + vertexSize) = pos->Interp;
 	}
 
 }
diff --git a/code/nel/src/3d/ps_shockwave.cpp b/code/nel/src/3d/ps_shockwave.cpp
index 607b2d03e..4421faded 100644
--- a/code/nel/src/3d/ps_shockwave.cpp
+++ b/code/nel/src/3d/ps_shockwave.cpp
@@ -23,6 +23,7 @@
 #include "nel/3d/ps_iterator.h"
 #include "nel/3d/particle_system.h"
 
+using NLMISC::CVectorPacked;
 
 namespace NL3D
 {
@@ -159,10 +160,10 @@ public:
 						radVect = *ptCurrSize * (CPSUtil::getCos((sint32) currAngle) * ptCurrBasis->X + CPSUtil::getSin((sint32) currAngle) * ptCurrBasis->Y);
 						innerVect = radiusRatio * radVect;
 						CHECK_VERTEX_BUFFER(*vb, currVertex);
-						* (CVector *) currVertex = *posIt + radVect;
+						* (CVectorPacked *) currVertex = *posIt + radVect;
 						currVertex += vSize;
 						CHECK_VERTEX_BUFFER(*vb, currVertex);
-						* (CVector *) currVertex = *posIt + innerVect;
+						* (CVectorPacked *) currVertex = *posIt + innerVect;
 						currVertex += vSize;
 						currAngle += angleStep;
 					}
diff --git a/code/nel/src/3d/ps_tail_dot.cpp b/code/nel/src/3d/ps_tail_dot.cpp
index 623b8e7f7..0568528ad 100644
--- a/code/nel/src/3d/ps_tail_dot.cpp
+++ b/code/nel/src/3d/ps_tail_dot.cpp
@@ -25,6 +25,8 @@
 
 #include <memory>
 
+using NLMISC::CVectorPacked;
+
 namespace NL3D
 {
 static NLMISC::CRGBA GradientB2W[] = {NLMISC::CRGBA(0, 0, 0, 0), NLMISC::CRGBA(255, 255, 255, 255) };
@@ -330,7 +332,7 @@ void CPSTailDot::displayRibbons(uint32 nbRibbons, uint32 srcStep)
 				do
 				{
 					// the parent class has a method to get the ribbons positions
-					computeRibbon((uint) (fpRibbonIndex >> 16), (CVector *) currVert, vertexSize);
+					computeRibbon((uint) (fpRibbonIndex >> 16), (CVectorPacked *) currVert, vertexSize);
 					currVert += vertexSize * (_UsedNbSegs + 1);
 					fpRibbonIndex += srcStep;
 				}
@@ -345,7 +347,7 @@ void CPSTailDot::displayRibbons(uint32 nbRibbons, uint32 srcStep)
 				{
 					// we compute each pos thanks to the parametric curve
 					_Owner->integrateSingle(date - _UsedSegDuration * (_UsedNbSegs + 1), _UsedSegDuration, _UsedNbSegs + 1, (uint) (fpRibbonIndex >> 16),
-											(NLMISC::CVector *) currVert, vertexSize);
+											(NLMISC::CVectorPacked *) currVert, vertexSize);
 					currVert += vertexSize * (_UsedNbSegs + 1);
 					fpRibbonIndex += srcStep;
 				}
diff --git a/code/nel/src/3d/ps_util.cpp b/code/nel/src/3d/ps_util.cpp
index 0f7600d0e..211383a61 100644
--- a/code/nel/src/3d/ps_util.cpp
+++ b/code/nel/src/3d/ps_util.cpp
@@ -44,6 +44,7 @@ namespace NL3D {
 
 
 using NLMISC::CVector;
+using NLMISC::CVectorPacked;
 
 
 //#ifdef NL_DEBUG
diff --git a/code/nel/src/3d/seg_remanence.cpp b/code/nel/src/3d/seg_remanence.cpp
index e0f9a52ae..5340811b2 100644
--- a/code/nel/src/3d/seg_remanence.cpp
+++ b/code/nel/src/3d/seg_remanence.cpp
@@ -27,6 +27,7 @@
 #include "nel/3d/dru.h"
 
 
+using NLMISC::CVectorPacked;
 
 
 
@@ -162,8 +163,8 @@ void CSegRemanence::registerBasic()
 // helper functions to fill vb
 static inline void vbPush(uint8 *&dest, const CVector &v)
 {
-	*(CVector *) dest = v;
-	dest +=sizeof(CVector);
+	*(CVectorPacked *) dest = v;
+	dest +=sizeof(CVectorPacked);
 }
 
 static inline void vbPush(uint8 *&dest, float f)
diff --git a/code/nel/src/3d/water_model.cpp b/code/nel/src/3d/water_model.cpp
index eb8ceae27..2a7d80dbb 100644
--- a/code/nel/src/3d/water_model.cpp
+++ b/code/nel/src/3d/water_model.cpp
@@ -33,6 +33,7 @@
 #include "nel/3d/texture_bump.h"
 #include "nel/3d/water_env_map.h"
 
+using NLMISC::CVectorPacked;
 
 using NLMISC::CVector2f;
 
@@ -1450,15 +1451,15 @@ uint CWaterModel::fillVBSoft(void *datas, uint startTri)
 			}
 			for(uint l = 0; l < numVerts - 2; ++l)
 			{
-				*(CVector *) dest = unprojectedTriSoft[0];
+				*(CVectorPacked *) dest = unprojectedTriSoft[0];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[0];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = unprojectedTriSoft[l + 1];
+				*(CVectorPacked *) dest = unprojectedTriSoft[l + 1];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[l + 1];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = unprojectedTriSoft[l + 2];
+				*(CVectorPacked *) dest = unprojectedTriSoft[l + 2];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[l + 2];
 				dest += sizeof(float[2]);
@@ -1481,27 +1482,27 @@ uint CWaterModel::fillVBSoft(void *datas, uint startTri)
 				computeWaterVertexSoft((float) (x + 1), (float) (y + 1), proj[2], envMap[2], camI, camJ, camK, denom, date, camMat.getPos());
 				computeWaterVertexSoft((float) x, (float) (y + 1), proj[3], envMap[3], camI, camJ, camK, denom, date, camMat.getPos());
 				//
-				*(CVector *) dest = proj[0];
+				*(CVectorPacked *) dest = proj[0];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[0];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = proj[2];
+				*(CVectorPacked *) dest = proj[2];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[2];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = proj[1];
+				*(CVectorPacked *) dest = proj[1];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[1];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = proj[0];
+				*(CVectorPacked *) dest = proj[0];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[0];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = proj[3];
+				*(CVectorPacked *) dest = proj[3];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[3];
 				dest += sizeof(float[2]);
-				*(CVector *) dest = proj[2];
+				*(CVectorPacked *) dest = proj[2];
 				dest += sizeof(float[3]);
 				*(CVector2f *) dest = envMap[2];
 				dest += sizeof(float[2]);
@@ -1557,11 +1558,11 @@ uint CWaterModel::fillVBHard(void *datas, uint startTri)
 			}
 			for(uint l = 0; l < numVerts - 2; ++l)
 			{
-				*(CVector *) dest = unprojectedTri[0];
+				*(CVectorPacked *) dest = unprojectedTri[0];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = unprojectedTri[l + 1];
+				*(CVectorPacked *) dest = unprojectedTri[l + 1];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = unprojectedTri[l + 2];
+				*(CVectorPacked *) dest = unprojectedTri[l + 2];
 				dest += WATER_VERTEX_HARD_SIZE;
 			}
 		}
@@ -1581,17 +1582,17 @@ uint CWaterModel::fillVBHard(void *datas, uint startTri)
 				computeWaterVertexHard((float) (x + 1), (float) (y + 1), proj[2], camI, camJ, camK, denom);
 				computeWaterVertexHard((float) x, (float) (y + 1), proj[3], camI, camJ, camK, denom);
 				//
-				*(CVector *) dest = proj[0];
+				*(CVectorPacked *) dest = proj[0];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = proj[2];
+				*(CVectorPacked *) dest = proj[2];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = proj[1];
+				*(CVectorPacked *) dest = proj[1];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = proj[0];
+				*(CVectorPacked *) dest = proj[0];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = proj[3];
+				*(CVectorPacked *) dest = proj[3];
 				dest += WATER_VERTEX_HARD_SIZE;
-				*(CVector *) dest = proj[2];
+				*(CVectorPacked *) dest = proj[2];
 				dest += WATER_VERTEX_HARD_SIZE;
 			}
 		}

From d94a49b3d847aeb235af86ad2b4e07abaf08c767 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 00:53:13 +0200
Subject: [PATCH 07/21] SSE2: More CVector alignment fixes

--HG--
branch : sse2
---
 code/nel/include/nel/3d/shadow_skin.h       |  1 +
 code/nel/src/3d/lod_character_manager.cpp   |  2 +-
 code/nel/src/3d/mesh.cpp                    |  8 ++---
 code/nel/src/3d/mesh_mrm.cpp                | 12 +++----
 code/nel/src/3d/mesh_mrm_skin.cpp           | 36 +++++++++++++--------
 code/nel/src/3d/mesh_mrm_skinned.cpp        |  2 +-
 code/nel/src/3d/mesh_multi_lod_instance.cpp |  2 +-
 code/nel/src/3d/patch_render.cpp            |  6 ++--
 code/nel/src/3d/shadow_skin.cpp             | 28 ++++++++++++++++
 code/nel/src/3d/vegetable_manager.cpp       | 16 ++++-----
 code/nel/src/3d/vertex_buffer.cpp           |  4 +--
 11 files changed, 77 insertions(+), 40 deletions(-)

diff --git a/code/nel/include/nel/3d/shadow_skin.h b/code/nel/include/nel/3d/shadow_skin.h
index 2b63a635d..3ecc56631 100644
--- a/code/nel/include/nel/3d/shadow_skin.h
+++ b/code/nel/include/nel/3d/shadow_skin.h
@@ -74,6 +74,7 @@ public:
 public:
 
 	// skinning
+	void		applySkin(NLMISC::CVectorPacked *dst, std::vector<CMatrix3x4> &boneMat3x4);
 	void		applySkin(NLMISC::CVector *dst, std::vector<CMatrix3x4> &boneMat3x4);
 
 	/** return ray intersection.
diff --git a/code/nel/src/3d/lod_character_manager.cpp b/code/nel/src/3d/lod_character_manager.cpp
index 48c2a500f..46a6bacf8 100644
--- a/code/nel/src/3d/lod_character_manager.cpp
+++ b/code/nel/src/3d/lod_character_manager.cpp
@@ -676,7 +676,7 @@ bool			CLodCharacterManager::addRenderCharacterKey(CLodCharacterInstance &instan
 		{
 			// NB: order is important for AGP filling optimisation
 			// transform vertex, and store.
-			CVector		*dstVector= (CVector*)dstPtr;
+			CVectorPacked		*dstVector= (CVectorPacked*)dstPtr;
 			fVect.x= vertPtr->x; fVect.y= vertPtr->y; fVect.z= vertPtr->z;
 			++vertPtr;
 			dstVector->x= a00 * fVect.x + a01 * fVect.y + a02 * fVect.z + matPos.x;
diff --git a/code/nel/src/3d/mesh.cpp b/code/nel/src/3d/mesh.cpp
index dfaed0ce4..4bd444fb0 100644
--- a/code/nel/src/3d/mesh.cpp
+++ b/code/nel/src/3d/mesh.cpp
@@ -1049,7 +1049,7 @@ bool	CMeshGeom::retrieveVertices(std::vector<NLMISC::CVector> &vertices) const
 		uint		vSize= vb.getVertexSize();
 		for(i=0;i<vertices.size();i++)
 		{
-			vertices[i]= *(const CVector*)pVert;
+			vertices[i]= *(const CVectorPacked*)pVert;
 			pVert+= vSize;
 		}
 	}
@@ -1718,7 +1718,7 @@ void	CMeshGeom::bkupOriginalSkinVertices()
 		_OriginalTGSpace.resize(numVertices);
 		for(uint i=0; i<numVertices;i++)
 		{
-			_OriginalTGSpace[i]= *(CVector*)vba.getTexCoordPointer(i, tgSpaceStage);
+			_OriginalTGSpace[i]= *(CVectorPacked*)vba.getTexCoordPointer(i, tgSpaceStage);
 		}
 	}
 }
@@ -1760,7 +1760,7 @@ void	CMeshGeom::restoreOriginalSkinVertices()
 		// copy tangent space vectors
 		for(uint i = 0; i < numVertices; ++i)
 		{
-			*(CVector*)vba.getTexCoordPointer(i, numTexCoords - 1)= _OriginalTGSpace[i];
+			*(CVectorPacked*)vba.getTexCoordPointer(i, numTexCoords - 1)= _OriginalTGSpace[i];
 		}
 	}
 
@@ -2117,7 +2117,7 @@ void	CMeshGeom::buildShadowSkin()
 		for(uint i=0; i<numVertices;i++)
 		{
 			// Copy Vertex
-			_ShadowSkin.Vertices[i].Vertex= *((CVector*)srcVert);
+			_ShadowSkin.Vertices[i].Vertex= *((CVectorPacked*)srcVert);
 			// Suppose the 0 matrix inf is the highest (we are at least sure it is not 0)
 			// And SkinWeight Export show the 0th is the highest one...
 			_ShadowSkin.Vertices[i].MatrixId= ((CPaletteSkin*)srcPal)->MatrixId[0];
diff --git a/code/nel/src/3d/mesh_mrm.cpp b/code/nel/src/3d/mesh_mrm.cpp
index 999b3b62d..d0d733283 100644
--- a/code/nel/src/3d/mesh_mrm.cpp
+++ b/code/nel/src/3d/mesh_mrm.cpp
@@ -2066,7 +2066,7 @@ void	CMeshMRMGeom::bkupOriginalSkinVerticesSubset(uint wedgeStart, uint wedgeEnd
 		_OriginalTGSpace.resize(_VBufferFinal.getNumVertices());
 		for(uint i=wedgeStart; i<wedgeEnd;i++)
 		{
-			_OriginalTGSpace[i]= *(CVector*)vba.getTexCoordPointer(i, tgSpaceStage);
+			_OriginalTGSpace[i]= *(CVectorPacked*)vba.getTexCoordPointer(i, tgSpaceStage);
 		}
 	}
 }
@@ -2105,7 +2105,7 @@ void	CMeshMRMGeom::restoreOriginalSkinVertices()
 		// copy tangent space vectors
 		for(uint i = 0; i < _VBufferFinal.getNumVertices(); ++i)
 		{
-			*(CVector*)vba.getTexCoordPointer(i, numTexCoords - 1)= _OriginalTGSpace[i];
+			*(CVectorPacked*)vba.getTexCoordPointer(i, numTexCoords - 1)= _OriginalTGSpace[i];
 		}
 	}
 }
@@ -2164,8 +2164,8 @@ void	CMeshMRMGeom::restoreOriginalSkinPart(CLod &lod)
 			CVector				*srcVertex= srcVertexPtr + index;
 			CVector				*srcNormal= srcNormalPtr + index;
 			uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-			CVector				*dstVertex= (CVector*)(dstVertexVB);
-			CVector				*dstNormal= (CVector*)(dstVertexVB + normalOff);
+			CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
+			CVectorPacked		*dstNormal= (CVectorPacked*)(dstVertexVB + normalOff);
 
 
 			// Vertex.
@@ -2621,7 +2621,7 @@ bool	CMeshMRMGeom::buildGeometryForLod(uint lodId, std::vector<CVector> &vertice
 				// Final remaping of vertex to final index
 				vertexRemap[i]= dstIndex;
 				// copy to dest
-				*pDstVert= *(CVector*)pSrcVert;
+				*pDstVert= *(CVectorPacked*)pSrcVert;
 
 				// next dest
 				pDstVert++;
@@ -3467,7 +3467,7 @@ sint			CMeshMRMGeom::renderShadowSkinGeom(CMeshMRMInstance	*mi, uint remainingVe
 	CLod	&lod= _Lods[_Lods.size()-1];
 	computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton);
 
-	_ShadowSkin.applySkin((CVector*)vbDest, boneMat3x4);
+	_ShadowSkin.applySkin((CVectorPacked*)vbDest, boneMat3x4);
 
 
 	// How many vertices are added to the VBuffer ???
diff --git a/code/nel/src/3d/mesh_mrm_skin.cpp b/code/nel/src/3d/mesh_mrm_skin.cpp
index 13e8bdd21..d8460a1a5 100644
--- a/code/nel/src/3d/mesh_mrm_skin.cpp
+++ b/code/nel/src/3d/mesh_mrm_skin.cpp
@@ -222,11 +222,13 @@ void	CMeshMRMGeom::applySkin(CLod &lod, const CSkeletonModel *skeleton)
 				CMesh::CSkinWeight	*srcSkin= srcSkinPtr + index;
 				CVector				*srcVertex= srcVertexPtr + index;
 				uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-				CVector				*dstVertex= (CVector*)(dstVertexVB);
+				CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
 
 
 				// Vertex.
-				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, *dstVertex);
+				CVector temp;
+				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, temp);
+				*dstVertex = temp;
 			}
 			break;
 
@@ -239,12 +241,14 @@ void	CMeshMRMGeom::applySkin(CLod &lod, const CSkeletonModel *skeleton)
 				CMesh::CSkinWeight	*srcSkin= srcSkinPtr + index;
 				CVector				*srcVertex= srcVertexPtr + index;
 				uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-				CVector				*dstVertex= (CVector*)(dstVertexVB);
+				CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
 
 
 				// Vertex.
-				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-				boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
+				CVector temp;
+				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], temp);
+				boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], temp);
+				*dstVertex = temp;
 			}
 			break;
 
@@ -257,13 +261,15 @@ void	CMeshMRMGeom::applySkin(CLod &lod, const CSkeletonModel *skeleton)
 				CMesh::CSkinWeight	*srcSkin= srcSkinPtr + index;
 				CVector				*srcVertex= srcVertexPtr + index;
 				uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-				CVector				*dstVertex= (CVector*)(dstVertexVB);
+				CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
 
 
 				// Vertex.
-				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-				boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
-				boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex);
+				CVector temp;
+				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], temp);
+				boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], temp);
+				boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], temp);
+				*dstVertex = temp;
 			}
 			break;
 
@@ -276,14 +282,16 @@ void	CMeshMRMGeom::applySkin(CLod &lod, const CSkeletonModel *skeleton)
 				CMesh::CSkinWeight	*srcSkin= srcSkinPtr + index;
 				CVector				*srcVertex= srcVertexPtr + index;
 				uint8				*dstVertexVB= destVertexPtr + index * vertexSize;
-				CVector				*dstVertex= (CVector*)(dstVertexVB);
+				CVectorPacked		*dstVertex= (CVectorPacked*)(dstVertexVB);
 
 
 				// Vertex.
-				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], *dstVertex);
-				boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], *dstVertex);
-				boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], *dstVertex);
-				boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], *dstVertex);
+				CVector temp;
+				boneMat3x4[ srcSkin->MatrixId[0] ].mulSetPoint( *srcVertex, srcSkin->Weights[0], temp);
+				boneMat3x4[ srcSkin->MatrixId[1] ].mulAddPoint( *srcVertex, srcSkin->Weights[1], temp);
+				boneMat3x4[ srcSkin->MatrixId[2] ].mulAddPoint( *srcVertex, srcSkin->Weights[2], temp);
+				boneMat3x4[ srcSkin->MatrixId[3] ].mulAddPoint( *srcVertex, srcSkin->Weights[3], temp);
+				*dstVertex = temp;
 			}
 			break;
 
diff --git a/code/nel/src/3d/mesh_mrm_skinned.cpp b/code/nel/src/3d/mesh_mrm_skinned.cpp
index 2b1c3beb6..c4f795c87 100644
--- a/code/nel/src/3d/mesh_mrm_skinned.cpp
+++ b/code/nel/src/3d/mesh_mrm_skinned.cpp
@@ -1962,7 +1962,7 @@ sint			CMeshMRMSkinnedGeom::renderShadowSkinGeom(CMeshMRMSkinnedInstance	*mi, ui
 	CLod	&lod= _Lods[_Lods.size()-1];
 	computeBoneMatrixes3x4(boneMat3x4, lod.MatrixInfluences, skeleton);
 
-	_ShadowSkin.applySkin((CVector*)vbDest, boneMat3x4);
+	_ShadowSkin.applySkin((CVectorPacked*)vbDest, boneMat3x4);
 
 
 	// How many vertices are added to the VBuffer ???
diff --git a/code/nel/src/3d/mesh_multi_lod_instance.cpp b/code/nel/src/3d/mesh_multi_lod_instance.cpp
index c6c8fa237..f3dbbab93 100644
--- a/code/nel/src/3d/mesh_multi_lod_instance.cpp
+++ b/code/nel/src/3d/mesh_multi_lod_instance.cpp
@@ -302,7 +302,7 @@ void		CMeshMultiLodInstance::setPosCoarseMesh( CMeshGeom &geom, const CMatrix &m
 	for (uint i=0; i<_LastCoarseMeshNumVertices; i++)
 	{
 		// Transform position
-		*(CVector*)vDest = matrix.mulPoint (*(const CVector*)vSrc);
+		*(CVectorPacked*)vDest = matrix.mulPoint (CVector(*(const CVectorPacked*)vSrc));
 
 		// Next point
 		vSrc+=vtSrcSize;
diff --git a/code/nel/src/3d/patch_render.cpp b/code/nel/src/3d/patch_render.cpp
index 135b9fdb3..76687cb38 100644
--- a/code/nel/src/3d/patch_render.cpp
+++ b/code/nel/src/3d/patch_render.cpp
@@ -1056,7 +1056,7 @@ inline void		CPatch::fillFar0VertexVB(CTessFarVertex *pVert)
 
 		// v[11]== EndPos - StartPos
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar0VBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.DeltaPosOff, sizeof(CVector))
-		*(CVector*)(CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.DeltaPosOff)=
+		*(CVectorPacked*)(CurVBPtr + CLandscapeGlobals::CurrentFar0VBInfo.DeltaPosOff)=
 			pVert->Src->EndPos - pVert->Src->StartPos;
 	}
 }
@@ -1144,7 +1144,7 @@ inline void		CPatch::fillFar1VertexVB(CTessFarVertex *pVert)
 
 		// v[11]== EndPos - StartPos
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentFar1VBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.DeltaPosOff, sizeof(CVector))
-		*(CVector*)(CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.DeltaPosOff)=
+		*(CVectorPacked*)(CurVBPtr + CLandscapeGlobals::CurrentFar1VBInfo.DeltaPosOff)=
 			pVert->Src->EndPos - pVert->Src->StartPos;
 
 		// v[12]== Alpha information
@@ -1214,7 +1214,7 @@ inline void		CPatch::fillTileVertexVB(CTessNearVertex *pVert)
 
 		// v[11]== EndPos - StartPos
 		CHECK_VBA_RANGE(CLandscapeGlobals::CurrentTileVBInfo.Accessor, CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.DeltaPosOff, sizeof(CVector))
-		*(CVector*)(CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.DeltaPosOff)=
+		*(CVectorPacked*)(CurVBPtr + CLandscapeGlobals::CurrentTileVBInfo.DeltaPosOff)=
 			pVert->Src->EndPos - pVert->Src->StartPos;
 	}
 }
diff --git a/code/nel/src/3d/shadow_skin.cpp b/code/nel/src/3d/shadow_skin.cpp
index 717d81f74..7aef7821e 100644
--- a/code/nel/src/3d/shadow_skin.cpp
+++ b/code/nel/src/3d/shadow_skin.cpp
@@ -38,6 +38,33 @@ uint	CShadowSkin::NumCacheVertexShadow= NL_BlockByteL1 / sizeof(CShadowVertex);
 
 
 // ***************************************************************************
+void		CShadowSkin::applySkin(CVectorPacked *dst, std::vector<CMatrix3x4> &boneMat3x4)
+{
+	if(Vertices.empty())
+		return;
+	uint	numVerts= (uint)Vertices.size();
+	CShadowVertex	*src= &Vertices[0];
+
+	// Then do the skin
+	for(;numVerts>0;)
+	{
+		// number of vertices to process for this block.
+		uint	nBlockInf= min(NumCacheVertexShadow, numVerts);
+		// next block.
+		numVerts-= nBlockInf;
+
+		// cache the data in L1 cache.
+		CFastMem::precache(src, nBlockInf * sizeof(CShadowVertex));
+
+		CVector temp;
+		//  for all InfluencedVertices only.
+		for(;nBlockInf>0;nBlockInf--, src++, dst++)
+		{
+			boneMat3x4[ src->MatrixId ].mulSetPoint( src->Vertex, temp );
+			*dst = temp;
+		}
+	}
+}
 void		CShadowSkin::applySkin(CVector *dst, std::vector<CMatrix3x4> &boneMat3x4)
 {
 	if(Vertices.empty())
@@ -56,6 +83,7 @@ void		CShadowSkin::applySkin(CVector *dst, std::vector<CMatrix3x4> &boneMat3x4)
 		// cache the data in L1 cache.
 		CFastMem::precache(src, nBlockInf * sizeof(CShadowVertex));
 
+		CVector temp;
 		//  for all InfluencedVertices only.
 		for(;nBlockInf>0;nBlockInf--, src++, dst++)
 		{
diff --git a/code/nel/src/3d/vegetable_manager.cpp b/code/nel/src/3d/vegetable_manager.cpp
index ba44a766f..f860b5b59 100644
--- a/code/nel/src/3d/vegetable_manager.cpp
+++ b/code/nel/src/3d/vegetable_manager.cpp
@@ -1379,21 +1379,21 @@ void			CVegetableManager::addInstance(CVegetableInstanceGroup *ig,
 		// Pos.
 		//-------
 		// Separate Center and relative pos.
-		CVector	relPos= mat.mulVector(*(CVector*)srcPtr);	// mulVector, because translation in v[center]
+		CVector	relPos= mat.mulVector(*(CVectorPacked*)srcPtr);	// mulVector, because translation in v[center]
 		// compute bendCenterPos
 		CVector	bendCenterPos;
 		if(shape->BendCenterMode == CVegetableShapeBuild::BendCenterNull)
 			bendCenterPos= CVector::Null;
 		else
 		{
-			CVector	v= *(CVector*)srcPtr;
+			CVector	v= *(CVectorPacked*)srcPtr;
 			v.z= 0;
 			bendCenterPos= mat.mulVector(v);				// mulVector, because translation in v[center]
 		}
 		// copy
 		deltaPos= relPos-bendCenterPos;
-		*(CVector*)dstPtr= deltaPos;
-		*(CVector*)(dstPtr + dstCenterOff)= instancePos + bendCenterPos;
+		*(CVectorPacked*)dstPtr= deltaPos;
+		*(CVectorPacked*)(dstPtr + dstCenterOff)= instancePos + bendCenterPos;
 		// if !destLighted, then VP is different
 		if(!destLighted)
 		{
@@ -1426,7 +1426,7 @@ void			CVegetableManager::addInstance(CVegetableInstanceGroup *ig,
 			if(destLighted)
 			{
 				// normal
-				*(CVector*)(dstPtr + dstNormalOff)= normalMat.mulVector( *(CVector*)(srcPtr + srcNormalOff) );
+				*(CVectorPacked*)(dstPtr + dstNormalOff)= normalMat.mulVector( *(CVectorPacked*)(srcPtr + srcNormalOff) );
 			}
 			// If destLighted, secondaryRGBA is the ambient
 			// else secondaryRGBA is used only for Alpha (DLM uv.v).
@@ -1437,7 +1437,7 @@ void			CVegetableManager::addInstance(CVegetableInstanceGroup *ig,
 			nlassert(!destLighted);
 
 			// compute normal.
-			CVector		rotNormal= normalMat.mulVector( *(CVector*)(srcPtr + srcNormalOff) );
+			CVector		rotNormal= normalMat.mulVector( *(CVectorPacked*)(srcPtr + srcNormalOff) );
 			// must normalize() because scale is possible.
 			rotNormal.normalize();
 
@@ -1466,7 +1466,7 @@ void			CVegetableManager::addInstance(CVegetableInstanceGroup *ig,
 
 		// Bend.
 		//-------
-		CVector		*dstBendPtr= (CVector*)(dstPtr + dstBendOff);
+		CVectorPacked		*dstBendPtr= (CVectorPacked*)(dstPtr + dstBendOff);
 		// setup bend Phase.
 		dstBendPtr->y= bendPhase;
 		// setup bend Weight.
@@ -2704,7 +2704,7 @@ uint		CVegetableManager::updateInstanceLighting(CVegetableInstanceGroup *ig, uin
 			nlassert(!destLighted);
 
 			// compute normal.
-			CVector		rotNormal= normalMat.mulVector( *(CVector*)(srcPtr + srcNormalOff) );
+			CVector		rotNormal= normalMat.mulVector( *(CVectorPacked*)(srcPtr + srcNormalOff) );
 			// must normalize() because scale is possible.
 			rotNormal.normalize();
 
diff --git a/code/nel/src/3d/vertex_buffer.cpp b/code/nel/src/3d/vertex_buffer.cpp
index 94f269a2a..e8c5ac0c9 100644
--- a/code/nel/src/3d/vertex_buffer.cpp
+++ b/code/nel/src/3d/vertex_buffer.cpp
@@ -674,13 +674,13 @@ void		CVertexBuffer::serialOldV1Minus(NLMISC::IStream &f, sint ver)
 		// XYZ.
 		if(_Flags & PositionFlag)
 		{
-			CVector		&vert= *(CVector*)(pointer + stridedId + _Offset[Position]);
+			CVectorPacked		&vert= *(CVectorPacked*)(pointer + stridedId + _Offset[Position]);
 			f.serial(vert);
 		}
 		// Normal
 		if(_Flags & NormalFlag)
 		{
-			CVector		&norm= *(CVector*)(pointer + stridedId + _Offset[Normal]);
+			CVectorPacked		&norm= *(CVectorPacked*)(pointer + stridedId + _Offset[Normal]);
 			f.serial(norm);
 		}
 		// Uvs.

From ba2231f0683f41e9c51e299a348e525dcbeae1d8 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 01:21:00 +0200
Subject: [PATCH 08/21] SSE2: Some initial CVector SSE2 math

--HG--
branch : sse2
---
 code/nel/include/nel/misc/types_nl.h      | 70 ++++++++++++++++++++++-
 code/nel/include/nel/misc/vector.h        | 27 ++++++++-
 code/nel/include/nel/misc/vector_inline.h | 42 ++++++++++++++
 code/nel/src/misc/common.cpp              | 28 +--------
 4 files changed, 136 insertions(+), 31 deletions(-)

diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h
index b94ffe50f..21cd8b39e 100644
--- a/code/nel/include/nel/misc/types_nl.h
+++ b/code/nel/include/nel/misc/types_nl.h
@@ -336,14 +336,82 @@ typedef	unsigned	int			uint;			// at least 32bits (depend of processor)
 #endif
 
 #ifdef USE_SSE2
+
 extern void *operator new(size_t size) throw(std::bad_alloc);
 extern void *operator new[](size_t size) throw(std::bad_alloc);
 extern void operator delete(void *p) throw();
 extern void operator delete[](void *p) throw();
+
 #define NL_ALIGN_SSE2(nb) NL_ALIGN(nb)
+
+#	ifdef NL_COMP_VC
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+	return _aligned_malloc(size, alignment);
+}
+
+inline void aligned_free(void *ptr)
+{
+	_aligned_free(ptr);
+}
+
+#	else
+
+inline void *aligned_malloc(size_t size, size_t alignment)
+{
+	return memalign(alignment, size);
+}
+
+inline void aligned_free(void *ptr)
+{
+	free(ptr);
+}
+
+#	endif /* NL_COMP_ */
+
+template<class T>
+class aligned_allocator : public std::allocator<T>
+{
+public:
+	typedef size_t size_type;
+	typedef std::ptrdiff_t difference_type;
+	typedef T* pointer;
+	typedef const T* const_pointer;
+	typedef T& reference;
+	typedef const T& const_reference;
+	typedef T value_type;
+
+	template<class U>
+	struct rebind
+	{
+		typedef aligned_allocator<U> other;
+	};
+
+	aligned_allocator() : std::allocator<T>() {}
+
+	aligned_allocator(const aligned_allocator& other) : std::allocator<T>(other) {}
+
+	template<class U>
+	aligned_allocator(const aligned_allocator<U>& other) : std::allocator<T>(other) {}
+
+	~aligned_allocator() {}
+
+	pointer allocate(size_type num, const void* /*hint*/ = 0)
+	{
+		return static_cast<pointer>(aligned_malloc(NL_DEFAULT_MEMORY_ALIGNMENT, num * sizeof(T)));
+	}
+
+	void deallocate(pointer p, size_type /*num*/)
+	{
+		aligned_free(p);
+	}
+};
+
 #else
 #define NL_ALIGN_SSE2(nb) 
-#endif
+#endif /* USE_SSE2 */
+
 
 // CHashMap, CHashSet and CHashMultiMap definitions
 #if defined(_STLPORT_VERSION) // STLport detected
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index b1e2573d5..f11137764 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -35,15 +35,24 @@ class IStream;
  * \author Lionel Berenguier
  * \author Nevrax France
  * \date 2000
+ * \author Jan Boon
+ * \date 2014
  */
 NL_ALIGN_SSE2(16)
 class CVector
 {
 public:		// Attributes.
-	float	x,y,z;
-
 #ifdef USE_SSE2
-	float	w; // Padding
+	union
+	{
+		struct
+		{
+			float x, y, z, P;
+		};
+		__m128 mm;
+	};
+#else
+	float	x,y,z;
 #endif
 
 public:		// const.
@@ -187,9 +196,21 @@ public:
 inline CVector blend(const CVector &v0, const CVector &v1, float lambda)
 {
 	float invLambda = 1.f - lambda;
+#ifdef USE_SSE2
+	CVector res;
+	__m128 mLambda = _mm_set1_ps(lambda);
+	__m128 mInvLambda = _mm_set1_ps(invLambda);
+	__m128 mv0 = v0.mm;
+	__m128 mv1 = v1.mm;
+	mv0 = _mm_mul_ps(mv0, mInvLambda);
+	mv1 = _mm_mul_ps(mv1, mLambda);
+	res.mm = _mm_add_ps(mv0, mv1);
+	return res;
+#else
 	return CVector(invLambda * v0.x + lambda * v1.x,
 		           invLambda * v0.y + lambda * v1.y,
 				   invLambda * v0.z + lambda * v1.z);
+#endif
 }
 
 
diff --git a/code/nel/include/nel/misc/vector_inline.h b/code/nel/include/nel/misc/vector_inline.h
index 9f890f637..61f20e367 100644
--- a/code/nel/include/nel/misc/vector_inline.h
+++ b/code/nel/include/nel/misc/vector_inline.h
@@ -31,23 +31,35 @@ namespace	NLMISC
 // Base Maths.
 inline	CVector	&CVector::operator+=(const CVector &v)
 {
+#ifdef USE_SSE2
+	mm = _mm_add_ps(mm, v.mm);
+#else
 	x+=v.x;
 	y+=v.y;
 	z+=v.z;
+#endif
 	return *this;
 }
 inline	CVector	&CVector::operator-=(const CVector &v)
 {
+#ifdef USE_SSE2
+	mm = _mm_sub_ps(mm, v.mm);
+#else
 	x-=v.x;
 	y-=v.y;
 	z-=v.z;
+#endif
 	return *this;
 }
 inline	CVector	&CVector::operator*=(float f)
 {
+#ifdef USE_SSE2
+	mm = _mm_mul_ps(mm, _mm_set1_ps(f));
+#else
 	x*=f;
 	y*=f;
 	z*=f;
+#endif
 	return *this;
 }
 inline	CVector	&CVector::operator/=(float f)
@@ -56,18 +68,36 @@ inline	CVector	&CVector::operator/=(float f)
 }
 inline	CVector	CVector::operator+(const CVector &v) const
 {
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_add_ps(mm, v.mm);
+	return res;
+#else
 	CVector	ret(x+v.x, y+v.y, z+v.z);
 	return ret;
+#endif
 }
 inline	CVector	CVector::operator-(const CVector &v) const
 {
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_sub_ps(mm, v.mm);
+	return res;
+#else
 	CVector	ret(x-v.x, y-v.y, z-v.z);
 	return ret;
+#endif
 }
 inline	CVector	CVector::operator*(float f) const
 {
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_mul_ps(mm, _mm_set1_ps(f));
+	return res;
+#else
 	CVector	ret(x*f, y*f, z*f);
 	return ret;
+#endif
 }
 inline	CVector	CVector::operator/(float f) const
 {
@@ -75,12 +105,24 @@ inline	CVector	CVector::operator/(float f) const
 }
 inline	CVector	CVector::operator-() const
 {
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_mul_ps(mm, _mm_set1_ps(-1.0f));
+	return res;
+#else
 	return CVector(-x,-y,-z);
+#endif
 }
 inline CVector	operator*(float f, const CVector &v)
 {
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_mul_ps(v.mm, _mm_set1_ps(f));
+	return res;
+#else
 	CVector	ret(v.x*f, v.y*f, v.z*f);
 	return ret;
+#endif
 }
 
 
diff --git a/code/nel/src/misc/common.cpp b/code/nel/src/misc/common.cpp
index b58792a65..dd244667b 100644
--- a/code/nel/src/misc/common.cpp
+++ b/code/nel/src/misc/common.cpp
@@ -71,33 +71,7 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); }
 #endif // NL_OS_WINDOWS
 
 
-#ifdef HAS_SSE2
-
-#	ifdef NL_COMP_VC
-
-inline void *aligned_malloc(size_t size, size_t alignment)
-{
-	return _aligned_malloc(size, alignment);
-}
-
-inline void aligned_free(void *p)
-{
-	_aligned_free(ptr);
-}
-
-#	else
-
-inline void *aligned_malloc(size_t size, size_t alignment)
-{
-	return memalign(alignment, size);
-}
-
-inline void aligned_free(void *ptr)
-{
-	free(ptr);
-}
-
-#	endif /* NL_COMP_ */
+#ifdef USE_SSE2
 
 void *operator new(size_t size) throw(std::bad_alloc)
 {

From f8b6d81b254486d2a71277a8a81e15e5b0d6e515 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 03:30:40 +0200
Subject: [PATCH 09/21] SSE2: More alignment workarounds

--HG--
branch : sse2
---
 code/nel/include/nel/3d/driver.h              |  1 +
 code/nel/include/nel/3d/particle_system.h     |  2 +-
 code/nel/include/nel/3d/ps_attrib.h           | 12 ++---
 .../include/nel/3d/ps_attrib_maker_helper.h   | 12 ++---
 .../nel/3d/ps_attrib_maker_iterators.h        | 12 ++---
 code/nel/include/nel/3d/ps_force.h            | 11 +++--
 code/nel/include/nel/3d/ps_iterator.h         |  4 +-
 code/nel/include/nel/3d/ps_located.h          |  3 +-
 code/nel/include/nel/3d/ps_util.h             |  1 +
 code/nel/include/nel/3d/ps_zone.h             | 12 ++---
 code/nel/include/nel/3d/u_driver.h            |  1 +
 code/nel/include/nel/misc/vector.h            | 40 ++++++++++++++--
 code/nel/src/3d/particle_system.cpp           |  2 +-
 code/nel/src/3d/ps_emitter.cpp                |  4 +-
 code/nel/src/3d/ps_face_look_at.cpp           |  2 +-
 code/nel/src/3d/ps_force.cpp                  |  8 ++--
 code/nel/src/3d/ps_located.cpp                | 24 +++++++---
 code/nel/src/3d/ps_plane_basis_maker.cpp      |  6 +--
 code/nel/src/3d/ps_shockwave.cpp              |  4 +-
 code/nel/src/3d/ps_sound.cpp                  |  4 +-
 code/nel/src/3d/ps_zone.cpp                   | 46 +++++++++----------
 21 files changed, 132 insertions(+), 79 deletions(-)

diff --git a/code/nel/include/nel/3d/driver.h b/code/nel/include/nel/3d/driver.h
index 3eb5823ca..ef846fd2c 100644
--- a/code/nel/include/nel/3d/driver.h
+++ b/code/nel/include/nel/3d/driver.h
@@ -57,6 +57,7 @@ using NLMISC::CRefCount;
 using NLMISC::CSmartPtr;
 using NLMISC::CRGBA;
 using NLMISC::CVector;
+using NLMISC::CVectorPacked;
 using NLMISC::CMatrix;
 using NLMISC::CSynchronized;
 
diff --git a/code/nel/include/nel/3d/particle_system.h b/code/nel/include/nel/3d/particle_system.h
index c1139d070..2cc0f9d79 100644
--- a/code/nel/include/nel/3d/particle_system.h
+++ b/code/nel/include/nel/3d/particle_system.h
@@ -1244,7 +1244,7 @@ public:
 	static std::vector<uint>						   _ParticleToRemove;			// used during the update step, contains the indices of the particles to remove
 	static std::vector<sint>						   _ParticleRemoveListIndex; 	// for each particle, -1 if it hasn't been removed, or else give the insertion number in _ParticleToRemove
 	static std::vector<uint>						   _CollidingParticles; // index of particle that collided
-	static std::vector<NLMISC::CVector>				   _SpawnPos;			// spawn position of newly created particles
+	static std::vector<NLMISC::CVectorPacked>		   _SpawnPos;			// spawn position of newly created particles
 public:
 	// current sim steps infos
 	static TAnimationTime								EllapsedTime;
diff --git a/code/nel/include/nel/3d/ps_attrib.h b/code/nel/include/nel/3d/ps_attrib.h
index cd691e719..a70fa7921 100644
--- a/code/nel/include/nel/3d/ps_attrib.h
+++ b/code/nel/include/nel/3d/ps_attrib.h
@@ -563,12 +563,12 @@ void CPSAttrib<T>::swap(CPSAttrib<T> &other)
 
 // here we give some definition for common types
 
-typedef CPSAttrib<NLMISC::CVector> TPSAttribVector;
-typedef CPSAttrib<NLMISC::CRGBA>   TPSAttribRGBA;
-typedef CPSAttrib<float>		   TPSAttribFloat;
-typedef CPSAttrib<uint32>		   TPSAttribUInt;
-typedef CPSAttrib<uint8>		   TPSAttribUInt8;
-typedef CPSAttrib<TAnimationTime>  TPSAttribTime;
+typedef CPSAttrib<NLMISC::CVectorPacked>	TPSAttribVector;
+typedef CPSAttrib<NLMISC::CRGBA>			TPSAttribRGBA;
+typedef CPSAttrib<float>					TPSAttribFloat;
+typedef CPSAttrib<uint32>					TPSAttribUInt;
+typedef CPSAttrib<uint8>					TPSAttribUInt8;
+typedef CPSAttrib<TAnimationTime>			TPSAttribTime;
 
 } // NL3D
 
diff --git a/code/nel/include/nel/3d/ps_attrib_maker_helper.h b/code/nel/include/nel/3d/ps_attrib_maker_helper.h
index 147d1ae5d..4ae6e65b1 100644
--- a/code/nel/include/nel/3d/ps_attrib_maker_helper.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_helper.h
@@ -1190,10 +1190,10 @@ T  CPSAttribMakerT<T, F>::get(CPSLocated *loc, uint32 index)
 			result=  getInternal(loc->getInvMass()[index]);
 		break;
 		case CPSInputType::attrSpeed:
-			result = getInternal(loc->getSpeed()[index].norm());
+			result = getInternal(NLMISC::CVector(loc->getSpeed()[index]).norm());
 		break;
 		case CPSInputType::attrPosition:
-			result = getInternal(loc->getPos()[index].norm());
+			result = getInternal(NLMISC::CVector(loc->getPos()[index]).norm());
 		break;
 		case CPSInputType::attrUniformRandom:
 		{
@@ -1210,7 +1210,7 @@ T  CPSAttribMakerT<T, F>::get(CPSLocated *loc, uint32 index)
 			static NLMISC::CVector lodVect;
 			float lodOffset;
 			loc->getLODVect(lodVect, lodOffset, loc->getMatrixMode());
-			float r = fabsf(loc->getPos()[index] * lodVect + lodOffset);
+			float r = fabsf(NLMISC::CVector(loc->getPos()[index]) * lodVect + lodOffset);
 			r = this->_NbCycles * r > MaxInputValue ? MaxInputValue : r;
 			if (_Clamp)
 			{
@@ -1224,7 +1224,7 @@ T  CPSAttribMakerT<T, F>::get(CPSLocated *loc, uint32 index)
 			static NLMISC::CVector lodVect;
 			float lodOffset;
 			loc->getLODVect(lodVect, lodOffset, loc->getMatrixMode());
-			float r = loc->getPos()[index] * lodVect + lodOffset;
+			float r = NLMISC::CVector(loc->getPos()[index]) * lodVect + lodOffset;
 			r = this->_NbCycles * (r > MaxInputValue ? MaxInputValue : r * r);
 
 			if (_Clamp)
@@ -1240,7 +1240,7 @@ T  CPSAttribMakerT<T, F>::get(CPSLocated *loc, uint32 index)
 			float lodOffset;
 			loc->getLODVect(lodVect, lodOffset, loc->getMatrixMode());
 
-			float r = loc->getPos()[index] * lodVect + lodOffset;
+			float r = NLMISC::CVector(loc->getPos()[index]) * lodVect + lodOffset;
 			if (r < 0)
 			{
 				result = _F(MaxInputValue);
@@ -1260,7 +1260,7 @@ T  CPSAttribMakerT<T, F>::get(CPSLocated *loc, uint32 index)
 			float lodOffset;
 			loc->getLODVect(lodVect, lodOffset, loc->getMatrixMode());
 
-			float r = loc->getPos()[index] * lodVect + lodOffset;
+			float r = NLMISC::CVector(loc->getPos()[index]) * lodVect + lodOffset;
 			if (r < 0)
 			{
 				result = _F(MaxInputValue);
diff --git a/code/nel/include/nel/3d/ps_attrib_maker_iterators.h b/code/nel/include/nel/3d/ps_attrib_maker_iterators.h
index c2c54d9d8..cbc3231d0 100644
--- a/code/nel/include/nel/3d/ps_attrib_maker_iterators.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_iterators.h
@@ -48,7 +48,7 @@ namespace NL3D
 	template <class TBaseIter>
 	struct CVectNormIterator : CPSBaseIterator<TBaseIter>
 	{
-		GET_INLINE float get() const { return this->Iter.get().norm(); }
+		GET_INLINE float get() const { return CVector(this->Iter.get()).norm(); }
 		CVectNormIterator(const TBaseIter &it) : CPSBaseIterator<TBaseIter>(it) {}
 	};
 
@@ -76,7 +76,7 @@ namespace NL3D
 	template <class TBaseIter>
 	struct CDistIterator : CPSBaseIterator<TBaseIter>
 	{
-		NLMISC::CVector V;
+		NLMISC::CVectorPacked V;
 		float Offset;
 		CDistIterator(const TBaseIter &it) : CPSBaseIterator<TBaseIter>(it) {}
 	};
@@ -89,7 +89,7 @@ namespace NL3D
 		GET_INLINE
 		float get() const
 		{
-			const float r = fabsf(this->Iter.get() * this->V + this->Offset);
+			const float r = fabsf(CVector(this->Iter.get()) * this->V + this->Offset);
 			return r > MaxInputValue ? MaxInputValue : r;
 		}
 		CFDot3AddIterator(const TBaseIter &it) : CDistIterator<TBaseIter>(it) {}
@@ -101,7 +101,7 @@ namespace NL3D
 	{
 		float get() const
 		{
-			float r = this->Iter.get() * this->V + this->Offset;
+			float r = CVector(this->Iter.get()) * this->V + this->Offset;
 			r *= r;
 			return r > MaxInputValue ? MaxInputValue : r;
 		}
@@ -115,7 +115,7 @@ namespace NL3D
 		GET_INLINE
 		float get() const
 		{
-			const float r = this->Iter.get() * this->V + this->Offset;
+			const float r = CVector(this->Iter.get()) * this->V + this->Offset;
 			if (r < 0.f) return MaxInputValue;
 			return r > MaxInputValue ? MaxInputValue : r;
 		}
@@ -130,7 +130,7 @@ namespace NL3D
 		GET_INLINE
 		float get() const
 		{
-			float r = this->Iter.get() * this->V + this->Offset;
+			float r = CVector(this->Iter.get()) * this->V + this->Offset;
 			if (r < 0) return MaxInputValue;
 			r *= r;
 			return r > MaxInputValue ? MaxInputValue : r;
diff --git a/code/nel/include/nel/3d/ps_force.h b/code/nel/include/nel/3d/ps_force.h
index 76cf90ee4..1739bfa92 100644
--- a/code/nel/include/nel/3d/ps_force.h
+++ b/code/nel/include/nel/3d/ps_force.h
@@ -87,9 +87,9 @@ public:
 	  * 'accumulate' set to false.
 	  * NB : works only with integrable forces
 	  */
-	 virtual void integrate(float /* date */, CPSLocated * /* src */, uint32 /* startIndex */, uint32 /* numObjects */, NLMISC::CVector * /* destPos */ = NULL, NLMISC::CVector * /* destSpeed */ = NULL,
+	 virtual void integrate(float /* date */, CPSLocated * /* src */, uint32 /* startIndex */, uint32 /* numObjects */, NLMISC::CVectorPacked * /* destPos */ = NULL, NLMISC::CVectorPacked * /* destSpeed */ = NULL,
 							bool /* accumulate */ = false,
-							uint /* posStride */ = sizeof(NLMISC::CVector), uint /* speedStride */ = sizeof(NLMISC::CVector)
+							uint /* posStride */ = sizeof(NLMISC::CVectorPacked), uint /* speedStride */ = sizeof(NLMISC::CVectorPacked)
 							) const
 	 {
 		 nlassert(0); // not an integrable force
@@ -325,7 +325,10 @@ template <class T> void CIsotropicForceT<T>::computeForces(CPSLocated &target)
 
 		for (; speedIt != endSpeedIt; ++speedIt, ++posIt, ++invMassIt)
 		{
-			_F(*posIt, *speedIt, *invMassIt);
+			const CVector posv = *posIt;
+			CVector speedv = *speedIt;
+			_F(posv, speedv, *invMassIt);
+			*speedIt = speedv;
 		}
 	}
 }
@@ -770,7 +773,7 @@ protected:
 	virtual CPSLocated *getForceIntensityOwner(void) { return _Owner; }
 
 	// the normal of the vortex
-	CPSAttrib<NLMISC::CVector> _Normal;
+	CPSAttrib<NLMISC::CVectorPacked> _Normal;
 	// radius of the vortex
 	TPSAttribFloat _Radius;
 
diff --git a/code/nel/include/nel/3d/ps_iterator.h b/code/nel/include/nel/3d/ps_iterator.h
index 5a02cecda..5f850e393 100644
--- a/code/nel/include/nel/3d/ps_iterator.h
+++ b/code/nel/include/nel/3d/ps_iterator.h
@@ -134,10 +134,10 @@ namespace NL3D
 	/// Some typedefs
 	typedef CAdvance1Iterator<TPSAttribFloat::const_iterator, float> TIteratorFloatStep1;
 	typedef CAdvance1Iterator<TPSAttribFloat::const_iterator, TAnimationTime> TIteratorFloatStep1;
-	typedef CAdvance1Iterator<TPSAttribVector::const_iterator, NLMISC::CVector> TIteratorVectStep1;
+	typedef CAdvance1Iterator<TPSAttribVector::const_iterator, NLMISC::CVectorPacked> TIteratorVectStep1;
 	typedef CAdvance1616Iterator<TPSAttribFloat::const_iterator, float> TIteratorFloatStep1616;
 	typedef CAdvance1616Iterator<TPSAttribFloat::const_iterator, TAnimationTime> TIteratorTimeStep1616;
-	typedef CAdvance1616Iterator<TPSAttribVector::const_iterator, NLMISC::CVector> TIteratorVectStep1616;
+	typedef CAdvance1616Iterator<TPSAttribVector::const_iterator, NLMISC::CVectorPacked> TIteratorVectStep1616;
 
 } // NL3D
 
diff --git a/code/nel/include/nel/3d/ps_located.h b/code/nel/include/nel/3d/ps_located.h
index ca1c86a7b..2fb210544 100644
--- a/code/nel/include/nel/3d/ps_located.h
+++ b/code/nel/include/nel/3d/ps_located.h
@@ -220,6 +220,7 @@ public:
 	CScene *getScene(void);
 
 	/// shortcut to the same method of the owning particle system
+	void getLODVect(NLMISC::CVectorPacked &v, float &offset, TPSMatrixMode matrixMode);
 	void getLODVect(NLMISC::CVector &v, float &offset, TPSMatrixMode matrixMode);
 
 
@@ -411,7 +412,7 @@ public:
 	void computeForces();
 
 	// compute collisions
-	void computeCollisions(uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter);
+	void computeCollisions(uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter);
 
 	// get a conversion matrix between 2 matrix modes
 	static const NLMISC::CMatrix &getConversionMatrix(const CParticleSystem &ps, TPSMatrixMode to, TPSMatrixMode from);
diff --git a/code/nel/include/nel/3d/ps_util.h b/code/nel/include/nel/3d/ps_util.h
index 7542b7053..64af8a7fa 100644
--- a/code/nel/include/nel/3d/ps_util.h
+++ b/code/nel/include/nel/3d/ps_util.h
@@ -28,6 +28,7 @@ namespace NLMISC
 {
 	class CMatrix;
 	class CVector;
+	class CVectorPacked;
 };
 
 namespace NL3D
diff --git a/code/nel/include/nel/3d/ps_zone.h b/code/nel/include/nel/3d/ps_zone.h
index cf29bc258..72d5a5529 100644
--- a/code/nel/include/nel/3d/ps_zone.h
+++ b/code/nel/include/nel/3d/ps_zone.h
@@ -106,7 +106,7 @@ public:
 	/** Compute collisions for the given target. This will update the collisions infos.
 	  * The caller must provide pointer to arrays positions before and after time step.
 	  */
-	virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter) = 0;
+	virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter) = 0;
 
 protected:
 
@@ -141,7 +141,7 @@ protected:
 class CPSZonePlane : public CPSZone, public IPSMover
 {
 	public:
-		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter);
+		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter);
 		virtual void show();
 
 
@@ -192,7 +192,7 @@ typedef CPSAttrib<CRadiusPair> TPSAttribRadiusPair;
 class CPSZoneSphere : public CPSZone, public IPSMover
 {
 	public:
-		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter);
+		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter);
 		virtual void show();
 
 
@@ -236,7 +236,7 @@ class CPSZoneSphere : public CPSZone, public IPSMover
 class CPSZoneDisc : public CPSZone, public IPSMover
 {
 	public:
-		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter);
+		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter);
 		virtual void show();
 
 		CPSZoneDisc()
@@ -283,7 +283,7 @@ class CPSZoneDisc : public CPSZone, public IPSMover
 class CPSZoneCylinder : public CPSZone, public IPSMover
 {
 	public:
-		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter);
+		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter);
 		virtual void show();
 
 		CPSZoneCylinder()
@@ -335,7 +335,7 @@ class CPSZoneCylinder : public CPSZone, public IPSMover
 class CPSZoneRectangle : public CPSZone, public IPSMover
 {
 	public:
-		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter);
+		virtual	void computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter);
 		virtual void show();
 
 		CPSZoneRectangle()
diff --git a/code/nel/include/nel/3d/u_driver.h b/code/nel/include/nel/3d/u_driver.h
index 2e74ae3fe..73b97d9cd 100644
--- a/code/nel/include/nel/3d/u_driver.h
+++ b/code/nel/include/nel/3d/u_driver.h
@@ -47,6 +47,7 @@ namespace NL3D
 
 
 using NLMISC::CVector;
+using NLMISC::CVectorPacked;
 using NLMISC::CMatrix;
 using NLMISC::CRGBA;
 using NLMISC::CBitmap;
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index f11137764..d499a5dba 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -69,11 +69,11 @@ public:		// Methods.
 	/// @name Object.
 	//@{
 	/// Constructor which does nothing.
-	CVector() {}
+	CVector() { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
 	/// Constructor .
-	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) {}
+	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
 	/// Copy Constructor.
-	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) {}
+	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
 	//@}
 
 	/// @name Base Maths.
@@ -181,6 +181,14 @@ public:
 		return *this;
 	}
 
+	CVectorPacked &operator -= (const CVector &v)
+	{
+		x -= v.x;
+		y -= v.y;
+		z -= v.z;
+		return *this;
+	}
+
 	operator CVector () const
 	{
 		return CVector(x, y, z);
@@ -190,6 +198,16 @@ public:
 	{
 		f.serial(x,y,z);
 	}
+
+	CVector	operator+(const CVector &v) const
+	{
+		return CVector(*this) + v;
+	}
+
+	CVector	operator-(const CVector &v) const
+	{
+		return CVector(*this) - v;
+	}
 };
 
 // blend (faster version than the generic version found in algo.h)
@@ -214,9 +232,25 @@ inline CVector blend(const CVector &v0, const CVector &v1, float lambda)
 }
 
 
+
 }
 
 
+namespace std {
+	inline void swap(NLMISC::CVectorPacked &v1, NLMISC::CVector &v2)
+	{
+		NLMISC::CVectorPacked temp = v2;
+		v2 = NLMISC::CVector(v1);
+		v1 = temp;
+	}
+	inline void swap(NLMISC::CVector &v1,  NLMISC::CVectorPacked &v2)
+	{
+		NLMISC::CVectorPacked temp = v1;
+		v1 = NLMISC::CVector(v2);
+		v2 = temp;
+	}
+}
+
 #include "vector_inline.h"
 
 
diff --git a/code/nel/src/3d/particle_system.cpp b/code/nel/src/3d/particle_system.cpp
index 3b25c3e1e..ce8fbaae4 100644
--- a/code/nel/src/3d/particle_system.cpp
+++ b/code/nel/src/3d/particle_system.cpp
@@ -64,7 +64,7 @@ float CParticleSystem::RealEllapsedTimeRatio = 1.f;
 bool CParticleSystem::InsideSimLoop = false;
 bool CParticleSystem::InsideRemoveLoop = false;
 bool CParticleSystem::InsideNewElementsLoop = false;;
-std::vector<NLMISC::CVector> CParticleSystem::_SpawnPos;
+std::vector<NLMISC::CVectorPacked> CParticleSystem::_SpawnPos;
 
 
 
diff --git a/code/nel/src/3d/ps_emitter.cpp b/code/nel/src/3d/ps_emitter.cpp
index 0084111a0..ef0931eed 100644
--- a/code/nel/src/3d/ps_emitter.cpp
+++ b/code/nel/src/3d/ps_emitter.cpp
@@ -2790,7 +2790,7 @@ void CPSEmitter::doEmitOnce(uint firstInstanceIndex)
 				CVector startPos;
 				if (!_Owner->isParametricMotionEnabled())
 				{
-					startPos = _Owner->getPos()[k] - _Owner->getSpeed()[k] * CParticleSystem::EllapsedTime;
+					startPos = CVector(_Owner->getPos()[k]) - CVector(_Owner->getSpeed()[k]) * CParticleSystem::EllapsedTime;
 				}
 				else
 				{
@@ -2823,7 +2823,7 @@ void CPSEmitter::doEmitOnce(uint firstInstanceIndex)
 			CVector startPos;
 			if (!_Owner->isParametricMotionEnabled())
 			{
-				startPos = _Owner->getPos()[k] - _Owner->getSpeed()[k] * CParticleSystem::EllapsedTime;
+				startPos = CVector(_Owner->getPos()[k]) - CVector(_Owner->getSpeed()[k]) * CParticleSystem::EllapsedTime;
 			}
 			else
 			{
diff --git a/code/nel/src/3d/ps_face_look_at.cpp b/code/nel/src/3d/ps_face_look_at.cpp
index ccc5907c5..782d5c70d 100644
--- a/code/nel/src/3d/ps_face_look_at.cpp
+++ b/code/nel/src/3d/ps_face_look_at.cpp
@@ -65,7 +65,7 @@ public:
 		do
 		{
 			// tmp unoptimized slow version
-			CVector normedSpeed = (*speedIt).normed();
+			CVector normedSpeed = CVector(*speedIt).normed();
 			float iProj = normedSpeed * I;
 			float kProj = normedSpeed * K;
 			dest->I = iProj * I + kProj * K;
diff --git a/code/nel/src/3d/ps_force.cpp b/code/nel/src/3d/ps_force.cpp
index 7659ce7af..f013dcc3a 100644
--- a/code/nel/src/3d/ps_force.cpp
+++ b/code/nel/src/3d/ps_force.cpp
@@ -862,8 +862,8 @@ void CPSCylindricVortex::computeForces(CPSLocated &target)
 					p *= 1.f / d;
 					// compute the speed vect that we should have (normalized)
 					realTangentialSpeed = n ^ p;
-					tangentialSpeed = (*speedIt * realTangentialSpeed) * realTangentialSpeed;
-					radialSpeed =  (p * *speedIt) * p;
+					tangentialSpeed = (CVector(*speedIt) * realTangentialSpeed) * realTangentialSpeed;
+					radialSpeed =  (p * CVector(*speedIt)) * p;
 					// update radial speed;
 					*speedIt -= _RadialViscosity * CParticleSystem::EllapsedTime * radialSpeed;
 					// update tangential speed
@@ -981,7 +981,7 @@ void CPSMagneticForce::computeForces(CPSLocated &target)
 			TPSAttribFloat::const_iterator invMassIt = target.getInvMass().begin();
 			for (; it != itend; ++it, ++invMassIt)
 			{
-				(*it) += intensity * *invMassIt * (*it ^ toAdd);
+				(*it) += intensity * *invMassIt * (CVector(*it) ^ toAdd);
 			}
 		}
 		else
@@ -989,7 +989,7 @@ void CPSMagneticForce::computeForces(CPSLocated &target)
 			float i = intensity / target.getInitialMass();
 			for (; it != itend; ++it)
 			{
-				(*it) += i * (*it ^ toAdd);
+				(*it) += i * (CVector(*it) ^ toAdd);
 			}
 		}
 	}
diff --git a/code/nel/src/3d/ps_located.cpp b/code/nel/src/3d/ps_located.cpp
index 57997e2be..0ef57fab1 100644
--- a/code/nel/src/3d/ps_located.cpp
+++ b/code/nel/src/3d/ps_located.cpp
@@ -502,6 +502,18 @@ bool CPSLocated::hasEmitters(void) const
 	return false;
 }
 
+/// ***************************************************************************************
+void CPSLocated::getLODVect(NLMISC::CVectorPacked &v, float &offset, TPSMatrixMode matrixMode)
+{
+	NL_PS_FUNC(CPSLocated_getLODVect)
+	nlassert(_Owner);
+	CHECK_PS_INTEGRITY
+	CVector temp;
+	_Owner->getLODVect(temp, offset, matrixMode);
+	v = temp;
+	CHECK_PS_INTEGRITY
+}
+
 /// ***************************************************************************************
 void CPSLocated::getLODVect(NLMISC::CVector &v, float &offset, TPSMatrixMode matrixMode)
 {
@@ -1866,7 +1878,7 @@ void CPSLocated::updateCollisions()
 			if (_Time[currCollision->Index] >= 1.f)
 			{
 				// check whether particles died before the collision. If so, just continue (particle has already been inserted in the remove list), and cancel the collision
-				float timeToCollision = currCollision->Dist / _Speed[currCollision->Index].norm();
+				float timeToCollision = currCollision->Dist / CVector(_Speed[currCollision->Index]).norm();
 				if (_Time[currCollision->Index] / _TimeIncrement[currCollision->Index] - timeToCollision * CParticleSystem::RealEllapsedTimeRatio >= 1.f)
 				{
 					// says that collision did not occurs
@@ -2196,12 +2208,12 @@ void CPSLocated::removeOldParticles()
 
 					if (_LifeScheme)
 					{
-						_Pos[*it] -= _Speed[*it] * ((_Time[*it] - 1.f) / _TimeIncrement[*it]) * ellapsedTimeRatio;
+						_Pos[*it] -= CVector(_Speed[*it]) * ((_Time[*it] - 1.f) / _TimeIncrement[*it]) * ellapsedTimeRatio;
 						timeUntilNextSimStep = (_Time[*it] - 1.f) / _TimeIncrement[*it];
 					}
 					else
 					{
-						_Pos[*it] -= _Speed[*it] * ((_Time[*it] - 1.f) * _InitialLife) * ellapsedTimeRatio;
+						_Pos[*it] -= CVector(_Speed[*it]) * ((_Time[*it] - 1.f) * _InitialLife) * ellapsedTimeRatio;
 						timeUntilNextSimStep = (_Time[*it] - 1.f) * _InitialLife;
 					}
 					_Time[*it] = 0.9999f;
@@ -2255,7 +2267,7 @@ void CPSLocated::removeOldParticles()
 					{
 						// move position backward (compute its position at death)
 						timeUntilNextSimStep = ((_Time[*it] - 1.f) / _TimeIncrement[*it]) * ellapsedTimeRatio;
-						_Pos[*it] -= _Speed[*it] * timeUntilNextSimStep;
+						_Pos[*it] -= CVector(_Speed[*it]) * timeUntilNextSimStep;
 
 						// force time to 1 because emitter 'on death' may rely on the date of emitter to compute its attributes
 						_Time[*it] = 0.9999f;
@@ -2283,7 +2295,7 @@ void CPSLocated::removeOldParticles()
 					{
 						// move position backward
 						timeUntilNextSimStep = (_Time[*it] - 1.f) * _InitialLife * ellapsedTimeRatio;
-						_Pos[*it] -= _Speed[*it] * timeUntilNextSimStep;
+						_Pos[*it] -= CVector(_Speed[*it]) * timeUntilNextSimStep;
 						// force time to 1 because emitter 'on death' may rely on the date of emitter to compute its attributes
 						_Time[*it] = 0.9999f;
 					}
@@ -3038,7 +3050,7 @@ void CPSLocated::setZBias(float value)
 }
 
 /// ***************************************************************************************
-void CPSLocated::computeCollisions(uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter)
+void CPSLocated::computeCollisions(uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter)
 {
 	NL_PS_FUNC(CPSLocated_computeCollisions)
 	for(TDtorObserversVect::iterator it = _DtorObserversVect.begin(); it != _DtorObserversVect.end(); ++it)
diff --git a/code/nel/src/3d/ps_plane_basis_maker.cpp b/code/nel/src/3d/ps_plane_basis_maker.cpp
index f1ca718b9..2bec4c713 100644
--- a/code/nel/src/3d/ps_plane_basis_maker.cpp
+++ b/code/nel/src/3d/ps_plane_basis_maker.cpp
@@ -130,7 +130,7 @@ void *CPSPlaneBasisFollowSpeed::make(CPSLocated *loc,
 			case XY:
 				while (numAttrib --)
 				{
-					const CVector *speedVect = &(*(speedIt + (fpIndex >> 16)));
+					const NLMISC::CVectorPacked *speedVect = &(*(speedIt + (fpIndex >> 16)));
 					float norm = sqrtf(speedVect->x * speedVect->x + speedVect->y * speedVect->y);
 					float invNorm = (norm != 0.f) ? 1.f / norm : 0.f;
 					CPlaneBasis &pb = *(CPlaneBasis *) ptDat;
@@ -143,7 +143,7 @@ void *CPSPlaneBasisFollowSpeed::make(CPSLocated *loc,
 			case XZ:
 				while (numAttrib --)
 				{
-					const CVector *speedVect = &(*(speedIt + (fpIndex >> 16)));
+					const NLMISC::CVectorPacked *speedVect = &(*(speedIt + (fpIndex >> 16)));
 					float norm = sqrtf(speedVect->x * speedVect->x + speedVect->z * speedVect->z);
 					float invNorm = (norm != 0.f) ? 1.f / norm : 0.f;
 					CPlaneBasis &pb = *(CPlaneBasis *) ptDat;
@@ -156,7 +156,7 @@ void *CPSPlaneBasisFollowSpeed::make(CPSLocated *loc,
 			case YZ:
 				while (numAttrib --)
 				{
-					const CVector *speedVect = &(*(speedIt + (fpIndex >> 16)));
+					const NLMISC::CVectorPacked *speedVect = &(*(speedIt + (fpIndex >> 16)));
 					float norm = sqrtf(speedVect->y * speedVect->y + speedVect->z * speedVect->z);
 					float invNorm = (norm != 0.f) ? 1.f / norm : 0.f;
 					CPlaneBasis &pb = *(CPlaneBasis *) ptDat;
diff --git a/code/nel/src/3d/ps_shockwave.cpp b/code/nel/src/3d/ps_shockwave.cpp
index 4421faded..169ad2ec6 100644
--- a/code/nel/src/3d/ps_shockwave.cpp
+++ b/code/nel/src/3d/ps_shockwave.cpp
@@ -160,10 +160,10 @@ public:
 						radVect = *ptCurrSize * (CPSUtil::getCos((sint32) currAngle) * ptCurrBasis->X + CPSUtil::getSin((sint32) currAngle) * ptCurrBasis->Y);
 						innerVect = radiusRatio * radVect;
 						CHECK_VERTEX_BUFFER(*vb, currVertex);
-						* (CVectorPacked *) currVertex = *posIt + radVect;
+						* (CVectorPacked *) currVertex = CVector(*posIt) + radVect;
 						currVertex += vSize;
 						CHECK_VERTEX_BUFFER(*vb, currVertex);
-						* (CVectorPacked *) currVertex = *posIt + innerVect;
+						* (CVectorPacked *) currVertex = CVector(*posIt) + innerVect;
 						currVertex += vSize;
 						currAngle += angleStep;
 					}
diff --git a/code/nel/src/3d/ps_sound.cpp b/code/nel/src/3d/ps_sound.cpp
index a5ae6ad26..07aefd4cf 100644
--- a/code/nel/src/3d/ps_sound.cpp
+++ b/code/nel/src/3d/ps_sound.cpp
@@ -148,8 +148,8 @@ void			CPSSound::step(TPSProcessPass pass)
 
 	CPSAttrib<UPSSoundInstance *>::iterator it = _Sounds.begin(),
 												 endIt;
-	CPSAttrib<NLMISC::CVector>::const_iterator posIt = _Owner->getPos().begin();
-	CPSAttrib<NLMISC::CVector>::const_iterator speedIt = _Owner->getSpeed().begin();
+	CPSAttrib<NLMISC::CVectorPacked>::const_iterator posIt = _Owner->getPos().begin();
+	CPSAttrib<NLMISC::CVectorPacked>::const_iterator speedIt = _Owner->getSpeed().begin();
 
 	do
 	{
diff --git a/code/nel/src/3d/ps_zone.cpp b/code/nel/src/3d/ps_zone.cpp
index 813103896..250877e28 100644
--- a/code/nel/src/3d/ps_zone.cpp
+++ b/code/nel/src/3d/ps_zone.cpp
@@ -194,7 +194,7 @@ void CPSZonePlane::deleteElement(uint32 index)
 }
 
 
-void CPSZonePlane::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter)
+void CPSZonePlane::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter)
 {
 	NL_PS_FUNC(CPSZonePlane_computeCollisions)
 	MINI_TIMER(PSStatsZonePlane)
@@ -213,9 +213,9 @@ void CPSZonePlane::computeCollisions(CPSLocated &target, uint firstInstanceIndex
 		NLMISC::CPlane p;
 		p.make(m.mulVector(*normalIt), m * (*planePosIt));
 		// deals with each particle
-		const NLMISC::CVector *itPosBefore = posBefore + firstInstanceIndex;
-		const NLMISC::CVector *itPosBeforeEnd = posBefore + target.getSize();
-		const NLMISC::CVector *itPosAfter = posAfter + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBefore = posBefore + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBeforeEnd = posBefore + target.getSize();
+		const NLMISC::CVectorPacked *itPosAfter = posAfter + firstInstanceIndex;
 		while (itPosBefore != itPosBeforeEnd)
 		{
 			float posSide = p * *itPosBefore;
@@ -235,7 +235,7 @@ void CPSZonePlane::computeCollisions(CPSLocated &target, uint firstInstanceIndex
 				ci.Dist = startEnd.norm();
 				// we translate the particle from an epsilon so that it won't get hooked to the plane
 				ci.NewPos = *itPosBefore  + startEnd + PSCollideEpsilon * p.getNormal();
-				const CVector &speed = target.getSpeed()[(uint32)(itPosBefore - posBefore)];
+				const CVector speed = target.getSpeed()[(uint32)(itPosBefore - posBefore)];
 				ci.NewSpeed = _BounceFactor * (speed - 2.0f * (speed * p.getNormal()) * p.getNormal());
 				ci.CollisionZone = this;
 				CPSLocated::_Collisions[itPosBefore - posBefore].update(ci);
@@ -290,7 +290,7 @@ void CPSZonePlane::serial(NLMISC::IStream &f) throw(NLMISC::EStream)
 // sphere implementation //
 ///////////////////////////
 
-void CPSZoneSphere::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter)
+void CPSZoneSphere::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter)
 {
 	NL_PS_FUNC(CPSZoneSphere_computeCollisions)
 	MINI_TIMER(PSStatsZoneSphere)
@@ -308,9 +308,9 @@ void CPSZoneSphere::computeCollisions(CPSLocated &target, uint firstInstanceInde
 		const CMatrix &m = CPSLocated::getConversionMatrix(&target, this->_Owner);
 		CVector center = m * *spherePosIt;
 		// deals with each particle
-		const NLMISC::CVector *itPosBefore = posBefore + firstInstanceIndex;
-		const NLMISC::CVector *itPosBeforeEnd = posBefore + target.getSize();
-		const NLMISC::CVector *itPosAfter = posAfter + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBefore = posBefore + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBeforeEnd = posBefore + target.getSize();
+		const NLMISC::CVectorPacked *itPosAfter = posAfter + firstInstanceIndex;
 		while (itPosBefore != itPosBeforeEnd)
 		{
 			// check whether the located is going through the sphere
@@ -346,7 +346,7 @@ void CPSZoneSphere::computeCollisions(CPSLocated &target, uint firstInstanceInde
 						ci.Dist = startEnd.norm();
 						// we translate the particle from an epsilon so that it won't get hooked to the sphere
 						ci.NewPos = pos  + startEnd + PSCollideEpsilon * normal;
-						const CVector &speed = target.getSpeed()[(uint32)(itPosBefore - posBefore)];
+						const CVector speed = target.getSpeed()[(uint32)(itPosBefore - posBefore)];
 						ci.NewSpeed = _BounceFactor * (speed - 2.0f * (speed * normal) * normal);
 						ci.CollisionZone = this;
 						CPSLocated::_Collisions[itPosBefore - posBefore].update(ci);
@@ -450,7 +450,7 @@ void CPSZoneSphere::deleteElement(uint32 index)
 ////////////////////////////////
 // CPSZoneDisc implementation //
 ////////////////////////////////
-void CPSZoneDisc::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter)
+void CPSZoneDisc::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter)
 {
 	NL_PS_FUNC(CPSZoneDisc_computeCollisions)
 	MINI_TIMER(PSStatsZoneDisc)
@@ -477,9 +477,9 @@ void CPSZoneDisc::computeCollisions(CPSLocated &target, uint firstInstanceIndex,
 		const float epsilon = 0.5f * PSCollideEpsilon;
 
 		// deals with each particle
-		const NLMISC::CVector *itPosBefore = posBefore + firstInstanceIndex;
-		const NLMISC::CVector *itPosBeforeEnd = posBefore + target.getSize();
-		const NLMISC::CVector *itPosAfter = posAfter + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBefore = posBefore + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBeforeEnd = posBefore + target.getSize();
+		const NLMISC::CVectorPacked *itPosAfter = posAfter + firstInstanceIndex;
 		while (itPosBefore != itPosBeforeEnd)
 		{
 			float posSide = p * *itPosBefore;
@@ -503,7 +503,7 @@ void CPSZoneDisc::computeCollisions(CPSLocated &target, uint firstInstanceIndex,
 				hitRadius2 = (ci.NewPos - center) * (ci.NewPos - center);
 				if (hitRadius2 < radiusIt->R2) // check collision against disc
 				{
-					const CVector &speed = target.getSpeed()[(uint32)(itPosBefore - posBefore)];
+					const CVector speed = target.getSpeed()[(uint32)(itPosBefore - posBefore)];
 					ci.NewSpeed = _BounceFactor * (speed - 2.0f * (speed * p.getNormal()) * p.getNormal());
 					ci.CollisionZone = this;
 					CPSLocated::_Collisions[itPosBefore - posBefore].update(ci);
@@ -847,7 +847,7 @@ void CPSZoneCylinder::performMotion(TAnimationTime ellapsedTime)
 */
 
 
-void CPSZoneCylinder::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter)
+void CPSZoneCylinder::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter)
 {
 	NL_PS_FUNC(CPSZoneCylinder_computeCollisions)
 	MINI_TIMER(PSStatsZoneCylinder)
@@ -873,9 +873,9 @@ void CPSZoneCylinder::computeCollisions(CPSLocated &target, uint firstInstanceIn
 		CVector destProjectedPos, destTPos;
 		// deals with each particle
 		// deals with each particle
-		const NLMISC::CVector *itPosBefore = posBefore + firstInstanceIndex;
-		const NLMISC::CVector *itPosBeforeEnd = posBefore + target.getSize();
-		const NLMISC::CVector *itPosAfter = posAfter + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBefore = posBefore + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBeforeEnd = posBefore + target.getSize();
+		const NLMISC::CVectorPacked *itPosAfter = posAfter + firstInstanceIndex;
 		while (itPosBefore != itPosBeforeEnd)
 		{
 			const CVector &pos = *itPosBefore;
@@ -1123,7 +1123,7 @@ void CPSZoneCylinder::deleteElement(uint32 index)
 //	implementation of CPSZoneRectangle      //
 //////////////////////////////////////////////
 
-void CPSZoneRectangle::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVector *posBefore, const NLMISC::CVector *posAfter)
+void CPSZoneRectangle::computeCollisions(CPSLocated &target, uint firstInstanceIndex, const NLMISC::CVectorPacked *posBefore, const NLMISC::CVectorPacked *posAfter)
 {
 	NL_PS_FUNC(CPSZoneRectangle_computeCollisions)
 	MINI_TIMER(PSStatsZoneRectangle)
@@ -1149,9 +1149,9 @@ void CPSZoneRectangle::computeCollisions(CPSLocated &target, uint firstInstanceI
 		p.make(X ^ Y, center);
 		// deals with each particle
 		const float epsilon = 0.5f * PSCollideEpsilon;
-		const NLMISC::CVector *itPosBefore = posBefore + firstInstanceIndex;
-		const NLMISC::CVector *itPosBeforeEnd = posBefore + target.getSize();
-		const NLMISC::CVector *itPosAfter = posAfter + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBefore = posBefore + firstInstanceIndex;
+		const NLMISC::CVectorPacked *itPosBeforeEnd = posBefore + target.getSize();
+		const NLMISC::CVectorPacked *itPosAfter = posAfter + firstInstanceIndex;
 		while (itPosBefore != itPosBeforeEnd)
 		{
 			float posSide = p * *itPosBefore;

From 7c7db53c72e97b9d595ef484b9f93389d5d2b16f Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 04:09:14 +0200
Subject: [PATCH 10/21] SSE2: Another workaround for a stupid uint8 alloc

--HG--
branch : sse2
---
 code/nel/src/3d/ps_face_look_at.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/nel/src/3d/ps_face_look_at.cpp b/code/nel/src/3d/ps_face_look_at.cpp
index 782d5c70d..f5063f7f2 100644
--- a/code/nel/src/3d/ps_face_look_at.cpp
+++ b/code/nel/src/3d/ps_face_look_at.cpp
@@ -33,8 +33,8 @@ namespace NL3D
   */
 struct CLookAtAlign
 {
-	CVector I;
-	CVector K;
+	CVectorPacked I;
+	CVectorPacked K;
 };
 
 

From 1ceaed828ad41644731403107ca4869e91273b86 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 04:17:38 +0200
Subject: [PATCH 11/21] SSE2: More alignment fixes

--HG--
branch : sse2
---
 code/nel/include/nel/sound/clustered_sound.h |  2 ++
 code/nel/src/3d/ps_mesh.cpp                  | 26 ++++++++++----------
 code/nel/src/sound/clustered_sound.cpp       |  3 ++-
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/code/nel/include/nel/sound/clustered_sound.h b/code/nel/include/nel/sound/clustered_sound.h
index ed674f879..d9d2c47f2 100644
--- a/code/nel/include/nel/sound/clustered_sound.h
+++ b/code/nel/include/nel/sound/clustered_sound.h
@@ -50,6 +50,7 @@ class CClusteredSound
 {
 public:
 	/// This structure contain data about sound status in a cluster
+	NL_ALIGN_SSE2(16)
 	struct CClusterSoundStatus
 	{
 		/// The relative gain of sound in the cluster
@@ -78,6 +79,7 @@ public:
 	typedef std::map<NL3D::CCluster*, CClusterSoundStatus>	TClusterStatusMap;
 
 	/// This structure is used when we traverse the cluster/portal graph.
+	NL_ALIGN_SSE2(16)
 	struct CSoundTravContext
 	{
 		/// The current gain.
diff --git a/code/nel/src/3d/ps_mesh.cpp b/code/nel/src/3d/ps_mesh.cpp
index ddbf024fb..a8c057497 100644
--- a/code/nel/src/3d/ps_mesh.cpp
+++ b/code/nel/src/3d/ps_mesh.cpp
@@ -660,9 +660,9 @@ public:
 								CHECK_VERTEX_BUFFER(outVb,	  outVertex + outNormalOff);
 
 								// translate and resize the vertex (relatively to the mesh origin)
-								*(CVectorPacked *) outVertex = *posIt + sM * *(CVector *) inVertex;
+								*(CVectorPacked *) outVertex = *posIt + sM * CVector(*(CVectorPacked *) inVertex);
 								// copy the normal
-								*(CVectorPacked *) (outVertex + outNormalOff) = M * *(CVector *) (inVertex + inNormalOff);
+								*(CVectorPacked *) (outVertex + outNormalOff) = M * CVector(*(CVectorPacked *) (inVertex + inNormalOff));
 
 
 								inVertex  += inVSize;
@@ -683,7 +683,7 @@ public:
 								CHECK_VERTEX_BUFFER(outVb, outVertex);
 
 								// translate and resize the vertex (relatively to the mesh origin)
-								*(CVectorPacked *) outVertex = *posIt + sM * *(CVector *) inVertex;
+								*(CVectorPacked *) outVertex = *posIt + sM * CVector(*(CVectorPacked *) inVertex);
 
 								inVertex  += inVSize;
 								outVertex += outVSize;
@@ -774,10 +774,10 @@ public:
 								CHECK_VERTEX_BUFFER(outVb,	  outVertex + outNormalOff);
 
 								// morph, and transform the vertex
-								*(CVectorPacked *) outVertex = *posIt + sM * (opLambda * *(CVector *) m0 + lambda * *(CVector *) m1);
+								*(CVectorPacked *) outVertex = *posIt + sM * (opLambda * CVector(*(CVectorPacked *) m0) + lambda * CVector(*(CVectorPacked *) m1));
 								// morph, and transform the normal
-								*(CVectorPacked *) (outVertex + outNormalOff) = M * (opLambda * *(CVector *) (m0 + inNormalOff)
-																			  + lambda * *(CVector *) (m1 + inNormalOff)).normed();
+								*(CVectorPacked *) (outVertex + outNormalOff) = M * (opLambda * CVector(*(CVectorPacked *) (m0 + inNormalOff))
+																			  + lambda * CVector(*(CVectorPacked *) (m1 + inNormalOff))).normed();
 
 
 								m0  += inVSize;
@@ -799,7 +799,7 @@ public:
 								CHECK_VERTEX_BUFFER((*inVB1),	  m1);
 								CHECK_VERTEX_BUFFER(outVb, outVertex);
 								// morph, and transform the vertex
-								*(CVectorPacked *) outVertex = *posIt + sM * (opLambda * *(CVector *) m0 + opLambda * *(CVector *) m1);
+								*(CVectorPacked *) outVertex = *posIt + sM * (opLambda * CVector(*(CVectorPacked *) m0) + opLambda * CVector(*(CVectorPacked *) m1));
 
 								m0  += inVSize;
 								m1  += inVSize;
@@ -948,9 +948,9 @@ public:
 
 
 							// translate and resize the vertex (relatively to the mesh origin)
-							*(CVector *)  outVertex						 = *posIt + *ptCurrSize * *(CVector *) inVertex;
+							*(CVectorPacked *)  outVertex						 = *posIt + *ptCurrSize * CVector(*(CVectorPacked *) inVertex);
 							// copy the normal
-							*(CVector *)  (outVertex + normalOff ) = *(CVector *) (inVertex + pNormalOff);
+							*(CVectorPacked *)  (outVertex + normalOff ) = *(CVectorPacked *) (inVertex + pNormalOff);
 							inVertex  += inVSize;
 							outVertex += outVSize;
 						}
@@ -963,7 +963,7 @@ public:
 							// translate and resize the vertex (relatively to the mesh origin)
 							CHECK_VERTEX_BUFFER(outVb, outVertex);
 							CHECK_VERTEX_BUFFER(prerotVb, inVertex);
-							*(CVector *)  outVertex = *posIt + *ptCurrSize * *(CVector *) inVertex;
+							*(CVectorPacked *)  outVertex = *posIt + *ptCurrSize * CVector(*(CVectorPacked *) inVertex);
 							inVertex  += inVSize;
 							outVertex += outVSize;
 						}
@@ -1684,8 +1684,8 @@ CVertexBuffer &CPSConstraintMesh::makePrerotatedVb(const CVertexBuffer &inVb)
 				CHECK_VERTEX_BUFFER(prerotatedVb, outVertex);
 				CHECK_VERTEX_BUFFER(prerotatedVb, outVertex + pNormalOff);
 
-				* (CVectorPacked *) outVertex =  mat.mulVector(* (CVector *) inVertex);
-				* (CVectorPacked *) (outVertex + normalOff) =  mat.mulVector(* (CVector *) (inVertex + pNormalOff) );
+				* (CVectorPacked *) outVertex =  mat.mulVector(* (CVectorPacked *) inVertex);
+				* (CVectorPacked *) (outVertex + normalOff) =  mat.mulVector(* (CVectorPacked *) (inVertex + pNormalOff) );
 				outVertex += vpSize;
 				inVertex  += vSize;
 
@@ -1701,7 +1701,7 @@ CVertexBuffer &CPSConstraintMesh::makePrerotatedVb(const CVertexBuffer &inVb)
 				CHECK_VERTEX_BUFFER(prerotatedVb, outVertex);
 				CHECK_VERTEX_BUFFER(inVb, inVertex);
 
-				* (CVectorPacked *) outVertex =  mat.mulVector(* (CVector *) inVertex);
+				* (CVectorPacked *) outVertex =  mat.mulVector(* (CVectorPacked *) inVertex);
 				outVertex += vpSize;
 				inVertex += vSize;
 			}
diff --git a/code/nel/src/sound/clustered_sound.cpp b/code/nel/src/sound/clustered_sound.cpp
index 25f0e5b64..0ccfd0d59 100644
--- a/code/nel/src/sound/clustered_sound.cpp
+++ b/code/nel/src/sound/clustered_sound.cpp
@@ -881,7 +881,8 @@ bool CClusteredSound::addAudibleCluster(CCluster *cluster, CClusterSoundStatus &
 	}
 	else
 	{
-		_AudibleClusters.insert(make_pair(cluster, soundStatus));
+		//_AudibleClusters.insert(make_pair(cluster, soundStatus));
+		_AudibleClusters[cluster] = soundStatus;
 		return true;
 	}
 

From 35737498b5c2460da773a3a9c646affe1c4ec3a4 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 05:03:55 +0200
Subject: [PATCH 12/21] SSE2: Implement CVector

--HG--
branch : sse2
---
 code/nel/include/nel/misc/vector.h        | 10 ++--
 code/nel/include/nel/misc/vector_inline.h | 57 +++++++++++++++++++++++
 2 files changed, 64 insertions(+), 3 deletions(-)

diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index d499a5dba..f9667e1c7 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -69,11 +69,15 @@ public:		// Methods.
 	/// @name Object.
 	//@{
 	/// Constructor which does nothing.
-	CVector() { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
+	CVector() { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 	/// Constructor .
-	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
+	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 	/// Copy Constructor.
-	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
+#ifdef USE_SSE2
+	CVector(const CVector &v) : mm(v.mm) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
+#else
+	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { }
+#endif
 	//@}
 
 	/// @name Base Maths.
diff --git a/code/nel/include/nel/misc/vector_inline.h b/code/nel/include/nel/misc/vector_inline.h
index 61f20e367..270608af0 100644
--- a/code/nel/include/nel/misc/vector_inline.h
+++ b/code/nel/include/nel/misc/vector_inline.h
@@ -125,15 +125,43 @@ inline CVector	operator*(float f, const CVector &v)
 #endif
 }
 
+#ifdef USE_SSE2
+inline __m128 dotsplat(const __m128 &l, const __m128 &r)
+{
+	// TODO: _mm_hadd_ps SSE3
+
+	__m128 mult = _mm_mul_ps(l, r);
+	__m128 vx = _mm_shuffle_ps(mult, mult, _MM_SHUFFLE(0, 0, 0, 0));
+	__m128 vy = _mm_shuffle_ps(mult, mult, _MM_SHUFFLE(1, 1, 1, 1));
+	__m128 vz = _mm_shuffle_ps(mult, mult, _MM_SHUFFLE(2, 2, 2, 2));
+	__m128 result = _mm_add_ps(_mm_add_ps(vx, vy), vz);
+	return result;
+}
+#endif
 
 // ============================================================================================
 // Advanced Maths.
 inline	float	CVector::operator*(const CVector &v) const
 {
+#ifdef USE_SSE2
+	return _mm_cvtss_f32(dotsplat(mm, v.mm));
+#else
 	return x*v.x + y*v.y + z*v.z;
+#endif
 }
 inline	CVector	CVector::operator^(const CVector &v) const
 {
+#ifdef USE_SSE2
+	CVector res;
+	__m128 l = _mm_shuffle_ps(mm, mm, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 r = _mm_shuffle_ps(v.mm, v.mm, _MM_SHUFFLE(3, 1, 0, 2));
+	__m128 mul1 = _mm_mul_ps(l, r);
+	l = _mm_shuffle_ps(mm, mm, _MM_SHUFFLE(3, 1, 0, 2));
+	r = _mm_shuffle_ps(v.mm, v.mm, _MM_SHUFFLE(3, 0, 2, 1));
+	__m128 mul2 = _mm_mul_ps(l, r);
+	res.mm = _mm_sub_ps(mul1, mul2);
+	return res;
+#else
 	CVector	ret;
 
 	ret.x= y*v.z - z*v.y;
@@ -141,27 +169,48 @@ inline	CVector	CVector::operator^(const CVector &v) const
 	ret.z= x*v.y - y*v.x;
 
 	return ret;
+#endif
 }
 inline	float	CVector::sqrnorm() const
 {
+#ifdef USE_SSE2
+	return _mm_cvtss_f32(dotsplat(mm, mm));
+#else
 	return (float)(x*x + y*y + z*z);
+#endif
 }
 inline	float	CVector::norm() const
 {
+#ifdef USE_SSE2
+	return sqrt(_mm_cvtss_f32(dotsplat(mm, mm)));
+#else
 	return (float)sqrt(x*x + y*y + z*z);
+#endif
 }
 inline	void	CVector::normalize()
 {
+#ifdef USE_SSE2
+	__m128 normsplat = _mm_sqrt_ps(dotsplat(mm, mm));
+	mm = _mm_div_ps(mm, normsplat);
+#else
 	float	n=norm();
 	if(n)
 		*this/=n;
+#endif
 }
 inline	CVector	CVector::normed() const
 {
+#ifdef USE_SSE2
+	CVector res;
+	__m128 normsplat = _mm_sqrt_ps(dotsplat(mm, mm));
+	res.mm = _mm_div_ps(mm, normsplat);
+	return res;
+#else
 	CVector	ret;
 	ret= *this;
 	ret.normalize();
 	return ret;
+#endif
 }
 
 
@@ -219,15 +268,23 @@ inline	void	CVector::sphericToCartesian(float r, float theta,float phi)
 }
 inline	void	CVector::minof(const CVector &a, const CVector &b)
 {
+#ifdef USE_SSE2
+	mm = _mm_min_ps(a.mm, b.mm);
+#else
 	x= std::min(a.x, b.x);
 	y= std::min(a.y, b.y);
 	z= std::min(a.z, b.z);
+#endif
 }
 inline	void	CVector::maxof(const CVector &a, const CVector &b)
 {
+#ifdef USE_SSE2
+	mm = _mm_max_ps(a.mm, b.mm);
+#else
 	x= std::max(a.x, b.x);
 	y= std::max(a.y, b.y);
 	z= std::max(a.z, b.z);
+#endif
 }
 inline	void	CVector::serial(IStream &f)
 {

From 00b8ad4c914ea0ebe967cf9deeadb02f987910e5 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 05:21:27 +0200
Subject: [PATCH 13/21] SSE2: More alignment workarounds

--HG--
branch : sse2
---
 .../nel/3d/ps_attrib_maker_bin_op_inline.h    |  2 +-
 code/nel/include/nel/3d/ps_plane_basis.h      |  6 ++--
 code/nel/include/nel/misc/vector.h            | 28 +++++++++++++++++++
 code/nel/src/3d/ps_mesh.cpp                   |  4 +--
 4 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
index 0070ffb38..673925643 100644
--- a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
+++ b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h
@@ -43,7 +43,7 @@ template <>
 inline CPlaneBasis PSBinOpModulate(const CPlaneBasis &p1, const CPlaneBasis &p2)
 {
 	// we compute p1 * p2
-	NLMISC::CVector z = p1.X ^ p1.Y;
+	NLMISC::CVector z = CVector(p1.X) ^ CVector(p1.Y);
 	CPlaneBasis r;
 	r.X.x = p2.X.x * p1.X.x + p2.X.y * p1.Y.x + p2.X.z * z.x;
 	r.X.y = p2.X.x * p1.X.y + p2.X.y * p1.Y.y + p2.X.z * z.y;
diff --git a/code/nel/include/nel/3d/ps_plane_basis.h b/code/nel/include/nel/3d/ps_plane_basis.h
index 41882e148..2c8fd07c4 100644
--- a/code/nel/include/nel/3d/ps_plane_basis.h
+++ b/code/nel/include/nel/3d/ps_plane_basis.h
@@ -37,8 +37,8 @@ namespace NL3D {
 
 struct CPlaneBasis
 {
-	NLMISC::CVector X ;
-	NLMISC::CVector Y ;
+	NLMISC::CVectorPacked X ;
+	NLMISC::CVectorPacked Y ;
 
 
 	// default ctor
@@ -62,7 +62,7 @@ struct CPlaneBasis
 	/// compute the normal of the plane basis
 	NLMISC::CVector getNormal(void) const
 	{
-		return X ^ Y ;
+		return CVector(X) ^ CVector(Y) ;
 	}
 
 
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index f9667e1c7..1d4ef3fe4 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -212,6 +212,34 @@ public:
 	{
 		return CVector(*this) - v;
 	}
+
+	bool	operator==(const CVectorPacked &v) const
+	{
+		return x==v.x && y==v.y && z==v.z;
+	}
+	bool	operator!=(const CVectorPacked &v) const
+	{
+		return !(*this==v);
+	}
+	bool	operator<(const CVectorPacked &v) const
+	{
+		if(x!=v.x)
+			return x<v.x;
+		if(y!=v.y)
+			return y<v.y;
+		return z<v.z;
+	}
+	
+	CVector	operator^(const CVector &v) const
+	{
+		CVector	ret;
+
+		ret.x= y*v.z - z*v.y;
+		ret.y= z*v.x - x*v.z;
+		ret.z= x*v.y - y*v.x;
+
+		return ret;
+	}
 };
 
 // blend (faster version than the generic version found in algo.h)
diff --git a/code/nel/src/3d/ps_mesh.cpp b/code/nel/src/3d/ps_mesh.cpp
index a8c057497..2a875588a 100644
--- a/code/nel/src/3d/ps_mesh.cpp
+++ b/code/nel/src/3d/ps_mesh.cpp
@@ -404,8 +404,8 @@ void CPSMesh::updatePos()
 
 
 
-			mat.setRot( ptBasis->X * CPSUtil::getCos((sint32) *ptCurrAngle) + ptBasis->Y * CPSUtil::getSin((sint32) *ptCurrAngle)
-						, ptBasis->X * CPSUtil::getCos((sint32) *ptCurrAngle + 64) + ptBasis->Y * CPSUtil::getSin((sint32) *ptCurrAngle + 64)
+			mat.setRot( CVector(ptBasis->X) * CPSUtil::getCos((sint32) *ptCurrAngle) + CVector(ptBasis->Y) * CPSUtil::getSin((sint32) *ptCurrAngle)
+						, CVector(ptBasis->X) * CPSUtil::getCos((sint32) *ptCurrAngle + 64) + CVector(ptBasis->Y) * CPSUtil::getSin((sint32) *ptCurrAngle + 64)
 						, ptBasis->X ^ ptBasis->Y
 					  );
 

From d3847e10ccb132ba2738462a58dc2ef1f6610d8d Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 05:36:43 +0200
Subject: [PATCH 14/21] SSE2: Workaround alignment issue related to std::pair

--HG--
branch : sse2
---
 code/nel/include/nel/sound/clustered_sound.h | 16 +++++++---------
 code/nel/src/sound/audio_mixer_user.cpp      |  2 +-
 code/nel/src/sound/clustered_sound.cpp       |  9 ++++-----
 code/nel/src/sound/simple_source.cpp         |  2 +-
 code/nel/src/sound/stream_source.cpp         |  2 +-
 code/ryzom/client/src/sound_manager.cpp      |  2 +-
 6 files changed, 15 insertions(+), 18 deletions(-)

diff --git a/code/nel/include/nel/sound/clustered_sound.h b/code/nel/include/nel/sound/clustered_sound.h
index d9d2c47f2..da7cdb12f 100644
--- a/code/nel/include/nel/sound/clustered_sound.h
+++ b/code/nel/include/nel/sound/clustered_sound.h
@@ -50,7 +50,6 @@ class CClusteredSound
 {
 public:
 	/// This structure contain data about sound status in a cluster
-	NL_ALIGN_SSE2(16)
 	struct CClusterSoundStatus
 	{
 		/// The relative gain of sound in the cluster
@@ -60,11 +59,11 @@ public:
 		/// The ratio distance/max earing distance
 		float			DistFactor;
 		/// The sound virtual position (in fact Dist * Direction)
-		NLMISC::CVector	Position;
+		NLMISC::CVectorPacked	Position;
 		/// The blending factor between real sound pos and virtual pos (1 mean virtual pos, 0 mean real pos).
 		float			PosAlpha;
 		/// The direction vector for the virtual sound source.
-		NLMISC::CVector	Direction;
+		NLMISC::CVectorPacked	Direction;
 		/// The occlusion att.
 		sint32			Occlusion;
 		/// The occlusion LF factor (see EAX spec)
@@ -79,7 +78,6 @@ public:
 	typedef std::map<NL3D::CCluster*, CClusterSoundStatus>	TClusterStatusMap;
 
 	/// This structure is used when we traverse the cluster/portal graph.
-	NL_ALIGN_SSE2(16)
 	struct CSoundTravContext
 	{
 		/// The current gain.
@@ -99,18 +97,18 @@ public:
 		/// A blending factor to compute virtual source position.
 		float			Alpha;
 		/// The direction vector from listener to the first portal/cluster
-		NLMISC::CVector	Direction1;
+		NLMISC::CVectorPacked	Direction1;
 		/// The direction vector from the first portal/cluster to the second one.
-		NLMISC::CVector	Direction2;
+		NLMISC::CVectorPacked	Direction2;
 		/// The current blended direction used to place vitual source.
-		NLMISC::CVector	Direction;
+		NLMISC::CVectorPacked	Direction;
 		/// The previously traversed cluster. Used to stop back traversal.
 		NL3D::CCluster	*PreviousCluster;
 		/// The previous sound propagation vector
-		NLMISC::CVector	PreviousVector;
+		NLMISC::CVectorPacked	PreviousVector;
 
 		/// The last pseudo listener position
-		NLMISC::CVector	ListenerPos;
+		NLMISC::CVectorPacked	ListenerPos;
 
 		/// Constructor. Init all default value.
 		CSoundTravContext(const NLMISC::CVector &listenerPos,
diff --git a/code/nel/src/sound/audio_mixer_user.cpp b/code/nel/src/sound/audio_mixer_user.cpp
index 3a75433f4..4807581f0 100644
--- a/code/nel/src/sound/audio_mixer_user.cpp
+++ b/code/nel/src/sound/audio_mixer_user.cpp
@@ -1705,7 +1705,7 @@ void				CAudioMixerUser::update()
 						{
 							// there is some data here, update the virtual position of the sound.
 							float dist = (css->Position - source->getPos()).norm();
-							CVector vpos(_ListenPosition + css->Direction * (css->Dist + dist));
+							CVector vpos(_ListenPosition + CVector(css->Direction) * (css->Dist + dist));
 //							_Tracks[i]->DrvSource->setPos(source->getPos() * (1-css->PosAlpha) + css->Position*(css->PosAlpha));
 							_Tracks[i]->getPhysicalSource()->setPos(source->getPos() * (1-css->PosAlpha) + vpos*(css->PosAlpha));
 							// update the relative gain
diff --git a/code/nel/src/sound/clustered_sound.cpp b/code/nel/src/sound/clustered_sound.cpp
index 0ccfd0d59..8dbc12264 100644
--- a/code/nel/src/sound/clustered_sound.cpp
+++ b/code/nel/src/sound/clustered_sound.cpp
@@ -263,7 +263,7 @@ void CClusteredSound::update(const CVector &listenerPos, const CVector &/* view
 				{
 					// this one is better !
 					cs.Distance = css.Dist;
-					cs.Source->setPos(listenerPos + css.Direction * css.Dist + CVector(0,0,2));
+					cs.Source->setPos(listenerPos + CVector(css.Direction) * css.Dist + CVector(0,0,2));
 					if (css.DistFactor < 1.0f)
 						cs.Source->setRelativeGain(css.Gain * (1.0f - (css.DistFactor*css.DistFactor*css.DistFactor*css.DistFactor)));
 					else
@@ -289,7 +289,7 @@ void CClusteredSound::update(const CVector &listenerPos, const CVector &/* view
 					cs.Source = CAudioMixerUser::instance()->createSource(soundName, false, NULL, NULL, cluster);
 					if (cs.Source != 0)
 					{
-						cs.Source->setPos(listenerPos + css.Direction * css.Dist + CVector(0,0,2));
+						cs.Source->setPos(listenerPos + CVector(css.Direction) * css.Dist + CVector(0,0,2));
 						if (css.DistFactor < 1.0f)
 							cs.Source->setRelativeGain(css.Gain * (1.0f - (css.DistFactor*css.DistFactor/**css.DistFactor*css.DistFactor*/)));
 						else
@@ -867,7 +867,7 @@ bool CClusteredSound::addAudibleCluster(CCluster *cluster, CClusterSoundStatus &
 {
 	TClusterStatusMap::iterator it(_AudibleClusters.find(cluster));
 	nlassert(soundStatus.Dist < _MaxEarDistance);
-	nlassert(soundStatus.Direction.norm() <= 1.01f);
+	nlassert(CVector(soundStatus.Direction).norm() <= 1.01f);
 
 	if (it != _AudibleClusters.end())
 	{
@@ -881,8 +881,7 @@ bool CClusteredSound::addAudibleCluster(CCluster *cluster, CClusterSoundStatus &
 	}
 	else
 	{
-		//_AudibleClusters.insert(make_pair(cluster, soundStatus));
-		_AudibleClusters[cluster] = soundStatus;
+		_AudibleClusters.insert(make_pair(cluster, soundStatus));
 		return true;
 	}
 
diff --git a/code/nel/src/sound/simple_source.cpp b/code/nel/src/sound/simple_source.cpp
index 2c9dda86c..0bf4eba8d 100644
--- a/code/nel/src/sound/simple_source.cpp
+++ b/code/nel/src/sound/simple_source.cpp
@@ -109,7 +109,7 @@ CVector CSimpleSource::getVirtualPos() const
 		{
 			// there is some data here, update the virtual position of the sound.
 			float dist = (css->Position - getPos()).norm();
-			CVector vpos(CAudioMixerUser::instance()->getListenPosVector() + css->Direction * (css->Dist + dist));
+			CVector vpos(CAudioMixerUser::instance()->getListenPosVector() + CVector(css->Direction) * (css->Dist + dist));
 			vpos = _Position * (1-css->PosAlpha) + vpos*(css->PosAlpha);
 			return vpos;
 		}
diff --git a/code/nel/src/sound/stream_source.cpp b/code/nel/src/sound/stream_source.cpp
index 9bd48ff25..ece500cd0 100644
--- a/code/nel/src/sound/stream_source.cpp
+++ b/code/nel/src/sound/stream_source.cpp
@@ -134,7 +134,7 @@ CVector CStreamSource::getVirtualPos() const
 		{
 			// there is some data here, update the virtual position of the sound.
 			float dist = (css->Position - getPos()).norm();
-			CVector vpos(CAudioMixerUser::instance()->getListenPosVector() + css->Direction * (css->Dist + dist));
+			CVector vpos(CAudioMixerUser::instance()->getListenPosVector() + CVector(css->Direction) * (css->Dist + dist));
 			vpos = _Position * (1-css->PosAlpha) + vpos*(css->PosAlpha);
 			return vpos;
 		}
diff --git a/code/ryzom/client/src/sound_manager.cpp b/code/ryzom/client/src/sound_manager.cpp
index f87c6cdee..7c5589432 100644
--- a/code/ryzom/client/src/sound_manager.cpp
+++ b/code/ryzom/client/src/sound_manager.cpp
@@ -343,7 +343,7 @@ void CSoundManager::drawSounds(float camHeight)
 			const CClusteredSound::CClusterSoundStatus &css = first->second;
 			if (css.Direction != CVector::Null)
 			{
-				CVector dest = pos+css.Direction*css.Dist;
+				CVector dest = pos+CVector(css.Direction)*css.Dist;
 
 				NL3D::CDRU::drawLine(pos, dest, CRGBA(0,255,255,255), *idriver);
 				NL3D::CDRU::drawLine(dest+CVector(0.5f,0.5f,0), dest+CVector(-0.5f,-0.5f,0), CRGBA(0, 255,255,255), *idriver);

From d787c54567d9ec3ee9dce298be8a9879db049a27 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 14:01:26 +0200
Subject: [PATCH 15/21] SSE2: Implement alignment for arena allocator

--HG--
branch : sse2
---
 .../nel/include/nel/misc/fixed_size_allocator.h |  1 +
 code/nel/src/misc/fixed_size_allocator.cpp      | 17 ++++++++++++-----
 code/nel/src/misc/object_arena_allocator.cpp    | 14 ++++++++------
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/code/nel/include/nel/misc/fixed_size_allocator.h b/code/nel/include/nel/misc/fixed_size_allocator.h
index 9eb1d8a10..80b9ed491 100644
--- a/code/nel/include/nel/misc/fixed_size_allocator.h
+++ b/code/nel/include/nel/misc/fixed_size_allocator.h
@@ -53,6 +53,7 @@ public:
 	uint getNumAllocatedBlocks() const { return _NumAlloc; }
 private:
 	class CChunk;
+	NL_ALIGN(NL_DEFAULT_MEMORY_ALIGNMENT)
 	class CNode
 	{
 	public:
diff --git a/code/nel/src/misc/fixed_size_allocator.cpp b/code/nel/src/misc/fixed_size_allocator.cpp
index 790275ec6..30693ddfd 100644
--- a/code/nel/src/misc/fixed_size_allocator.cpp
+++ b/code/nel/src/misc/fixed_size_allocator.cpp
@@ -33,6 +33,9 @@ CFixedSizeAllocator::CFixedSizeAllocator(uint numBytesPerBlock, uint numBlockPer
 	_NumChunks = 0;
 	nlassert(numBytesPerBlock > 1);
 	_NumBytesPerBlock = numBytesPerBlock;
+	const uint mask = NL_DEFAULT_MEMORY_ALIGNMENT - 1;
+	_NumBytesPerBlock = (_NumBytesPerBlock + mask) & ~mask;
+	nlassert(_NumBytesPerBlock >= numBytesPerBlock);
 	_NumBlockPerChunk = std::max(numBlockPerChunk, (uint) 3);
 	_NumAlloc = 0;
 }
@@ -67,12 +70,14 @@ void *CFixedSizeAllocator::alloc()
 	return _FreeSpace->unlink();
 }
 
+#define aligned_offsetof(s, m) ((offsetof(s, m) + (NL_DEFAULT_MEMORY_ALIGNMENT - 1)) & ~(NL_DEFAULT_MEMORY_ALIGNMENT - 1))
+
 // *****************************************************************************************************************
 void CFixedSizeAllocator::free(void *block)
 {
 	if (!block) return;
 	/// get the node from the object
-	CNode *node = (CNode *) ((uint8 *) block - offsetof(CNode, Next));
+	CNode *node = (CNode *) ((uint8 *) block - aligned_offsetof(CNode, Next));
 	//
 	nlassert(node->Chunk != NULL);
 	nlassert(node->Chunk->Allocator == this);
@@ -84,7 +89,9 @@ void CFixedSizeAllocator::free(void *block)
 // *****************************************************************************************************************
 uint CFixedSizeAllocator::CChunk::getBlockSizeWithOverhead() const
 {
-	return std::max((uint)(sizeof(CNode) - offsetof(CNode, Next)),(uint)(Allocator->getNumBytesPerBlock())) + offsetof(CNode, Next);
+	nlctassert((sizeof(CNode) % NL_DEFAULT_MEMORY_ALIGNMENT) == 0);
+	return std::max((uint)(sizeof(CNode) - aligned_offsetof(CNode, Next)),
+		(uint)(Allocator->getNumBytesPerBlock())) + aligned_offsetof(CNode, Next);
 }
 
 // *****************************************************************************************************************
@@ -105,7 +112,7 @@ CFixedSizeAllocator::CChunk::~CChunk()
 	nlassert(NumFreeObjs == 0);
 	nlassert(Allocator->_NumChunks > 0);
 	-- (Allocator->_NumChunks);
-	delete[] Mem;
+	aligned_free(Mem); //delete[] Mem;
 }
 
 // *****************************************************************************************************************
@@ -115,7 +122,7 @@ void CFixedSizeAllocator::CChunk::init(CFixedSizeAllocator *alloc)
 	nlassert(alloc != NULL);
 	Allocator = alloc;
 	//
-	Mem = new uint8[getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk()];
+	Mem = (uint8 *)aligned_malloc(getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk(), NL_DEFAULT_MEMORY_ALIGNMENT); // new uint8[getBlockSizeWithOverhead() * alloc->getNumBlockPerChunk()];
 	//
 	getNode(0).Chunk = this;
 	getNode(0).Next = &getNode(1);
@@ -179,7 +186,7 @@ void *CFixedSizeAllocator::CNode::unlink()
 	*Prev = Next;
 	nlassert(Chunk->NumFreeObjs > 0);
 	Chunk->grab(); // tells the containing chunk that a node has been allocated
-	return (void *) &Next;
+	return (void *)((uintptr_t)(this) + aligned_offsetof(CNode, Next)); //(void *) &Next;
 }
 
 // *****************************************************************************************************************
diff --git a/code/nel/src/misc/object_arena_allocator.cpp b/code/nel/src/misc/object_arena_allocator.cpp
index 9c73f5059..8084b4ac9 100644
--- a/code/nel/src/misc/object_arena_allocator.cpp
+++ b/code/nel/src/misc/object_arena_allocator.cpp
@@ -68,21 +68,23 @@ void *CObjectArenaAllocator::alloc(uint size)
 	if (size >= _MaxAllocSize)
 	{
 		// use standard allocator
-		uint8 *block = new uint8[size + sizeof(uint)]; // an additionnal uint is needed to store size of block
+		nlctassert(NL_DEFAULT_MEMORY_ALIGNMENT > sizeof(uint));
+		uint8 *block = (uint8 *)aligned_malloc(NL_DEFAULT_MEMORY_ALIGNMENT + size, NL_DEFAULT_MEMORY_ALIGNMENT); //new uint8[size + sizeof(uint)]; // an additionnal uint is needed to store size of block
 		if (!block) return NULL;
 		#ifdef NL_DEBUG
 			_MemBlockToAllocID[block] = _AllocID;
 		#endif
 		*(uint *) block = size;
-		return block + sizeof(uint);
+		return block + NL_DEFAULT_MEMORY_ALIGNMENT;
 	}
 	uint entry = ((size + (_Granularity - 1)) / _Granularity) ;
 	nlassert(entry < _ObjectSizeToAllocator.size());
 	if (!_ObjectSizeToAllocator[entry])
 	{
-		_ObjectSizeToAllocator[entry] = new CFixedSizeAllocator(entry * _Granularity + sizeof(uint), _MaxAllocSize / size); // an additionnal uint is needed to store size of block
+		_ObjectSizeToAllocator[entry] = new CFixedSizeAllocator(entry * _Granularity + NL_DEFAULT_MEMORY_ALIGNMENT, _MaxAllocSize / size); // an additionnal uint is needed to store size of block
 	}
 	void *block = _ObjectSizeToAllocator[entry]->alloc();
+	nlassert(((uintptr_t)block % NL_DEFAULT_MEMORY_ALIGNMENT) == 0);
 	#ifdef NL_DEBUG
 		if (block)
 		{
@@ -91,14 +93,14 @@ void *CObjectArenaAllocator::alloc(uint size)
 		++_AllocID;
 	#endif
 	*(uint *) block = size;
-	return (void *) ((uint8 *) block + sizeof(uint));
+	return (void *) ((uint8 *) block + NL_DEFAULT_MEMORY_ALIGNMENT);
 }
 
 // *****************************************************************************************************************
 void CObjectArenaAllocator::free(void *block)
 {
 	if (!block) return;
-	uint8 *realBlock = (uint8 *) block - sizeof(uint); // a uint is used at start of block to give its size
+	uint8 *realBlock = (uint8 *) block - NL_DEFAULT_MEMORY_ALIGNMENT; // sizeof(uint); // a uint is used at start of block to give its size
 	uint size = *(uint *) realBlock;
 	if (size >= _MaxAllocSize)
 	{
@@ -107,7 +109,7 @@ void CObjectArenaAllocator::free(void *block)
 				nlassert(it != _MemBlockToAllocID.end());
 				_MemBlockToAllocID.erase(it);
 		#endif
-		delete realBlock;
+		aligned_free(realBlock);
 		return;
 	}
 	uint entry = ((size + (_Granularity - 1)) / _Granularity);

From 7867db46542953b4adee4834b4bcc9f19353a4e6 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 14:23:51 +0200
Subject: [PATCH 16/21] SSE2: Non-SSE2 copy to avoid some issues with STL pairs

--HG--
branch : sse2
---
 code/nel/include/nel/misc/vector.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index 1d4ef3fe4..0d3216a18 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -73,10 +73,10 @@ public:		// Methods.
 	/// Constructor .
 	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 	/// Copy Constructor.
-#ifdef USE_SSE2
+#if 0
 	CVector(const CVector &v) : mm(v.mm) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 #else
-	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { }
+	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
 #endif
 	//@}
 

From 5ec363a8a921be49fde3e85c354bce3d2b1c88d0 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 15:05:52 +0200
Subject: [PATCH 17/21] SSE2: Don't divide by zero when normalizing

--HG--
branch : sse2
---
 code/nel/include/nel/misc/vector_inline.h | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/code/nel/include/nel/misc/vector_inline.h b/code/nel/include/nel/misc/vector_inline.h
index 270608af0..77d827c35 100644
--- a/code/nel/include/nel/misc/vector_inline.h
+++ b/code/nel/include/nel/misc/vector_inline.h
@@ -182,7 +182,7 @@ inline	float	CVector::sqrnorm() const
 inline	float	CVector::norm() const
 {
 #ifdef USE_SSE2
-	return sqrt(_mm_cvtss_f32(dotsplat(mm, mm)));
+	return sqrt(sqrnorm());
 #else
 	return (float)sqrt(x*x + y*y + z*z);
 #endif
@@ -191,7 +191,8 @@ inline	void	CVector::normalize()
 {
 #ifdef USE_SSE2
 	__m128 normsplat = _mm_sqrt_ps(dotsplat(mm, mm));
-	mm = _mm_div_ps(mm, normsplat);
+	if (_mm_cvtss_f32(normsplat))
+		mm = _mm_div_ps(mm, normsplat);
 #else
 	float	n=norm();
 	if(n)
@@ -203,7 +204,8 @@ inline	CVector	CVector::normed() const
 #ifdef USE_SSE2
 	CVector res;
 	__m128 normsplat = _mm_sqrt_ps(dotsplat(mm, mm));
-	res.mm = _mm_div_ps(mm, normsplat);
+	if (_mm_cvtss_f32(normsplat))
+		res.mm = _mm_div_ps(mm, normsplat);
 	return res;
 #else
 	CVector	ret;

From d18159616e7d34220be85812d5f500fb53450edc Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 16:22:27 +0200
Subject: [PATCH 18/21] SSE2: Fix hopefully the last few alignment issues

--HG--
branch : sse2
---
 code/nel/include/nel/sound/audio_mixer_user.h         |  2 +-
 code/nel/include/nel/sound/background_sound_manager.h | 10 +++++-----
 code/nel/include/nel/sound/u_audio_mixer.h            |  8 +++++++-
 code/nel/src/sound/audio_mixer_user.cpp               | 10 +++++-----
 code/ryzom/client/src/sound_manager.cpp               |  4 ++--
 5 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/code/nel/include/nel/sound/audio_mixer_user.h b/code/nel/include/nel/sound/audio_mixer_user.h
index d8949c3d3..d152cd31b 100644
--- a/code/nel/include/nel/sound/audio_mixer_user.h
+++ b/code/nel/include/nel/sound/audio_mixer_user.h
@@ -471,7 +471,7 @@ protected:
 	/// Returns nb available tracks (or NULL)
 	void						getFreeTracks( uint nb, CTrack **tracks );
 	/// Fill a vector of position and mute flag for all playing sound source.
-	virtual void				getPlayingSoundsPos(bool virtualPos, std::vector<std::pair<bool, NLMISC::CVector> > &pos);
+	virtual void				getPlayingSoundsPos(bool virtualPos, std::vector<CPlayingSoundPos> &pos);
 
 	typedef CHashMap<NLMISC::TStringId, CControledSources, NLMISC::CStringIdHashMapTraits>	TUserVarControlsContainer;
 	/// Container for all user controler and currently controled playing source
diff --git a/code/nel/include/nel/sound/background_sound_manager.h b/code/nel/include/nel/sound/background_sound_manager.h
index 326ece0f3..06f15f80f 100644
--- a/code/nel/include/nel/sound/background_sound_manager.h
+++ b/code/nel/include/nel/sound/background_sound_manager.h
@@ -258,17 +258,17 @@ private:
 	struct TSoundStatus
 	{
 		/// The data of the sound.
-		TSoundData			&SoundData;
+		TSoundData				&SoundData;
 		/// The position of the source.
-		NLMISC::CVector		Position;
+		NLMISC::CVectorPacked	Position;
 		/** The relative gain of the source. This is used for patatoid competition.when
 		  * a smaller patatoid mute bigger one.
 		  */
-		float				Gain;
+		float					Gain;
 		/// The distance beween listener and source.
-		float				Distance;
+		float					Distance;
 		/// flag if inside a sound zone
-		bool				Inside;
+		bool					Inside;
 		/// Constructor.
 		TSoundStatus(TSoundData &sd, const NLMISC::CVector &position, float gain, float distance, bool inside)
 			: SoundData(sd), Position(position), Gain(gain), Distance(distance), Inside(inside)
diff --git a/code/nel/include/nel/sound/u_audio_mixer.h b/code/nel/include/nel/sound/u_audio_mixer.h
index 0d845f9c3..0b1bcc42a 100644
--- a/code/nel/include/nel/sound/u_audio_mixer.h
+++ b/code/nel/include/nel/sound/u_audio_mixer.h
@@ -336,7 +336,13 @@ public:
 	virtual uint		getMutedPlayingSourcesCount() const = 0;
 	/// Return a string showing the playing sources
 	virtual std::string	getSourcesStats() const = 0;
-	virtual void		getPlayingSoundsPos(bool virtualPos, std::vector<std::pair<bool, NLMISC::CVector> > &pos) =0;
+	struct CPlayingSoundPos
+	{
+		CPlayingSoundPos(bool first_, const NLMISC::CVector &second_) : first(first_), second(second_) { }
+		bool first;
+		NLMISC::CVector second;
+	};
+	virtual void		getPlayingSoundsPos(bool virtualPos, std::vector<CPlayingSoundPos> &pos) =0;
 	/** Write profiling information about the mixer to the output stream.
 	 *  \param out The output stream to which to write the information
 	 */
diff --git a/code/nel/src/sound/audio_mixer_user.cpp b/code/nel/src/sound/audio_mixer_user.cpp
index 4807581f0..1c9c46cdc 100644
--- a/code/nel/src/sound/audio_mixer_user.cpp
+++ b/code/nel/src/sound/audio_mixer_user.cpp
@@ -1475,7 +1475,7 @@ void CAudioMixerUser::freeTrack(CTrack *track)
 
 // ******************************************************************
 
-void CAudioMixerUser::getPlayingSoundsPos(bool virtualPos, std::vector<std::pair<bool, NLMISC::CVector> > &pos)
+void CAudioMixerUser::getPlayingSoundsPos(bool virtualPos, std::vector<CPlayingSoundPos> &pos)
 {
 	int nbplay = 0;
 	int	nbmute = 0;
@@ -1493,9 +1493,9 @@ void CAudioMixerUser::getPlayingSoundsPos(bool virtualPos, std::vector<std::pair
 			if (source->isPlaying())
 			{
 				if (virtualPos)
-					pos.push_back(make_pair(source->getTrack() == 0, source->getVirtualPos()));
+					pos.push_back(CPlayingSoundPos(source->getTrack() == 0, source->getVirtualPos()));
 				else
-					pos.push_back(make_pair(source->getTrack() == 0,
+					pos.push_back(CPlayingSoundPos(source->getTrack() == 0,
 						source->getSourceRelativeMode()
 						? source->getPos() + _ListenPosition
 						: source->getPos()));
@@ -1517,9 +1517,9 @@ void CAudioMixerUser::getPlayingSoundsPos(bool virtualPos, std::vector<std::pair
 			if (source->isPlaying())
 			{
 				if (virtualPos)
-					pos.push_back(make_pair(source->getTrack() == 0, source->getVirtualPos()));
+					pos.push_back(CPlayingSoundPos(source->getTrack() == 0, source->getVirtualPos()));
 				else
-					pos.push_back(make_pair(source->getTrack() == 0,
+					pos.push_back(CPlayingSoundPos(source->getTrack() == 0,
 						source->getSourceRelativeMode()
 						? source->getPos() + _ListenPosition
 						: source->getPos()));
diff --git a/code/ryzom/client/src/sound_manager.cpp b/code/ryzom/client/src/sound_manager.cpp
index 7c5589432..eddd4ed4c 100644
--- a/code/ryzom/client/src/sound_manager.cpp
+++ b/code/ryzom/client/src/sound_manager.cpp
@@ -363,10 +363,10 @@ void CSoundManager::drawSounds(float camHeight)
 	}
 	// draw the sound source position
 	{
-		std::vector<std::pair<bool, CVector> > soundPos;
+		std::vector<UAudioMixer::CPlayingSoundPos> soundPos;
 		_AudioMixer->getPlayingSoundsPos(true, soundPos);
 
-		std::vector<std::pair<bool, CVector> >::iterator first(soundPos.begin()), last(soundPos.end());
+		std::vector<UAudioMixer::CPlayingSoundPos>::iterator first(soundPos.begin()), last(soundPos.end());
 		for (; first != last; ++first)
 		{
 			NL3D::CDRU::drawLine(first->second + CVector(0.5f,0.5f,0), first->second + CVector(-0.5f,-0.5f,0), CRGBA(255,0,255,255), *idriver);

From afa315b1b5d0a7f9aa939281cc500e52f81c1675 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 16:46:36 +0200
Subject: [PATCH 19/21] SSE2: Implement some more of CVector

--HG--
branch : sse2
---
 code/nel/include/nel/misc/vector.h        |  4 ++--
 code/nel/include/nel/misc/vector_inline.h | 27 +++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index 0d3216a18..3be84894d 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -73,10 +73,10 @@ public:		// Methods.
 	/// Constructor .
 	CVector(float	_x, float _y, float _z) : x(_x), y(_y), z(_z) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 	/// Copy Constructor.
-#if 0
+#if USE_SSE2
 	CVector(const CVector &v) : mm(v.mm) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 #else
-	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error"); }
+	CVector(const CVector &v) : x(v.x), y(v.y), z(v.z) { /*if (((uintptr_t)(void *)(this) & 0xF) != 0) nlerror("Vector alignment error");*/ }
 #endif
 	//@}
 
diff --git a/code/nel/include/nel/misc/vector_inline.h b/code/nel/include/nel/misc/vector_inline.h
index 77d827c35..10351e2c9 100644
--- a/code/nel/include/nel/misc/vector_inline.h
+++ b/code/nel/include/nel/misc/vector_inline.h
@@ -64,7 +64,12 @@ inline	CVector	&CVector::operator*=(float f)
 }
 inline	CVector	&CVector::operator/=(float f)
 {
+#ifdef USE_SSE2
+	mm = _mm_div_ps(mm, _mm_set1_ps(f));
+	return *this;
+#else
 	return *this*= (1.0f/f);
+#endif
 }
 inline	CVector	CVector::operator+(const CVector &v) const
 {
@@ -101,7 +106,13 @@ inline	CVector	CVector::operator*(float f) const
 }
 inline	CVector	CVector::operator/(float f) const
 {
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_div_ps(mm, _mm_set1_ps(f));
+	return res;
+#else
 	return *this*(1.0f/f);
+#endif
 }
 inline	CVector	CVector::operator-() const
 {
@@ -220,19 +231,35 @@ inline	CVector	CVector::normed() const
 // Misc.
 inline	void	CVector::set(float _x, float _y, float _z)
 {
+#ifdef USE_SSE2
+	mm = _mm_setr_ps(_x, _y, _z, 0.0f);
+#else
 	x=_x; y=_y; z=_z;
+#endif
 }
 inline	bool	CVector::operator==(const CVector &v) const
 {
+#ifdef USE_SSE2
+	return (_mm_movemask_ps(_mm_cmpeq_ps(mm, v.mm)) & 0x07) == 0x07;
+#else
 	return x==v.x && y==v.y && z==v.z;
+#endif
 }
 inline	bool	CVector::operator!=(const CVector &v) const
 {
+#ifdef USE_SSE2
+	return (_mm_movemask_ps(_mm_cmpneq_ps(mm, v.mm)) & 0x07) != 0;
+#else
 	return !(*this==v);
+#endif
 }
 inline	bool	CVector::isNull() const
 {
+#ifdef USE_SSE2
+	return (_mm_movemask_ps(_mm_cmpeq_ps(mm, _mm_setzero_ps())) & 0x07) == 0x07;
+#else
 	return *this==CVector::Null;
+#endif
 }
 inline	bool	CVector::operator<(const CVector &v) const
 {

From 60879d87e44a5b8141a6488147ee1dd3883e5b08 Mon Sep 17 00:00:00 2001
From: kaetemi <kaetemi@gmail.com>
Date: Fri, 13 Jun 2014 16:54:46 +0200
Subject: [PATCH 20/21] SSE2: Vectorize some code

--HG--
branch : sse2
---
 code/nel/include/nel/3d/track_keyframer.h |  8 ++++----
 code/nel/include/nel/misc/vector.h        |  1 +
 code/nel/include/nel/misc/vector_inline.h | 25 +++++++++++++++++++++++
 code/nel/src/3d/bone.cpp                  |  8 ++------
 code/nel/src/3d/cloud.cpp                 | 22 ++++++++++----------
 5 files changed, 43 insertions(+), 21 deletions(-)

diff --git a/code/nel/include/nel/3d/track_keyframer.h b/code/nel/include/nel/3d/track_keyframer.h
index 6cfb6736a..e50b13354 100644
--- a/code/nel/include/nel/3d/track_keyframer.h
+++ b/code/nel/include/nel/3d/track_keyframer.h
@@ -403,10 +403,10 @@ template<class T, class TKeyVal> inline void	copyToValue(T &value, const TKeyVal
 inline void	copyToValue(NLMISC::CRGBA &col, const CVector &v)
 {
 	sint	i;
-
-	i= (sint)(v.x*255); NLMISC::clamp(i,0,255); col.R= (uint8) i;
-	i= (sint)(v.y*255); NLMISC::clamp(i,0,255); col.G= (uint8) i;
-	i= (sint)(v.z*255); NLMISC::clamp(i,0,255); col.B= (uint8) i;
+	CVector mul255 = v * 255;
+	i= (sint)(mul255.x); NLMISC::clamp(i,0,255); col.R= (uint8) i;
+	i= (sint)(mul255.y); NLMISC::clamp(i,0,255); col.G= (uint8) i;
+	i= (sint)(mul255.z); NLMISC::clamp(i,0,255); col.B= (uint8) i;
 	col.A=255;
 }
 
diff --git a/code/nel/include/nel/misc/vector.h b/code/nel/include/nel/misc/vector.h
index 3be84894d..62a73e1dd 100644
--- a/code/nel/include/nel/misc/vector.h
+++ b/code/nel/include/nel/misc/vector.h
@@ -152,6 +152,7 @@ public:		// Methods.
 
 	// friends.
 	friend	CVector	operator*(float f, const CVector &v0);
+	friend	CVector	operator/(float f, const CVector &v0);
 };
 
 class CVectorPacked
diff --git a/code/nel/include/nel/misc/vector_inline.h b/code/nel/include/nel/misc/vector_inline.h
index 10351e2c9..d854d7b45 100644
--- a/code/nel/include/nel/misc/vector_inline.h
+++ b/code/nel/include/nel/misc/vector_inline.h
@@ -136,6 +136,18 @@ inline CVector	operator*(float f, const CVector &v)
 #endif
 }
 
+inline CVector	operator/(float f, const CVector &v)
+{
+#ifdef USE_SSE2
+	CVector res;
+	res.mm = _mm_div_ps(_mm_set1_ps(f), v.mm);
+	return res;
+#else
+	CVector	ret(f/v.x, f/v.y, f/v.z);
+	return ret;
+#endif
+}
+
 #ifdef USE_SSE2
 inline __m128 dotsplat(const __m128 &l, const __m128 &r)
 {
@@ -150,6 +162,19 @@ inline __m128 dotsplat(const __m128 &l, const __m128 &r)
 }
 #endif
 
+inline CVector mul(const CVector &l, const CVector &r)
+{
+	CVector res;
+#ifdef USE_SSE2
+	res.mm = _mm_mul_ps(l.mm, r.mm);
+#else
+	res.x = l.x * r.x;
+	res.y = l.y * r.y;
+	res.z = l.z * r.z;
+#endif
+	return res;
+}
+
 // ============================================================================================
 // Advanced Maths.
 inline	float	CVector::operator*(const CVector &v) const
diff --git a/code/nel/src/3d/bone.cpp b/code/nel/src/3d/bone.cpp
index 9adf64fcd..2d6694e3d 100644
--- a/code/nel/src/3d/bone.cpp
+++ b/code/nel/src/3d/bone.cpp
@@ -189,9 +189,7 @@ void	CBone::compute(CBone *parent, const CMatrix &rootMatrix, CSkeletonModel *sk
 			// retrieve scale from our father.
 			parent->getScale(fatherScale);
 			// inverse this scale.
-			fatherScale.x= 1.0f / fatherScale.x;
-			fatherScale.y= 1.0f / fatherScale.y;
-			fatherScale.z= 1.0f / fatherScale.z;
+			fatherScale = 1.0f / fatherScale;
 
 			// Compute InverseScale compensation:
 			// with UnheritScale, formula per bone should be  T*Sf-1*P*R*S*P-1.
@@ -199,9 +197,7 @@ void	CBone::compute(CBone *parent, const CMatrix &rootMatrix, CSkeletonModel *sk
 			// So we must compute T*Sf-1*T-1, in order to get wanted result.
 			invScaleComp.setScale(fatherScale);
 			// Faster compute of the translation part: just "trans + fatherScale MUL -trans" where MUL is comp mul
-			trans.x-= fatherScale.x * trans.x;
-			trans.y-= fatherScale.y * trans.y;
-			trans.z-= fatherScale.z * trans.z;
+			trans -= mul(trans, fatherScale);
 			invScaleComp.setPos(trans);
 
 
diff --git a/code/nel/src/3d/cloud.cpp b/code/nel/src/3d/cloud.cpp
index 280ba2f04..0da6da45a 100644
--- a/code/nel/src/3d/cloud.cpp
+++ b/code/nel/src/3d/cloud.cpp
@@ -487,10 +487,10 @@ void CCloud::dispXYZ (CMaterial *pMat)
 				rVB.lock (vba);
 
 				pVertices = vba.getVertexCoordPointer (0);
-				*pVertices = CVector(_Pos.x,			_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
-				*pVertices = CVector(_Pos.x+_Size.x,	_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
-				*pVertices = CVector(_Pos.x+_Size.x,	_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
-				*pVertices = CVector(_Pos.x,			_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH);
+				*pVertices = CVectorPacked(_Pos.x,			_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+				*pVertices = CVectorPacked(_Pos.x+_Size.x,	_Pos.y,			_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+				*pVertices = CVectorPacked(_Pos.x+_Size.x,	_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+				*pVertices = CVectorPacked(_Pos.x,			_Pos.y+_Size.y,	_Pos.z+_Size.z*(_NbW*_NbH-d)*oneOverNbWNbH);
 
 				pUV = vba.getTexCoordPointer (0, 0);
 				pUV->U = i*oneOverNbW;		pUV->V = j*oneOverNbH;		pUV = (CUV*)( ((uint8*)pUV) + nVSize );
@@ -513,10 +513,10 @@ void CCloud::dispXYZ (CMaterial *pMat)
 		CVertexBufferReadWrite vba;
 		rVB.lock (vba);
 		CVectorPacked *pVertices = vba.getVertexCoordPointer (0);
-		*pVertices = CVector((float)0.25f,	0, (float)0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)0.75f,	0, (float)0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)0.75f,	0, (float)0.75f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
-		*pVertices = CVector((float)0.25f,	0, (float)0.75f);
+		*pVertices = CVectorPacked((float)0.25f,	0, (float)0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVectorPacked((float)0.75f,	0, (float)0.0f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVectorPacked((float)0.75f,	0, (float)0.75f); pVertices = (CVectorPacked*)( ((uint8*)pVertices) + nVSize );
+		*pVertices = CVectorPacked((float)0.25f,	0, (float)0.75f);
 	}
 }
 
@@ -640,7 +640,7 @@ void CCloud::genBill (CCamera *pCam, uint32 nBillSize)
 	//CMatrix CamMat = pCam->getMatrix();
 	//CVector Viewer = CamMat.getPos();
 	CVector Viewer = CVector (0,0,0);
-	CVector Center = CVector (_Pos.x+_Size.x/2, _Pos.y+_Size.y/2, _Pos.z+_Size.z/2);
+	CVector Center = _Pos + (_Size / 2); //CVector (_Pos.x+_Size.x/2, _Pos.y+_Size.y/2, _Pos.z+_Size.z/2);
 	CVector Size = _Size;
 	CVector I, J, K;
 	float Left, Right, Top, Bottom, Near, Far;
@@ -650,7 +650,7 @@ void CCloud::genBill (CCamera *pCam, uint32 nBillSize)
 	CMatrix mat;
 	mat.identity();
 	mat.setRot(I,J,K, true);
-	mat.setPos(CVector(Viewer.x, Viewer.y, Viewer.z));
+	mat.setPos(Viewer);
 	mat.invert();
 
 	// Clear background for cloud creation
@@ -708,7 +708,7 @@ void CCloud::dispBill (CCamera *pCam)
 //	CMatrix CamMat = pCam->getMatrix();
 //	CVector Viewer = CamMat.getPos();
 	CVector Viewer = CVector (0,0,0);
-	CVector Center = CVector (_Pos.x+_Size.x/2, _Pos.y+_Size.y/2, _Pos.z+_Size.z/2);
+	CVector Center = _Pos + (_Size / 2); //CVector (_Pos.x+_Size.x/2, _Pos.y+_Size.y/2, _Pos.z+_Size.z/2);
 	CVector Size = _Size;
 
 	// Prepare vertices.