From 5f54f75802910517b4248e632d3d8431092707d3 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Thu, 12 Jun 2014 21:57:34 +0200 Subject: [PATCH] SSE2: Initial testing implementation --HG-- branch : sse2 --- code/CMakeLists.txt | 4 + code/CMakeModules/nel.cmake | 2 + code/nel/include/nel/3d/computed_string.h | 2 +- .../nel/3d/ps_attrib_maker_bin_op_inline.h | 20 +-- .../include/nel/3d/ps_attrib_maker_helper.h | 2 +- .../include/nel/3d/ps_attrib_maker_template.h | 6 +- code/nel/include/nel/3d/ps_color.h | 4 +- code/nel/include/nel/3d/ps_edit.h | 2 +- code/nel/include/nel/3d/ps_force.h | 2 +- code/nel/include/nel/3d/ps_located.h | 2 +- code/nel/include/nel/3d/ps_zone.h | 4 +- code/nel/include/nel/3d/quad_grid.h | 8 +- .../nel/include/nel/3d/shadow_poly_receiver.h | 19 ++- code/nel/include/nel/3d/static_quad_grid.h | 4 +- code/nel/include/nel/3d/zone_lighter.h | 2 +- code/nel/include/nel/ligo/primitive.h | 2 +- code/nel/include/nel/misc/types_nl.h | 14 ++ code/nel/include/nel/pacs/chain_quad.h | 2 +- code/nel/include/nel/pacs/edge_quad.h | 2 +- code/nel/include/nel/pacs/local_retriever.h | 6 +- code/nel/include/nel/pacs/quad_grid.h | 8 +- .../nel/sound/background_sound_manager.h | 2 +- code/nel/src/3d/computed_string.cpp | 4 +- code/nel/src/3d/mesh_mrm_skin_template.cpp | 2 +- code/nel/src/3d/mesh_mrm_skinned_template.cpp | 2 +- code/nel/src/3d/ps_zone.cpp | 4 +- code/nel/src/3d/zone_lighter.cpp | 2 +- code/nel/src/ligo/primitive.cpp | 2 +- code/nel/src/misc/common.cpp | 55 +++++++ code/nel/src/misc/matrix.cpp | 146 +++++++++++++++++- code/nel/src/misc/polygon.cpp | 2 +- code/nel/src/pacs/chain_quad.cpp | 5 +- code/nel/src/pacs/edge_quad.cpp | 5 +- code/nel/src/pacs/local_retriever.cpp | 6 +- code/ryzom/client/src/decal.cpp | 10 +- 35 files changed, 307 insertions(+), 57 deletions(-) diff --git a/code/CMakeLists.txt b/code/CMakeLists.txt index 4f0439dfd..f2fb9ac81 100644 --- a/code/CMakeLists.txt +++ b/code/CMakeLists.txt @@ -131,6 +131,10 @@ IF(FINAL_VERSION) ADD_DEFINITIONS(-DFINAL_VERSION=1) ENDIF(FINAL_VERSION) +IF(WITH_SSE2) + ADD_DEFINITIONS(-DUSE_SSE2) +ENDIF(WITH_SSE2) + IF(WITH_QT) FIND_PACKAGE(Qt4 COMPONENTS QtCore QtGui QtXml QtOpenGL REQUIRED) ENDIF(WITH_QT) diff --git a/code/CMakeModules/nel.cmake b/code/CMakeModules/nel.cmake index b194b5ff9..5a4002ed4 100644 --- a/code/CMakeModules/nel.cmake +++ b/code/CMakeModules/nel.cmake @@ -324,6 +324,8 @@ MACRO(NL_SETUP_NEL_DEFAULT_OPTIONS) OPTION(WITH_LIBOVR "With LibOVR support" OFF) OPTION(WITH_LIBVR "With LibVR support" OFF) OPTION(WITH_PERFHUD "With NVIDIA PerfHUD support" OFF) + + OPTION(WITH_SSE2 "With SSE2" ON ) ENDMACRO(NL_SETUP_NEL_DEFAULT_OPTIONS) MACRO(NL_SETUP_NELNS_DEFAULT_OPTIONS) diff --git a/code/nel/include/nel/3d/computed_string.h b/code/nel/include/nel/3d/computed_string.h index fcb758da4..517200383 100644 --- a/code/nel/include/nel/3d/computed_string.h +++ b/code/nel/include/nel/3d/computed_string.h @@ -290,7 +290,7 @@ public: * \param matrix transformation matrix * \param hotspot position of string origine */ - void render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot = MiddleMiddle); + void render3D (IDriver& driver, const CMatrix &matrix, THotSpot hotspot = MiddleMiddle); }; diff --git a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h index 2a9cbff45..0070ffb38 100644 --- a/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h +++ b/code/nel/include/nel/3d/ps_attrib_maker_bin_op_inline.h @@ -33,14 +33,14 @@ namespace NL3D { */ template -inline T PSBinOpModulate(T arg1, T arg2) { return arg1 * arg2; } +inline T PSBinOpModulate(const T &arg1, const T &arg2) { return arg1 * arg2; } template -inline T PSBinOpAdd(T arg1, T arg2) { return arg1 + arg2; } +inline T PSBinOpAdd(const T &arg1, const T &arg2) { return arg1 + arg2; } template -inline T PSBinOpSubtract(T arg1, T arg2) { return arg1 - arg2; } +inline T PSBinOpSubtract(const T &arg1, const T &arg2) { return arg1 - arg2; } template <> -inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2) +inline CPlaneBasis PSBinOpModulate(const CPlaneBasis &p1, const CPlaneBasis &p2) { // we compute p1 * p2 NLMISC::CVector z = p1.X ^ p1.Y; @@ -57,13 +57,13 @@ inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2) } template <> -inline CPlaneBasis PSBinOpAdd(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */) +inline CPlaneBasis PSBinOpAdd(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */) { nlassert(0); // not allowed for now return CPlaneBasis(NLMISC::CVector::Null); } template <> -inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */) +inline CPlaneBasis PSBinOpSubtract(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */) { nlassert(0); // not allowed for now return CPlaneBasis(NLMISC::CVector::Null); @@ -71,21 +71,21 @@ inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */) template <> -inline uint32 PSBinOpSubtract(uint32 lhs, uint32 rhs) +inline uint32 PSBinOpSubtract(const uint32 &lhs, const uint32 &rhs) { return rhs > lhs ? 0 : lhs - rhs; // avoid overflow } template <> -inline NLMISC::CRGBA PSBinOpModulate(NLMISC::CRGBA t1, NLMISC::CRGBA t2) +inline NLMISC::CRGBA PSBinOpModulate(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2) { NLMISC::CRGBA result; result.modulateFromColor(t1, t2); return result; } template <> -inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2) +inline NLMISC::CRGBA PSBinOpAdd(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2) { NLMISC::CRGBA r; uint S = t1.R + t2.R; if (S > 255) S = 255; r.R = (uint8) S; @@ -94,7 +94,7 @@ inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2) return r; } template <> -inline NLMISC::CRGBA PSBinOpSubtract(NLMISC::CRGBA t1, NLMISC::CRGBA t2) +inline NLMISC::CRGBA PSBinOpSubtract(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2) { NLMISC::CRGBA r; sint S = t1.R - t2.R; if (S < 0) S = 0; r.R = (uint8) S; diff --git a/code/nel/include/nel/3d/ps_attrib_maker_helper.h b/code/nel/include/nel/3d/ps_attrib_maker_helper.h index 10d29fe52..147d1ae5d 100644 --- a/code/nel/include/nel/3d/ps_attrib_maker_helper.h +++ b/code/nel/include/nel/3d/ps_attrib_maker_helper.h @@ -1432,7 +1432,7 @@ public: * */ - virtual void setDefaultValue(T defaultValue) { _DefaultValue = defaultValue;} + virtual void setDefaultValue(const T &defaultValue) { _DefaultValue = defaultValue;} /// get the default value : virtual T getDefaultValue(void) const { return _DefaultValue; } diff --git a/code/nel/include/nel/3d/ps_attrib_maker_template.h b/code/nel/include/nel/3d/ps_attrib_maker_template.h index 92953b86f..72bc62df9 100644 --- a/code/nel/include/nel/3d/ps_attrib_maker_template.h +++ b/code/nel/include/nel/3d/ps_attrib_maker_template.h @@ -68,7 +68,7 @@ template struct CPSValueBlendFuncBase { virtual ~CPSValueBlendFuncBase() {} virtual void getValues(T &startValue, T &endValue) const = 0; - virtual void setValues(T startValue, T endValue) = 0; + virtual void setValues(const T &startValue, const T &endValue) = 0; }; @@ -122,7 +122,7 @@ public: } /// Set the Values between which to blend. - virtual void setValues(T startValue, T endValue) + virtual void setValues(const T &startValue, const T &endValue) { _StartValue = startValue; _EndValue = endValue; @@ -210,7 +210,7 @@ public: /// set the Values - virtual void setValues(T startValue, T endValue) + virtual void setValues(const T &startValue, const T &endValue) { float step = 1.f / n; float alpha = 0.0f; diff --git a/code/nel/include/nel/3d/ps_color.h b/code/nel/include/nel/3d/ps_color.h index d05d9cf11..bd92bcbe6 100644 --- a/code/nel/include/nel/3d/ps_color.h +++ b/code/nel/include/nel/3d/ps_color.h @@ -62,7 +62,7 @@ public: endValue = convertVBColor(endValue, _ColorType); } - virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue) + virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue) { CPSValueBlendFunc::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType)); } @@ -96,7 +96,7 @@ public: endValue = convertVBColor(endValue, _ColorType); } - virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue) + virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue) { CPSValueBlendSampleFunc::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType)); } diff --git a/code/nel/include/nel/3d/ps_edit.h b/code/nel/include/nel/3d/ps_edit.h index de7957f54..0c2da9e71 100644 --- a/code/nel/include/nel/3d/ps_edit.h +++ b/code/nel/include/nel/3d/ps_edit.h @@ -82,7 +82,7 @@ struct IPSMover virtual NLMISC::CVector getNormal(uint32 /* index */) { NL_PS_FUNC(getNormal); return NLMISC::CVector::Null ; } /// if the object only stores a normal, this set the normal of the object. Otherwise it has no effect - virtual void setNormal(uint32 /* index */, NLMISC::CVector /* n */) { NL_PS_FUNC(setNormal); } + virtual void setNormal(uint32 /* index */, const NLMISC::CVector &/* n */) { NL_PS_FUNC(setNormal); } // set a new orthogonal matrix for the object virtual void setMatrix(uint32 index, const NLMISC::CMatrix &m) = 0 ; diff --git a/code/nel/include/nel/3d/ps_force.h b/code/nel/include/nel/3d/ps_force.h index e93c21361..76f22f40b 100644 --- a/code/nel/include/nel/3d/ps_force.h +++ b/code/nel/include/nel/3d/ps_force.h @@ -741,7 +741,7 @@ public: virtual NLMISC::CVector getScale(uint32 k) const { return NLMISC::CVector(_Radius[k], _Radius[k], _Radius[k]); } virtual bool onlyStoreNormal(void) const { return true; } virtual NLMISC::CVector getNormal(uint32 index) { return _Normal[index]; } - virtual void setNormal(uint32 index, NLMISC::CVector n) { _Normal[index] = n; } + virtual void setNormal(uint32 index, const NLMISC::CVector &n) { _Normal[index] = n; } virtual void setMatrix(uint32 index, const NLMISC::CMatrix &m); virtual NLMISC::CMatrix getMatrix(uint32 index) const; diff --git a/code/nel/include/nel/3d/ps_located.h b/code/nel/include/nel/3d/ps_located.h index 30fa7defa..2c4862b63 100644 --- a/code/nel/include/nel/3d/ps_located.h +++ b/code/nel/include/nel/3d/ps_located.h @@ -613,7 +613,7 @@ public: struct CParametricInfo { CParametricInfo() {} - CParametricInfo(NLMISC::CVector pos, NLMISC::CVector speed, float date) + CParametricInfo(const NLMISC::CVector &pos, const NLMISC::CVector &speed, float date) : Pos(pos), Speed(speed), Date(date) { } diff --git a/code/nel/include/nel/3d/ps_zone.h b/code/nel/include/nel/3d/ps_zone.h index 7289e64e0..cf29bc258 100644 --- a/code/nel/include/nel/3d/ps_zone.h +++ b/code/nel/include/nel/3d/ps_zone.h @@ -153,7 +153,7 @@ class CPSZonePlane : public CPSZone, public IPSMover virtual NLMISC::CMatrix getMatrix(uint32 index) const; virtual bool onlyStoreNormal(void) const { return true; } virtual NLMISC::CVector getNormal(uint32 index); - virtual void setNormal(uint32 index, NLMISC::CVector n); + virtual void setNormal(uint32 index, const NLMISC::CVector &n); virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream); @@ -255,7 +255,7 @@ class CPSZoneDisc : public CPSZone, public IPSMover virtual NLMISC::CVector getScale(uint32 k) const; virtual bool onlyStoreNormal(void) const { return true; } virtual NLMISC::CVector getNormal(uint32 index); - virtual void setNormal(uint32 index, NLMISC::CVector n); + virtual void setNormal(uint32 index, const NLMISC::CVector &n); virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream); diff --git a/code/nel/include/nel/3d/quad_grid.h b/code/nel/include/nel/3d/quad_grid.h index 12160b540..e97543896 100644 --- a/code/nel/include/nel/3d/quad_grid.h +++ b/code/nel/include/nel/3d/quad_grid.h @@ -314,11 +314,11 @@ private:// Methods. } // return the coordinates on the grid of what include the bbox. - void selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1) + void selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1) { - CVector bminp, bmaxp; - bminp= bmin; - bmaxp= bmax; + CVector bmin, bmax; + bmin= bminp; + bmax= bmaxp; bmin.minof(bminp, bmaxp); bmax.maxof(bminp, bmaxp); bmin/= _EltSize; diff --git a/code/nel/include/nel/3d/shadow_poly_receiver.h b/code/nel/include/nel/3d/shadow_poly_receiver.h index 5c9476849..c781578ea 100644 --- a/code/nel/include/nel/3d/shadow_poly_receiver.h +++ b/code/nel/include/nel/3d/shadow_poly_receiver.h @@ -85,10 +85,27 @@ public: // a vertex struct CRGBAVertex { +#if USE_SSE2 + float X, Y, Z; +#else CVector V; +#endif CRGBA Color; CRGBAVertex() {} - CRGBAVertex(const CVector &v, CRGBA c) : V(v), Color(c) {} +#if USE_SSE2 + CRGBAVertex(const CVector &v, CRGBA c) : X(v.x), Y(v.y), Z(v.z), Color(c) {} + const CVector &asVector() const + { + //nlctassert(sizeof(CVector) == sizeof(CRGBAVertex)); + nlctassert(sizeof(CVector) + 4 == sizeof(CRGBAVertex)); + *reinterpret_cast(this); + } +#else + const CVector &asVector() const + { + return V; + } +#endif }; /** Compute list of clipped tri under the shadow mat diff --git a/code/nel/include/nel/3d/static_quad_grid.h b/code/nel/include/nel/3d/static_quad_grid.h index 568ae3c0e..0bc171a74 100644 --- a/code/nel/include/nel/3d/static_quad_grid.h +++ b/code/nel/include/nel/3d/static_quad_grid.h @@ -102,8 +102,10 @@ private:// Atttributes. // return the coordinates on the grid of what include the bbox. - void selectPoint(CVector point, sint &x0, sint &y0) + void selectPoint(const CVector &pointp, sint &x0, sint &y0) { + CVector point = pointp; + point/= _EltSize; x0= (sint)(floor(point.x)); y0= (sint)(floor(point.y)); diff --git a/code/nel/include/nel/3d/zone_lighter.h b/code/nel/include/nel/3d/zone_lighter.h index 4f2910c52..52ef66199 100644 --- a/code/nel/include/nel/3d/zone_lighter.h +++ b/code/nel/include/nel/3d/zone_lighter.h @@ -421,7 +421,7 @@ private: * The vector of water shapes is released then * \param bbox the bbox of the zone containing the water shapes */ - void makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox); + void makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox); /** For each tile of the current zone, check whether it below or above water. diff --git a/code/nel/include/nel/ligo/primitive.h b/code/nel/include/nel/ligo/primitive.h index c050f14b6..b2f703015 100644 --- a/code/nel/include/nel/ligo/primitive.h +++ b/code/nel/include/nel/ligo/primitive.h @@ -523,7 +523,7 @@ public: std::vector VPoints; - static float getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos); + static float getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos); public: diff --git a/code/nel/include/nel/misc/types_nl.h b/code/nel/include/nel/misc/types_nl.h index 5c3b80475..b5aa77e68 100644 --- a/code/nel/include/nel/misc/types_nl.h +++ b/code/nel/include/nel/misc/types_nl.h @@ -328,6 +328,20 @@ typedef unsigned int uint; // at least 32bits (depend of processor) #endif // NL_OS_UNIX +#define NL_DEFAULT_MEMORY_ALIGNMENT 16 +#ifdef NL_COMP_VC +#define NL_ALIGN(nb) __declspec(align(nb)) +#else +#define NL_ALIGN(nb) __attribute__((aligned(nb))) +#endif + +#ifdef USE_SSE2 +extern void *operator new(size_t size) throw(std::bad_alloc); +extern void *operator new[](size_t size) throw(std::bad_alloc); +extern void operator delete(void *p) throw(); +extern void operator delete[](void *p) throw(); +#endif + // CHashMap, CHashSet and CHashMultiMap definitions #if defined(_STLPORT_VERSION) // STLport detected # include diff --git a/code/nel/include/nel/pacs/chain_quad.h b/code/nel/include/nel/pacs/chain_quad.h index 0a9df779e..fb332f1f9 100644 --- a/code/nel/include/nel/pacs/chain_quad.h +++ b/code/nel/include/nel/pacs/chain_quad.h @@ -81,7 +81,7 @@ public: * \param cst the array of CEdgeChainEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF. * \return number of edgechain found. stored in cst.EdgeChainEntries (array cleared first). */ - sint selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const; + sint selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const; /// serial. diff --git a/code/nel/include/nel/pacs/edge_quad.h b/code/nel/include/nel/pacs/edge_quad.h index 71785cb13..95e57042c 100644 --- a/code/nel/include/nel/pacs/edge_quad.h +++ b/code/nel/include/nel/pacs/edge_quad.h @@ -92,7 +92,7 @@ public: * \param cst the array of CExteriorEdgeEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF. * \return number of exterioredge found. stored in cst.ExteriorEdgeEntries (array cleared first). */ - sint selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const; + sint selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const; /// Get the whole set of edge entries diff --git a/code/nel/include/nel/pacs/local_retriever.h b/code/nel/include/nel/pacs/local_retriever.h index d76aa0891..d90ed6dc8 100644 --- a/code/nel/include/nel/pacs/local_retriever.h +++ b/code/nel/include/nel/pacs/local_retriever.h @@ -548,12 +548,12 @@ public: /** * Check all surfaces integrity */ - bool checkSurfacesIntegrity(NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const; + bool checkSurfacesIntegrity(const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const; /** * Check surface integrity */ - bool checkSurfaceIntegrity(uint surf, NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const; + bool checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const; // @} @@ -565,7 +565,7 @@ protected: bool insurePosition(ULocalPosition &local) const; /// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border - void retrievePosition(NLMISC::CVector estimated, CCollisionSurfaceTemp &cst) const; + void retrievePosition(const NLMISC::CVector &estimated, CCollisionSurfaceTemp &cst) const; /// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border void retrieveAccuratePosition(CVector2s estimated, CCollisionSurfaceTemp &cst, bool &onBorder) const; diff --git a/code/nel/include/nel/pacs/quad_grid.h b/code/nel/include/nel/pacs/quad_grid.h index aa2b383b0..61cf76c6f 100644 --- a/code/nel/include/nel/pacs/quad_grid.h +++ b/code/nel/include/nel/pacs/quad_grid.h @@ -187,11 +187,11 @@ private:// Atttributes. private:// Methods. // return the coordinates on the grid of what include the bbox. - void selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1) + void selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1) { - CVector bminp, bmaxp; - bminp= bmin; - bmaxp= bmax; + CVector bmin, bmax; + bmin= bminp; + bmax= bmaxp; bmin.minof(bminp, bmaxp); bmax.maxof(bminp, bmaxp); bmin/= _EltSize; diff --git a/code/nel/include/nel/sound/background_sound_manager.h b/code/nel/include/nel/sound/background_sound_manager.h index 11f33d2be..326ece0f3 100644 --- a/code/nel/include/nel/sound/background_sound_manager.h +++ b/code/nel/include/nel/sound/background_sound_manager.h @@ -270,7 +270,7 @@ private: /// flag if inside a sound zone bool Inside; /// Constructor. - TSoundStatus(TSoundData &sd, NLMISC::CVector position, float gain, float distance, bool inside) + TSoundStatus(TSoundData &sd, const NLMISC::CVector &position, float gain, float distance, bool inside) : SoundData(sd), Position(position), Gain(gain), Distance(distance), Inside(inside) {} }; diff --git a/code/nel/src/3d/computed_string.cpp b/code/nel/src/3d/computed_string.cpp index a57191cc0..1c8962f5e 100644 --- a/code/nel/src/3d/computed_string.cpp +++ b/code/nel/src/3d/computed_string.cpp @@ -143,11 +143,13 @@ void CComputedString::render2D (IDriver& driver, /*------------------------------------------------------------------*\ render3D() \*------------------------------------------------------------------*/ -void CComputedString::render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot) +void CComputedString::render3D (IDriver& driver,const CMatrix &matrixp,THotSpot hotspot) { if (Vertices.getNumVertices() == 0) return; + CMatrix matrix = matrixp; + // get window size uint32 wndWidth, wndHeight; driver.getWindowSize(wndWidth, wndHeight); diff --git a/code/nel/src/3d/mesh_mrm_skin_template.cpp b/code/nel/src/3d/mesh_mrm_skin_template.cpp index 1958cae90..808dce31a 100644 --- a/code/nel/src/3d/mesh_mrm_skin_template.cpp +++ b/code/nel/src/3d/mesh_mrm_skin_template.cpp @@ -494,7 +494,7 @@ void CMeshMRMGeom::applySkinWithTangentSpace(CLod &lod, const CSkeletonModel *sk On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms) */ -#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) +#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2) //#define NL3D_RAWSKIN_PRECACHE #define NL3D_RAWSKIN_ASM #endif diff --git a/code/nel/src/3d/mesh_mrm_skinned_template.cpp b/code/nel/src/3d/mesh_mrm_skinned_template.cpp index 5d1b2f582..be072713f 100644 --- a/code/nel/src/3d/mesh_mrm_skinned_template.cpp +++ b/code/nel/src/3d/mesh_mrm_skinned_template.cpp @@ -43,7 +43,7 @@ On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms) */ -#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) +#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2) //#define NL3D_RAWSKIN_PRECACHE #define NL3D_RAWSKIN_ASM #endif diff --git a/code/nel/src/3d/ps_zone.cpp b/code/nel/src/3d/ps_zone.cpp index 30349906f..813103896 100644 --- a/code/nel/src/3d/ps_zone.cpp +++ b/code/nel/src/3d/ps_zone.cpp @@ -267,7 +267,7 @@ CVector CPSZonePlane::getNormal(uint32 index) NL_PS_FUNC(CPSZonePlane_getNormal) return _Normal[index]; } -void CPSZonePlane::setNormal(uint32 index, CVector n) +void CPSZonePlane::setNormal(uint32 index, const CVector &n) { NL_PS_FUNC(CPSZonePlane_setNormal) _Normal[index] = n; @@ -576,7 +576,7 @@ CVector CPSZoneDisc::getNormal(uint32 index) NL_PS_FUNC(CPSZoneDisc_getNormal) return _Normal[index]; } -void CPSZoneDisc::setNormal(uint32 index, CVector n) +void CPSZoneDisc::setNormal(uint32 index, const CVector &n) { NL_PS_FUNC(CPSZoneDisc_setNormal) _Normal[index] = n; diff --git a/code/nel/src/3d/zone_lighter.cpp b/code/nel/src/3d/zone_lighter.cpp index 1d7ec5a66..b78fa5635 100644 --- a/code/nel/src/3d/zone_lighter.cpp +++ b/code/nel/src/3d/zone_lighter.cpp @@ -3109,7 +3109,7 @@ void CZoneLighter::addWaterShape(CWaterShape *shape, const NLMISC::CMatrix &MT) } // *********************************************************** -void CZoneLighter::makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox) +void CZoneLighter::makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox) { if (!_WaterShapes.size()) return; diff --git a/code/nel/src/ligo/primitive.cpp b/code/nel/src/ligo/primitive.cpp index 9cf7df13f..34b650f2a 100644 --- a/code/nel/src/ligo/primitive.cpp +++ b/code/nel/src/ligo/primitive.cpp @@ -875,7 +875,7 @@ bool CPrimZone::contains (const NLMISC::CVector &v, const std::vector & // *************************************************************************** -float CPrimZone::getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos) +float CPrimZone::getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos) { // two points, compute distance to the segment. CVector V = (p2-p1).normed(); diff --git a/code/nel/src/misc/common.cpp b/code/nel/src/misc/common.cpp index 36e167260..b58792a65 100644 --- a/code/nel/src/misc/common.cpp +++ b/code/nel/src/misc/common.cpp @@ -71,6 +71,61 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); } #endif // NL_OS_WINDOWS +#ifdef HAS_SSE2 + +# ifdef NL_COMP_VC + +inline void *aligned_malloc(size_t size, size_t alignment) +{ + return _aligned_malloc(size, alignment); +} + +inline void aligned_free(void *p) +{ + _aligned_free(ptr); +} + +# else + +inline void *aligned_malloc(size_t size, size_t alignment) +{ + return memalign(alignment, size); +} + +inline void aligned_free(void *ptr) +{ + free(ptr); +} + +# endif /* NL_COMP_ */ + +void *operator new(size_t size) throw(std::bad_alloc) +{ + void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT); + if (p == NULL) throw std::bad_alloc(); + return p; +} + +void *operator new[](size_t size) throw(std::bad_alloc) +{ + void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT); + if (p == NULL) throw std::bad_alloc(); + return p; +} + +void operator delete(void *p) throw() +{ + aligned_free(p); +} + +void operator delete[](void *p) throw() +{ + aligned_free(p); +} + +#endif /* HAS_SSE2 */ + + #ifdef DEBUG_NEW #define new DEBUG_NEW #endif diff --git a/code/nel/src/misc/matrix.cpp b/code/nel/src/misc/matrix.cpp index dd884f4d5..e99e04304 100644 --- a/code/nel/src/misc/matrix.cpp +++ b/code/nel/src/misc/matrix.cpp @@ -16,6 +16,11 @@ #include "stdmisc.h" +#if (USE_SSE2) +# include +# include +#endif + #include "nel/misc/matrix.h" #include "nel/misc/plane.h" #include "nel/misc/debug.h" @@ -690,10 +695,86 @@ void CMatrix::scale(const CVector &v) // ====================================================================================================== // ====================================================================================================== +void CMatrix::setMulMatrixSSE2(const CMatrix &m1, const CMatrix &m2) +{ + m1.testExpandRot(); + m1.testExpandProj(); + m2.testExpandRot(); + m2.testExpandProj(); + + // Use exactly the 8 MMX registers we have + register __m128 in1a = _mm_loadu_ps(&m1.M[0]); + register __m128 in1b = _mm_loadu_ps(&m1.M[4]); + register __m128 in1c = _mm_loadu_ps(&m1.M[8]); + register __m128 in1d = _mm_loadu_ps(&m1.M[12]); + register __m128 in2; + register __m128 outrow; + register __m128 tempsplat; + register __m128 tempmul; + + in2 = _mm_loadu_ps(&m2.M[0]); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); + outrow = _mm_mul_ps(in1a, tempsplat); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); + tempmul = _mm_mul_ps(in1b, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); + tempmul = _mm_mul_ps(in1c, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3)); + tempmul = _mm_mul_ps(in1d, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + _mm_storeu_ps(&M[0], outrow); + + in2 = _mm_loadu_ps(&m2.M[4]); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); + outrow = _mm_mul_ps(in1a, tempsplat); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); + tempmul = _mm_mul_ps(in1b, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); + tempmul = _mm_mul_ps(in1c, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3)); + tempmul = _mm_mul_ps(in1d, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + _mm_storeu_ps(&M[4], outrow); + + in2 = _mm_loadu_ps(&m2.M[8]); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); + outrow = _mm_mul_ps(in1a, tempsplat); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); + tempmul = _mm_mul_ps(in1b, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); + tempmul = _mm_mul_ps(in1c, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3)); + tempmul = _mm_mul_ps(in1d, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + _mm_storeu_ps(&M[8], outrow); + + in2 = _mm_loadu_ps(&m2.M[12]); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); + outrow = _mm_mul_ps(in1a, tempsplat); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); + tempmul = _mm_mul_ps(in1b, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); + tempmul = _mm_mul_ps(in1c, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3)); + tempmul = _mm_mul_ps(in1d, tempsplat); + outrow = _mm_add_ps(outrow, tempmul); + _mm_storeu_ps(&M[12], outrow); +} // *************************************************************************** void CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2) { +#if USE_SSE2 + setMulMatrixSSE2(m1, m2); +#else /* For a fast MulMatrix, it appears to be better to not take State bits into account (no test/if() overhead) Just do heavy mul all the time (common case, and not so slow) @@ -720,6 +801,7 @@ void CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2) a14= m1.a11*m2.a14 + m1.a12*m2.a24 + m1.a13*m2.a34 + m1.a14; a24= m1.a21*m2.a14 + m1.a22*m2.a24 + m1.a23*m2.a34 + m1.a24; a34= m1.a31*m2.a14 + m1.a32*m2.a24 + m1.a33*m2.a34 + m1.a34; +#endif // Setup no proj at all, and force valid rot (still may be identity, but 0/1 are filled) StateBit= (m1.StateBit | m2.StateBit | MAT_VALIDROT) & ~(MAT_PROJ|MAT_VALIDPROJ); @@ -737,6 +819,13 @@ void CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2) void CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2) { // Do *this= m1*m2 +#ifdef USE_SSE2 + setMulMatrixSSE2(m1, m2); + StateBit = m1.StateBit | m2.StateBit; + StateBit |= MAT_VALIDALL; + if (m1.hasTrans() && m2.hasProj()) + StateBit |= MAT_ROT | MAT_SCALEANY; +#else identity(); StateBit= m1.StateBit | m2.StateBit; StateBit&= ~MAT_VALIDALL; @@ -824,18 +913,22 @@ void CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2) a32+= m1.a34*m2.a42; a33+= m1.a34*m2.a43; } +#endif // Modify Scale. if( (StateBit & MAT_SCALEUNI) && !(StateBit & MAT_SCALEANY) ) { // Must have correct Scale33 +#ifndef USE_SSE2 m1.testExpandRot(); m2.testExpandRot(); +#endif Scale33= m1.Scale33*m2.Scale33; } else Scale33=1; +#ifndef USE_SSE2 // In every case, I am valid now! StateBit|=MAT_VALIDROT; @@ -902,6 +995,7 @@ void CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2) { // Don't copy proj part, and leave MAT_VALIDPROJ not set } +#endif } // ====================================================================================================== void CMatrix::invert() @@ -1237,11 +1331,36 @@ bool CMatrix::normalize(TRotOrder ro) // ====================================================================================================== // ====================================================================================================== - // ====================================================================================================== CVector CMatrix::mulVector(const CVector &v) const { - +#ifdef USE_SSE2 + if (hasRot()) + { + CVector ret; + register __m128 in1a = _mm_loadu_ps(&M[0]); + register __m128 in1b = _mm_loadu_ps(&M[4]); + register __m128 in1c = _mm_loadu_ps(&M[8]); + register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector! + register __m128 tempsplat; + register __m128 tempmul; + register __m128 out; + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); + out = _mm_mul_ps(in1a, tempsplat); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); + tempmul = _mm_mul_ps(in1b, tempsplat); + out = _mm_add_ps(out, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); + tempmul = _mm_mul_ps(in1c, tempsplat); + out = _mm_add_ps(out, tempmul); + _mm_storeu_ps(&ret.x, out); + return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)! + } + else + { + return v; + } +#else CVector ret; if( hasRot() ) @@ -1253,6 +1372,7 @@ CVector CMatrix::mulVector(const CVector &v) const } else return v; +#endif } // ====================================================================================================== @@ -1263,9 +1383,31 @@ CVector CMatrix::mulPoint(const CVector &v) const if( hasRot() ) { +#ifdef USE_SSE2 + register __m128 in1a = _mm_loadu_ps(&M[0]); + register __m128 in1b = _mm_loadu_ps(&M[4]); + register __m128 in1c = _mm_loadu_ps(&M[8]); + register __m128 in1d = _mm_loadu_ps(&M[12]); + register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector! + register __m128 tempsplat; + register __m128 tempmul; + register __m128 out; + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0)); + out = _mm_mul_ps(in1a, tempsplat); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1)); + tempmul = _mm_mul_ps(in1b, tempsplat); + out = _mm_add_ps(out, tempmul); + tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2)); + tempmul = _mm_mul_ps(in1c, tempsplat); + out = _mm_add_ps(out, tempmul); + out = _mm_add_ps(out, in1d); + _mm_storeu_ps(&ret.x, out); + return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)! +#else ret.x= a11*v.x + a12*v.y + a13*v.z; ret.y= a21*v.x + a22*v.y + a23*v.z; ret.z= a31*v.x + a32*v.y + a33*v.z; +#endif } else { diff --git a/code/nel/src/misc/polygon.cpp b/code/nel/src/misc/polygon.cpp index b541d2eba..2cd60058d 100644 --- a/code/nel/src/misc/polygon.cpp +++ b/code/nel/src/misc/polygon.cpp @@ -249,7 +249,7 @@ public: Back = NULL; Front = NULL; } - CBSPNode2v ( const CPlane &plane, CVector p0, CVector p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1) + CBSPNode2v ( const CPlane &plane, const CVector &p0, const CVector &p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1) { Back = NULL; Front = NULL; diff --git a/code/nel/src/pacs/chain_quad.cpp b/code/nel/src/pacs/chain_quad.cpp index 321761953..c7af9785b 100644 --- a/code/nel/src/pacs/chain_quad.cpp +++ b/code/nel/src/pacs/chain_quad.cpp @@ -344,8 +344,11 @@ sint CChainQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTem return nRes; } -sint CChainQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const +sint CChainQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const { + CVector start = startp; + CVector end = endp; + sint nRes=0; sint i; uint16 *ochainLUT= cst.OChainLUT; diff --git a/code/nel/src/pacs/edge_quad.cpp b/code/nel/src/pacs/edge_quad.cpp index 1515af075..14082a3b2 100644 --- a/code/nel/src/pacs/edge_quad.cpp +++ b/code/nel/src/pacs/edge_quad.cpp @@ -453,8 +453,11 @@ sint CEdgeQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTemp return nRes; } -sint CEdgeQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const +sint CEdgeQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const { + CVector start = startp; + CVector end = endp; + sint nRes=0; sint i; uint16 *indexLUT= cst.OChainLUT; diff --git a/code/nel/src/pacs/local_retriever.cpp b/code/nel/src/pacs/local_retriever.cpp index 7158cee0a..1b18a6052 100644 --- a/code/nel/src/pacs/local_retriever.cpp +++ b/code/nel/src/pacs/local_retriever.cpp @@ -1052,7 +1052,7 @@ bool NLPACS::CLocalRetriever::testPosition(NLPACS::ULocalPosition &local, CColli } -void NLPACS::CLocalRetriever::retrievePosition(CVector estimated, CCollisionSurfaceTemp &cst) const +void NLPACS::CLocalRetriever::retrievePosition(const CVector &estimated, CCollisionSurfaceTemp &cst) const { if (!_Loaded) return; @@ -2200,7 +2200,7 @@ void NLPACS::CLocalRetriever::replaceChain(uint32 chainId, const std::vector= _Surfaces.size()) return false; diff --git a/code/ryzom/client/src/decal.cpp b/code/ryzom/client/src/decal.cpp index 1454d9f59..bfcf4dc4b 100644 --- a/code/ryzom/client/src/decal.cpp +++ b/code/ryzom/client/src/decal.cpp @@ -433,10 +433,16 @@ void CDecal::renderTriCache(NL3D::IDriver &drv, NL3D::CShadowPolyReceiver &/* float bottomBlendBias = bottomBlendScale * (_RefPosition.z - _BottomBlendZMin); do { +#if USE_SSE2 + dest->X = srcVert->X; + dest->Y = srcVert->Y; + dest->Z = srcVert->Z; +#else dest->V = srcVert->V; - float dist = (camPos - srcVert->V).norm(); +#endif + float dist = (camPos - srcVert->asVector()).norm(); float intensity = scale * dist + bias; - float bottomBlend = srcVert->V.z * bottomBlendScale + bottomBlendBias; + float bottomBlend = srcVert->asVector().z * bottomBlendScale + bottomBlendBias; clamp(bottomBlend, 0.f, 1.f); clamp(intensity, 0.f, 255.f); intensity *= bottomBlend;