SSE2: Initial testing implementation

--HG--
branch : sse2
hg/feature/sse2
kaetemi 11 years ago
parent 0b4e64f0c9
commit 5f54f75802

@ -131,6 +131,10 @@ IF(FINAL_VERSION)
ADD_DEFINITIONS(-DFINAL_VERSION=1)
ENDIF(FINAL_VERSION)
IF(WITH_SSE2)
ADD_DEFINITIONS(-DUSE_SSE2)
ENDIF(WITH_SSE2)
IF(WITH_QT)
FIND_PACKAGE(Qt4 COMPONENTS QtCore QtGui QtXml QtOpenGL REQUIRED)
ENDIF(WITH_QT)

@ -324,6 +324,8 @@ MACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)
OPTION(WITH_LIBOVR "With LibOVR support" OFF)
OPTION(WITH_LIBVR "With LibVR support" OFF)
OPTION(WITH_PERFHUD "With NVIDIA PerfHUD support" OFF)
OPTION(WITH_SSE2 "With SSE2" ON )
ENDMACRO(NL_SETUP_NEL_DEFAULT_OPTIONS)
MACRO(NL_SETUP_NELNS_DEFAULT_OPTIONS)

@ -290,7 +290,7 @@ public:
* \param matrix transformation matrix
* \param hotspot position of string origine
*/
void render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot = MiddleMiddle);
void render3D (IDriver& driver, const CMatrix &matrix, THotSpot hotspot = MiddleMiddle);
};

@ -33,14 +33,14 @@ namespace NL3D {
*/
template <class T>
inline T PSBinOpModulate(T arg1, T arg2) { return arg1 * arg2; }
inline T PSBinOpModulate(const T &arg1, const T &arg2) { return arg1 * arg2; }
template <class T>
inline T PSBinOpAdd(T arg1, T arg2) { return arg1 + arg2; }
inline T PSBinOpAdd(const T &arg1, const T &arg2) { return arg1 + arg2; }
template <class T>
inline T PSBinOpSubtract(T arg1, T arg2) { return arg1 - arg2; }
inline T PSBinOpSubtract(const T &arg1, const T &arg2) { return arg1 - arg2; }
template <>
inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2)
inline CPlaneBasis PSBinOpModulate(const CPlaneBasis &p1, const CPlaneBasis &p2)
{
// we compute p1 * p2
NLMISC::CVector z = p1.X ^ p1.Y;
@ -57,13 +57,13 @@ inline CPlaneBasis PSBinOpModulate(CPlaneBasis p1, CPlaneBasis p2)
}
template <>
inline CPlaneBasis PSBinOpAdd(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
inline CPlaneBasis PSBinOpAdd(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */)
{
nlassert(0); // not allowed for now
return CPlaneBasis(NLMISC::CVector::Null);
}
template <>
inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
inline CPlaneBasis PSBinOpSubtract(const CPlaneBasis &/* p1 */, const CPlaneBasis &/* p2 */)
{
nlassert(0); // not allowed for now
return CPlaneBasis(NLMISC::CVector::Null);
@ -71,21 +71,21 @@ inline CPlaneBasis PSBinOpSubtract(CPlaneBasis /* p1 */, CPlaneBasis /* p2 */)
template <>
inline uint32 PSBinOpSubtract(uint32 lhs, uint32 rhs)
inline uint32 PSBinOpSubtract(const uint32 &lhs, const uint32 &rhs)
{
return rhs > lhs ? 0 : lhs - rhs; // avoid overflow
}
template <>
inline NLMISC::CRGBA PSBinOpModulate(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
inline NLMISC::CRGBA PSBinOpModulate(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
{
NLMISC::CRGBA result;
result.modulateFromColor(t1, t2);
return result;
}
template <>
inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
inline NLMISC::CRGBA PSBinOpAdd(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
{
NLMISC::CRGBA r;
uint S = t1.R + t2.R; if (S > 255) S = 255; r.R = (uint8) S;
@ -94,7 +94,7 @@ inline NLMISC::CRGBA PSBinOpAdd(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
return r;
}
template <>
inline NLMISC::CRGBA PSBinOpSubtract(NLMISC::CRGBA t1, NLMISC::CRGBA t2)
inline NLMISC::CRGBA PSBinOpSubtract(const NLMISC::CRGBA &t1, const NLMISC::CRGBA &t2)
{
NLMISC::CRGBA r;
sint S = t1.R - t2.R; if (S < 0) S = 0; r.R = (uint8) S;

@ -1432,7 +1432,7 @@ public:
*
*/
virtual void setDefaultValue(T defaultValue) { _DefaultValue = defaultValue;}
virtual void setDefaultValue(const T &defaultValue) { _DefaultValue = defaultValue;}
/// get the default value :
virtual T getDefaultValue(void) const { return _DefaultValue; }

@ -68,7 +68,7 @@ template <typename T> struct CPSValueBlendFuncBase
{
virtual ~CPSValueBlendFuncBase() {}
virtual void getValues(T &startValue, T &endValue) const = 0;
virtual void setValues(T startValue, T endValue) = 0;
virtual void setValues(const T &startValue, const T &endValue) = 0;
};
@ -122,7 +122,7 @@ public:
}
/// Set the Values between which to blend.
virtual void setValues(T startValue, T endValue)
virtual void setValues(const T &startValue, const T &endValue)
{
_StartValue = startValue;
_EndValue = endValue;
@ -210,7 +210,7 @@ public:
/// set the Values
virtual void setValues(T startValue, T endValue)
virtual void setValues(const T &startValue, const T &endValue)
{
float step = 1.f / n;
float alpha = 0.0f;

@ -62,7 +62,7 @@ public:
endValue = convertVBColor(endValue, _ColorType);
}
virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue)
virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue)
{
CPSValueBlendFunc<NLMISC::CRGBA>::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType));
}
@ -96,7 +96,7 @@ public:
endValue = convertVBColor(endValue, _ColorType);
}
virtual void setValues(NLMISC::CRGBA startValue, NLMISC::CRGBA endValue)
virtual void setValues(const NLMISC::CRGBA &startValue, const NLMISC::CRGBA &endValue)
{
CPSValueBlendSampleFunc<NLMISC::CRGBA, RGBA_BLENDER_NUM_VALUES>::setValues(convertVBColor(startValue, _ColorType), convertVBColor(endValue, _ColorType));
}

@ -82,7 +82,7 @@ struct IPSMover
virtual NLMISC::CVector getNormal(uint32 /* index */) { NL_PS_FUNC(getNormal); return NLMISC::CVector::Null ; }
/// if the object only stores a normal, this set the normal of the object. Otherwise it has no effect
virtual void setNormal(uint32 /* index */, NLMISC::CVector /* n */) { NL_PS_FUNC(setNormal); }
virtual void setNormal(uint32 /* index */, const NLMISC::CVector &/* n */) { NL_PS_FUNC(setNormal); }
// set a new orthogonal matrix for the object
virtual void setMatrix(uint32 index, const NLMISC::CMatrix &m) = 0 ;

@ -741,7 +741,7 @@ public:
virtual NLMISC::CVector getScale(uint32 k) const { return NLMISC::CVector(_Radius[k], _Radius[k], _Radius[k]); }
virtual bool onlyStoreNormal(void) const { return true; }
virtual NLMISC::CVector getNormal(uint32 index) { return _Normal[index]; }
virtual void setNormal(uint32 index, NLMISC::CVector n) { _Normal[index] = n; }
virtual void setNormal(uint32 index, const NLMISC::CVector &n) { _Normal[index] = n; }
virtual void setMatrix(uint32 index, const NLMISC::CMatrix &m);
virtual NLMISC::CMatrix getMatrix(uint32 index) const;

@ -613,7 +613,7 @@ public:
struct CParametricInfo
{
CParametricInfo() {}
CParametricInfo(NLMISC::CVector pos, NLMISC::CVector speed, float date)
CParametricInfo(const NLMISC::CVector &pos, const NLMISC::CVector &speed, float date)
: Pos(pos), Speed(speed), Date(date)
{
}

@ -153,7 +153,7 @@ class CPSZonePlane : public CPSZone, public IPSMover
virtual NLMISC::CMatrix getMatrix(uint32 index) const;
virtual bool onlyStoreNormal(void) const { return true; }
virtual NLMISC::CVector getNormal(uint32 index);
virtual void setNormal(uint32 index, NLMISC::CVector n);
virtual void setNormal(uint32 index, const NLMISC::CVector &n);
virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream);
@ -255,7 +255,7 @@ class CPSZoneDisc : public CPSZone, public IPSMover
virtual NLMISC::CVector getScale(uint32 k) const;
virtual bool onlyStoreNormal(void) const { return true; }
virtual NLMISC::CVector getNormal(uint32 index);
virtual void setNormal(uint32 index, NLMISC::CVector n);
virtual void setNormal(uint32 index, const NLMISC::CVector &n);
virtual void serial(NLMISC::IStream &f) throw(NLMISC::EStream);

@ -314,11 +314,11 @@ private:// Methods.
}
// return the coordinates on the grid of what include the bbox.
void selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1)
void selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1)
{
CVector bminp, bmaxp;
bminp= bmin;
bmaxp= bmax;
CVector bmin, bmax;
bmin= bminp;
bmax= bmaxp;
bmin.minof(bminp, bmaxp);
bmax.maxof(bminp, bmaxp);
bmin/= _EltSize;

@ -85,10 +85,27 @@ public:
// a vertex
struct CRGBAVertex
{
#if USE_SSE2
float X, Y, Z;
#else
CVector V;
#endif
CRGBA Color;
CRGBAVertex() {}
CRGBAVertex(const CVector &v, CRGBA c) : V(v), Color(c) {}
#if USE_SSE2
CRGBAVertex(const CVector &v, CRGBA c) : X(v.x), Y(v.y), Z(v.z), Color(c) {}
const CVector &asVector() const
{
//nlctassert(sizeof(CVector) == sizeof(CRGBAVertex));
nlctassert(sizeof(CVector) + 4 == sizeof(CRGBAVertex));
*reinterpret_cast<const CVector *>(this);
}
#else
const CVector &asVector() const
{
return V;
}
#endif
};
/** Compute list of clipped tri under the shadow mat

@ -102,8 +102,10 @@ private:// Atttributes.
// return the coordinates on the grid of what include the bbox.
void selectPoint(CVector point, sint &x0, sint &y0)
void selectPoint(const CVector &pointp, sint &x0, sint &y0)
{
CVector point = pointp;
point/= _EltSize;
x0= (sint)(floor(point.x));
y0= (sint)(floor(point.y));

@ -421,7 +421,7 @@ private:
* The vector of water shapes is released then
* \param bbox the bbox of the zone containing the water shapes
*/
void makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox);
void makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox);
/** For each tile of the current zone, check whether it below or above water.

@ -523,7 +523,7 @@ public:
std::vector<CPrimVector> VPoints;
static float getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos);
static float getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos);
public:

@ -328,6 +328,20 @@ typedef unsigned int uint; // at least 32bits (depend of processor)
#endif // NL_OS_UNIX
#define NL_DEFAULT_MEMORY_ALIGNMENT 16
#ifdef NL_COMP_VC
#define NL_ALIGN(nb) __declspec(align(nb))
#else
#define NL_ALIGN(nb) __attribute__((aligned(nb)))
#endif
#ifdef USE_SSE2
extern void *operator new(size_t size) throw(std::bad_alloc);
extern void *operator new[](size_t size) throw(std::bad_alloc);
extern void operator delete(void *p) throw();
extern void operator delete[](void *p) throw();
#endif
// CHashMap, CHashSet and CHashMultiMap definitions
#if defined(_STLPORT_VERSION) // STLport detected
# include <hash_map>

@ -81,7 +81,7 @@ public:
* \param cst the array of CEdgeChainEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF.
* \return number of edgechain found. stored in cst.EdgeChainEntries (array cleared first).
*/
sint selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const;
sint selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const;
/// serial.

@ -92,7 +92,7 @@ public:
* \param cst the array of CExteriorEdgeEntry to fill. contain also OChainLUT, an array for internal use. In: must be filled with 0xFFFF. Out: still filled with 0xFFFF.
* \return number of exterioredge found. stored in cst.ExteriorEdgeEntries (array cleared first).
*/
sint selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const;
sint selectEdges(const CVector &start, const CVector &end, CCollisionSurfaceTemp &cst) const;
/// Get the whole set of edge entries

@ -548,12 +548,12 @@ public:
/**
* Check all surfaces integrity
*/
bool checkSurfacesIntegrity(NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const;
bool checkSurfacesIntegrity(const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const;
/**
* Check surface integrity
*/
bool checkSurfaceIntegrity(uint surf, NLMISC::CVector translation = NLMISC::CVector::Null, bool verbose = false) const;
bool checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation = NLMISC::CVector::Null, bool verbose = false) const;
// @}
@ -565,7 +565,7 @@ protected:
bool insurePosition(ULocalPosition &local) const;
/// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border
void retrievePosition(NLMISC::CVector estimated, CCollisionSurfaceTemp &cst) const;
void retrievePosition(const NLMISC::CVector &estimated, CCollisionSurfaceTemp &cst) const;
/// Retrieves a position inside the retriever (from the local position), returns true if the position is close to a border
void retrieveAccuratePosition(CVector2s estimated, CCollisionSurfaceTemp &cst, bool &onBorder) const;

@ -187,11 +187,11 @@ private:// Atttributes.
private:// Methods.
// return the coordinates on the grid of what include the bbox.
void selectQuads(CVector bmin, CVector bmax, sint &x0, sint &x1, sint &y0, sint &y1)
void selectQuads(const CVector &bminp, const CVector &bmaxp, sint &x0, sint &x1, sint &y0, sint &y1)
{
CVector bminp, bmaxp;
bminp= bmin;
bmaxp= bmax;
CVector bmin, bmax;
bmin= bminp;
bmax= bmaxp;
bmin.minof(bminp, bmaxp);
bmax.maxof(bminp, bmaxp);
bmin/= _EltSize;

@ -270,7 +270,7 @@ private:
/// flag if inside a sound zone
bool Inside;
/// Constructor.
TSoundStatus(TSoundData &sd, NLMISC::CVector position, float gain, float distance, bool inside)
TSoundStatus(TSoundData &sd, const NLMISC::CVector &position, float gain, float distance, bool inside)
: SoundData(sd), Position(position), Gain(gain), Distance(distance), Inside(inside)
{}
};

@ -143,11 +143,13 @@ void CComputedString::render2D (IDriver& driver,
/*------------------------------------------------------------------*\
render3D()
\*------------------------------------------------------------------*/
void CComputedString::render3D (IDriver& driver,CMatrix matrix,THotSpot hotspot)
void CComputedString::render3D (IDriver& driver,const CMatrix &matrixp,THotSpot hotspot)
{
if (Vertices.getNumVertices() == 0)
return;
CMatrix matrix = matrixp;
// get window size
uint32 wndWidth, wndHeight;
driver.getWindowSize(wndWidth, wndHeight);

@ -494,7 +494,7 @@ void CMeshMRMGeom::applySkinWithTangentSpace(CLod &lod, const CSkeletonModel *sk
On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
*/
#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
//#define NL3D_RAWSKIN_PRECACHE
#define NL3D_RAWSKIN_ASM
#endif

@ -43,7 +43,7 @@
On a P4-2.4Ghz, for 40000 vertices skinned, both no precaching and asm
saves 27% of execution time in the applyRawSkinNormal*() loop (ie 1 ms)
*/
#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM) &&!defined(USE_SSE2)
//#define NL3D_RAWSKIN_PRECACHE
#define NL3D_RAWSKIN_ASM
#endif

@ -267,7 +267,7 @@ CVector CPSZonePlane::getNormal(uint32 index)
NL_PS_FUNC(CPSZonePlane_getNormal)
return _Normal[index];
}
void CPSZonePlane::setNormal(uint32 index, CVector n)
void CPSZonePlane::setNormal(uint32 index, const CVector &n)
{
NL_PS_FUNC(CPSZonePlane_setNormal)
_Normal[index] = n;
@ -576,7 +576,7 @@ CVector CPSZoneDisc::getNormal(uint32 index)
NL_PS_FUNC(CPSZoneDisc_getNormal)
return _Normal[index];
}
void CPSZoneDisc::setNormal(uint32 index, CVector n)
void CPSZoneDisc::setNormal(uint32 index, const CVector &n)
{
NL_PS_FUNC(CPSZoneDisc_setNormal)
_Normal[index] = n;

@ -3109,7 +3109,7 @@ void CZoneLighter::addWaterShape(CWaterShape *shape, const NLMISC::CMatrix &MT)
}
// ***********************************************************
void CZoneLighter::makeQuadGridFromWaterShapes(NLMISC::CAABBox zoneBBox)
void CZoneLighter::makeQuadGridFromWaterShapes(const NLMISC::CAABBox &zoneBBox)
{
if (!_WaterShapes.size()) return;

@ -875,7 +875,7 @@ bool CPrimZone::contains (const NLMISC::CVector &v, const std::vector<CVector> &
// ***************************************************************************
float CPrimZone::getSegmentDist(const NLMISC::CVector v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos)
float CPrimZone::getSegmentDist(const NLMISC::CVector &v, const NLMISC::CVector &p1, const NLMISC::CVector &p2, NLMISC::CVector &nearPos)
{
// two points, compute distance to the segment.
CVector V = (p2-p1).normed();

@ -71,6 +71,61 @@ extern "C" long _ftol2( double dblSource ) { return _ftol( dblSource ); }
#endif // NL_OS_WINDOWS
#ifdef HAS_SSE2
# ifdef NL_COMP_VC
inline void *aligned_malloc(size_t size, size_t alignment)
{
return _aligned_malloc(size, alignment);
}
inline void aligned_free(void *p)
{
_aligned_free(ptr);
}
# else
inline void *aligned_malloc(size_t size, size_t alignment)
{
return memalign(alignment, size);
}
inline void aligned_free(void *ptr)
{
free(ptr);
}
# endif /* NL_COMP_ */
void *operator new(size_t size) throw(std::bad_alloc)
{
void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
if (p == NULL) throw std::bad_alloc();
return p;
}
void *operator new[](size_t size) throw(std::bad_alloc)
{
void *p = aligned_malloc(size, NL_DEFAULT_MEMORY_ALIGNMENT);
if (p == NULL) throw std::bad_alloc();
return p;
}
void operator delete(void *p) throw()
{
aligned_free(p);
}
void operator delete[](void *p) throw()
{
aligned_free(p);
}
#endif /* HAS_SSE2 */
#ifdef DEBUG_NEW
#define new DEBUG_NEW
#endif

@ -16,6 +16,11 @@
#include "stdmisc.h"
#if (USE_SSE2)
# include <xmmintrin.h>
# include <emmintrin.h>
#endif
#include "nel/misc/matrix.h"
#include "nel/misc/plane.h"
#include "nel/misc/debug.h"
@ -690,10 +695,86 @@ void CMatrix::scale(const CVector &v)
// ======================================================================================================
// ======================================================================================================
void CMatrix::setMulMatrixSSE2(const CMatrix &m1, const CMatrix &m2)
{
m1.testExpandRot();
m1.testExpandProj();
m2.testExpandRot();
m2.testExpandProj();
// Use exactly the 8 MMX registers we have
register __m128 in1a = _mm_loadu_ps(&m1.M[0]);
register __m128 in1b = _mm_loadu_ps(&m1.M[4]);
register __m128 in1c = _mm_loadu_ps(&m1.M[8]);
register __m128 in1d = _mm_loadu_ps(&m1.M[12]);
register __m128 in2;
register __m128 outrow;
register __m128 tempsplat;
register __m128 tempmul;
in2 = _mm_loadu_ps(&m2.M[0]);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
outrow = _mm_mul_ps(in1a, tempsplat);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
tempmul = _mm_mul_ps(in1b, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
tempmul = _mm_mul_ps(in1c, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
tempmul = _mm_mul_ps(in1d, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
_mm_storeu_ps(&M[0], outrow);
in2 = _mm_loadu_ps(&m2.M[4]);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
outrow = _mm_mul_ps(in1a, tempsplat);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
tempmul = _mm_mul_ps(in1b, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
tempmul = _mm_mul_ps(in1c, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
tempmul = _mm_mul_ps(in1d, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
_mm_storeu_ps(&M[4], outrow);
in2 = _mm_loadu_ps(&m2.M[8]);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
outrow = _mm_mul_ps(in1a, tempsplat);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
tempmul = _mm_mul_ps(in1b, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
tempmul = _mm_mul_ps(in1c, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
tempmul = _mm_mul_ps(in1d, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
_mm_storeu_ps(&M[8], outrow);
in2 = _mm_loadu_ps(&m2.M[12]);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
outrow = _mm_mul_ps(in1a, tempsplat);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
tempmul = _mm_mul_ps(in1b, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
tempmul = _mm_mul_ps(in1c, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(3, 3, 3, 3));
tempmul = _mm_mul_ps(in1d, tempsplat);
outrow = _mm_add_ps(outrow, tempmul);
_mm_storeu_ps(&M[12], outrow);
}
// ***************************************************************************
void CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
{
#if USE_SSE2
setMulMatrixSSE2(m1, m2);
#else
/*
For a fast MulMatrix, it appears to be better to not take State bits into account (no test/if() overhead)
Just do heavy mul all the time (common case, and not so slow)
@ -720,6 +801,7 @@ void CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
a14= m1.a11*m2.a14 + m1.a12*m2.a24 + m1.a13*m2.a34 + m1.a14;
a24= m1.a21*m2.a14 + m1.a22*m2.a24 + m1.a23*m2.a34 + m1.a24;
a34= m1.a31*m2.a14 + m1.a32*m2.a24 + m1.a33*m2.a34 + m1.a34;
#endif
// Setup no proj at all, and force valid rot (still may be identity, but 0/1 are filled)
StateBit= (m1.StateBit | m2.StateBit | MAT_VALIDROT) & ~(MAT_PROJ|MAT_VALIDPROJ);
@ -737,6 +819,13 @@ void CMatrix::setMulMatrixNoProj(const CMatrix &m1, const CMatrix &m2)
void CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
{
// Do *this= m1*m2
#ifdef USE_SSE2
setMulMatrixSSE2(m1, m2);
StateBit = m1.StateBit | m2.StateBit;
StateBit |= MAT_VALIDALL;
if (m1.hasTrans() && m2.hasProj())
StateBit |= MAT_ROT | MAT_SCALEANY;
#else
identity();
StateBit= m1.StateBit | m2.StateBit;
StateBit&= ~MAT_VALIDALL;
@ -824,18 +913,22 @@ void CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
a32+= m1.a34*m2.a42;
a33+= m1.a34*m2.a43;
}
#endif
// Modify Scale.
if( (StateBit & MAT_SCALEUNI) && !(StateBit & MAT_SCALEANY) )
{
// Must have correct Scale33
#ifndef USE_SSE2
m1.testExpandRot();
m2.testExpandRot();
#endif
Scale33= m1.Scale33*m2.Scale33;
}
else
Scale33=1;
#ifndef USE_SSE2
// In every case, I am valid now!
StateBit|=MAT_VALIDROT;
@ -902,6 +995,7 @@ void CMatrix::setMulMatrix(const CMatrix &m1, const CMatrix &m2)
{
// Don't copy proj part, and leave MAT_VALIDPROJ not set
}
#endif
}
// ======================================================================================================
void CMatrix::invert()
@ -1237,11 +1331,36 @@ bool CMatrix::normalize(TRotOrder ro)
// ======================================================================================================
// ======================================================================================================
// ======================================================================================================
CVector CMatrix::mulVector(const CVector &v) const
{
#ifdef USE_SSE2
if (hasRot())
{
CVector ret;
register __m128 in1a = _mm_loadu_ps(&M[0]);
register __m128 in1b = _mm_loadu_ps(&M[4]);
register __m128 in1c = _mm_loadu_ps(&M[8]);
register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector!
register __m128 tempsplat;
register __m128 tempmul;
register __m128 out;
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
out = _mm_mul_ps(in1a, tempsplat);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
tempmul = _mm_mul_ps(in1b, tempsplat);
out = _mm_add_ps(out, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
tempmul = _mm_mul_ps(in1c, tempsplat);
out = _mm_add_ps(out, tempmul);
_mm_storeu_ps(&ret.x, out);
return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)!
}
else
{
return v;
}
#else
CVector ret;
if( hasRot() )
@ -1253,6 +1372,7 @@ CVector CMatrix::mulVector(const CVector &v) const
}
else
return v;
#endif
}
// ======================================================================================================
@ -1263,9 +1383,31 @@ CVector CMatrix::mulPoint(const CVector &v) const
if( hasRot() )
{
#ifdef USE_SSE2
register __m128 in1a = _mm_loadu_ps(&M[0]);
register __m128 in1b = _mm_loadu_ps(&M[4]);
register __m128 in1c = _mm_loadu_ps(&M[8]);
register __m128 in1d = _mm_loadu_ps(&M[12]);
register __m128 in2 = _mm_loadu_ps(&v.x); // WARNING: Read goes past size of CVector!
register __m128 tempsplat;
register __m128 tempmul;
register __m128 out;
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(0, 0, 0, 0));
out = _mm_mul_ps(in1a, tempsplat);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(1, 1, 1, 1));
tempmul = _mm_mul_ps(in1b, tempsplat);
out = _mm_add_ps(out, tempmul);
tempsplat = _mm_shuffle_ps(in2, in2, _MM_SHUFFLE(2, 2, 2, 2));
tempmul = _mm_mul_ps(in1c, tempsplat);
out = _mm_add_ps(out, tempmul);
out = _mm_add_ps(out, in1d);
_mm_storeu_ps(&ret.x, out);
return ret; // WARNING: Write goes past size of CVector (this occurs on the stack)!
#else
ret.x= a11*v.x + a12*v.y + a13*v.z;
ret.y= a21*v.x + a22*v.y + a23*v.z;
ret.z= a31*v.x + a32*v.y + a33*v.z;
#endif
}
else
{

@ -249,7 +249,7 @@ public:
Back = NULL;
Front = NULL;
}
CBSPNode2v ( const CPlane &plane, CVector p0, CVector p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1)
CBSPNode2v ( const CPlane &plane, const CVector &p0, const CVector &p1, uint v0, uint v1 ) : Plane (plane), P0 (p0), P1 (p1)
{
Back = NULL;
Front = NULL;

@ -344,8 +344,11 @@ sint CChainQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTem
return nRes;
}
sint CChainQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const
sint CChainQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const
{
CVector start = startp;
CVector end = endp;
sint nRes=0;
sint i;
uint16 *ochainLUT= cst.OChainLUT;

@ -453,8 +453,11 @@ sint CEdgeQuad::selectEdges(const NLMISC::CAABBox &bbox, CCollisionSurfaceTemp
return nRes;
}
sint CEdgeQuad::selectEdges(CVector start, CVector end, CCollisionSurfaceTemp &cst) const
sint CEdgeQuad::selectEdges(const CVector &startp, const CVector &endp, CCollisionSurfaceTemp &cst) const
{
CVector start = startp;
CVector end = endp;
sint nRes=0;
sint i;
uint16 *indexLUT= cst.OChainLUT;

@ -1052,7 +1052,7 @@ bool NLPACS::CLocalRetriever::testPosition(NLPACS::ULocalPosition &local, CColli
}
void NLPACS::CLocalRetriever::retrievePosition(CVector estimated, CCollisionSurfaceTemp &cst) const
void NLPACS::CLocalRetriever::retrievePosition(const CVector &estimated, CCollisionSurfaceTemp &cst) const
{
if (!_Loaded)
return;
@ -2200,7 +2200,7 @@ void NLPACS::CLocalRetriever::replaceChain(uint32 chainId, const std::vector<NLP
/*
* Check surface integrity
*/
bool NLPACS::CLocalRetriever::checkSurfacesIntegrity(NLMISC::CVector translation, bool verbose) const
bool NLPACS::CLocalRetriever::checkSurfacesIntegrity(const NLMISC::CVector &translation, bool verbose) const
{
bool success = true;
uint surf;
@ -2225,7 +2225,7 @@ bool NLPACS::CLocalRetriever::checkSurfacesIntegrity(NLMISC::CVector translation
/**
* Check surface integrity
*/
bool NLPACS::CLocalRetriever::checkSurfaceIntegrity(uint surf, NLMISC::CVector translation, bool verbose) const
bool NLPACS::CLocalRetriever::checkSurfaceIntegrity(uint surf, const NLMISC::CVector &translation, bool verbose) const
{
if (surf >= _Surfaces.size())
return false;

@ -433,10 +433,16 @@ void CDecal::renderTriCache(NL3D::IDriver &drv, NL3D::CShadowPolyReceiver &/*
float bottomBlendBias = bottomBlendScale * (_RefPosition.z - _BottomBlendZMin);
do
{
#if USE_SSE2
dest->X = srcVert->X;
dest->Y = srcVert->Y;
dest->Z = srcVert->Z;
#else
dest->V = srcVert->V;
float dist = (camPos - srcVert->V).norm();
#endif
float dist = (camPos - srcVert->asVector()).norm();
float intensity = scale * dist + bias;
float bottomBlend = srcVert->V.z * bottomBlendScale + bottomBlendBias;
float bottomBlend = srcVert->asVector().z * bottomBlendScale + bottomBlendBias;
clamp(bottomBlend, 0.f, 1.f);
clamp(intensity, 0.f, 255.f);
intensity *= bottomBlend;

Loading…
Cancel
Save