@ -108,281 +108,6 @@ public:
} ;
// ***************************************************************************
// ***************************************************************************
// SSE Matrix
// ***************************************************************************
// ***************************************************************************
// ***************************************************************************
# if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
/** For fast vector/point multiplication. Special usage for Skinning.
* NB : SSE is no more used ( no speed gain , some memory problem ) , but keep it for possible future usage .
*/
class CMatrix3x4SSE
{
public :
// Order them in memory column first, for SSE column multiplication.
float a11 , a21 , a31 , a41 ;
float a12 , a22 , a32 , a42 ;
float a13 , a23 , a33 , a43 ;
float a14 , a24 , a34 , a44 ;
// Copy from a matrix.
void set ( const CMatrix & mat )
{
const float * m = mat . get ( ) ;
a11 = m [ 0 ] ; a12 = m [ 4 ] ; a13 = m [ 8 ] ; a14 = m [ 12 ] ;
a21 = m [ 1 ] ; a22 = m [ 5 ] ; a23 = m [ 9 ] ; a24 = m [ 13 ] ;
a31 = m [ 2 ] ; a32 = m [ 6 ] ; a33 = m [ 10 ] ; a34 = m [ 14 ] ;
// not used.
a41 = 0 ; a42 = 0 ; a43 = 0 ; a44 = 1 ;
}
// mulSetvector. NB: in should be different as v!! (else don't work).
void mulSetVector ( const CVector & vin , CVector & vout )
{
__asm
{
mov eax , vin
mov ebx , this
mov edi , vout
// Load in vector in op[0]
movss xmm0 , [ eax ] vin . x
movss xmm1 , [ eax ] vin . y
movss xmm2 , [ eax ] vin . z
// Expand op[0] to op[1], op[2], op[3]
shufps xmm0 , xmm0 , 0
shufps xmm1 , xmm1 , 0
shufps xmm2 , xmm2 , 0
// Mul each vector with 3 Matrix column
mulps xmm0 , [ ebx ] this . a11
mulps xmm1 , [ ebx ] this . a12
mulps xmm2 , [ ebx ] this . a13
// Add each column vector.
addps xmm0 , xmm1
addps xmm0 , xmm2
// write the result.
movss [ edi ] vout . x , xmm0
shufps xmm0 , xmm0 , 33
movss [ edi ] vout . y , xmm0
movhlps xmm0 , xmm0
movss [ edi ] vout . z , xmm0
}
}
// mulSetpoint. NB: in should be different as v!! (else don't work).
void mulSetPoint ( const CVector & vin , CVector & vout )
{
__asm
{
mov eax , vin
mov ebx , this
mov edi , vout
// Load in vector in op[0]
movss xmm0 , [ eax ] vin . x
movss xmm1 , [ eax ] vin . y
movss xmm2 , [ eax ] vin . z
// Expand op[0] to op[1], op[2], op[3]
shufps xmm0 , xmm0 , 0
shufps xmm1 , xmm1 , 0
shufps xmm2 , xmm2 , 0
// Mul each vector with 3 Matrix column
mulps xmm0 , [ ebx ] this . a11
mulps xmm1 , [ ebx ] this . a12
mulps xmm2 , [ ebx ] this . a13
// Add each column vector.
addps xmm0 , xmm1
addps xmm0 , xmm2
// Add Matrix translate column vector
addps xmm0 , [ ebx ] this . a14
// write the result.
movss [ edi ] vout . x , xmm0
shufps xmm0 , xmm0 , 33
movss [ edi ] vout . y , xmm0
movhlps xmm0 , xmm0
movss [ edi ] vout . z , xmm0
}
}
// mulSetvector. NB: vin should be different as v!! (else don't work).
void mulSetVector ( const CVector & vin , float scale , CVector & vout )
{
__asm
{
mov eax , vin
mov ebx , this
mov edi , vout
// Load in vector in op[0]
movss xmm0 , [ eax ] vin . x
movss xmm1 , [ eax ] vin . y
movss xmm2 , [ eax ] vin . z
// Load scale in op[0]
movss xmm3 , scale
// Expand op[0] to op[1], op[2], op[3]
shufps xmm0 , xmm0 , 0
shufps xmm1 , xmm1 , 0
shufps xmm2 , xmm2 , 0
shufps xmm3 , xmm3 , 0
// Store vertex column in other regs.
movaps xmm5 , xmm0
movaps xmm6 , xmm1
movaps xmm7 , xmm2
// Mul each vector with 3 Matrix column
mulps xmm0 , [ ebx ] this . a11
mulps xmm1 , [ ebx ] this . a12
mulps xmm2 , [ ebx ] this . a13
// Add each column vector.
addps xmm0 , xmm1
addps xmm0 , xmm2
// mul final result with scale
mulps xmm0 , xmm3
// store it in xmm4 for future use.
movaps xmm4 , xmm0
}
}
// mulSetpoint. NB: vin should be different as v!! (else don't work).
void mulSetPoint ( const CVector & vin , float scale , CVector & vout )
{
__asm
{
mov eax , vin
mov ebx , this
mov edi , vout
// Load in vector in op[0]
movss xmm0 , [ eax ] vin . x
movss xmm1 , [ eax ] vin . y
movss xmm2 , [ eax ] vin . z
// Load scale in op[0]
movss xmm3 , scale
// Expand op[0] to op[1], op[2], op[3]
shufps xmm0 , xmm0 , 0
shufps xmm1 , xmm1 , 0
shufps xmm2 , xmm2 , 0
shufps xmm3 , xmm3 , 0
// Store vertex column in other regs.
movaps xmm5 , xmm0
movaps xmm6 , xmm1
movaps xmm7 , xmm2
// Mul each vector with 3 Matrix column
mulps xmm0 , [ ebx ] this . a11
mulps xmm1 , [ ebx ] this . a12
mulps xmm2 , [ ebx ] this . a13
// Add each column vector.
addps xmm0 , xmm1
addps xmm0 , xmm2
// Add Matrix translate column vector
addps xmm0 , [ ebx ] this . a14
// mul final result with scale
mulps xmm0 , xmm3
// store it in xmm4 for future use.
movaps xmm4 , xmm0
}
}
// mulAddvector. NB: vin should be different as v!! (else don't work).
void mulAddVector ( const CVector & /* vin */ , float scale , CVector & vout )
{
__asm
{
mov ebx , this
mov edi , vout
// Load vin vector loaded in mulSetVector
movaps xmm0 , xmm5
movaps xmm1 , xmm6
movaps xmm2 , xmm7
// Load scale in op[0]
movss xmm3 , scale
// Expand op[0] to op[1], op[2], op[3]
shufps xmm3 , xmm3 , 0
// Mul each vector with 3 Matrix column
mulps xmm0 , [ ebx ] this . a11
mulps xmm1 , [ ebx ] this . a12
mulps xmm2 , [ ebx ] this . a13
// Add each column vector.
addps xmm0 , xmm1
addps xmm0 , xmm2
// mul final result with scale
mulps xmm0 , xmm3
// Add result, with prec sum.
addps xmm0 , xmm4
// store it in xmm4 for future use.
movaps xmm4 , xmm0
// write the result.
movss [ edi ] vout . x , xmm0
shufps xmm0 , xmm0 , 33
movss [ edi ] vout . y , xmm0
movhlps xmm0 , xmm0
movss [ edi ] vout . z , xmm0
}
}
// mulAddpoint. NB: vin should be different as v!! (else don't work).
void mulAddPoint ( const CVector & /* vin */ , float scale , CVector & vout )
{
__asm
{
mov ebx , this
mov edi , vout
// Load vin vector loaded in mulSetPoint
movaps xmm0 , xmm5
movaps xmm1 , xmm6
movaps xmm2 , xmm7
// Load scale in op[0]
movss xmm3 , scale
// Expand op[0] to op[1], op[2], op[3]
shufps xmm3 , xmm3 , 0
// Mul each vector with 3 Matrix column
mulps xmm0 , [ ebx ] this . a11
mulps xmm1 , [ ebx ] this . a12
mulps xmm2 , [ ebx ] this . a13
// Add each column vector.
addps xmm0 , xmm1
addps xmm0 , xmm2
// Add Matrix translate column vector
addps xmm0 , [ ebx ] this . a14
// mul final result with scale
mulps xmm0 , xmm3
// Add result, with prec sum.
addps xmm0 , xmm4
// store it in xmm4 for future use.
movaps xmm4 , xmm0
// write the result.
movss [ edi ] vout . x , xmm0
shufps xmm0 , xmm0 , 33
movss [ edi ] vout . y , xmm0
movhlps xmm0 , xmm0
movss [ edi ] vout . z , xmm0
}
}
} ;
# else // NL_OS_WINDOWS
/// dummy CMatrix3x4SSE for non windows platform
class CMatrix3x4SSE : public CMatrix3x4 { } ;
# endif
} // NL3D