1151 lines
36 KiB
C++
1151 lines
36 KiB
C++
/*
|
|
* Software License Agreement (BSD License)
|
|
*
|
|
* Copyright (c) 2011-2014, Willow Garage, Inc.
|
|
* Copyright (c) 2014-2016, Open Source Robotics Foundation
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* * Redistributions in binary form must reproduce the above
|
|
* copyright notice, this list of conditions and the following
|
|
* disclaimer in the documentation and/or other materials provided
|
|
* with the distribution.
|
|
* * Neither the name of Open Source Robotics Foundation nor the names of its
|
|
* contributors may be used to endorse or promote products derived
|
|
* from this software without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
|
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
|
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
|
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
|
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
|
|
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/** \author Jia Pan */
|
|
|
|
|
|
#ifndef FCL_MATH_SIMD_DETAILS_H
|
|
#define FCL_MATH_SIMD_DETAILS_H
|
|
|
|
#include "fcl/common/data_types.h"
|
|
|
|
#include <xmmintrin.h>
|
|
#if defined (__SSE3__)
|
|
#include <pmmintrin.h>
|
|
#endif
|
|
#if defined (__SSE4__)
|
|
#include <smmintrin.h>
|
|
#endif
|
|
|
|
|
|
namespace fcl
|
|
{
|
|
|
|
/** @brief FCL internals. Ignore this :) unless you are God */
|
|
namespace detail
|
|
{
|
|
|
|
const __m128 xmms_0 = {0.f, 0.f, 0.f, 0.f};
|
|
const __m128d xmmd_0 = {0, 0};
|
|
|
|
static inline __m128 vec_sel(__m128 a, __m128 b, __m128 mask)
|
|
{
|
|
return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
|
|
}
|
|
static inline __m128 vec_sel(__m128 a, __m128 b, const unsigned int* mask)
|
|
{
|
|
return vec_sel(a, b, _mm_load_ps((float*)mask));
|
|
}
|
|
|
|
static inline __m128 vec_sel(__m128 a, __m128 b, unsigned int mask)
|
|
{
|
|
return vec_sel(a, b, _mm_set1_ps(*(float*)&mask));
|
|
}
|
|
|
|
#define vec_splat(a, e) _mm_shuffle_ps((a), (a), _MM_SHUFFLE((e), (e), (e), (e)))
|
|
#define vec_splatd(a, e) _mm_shuffle_pd((a), (a), _MM_SHUFFLE2((e), (e)))
|
|
|
|
#define _mm_ror_ps(x, e) (((e) % 4) ? _mm_shuffle_ps((x), (x), _MM_SHUFFLE(((e)+3)%4, ((e)+2)%4, ((e)+1)%4, (e)%4)) : (x))
|
|
|
|
#define _mm_rol_ps(x, e) (((e) % 4) ? _mm_shuffle_ps((x), (x), _MM_SHUFFLE((7-(e))%4, (6-(e))%4, (5-(e))%4, (4-(e))%4)) : (x))
|
|
|
|
static inline __m128 newtonraphson_rsqrt4(const __m128 v)
|
|
{
|
|
static const union { float i[4]; __m128 m; } _half4 __attribute__ ((aligned(16))) = {{.5f, .5f, .5f, .5f}};
|
|
static const union { float i[4]; __m128 m; } _three __attribute__ ((aligned(16))) = {{3.f, 3.f, 3.f, 3.f}};
|
|
__m128 approx = _mm_rsqrt_ps(v);
|
|
__m128 muls = _mm_mul_ps(_mm_mul_ps(v, approx), approx);
|
|
return _mm_mul_ps(_mm_mul_ps(_half4.m, approx), _mm_sub_ps(_three.m, muls));
|
|
}
|
|
|
|
struct sse_meta_f4
|
|
{
|
|
typedef float meta_type;
|
|
|
|
union {float vs[4]; __m128 v; };
|
|
sse_meta_f4() : v(_mm_set1_ps(0)) {}
|
|
sse_meta_f4(float x) : v(_mm_set1_ps(x)) {}
|
|
sse_meta_f4(float* px) : v(_mm_load_ps(px)) {}
|
|
sse_meta_f4(__m128 x) : v(x) {}
|
|
sse_meta_f4(float x, float y, float z, float w = 1) : v(_mm_setr_ps(x, y, z, w)) {}
|
|
inline void setValue(float x, float y, float z, float w = 1) { v = _mm_setr_ps(x, y, z, w); }
|
|
inline void setValue(float x) { v = _mm_set1_ps(x); }
|
|
inline void setValue(__m128 x) { v = x; }
|
|
inline void negate() { v = _mm_sub_ps(xmms_0, v); }
|
|
|
|
inline sse_meta_f4& ubound(const sse_meta_f4& u)
|
|
{
|
|
v = _mm_min_ps(v, u.v);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f4& lbound(const sse_meta_f4& l)
|
|
{
|
|
v = _mm_max_ps(v, l.v);
|
|
return *this;
|
|
}
|
|
|
|
inline void* operator new [] (size_t n) { return _mm_malloc(n, 16); }
|
|
inline void operator delete [] (void* x) { if(x) _mm_free(x); }
|
|
inline float operator [] (size_t i) const { return vs[i]; }
|
|
inline float& operator [] (size_t i) { return vs[i]; }
|
|
|
|
inline sse_meta_f4 operator + (const sse_meta_f4& other) const { return sse_meta_f4(_mm_add_ps(v, other.v)); }
|
|
inline sse_meta_f4 operator - (const sse_meta_f4& other) const { return sse_meta_f4(_mm_sub_ps(v, other.v)); }
|
|
inline sse_meta_f4 operator * (const sse_meta_f4& other) const { return sse_meta_f4(_mm_mul_ps(v, other.v)); }
|
|
inline sse_meta_f4 operator / (const sse_meta_f4& other) const { return sse_meta_f4(_mm_div_ps(v, other.v)); }
|
|
inline sse_meta_f4& operator += (const sse_meta_f4& other) { v = _mm_add_ps(v, other.v); return *this; }
|
|
inline sse_meta_f4& operator -= (const sse_meta_f4& other) { v = _mm_sub_ps(v, other.v); return *this; }
|
|
inline sse_meta_f4& operator *= (const sse_meta_f4& other) { v = _mm_mul_ps(v, other.v); return *this; }
|
|
inline sse_meta_f4& operator /= (const sse_meta_f4& other) { v = _mm_div_ps(v, other.v); return *this; }
|
|
inline sse_meta_f4 operator + (float t) const { return sse_meta_f4(_mm_add_ps(v, _mm_set1_ps(t))); }
|
|
inline sse_meta_f4 operator - (float t) const { return sse_meta_f4(_mm_sub_ps(v, _mm_set1_ps(t))); }
|
|
inline sse_meta_f4 operator * (float t) const { return sse_meta_f4(_mm_mul_ps(v, _mm_set1_ps(t))); }
|
|
inline sse_meta_f4 operator / (float t) const { return sse_meta_f4(_mm_div_ps(v, _mm_set1_ps(t))); }
|
|
inline sse_meta_f4& operator += (float t) { v = _mm_add_ps(v, _mm_set1_ps(t)); return *this; }
|
|
inline sse_meta_f4& operator -= (float t) { v = _mm_sub_ps(v, _mm_set1_ps(t)); return *this; }
|
|
inline sse_meta_f4& operator *= (float t) { v = _mm_mul_ps(v, _mm_set1_ps(t)); return *this; }
|
|
inline sse_meta_f4& operator /= (float t) { v = _mm_div_ps(v, _mm_set1_ps(t)); return *this; }
|
|
inline sse_meta_f4 operator - () const
|
|
{
|
|
static const union { int i[4]; __m128 m; } negativemask __attribute__ ((aligned(16))) = {{0x80000000, 0x80000000, 0x80000000, 0x80000000}};
|
|
return sse_meta_f4(_mm_xor_ps(negativemask.m, v));
|
|
}
|
|
} __attribute__ ((aligned (16)));
|
|
|
|
struct sse_meta_d4
|
|
{
|
|
typedef double meta_type;
|
|
|
|
union {double vs[4]; __m128d v[2]; };
|
|
sse_meta_d4()
|
|
{
|
|
setValue(0.0);
|
|
}
|
|
|
|
sse_meta_d4(double x)
|
|
{
|
|
setValue(x);
|
|
}
|
|
|
|
sse_meta_d4(double* px)
|
|
{
|
|
v[0] = _mm_load_pd(px);
|
|
v[1] = _mm_set_pd(0, *(px + 2));
|
|
}
|
|
|
|
sse_meta_d4(__m128d x, __m128d y)
|
|
{
|
|
v[0] = x;
|
|
v[1] = y;
|
|
}
|
|
|
|
sse_meta_d4(double x, double y, double z, double w = 0)
|
|
{
|
|
setValue(x, y, z, w);
|
|
}
|
|
|
|
inline void setValue(double x, double y, double z, double w = 0)
|
|
{
|
|
v[0] = _mm_setr_pd(x, y);
|
|
v[1] = _mm_setr_pd(z, w);
|
|
}
|
|
|
|
inline void setValue(double x)
|
|
{
|
|
v[0] = _mm_set1_pd(x);
|
|
v[1] = v[0];
|
|
}
|
|
|
|
inline void setValue(__m128d x, __m128d y)
|
|
{
|
|
v[0] = x;
|
|
v[1] = y;
|
|
}
|
|
|
|
inline void negate()
|
|
{
|
|
v[0] = _mm_sub_pd(xmmd_0, v[0]);
|
|
v[1] = _mm_sub_pd(xmmd_0, v[1]);
|
|
}
|
|
|
|
inline sse_meta_d4& ubound(const sse_meta_d4& u)
|
|
{
|
|
v[0] = _mm_min_pd(v[0], u.v[0]);
|
|
v[1] = _mm_min_pd(v[1], u.v[1]);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_d4& lbound(const sse_meta_d4& l)
|
|
{
|
|
v[0] = _mm_max_pd(v[0], l.v[0]);
|
|
v[1] = _mm_max_pd(v[1], l.v[1]);
|
|
return *this;
|
|
}
|
|
|
|
inline void* operator new [] (size_t n)
|
|
{
|
|
return _mm_malloc(n, 16);
|
|
}
|
|
|
|
inline void operator delete [] (void* x)
|
|
{
|
|
if(x) _mm_free(x);
|
|
}
|
|
|
|
inline double operator [] (size_t i) const { return vs[i]; }
|
|
inline double& operator [] (size_t i) { return vs[i]; }
|
|
|
|
inline sse_meta_d4 operator + (const sse_meta_d4& other) const { return sse_meta_d4(_mm_add_pd(v[0], other.v[0]), _mm_add_pd(v[1], other.v[1])); }
|
|
inline sse_meta_d4 operator - (const sse_meta_d4& other) const { return sse_meta_d4(_mm_sub_pd(v[0], other.v[0]), _mm_sub_pd(v[1], other.v[1])); }
|
|
inline sse_meta_d4 operator * (const sse_meta_d4& other) const { return sse_meta_d4(_mm_mul_pd(v[0], other.v[0]), _mm_mul_pd(v[1], other.v[1])); }
|
|
inline sse_meta_d4 operator / (const sse_meta_d4& other) const { return sse_meta_d4(_mm_div_pd(v[0], other.v[0]), _mm_div_pd(v[1], other.v[1])); }
|
|
inline sse_meta_d4& operator += (const sse_meta_d4& other) { v[0] = _mm_add_pd(v[0], other.v[0]); v[1] = _mm_add_pd(v[1], other.v[1]); return *this; }
|
|
inline sse_meta_d4& operator -= (const sse_meta_d4& other) { v[0] = _mm_sub_pd(v[0], other.v[0]); v[1] = _mm_sub_pd(v[1], other.v[1]); return *this; }
|
|
inline sse_meta_d4& operator *= (const sse_meta_d4& other) { v[0] = _mm_mul_pd(v[0], other.v[0]); v[1] = _mm_mul_pd(v[1], other.v[1]); return *this; }
|
|
inline sse_meta_d4& operator /= (const sse_meta_d4& other) { v[0] = _mm_div_pd(v[0], other.v[0]); v[1] = _mm_div_pd(v[1], other.v[1]); return *this; }
|
|
inline sse_meta_d4 operator + (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_add_pd(v[0], d), _mm_add_pd(v[1], d)); }
|
|
inline sse_meta_d4 operator - (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_sub_pd(v[0], d), _mm_sub_pd(v[1], d)); }
|
|
inline sse_meta_d4 operator * (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_mul_pd(v[0], d), _mm_mul_pd(v[1], d)); }
|
|
inline sse_meta_d4 operator / (double t) const { register __m128d d = _mm_set1_pd(t); return sse_meta_d4(_mm_div_pd(v[0], d), _mm_div_pd(v[1], d)); }
|
|
inline sse_meta_d4& operator += (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_add_pd(v[0], d); v[1] = _mm_add_pd(v[1], d); return *this; }
|
|
inline sse_meta_d4& operator -= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_sub_pd(v[0], d); v[1] = _mm_sub_pd(v[1], d); return *this; }
|
|
inline sse_meta_d4& operator *= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_mul_pd(v[0], d); v[1] = _mm_mul_pd(v[1], d); return *this; }
|
|
inline sse_meta_d4& operator /= (double t) { register __m128d d = _mm_set1_pd(t); v[0] = _mm_div_pd(v[0], d); v[1] = _mm_div_pd(v[1], d); return *this; }
|
|
inline sse_meta_d4 operator - () const
|
|
{
|
|
static const union { FCL_INT64 i[2]; __m128d m; } negativemask __attribute__ ((aligned(16))) = {{0x8000000000000000, 0x8000000000000000}};
|
|
return sse_meta_d4(_mm_xor_pd(v[0], negativemask.m), _mm_xor_pd(v[1], negativemask.m));
|
|
}
|
|
} __attribute__ ((aligned (16)));
|
|
|
|
|
|
|
|
static inline __m128 cross_prod(__m128 x, __m128 y)
|
|
{
|
|
// set to a[1][2][0][3] , b[2][0][1][3]
|
|
// multiply
|
|
static const int s1 = _MM_SHUFFLE(3, 0, 2, 1);
|
|
static const int s2 = _MM_SHUFFLE(3, 1, 0, 2);
|
|
__m128 xa = _mm_mul_ps(_mm_shuffle_ps(x, x, s1), _mm_shuffle_ps(y, y, s2));
|
|
|
|
// set to a[2][0][1][3] , b[1][2][0][3]
|
|
// multiply
|
|
__m128 xb = _mm_mul_ps(_mm_shuffle_ps(x, x, s2), _mm_shuffle_ps(y, y, s1));
|
|
|
|
// subtract
|
|
return _mm_sub_ps(xa, xb);
|
|
}
|
|
|
|
static inline sse_meta_f4 cross_prod(const sse_meta_f4& x, const sse_meta_f4& y)
|
|
{
|
|
return sse_meta_f4(cross_prod(x.v, y.v));
|
|
}
|
|
|
|
static inline void cross_prod(__m128d x0, __m128d x1, __m128d y0, __m128d y1, __m128d* z0, __m128d* z1)
|
|
{
|
|
static const int s0 = _MM_SHUFFLE2(0, 0);
|
|
static const int s1 = _MM_SHUFFLE2(0, 1);
|
|
static const int s2 = _MM_SHUFFLE2(1, 0);
|
|
static const int s3 = _MM_SHUFFLE2(1, 1);
|
|
__m128d xa1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s1), _mm_shuffle_pd(y1, y0, s0));
|
|
__m128d ya1 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s2), _mm_shuffle_pd(y0, y1, s3));
|
|
|
|
__m128d xa2 = _mm_mul_pd(_mm_shuffle_pd(x1, x0, s0), _mm_shuffle_pd(y0, y1, s1));
|
|
__m128d ya2 = _mm_mul_pd(_mm_shuffle_pd(x0, x1, s3), _mm_shuffle_pd(y0, y1, s2));
|
|
|
|
*z0 = _mm_sub_pd(xa1, xa2);
|
|
*z1 = _mm_sub_pd(ya1, ya2);
|
|
}
|
|
|
|
static inline sse_meta_d4 cross_prod(const sse_meta_d4& x, const sse_meta_d4& y)
|
|
{
|
|
__m128d z0, z1;
|
|
cross_prod(x.v[0], x.v[1], y.v[0], y.v[1], &z0, &z1);
|
|
return sse_meta_d4(z0, z1);
|
|
}
|
|
|
|
|
|
static inline __m128 dot_prod3(__m128 x, __m128 y)
|
|
{
|
|
register __m128 m = _mm_mul_ps(x, y);
|
|
return _mm_add_ps(_mm_shuffle_ps(m, m, _MM_SHUFFLE(0, 0, 0, 0)),
|
|
_mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
|
|
}
|
|
|
|
static inline float dot_prod3(const sse_meta_f4& x, const sse_meta_f4& y)
|
|
{
|
|
return _mm_cvtss_f32(dot_prod3(x.v, y.v));
|
|
}
|
|
|
|
|
|
static inline __m128d dot_prod3(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
|
|
{
|
|
register __m128d m1 = _mm_mul_pd(x0, y0);
|
|
register __m128d m2 = _mm_mul_pd(x1, y1);
|
|
return _mm_add_pd(_mm_add_pd(vec_splatd(m1, 0), vec_splatd(m1, 1)), vec_splatd(m2, 0));
|
|
}
|
|
|
|
static inline double dot_prod3(const sse_meta_d4& x, const sse_meta_d4& y)
|
|
{
|
|
double d;
|
|
_mm_storel_pd(&d, dot_prod3(x.v[0], x.v[1], y.v[0], y.v[1]));
|
|
return d;
|
|
}
|
|
|
|
static inline __m128 dot_prod4(__m128 x, __m128 y)
|
|
{
|
|
#if defined (__SSE4__)
|
|
return _mm_dp_ps(x, y, 0x71);
|
|
#elif defined (__SSE3__)
|
|
register __m128 t = _mm_mul_ps(x, y);
|
|
t = _mm_hadd_ps(t, t);
|
|
return _mm_hadd_ps(t, t);
|
|
#else
|
|
register __m128 s = _mm_mul_ps(x, y);
|
|
register __m128 r = _mm_add_ss(s, _mm_movehl_ps(s, s));
|
|
return _mm_add_ss(r, _mm_shuffle_ps(r, r, 1));
|
|
#endif
|
|
}
|
|
|
|
|
|
static inline float dot_prod4(const sse_meta_f4& x, const sse_meta_f4& y)
|
|
{
|
|
return _mm_cvtss_f32(dot_prod4(x.v, y.v));
|
|
}
|
|
|
|
static inline __m128d dot_prod4(__m128d x0, __m128d x1, __m128d y0, __m128d y1)
|
|
{
|
|
#if defined (__SSE4__)
|
|
register __m128d t1 = _mm_dp_pd(x0, y0, 0x31);
|
|
register __m128d t2 = _mm_dp_pd(x1, y1, 0x11);
|
|
return _mm_add_pd(t1, t2);
|
|
#elif defined (__SSE3__)
|
|
register __m128d t1 = _mm_mul_pd(x0, y0);
|
|
register __m128d t2 = _mm_mul_pd(x1, y1);
|
|
t1 = _mm_hadd_pd(t1, t1);
|
|
t2 = _mm_hadd_pd(t2, t2);
|
|
return _mm_add_pd(t1, t2);
|
|
#else
|
|
register __m128d t1 = _mm_mul_pd(x0, y0);
|
|
register __m128d t2 = _mm_mul_pd(x1, y1);
|
|
t1 = _mm_add_pd(t1, t2);
|
|
return _mm_add_pd(t1, _mm_shuffle_pd(t1, t1, 1));
|
|
#endif
|
|
}
|
|
|
|
static inline double dot_prod4(const sse_meta_d4& x, const sse_meta_d4& y)
|
|
{
|
|
double d;
|
|
_mm_storel_pd(&d, dot_prod4(x.v[0], x.v[1], y.v[0], y.v[1]));
|
|
return d;
|
|
}
|
|
|
|
static inline sse_meta_f4 min(const sse_meta_f4& x, const sse_meta_f4& y)
|
|
{
|
|
return sse_meta_f4(_mm_min_ps(x.v, y.v));
|
|
}
|
|
|
|
static inline sse_meta_d4 min(const sse_meta_d4& x, const sse_meta_d4& y)
|
|
{
|
|
return sse_meta_d4(_mm_min_pd(x.v[0], y.v[0]), _mm_min_pd(x.v[1], y.v[1]));
|
|
}
|
|
|
|
static inline sse_meta_f4 max(const sse_meta_f4& x, const sse_meta_f4& y)
|
|
{
|
|
return sse_meta_f4(_mm_max_ps(x.v, y.v));
|
|
}
|
|
|
|
static inline sse_meta_d4 max(const sse_meta_d4& x, const sse_meta_d4& y)
|
|
{
|
|
return sse_meta_d4(_mm_max_pd(x.v[0], y.v[0]), _mm_max_pd(x.v[1], y.v[1]));
|
|
}
|
|
|
|
static inline sse_meta_f4 abs(const sse_meta_f4& x)
|
|
{
|
|
static const union { int i[4]; __m128 m; } abs4mask __attribute__ ((aligned (16))) = {{0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}};
|
|
return sse_meta_f4(_mm_and_ps(x.v, abs4mask.m));
|
|
}
|
|
|
|
static inline sse_meta_d4 abs(const sse_meta_d4& x)
|
|
{
|
|
static const union { FCL_INT64 i[2]; __m128d m; } abs2mask __attribute__ ((aligned (16))) = {{0x7fffffffffffffff, 0x7fffffffffffffff}};
|
|
return sse_meta_d4(_mm_and_pd(x.v[0], abs2mask.m), _mm_and_pd(x.v[1], abs2mask.m));
|
|
}
|
|
|
|
static inline bool equal(const sse_meta_f4& x, const sse_meta_f4& y, float epsilon)
|
|
{
|
|
register __m128 d = _mm_sub_ps(x.v, y.v);
|
|
register __m128 e = _mm_set1_ps(epsilon);
|
|
return ((_mm_movemask_ps(_mm_cmplt_ps(d, e)) & 0x7) == 0x7) && ((_mm_movemask_ps(_mm_cmpgt_ps(d, _mm_sub_ps(xmms_0, e))) & 0x7) == 0x7);
|
|
}
|
|
|
|
static inline bool equal(const sse_meta_d4& x, const sse_meta_d4& y, double epsilon)
|
|
{
|
|
register __m128d d = _mm_sub_pd(x.v[0], y.v[0]);
|
|
register __m128d e = _mm_set1_pd(epsilon);
|
|
|
|
if(_mm_movemask_pd(_mm_cmplt_pd(d, e)) != 0x3) return false;
|
|
if(_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) != 0x3) return false;
|
|
|
|
d = _mm_sub_pd(x.v[1], y.v[1]);
|
|
if((_mm_movemask_pd(_mm_cmplt_pd(d, e)) & 0x1) != 0x1) return false;
|
|
if((_mm_movemask_pd(_mm_cmpgt_pd(d, _mm_sub_pd(xmmd_0, e))) & 0x1) != 0x1) return false;
|
|
return true;
|
|
}
|
|
|
|
static inline sse_meta_f4 normalize3(const sse_meta_f4& x)
|
|
{
|
|
register __m128 m = _mm_mul_ps(x.v, x.v);
|
|
__m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
|
|
return sse_meta_f4(_mm_mul_ps(x.v, newtonraphson_rsqrt4(r)));
|
|
}
|
|
|
|
static inline sse_meta_f4 normalize3_approx(const sse_meta_f4& x)
|
|
{
|
|
register __m128 m = _mm_mul_ps(x.v, x.v);
|
|
__m128 r = _mm_add_ps(vec_splat(m, 0), _mm_add_ps(vec_splat(m, 1), vec_splat(m, 2)));
|
|
return sse_meta_f4(_mm_mul_ps(x.v, _mm_rsqrt_ps(r)));
|
|
}
|
|
|
|
|
|
static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128* r0, __m128* r1, __m128* r2)
|
|
{
|
|
static const union { unsigned int i[4]; __m128 m; } selectmask __attribute__ ((aligned(16))) = {{0, 0xffffffff, 0, 0}};
|
|
register __m128 t0, t1;
|
|
t0 = _mm_unpacklo_ps(c0, c2);
|
|
t1 = _mm_unpackhi_ps(c0, c2);
|
|
*r0 = _mm_unpacklo_ps(t0, c1);
|
|
*r1 = _mm_shuffle_ps(t0, t0, _MM_SHUFFLE(0, 3, 2, 2));
|
|
*r1 = vec_sel(*r1, c1, selectmask.i);
|
|
*r2 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 1, 1, 0));
|
|
*r2 = vec_sel(*r2, vec_splat(c1, 2), selectmask.i);
|
|
}
|
|
|
|
|
|
static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128* i0, __m128* i1, __m128* i2)
|
|
{
|
|
__m128 t0, t1, t2, d, invd;
|
|
t2 = cross_prod(c0, c1);
|
|
t0 = cross_prod(c1, c2);
|
|
t1 = cross_prod(c2, c0);
|
|
d = dot_prod3(t2, c2);
|
|
d = vec_splat(d, 0);
|
|
invd = _mm_rcp_ps(d); // approximate inverse
|
|
transpose(t0, t1, t2, i0, i1, i2);
|
|
*i0 = _mm_mul_ps(*i0, invd);
|
|
*i1 = _mm_mul_ps(*i1, invd);
|
|
*i2 = _mm_mul_ps(*i2, invd);
|
|
}
|
|
|
|
|
|
struct sse_meta_f12
|
|
{
|
|
typedef float meta_type;
|
|
typedef sse_meta_f4 vector_type;
|
|
sse_meta_f4 c[3];
|
|
|
|
sse_meta_f12() { setZero(); }
|
|
|
|
sse_meta_f12(float xx, float xy, float xz,
|
|
float yx, float yy, float yz,
|
|
float zx, float zy, float zz)
|
|
{ setValue(xx, xy, xz, yx, yy, yz, zx, zy, zz); }
|
|
|
|
sse_meta_f12(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
|
|
{ setColumn(x, y, z); }
|
|
|
|
sse_meta_f12(__m128 x, __m128 y, __m128 z)
|
|
{ setColumn(x, y, z); }
|
|
|
|
inline void setValue(float xx, float xy, float xz,
|
|
float yx, float yy, float yz,
|
|
float zx, float zy, float zz)
|
|
{
|
|
c[0].setValue(xx, yx, zx, 0);
|
|
c[1].setValue(xy, yy, zy, 0);
|
|
c[2].setValue(xz, yz, zz, 0);
|
|
}
|
|
|
|
inline void setIdentity()
|
|
{
|
|
c[0].setValue(1, 0, 0, 0);
|
|
c[1].setValue(0, 1, 0, 0);
|
|
c[2].setValue(0, 0, 1, 0);
|
|
}
|
|
|
|
inline void setZero()
|
|
{
|
|
c[0].setValue(0);
|
|
c[1].setValue(0);
|
|
c[2].setValue(0);
|
|
}
|
|
|
|
inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z)
|
|
{
|
|
c[0] = x; c[1] = y; c[2] = z;
|
|
}
|
|
|
|
inline void setColumn(__m128 x, __m128 y, __m128 z)
|
|
{
|
|
c[0].setValue(x); c[1].setValue(y); c[2].setValue(z);
|
|
}
|
|
|
|
inline const sse_meta_f4& getColumn(size_t i) const
|
|
{
|
|
return c[i];
|
|
}
|
|
|
|
inline sse_meta_f4& getColumn(size_t i)
|
|
{
|
|
return c[i];
|
|
}
|
|
|
|
inline sse_meta_f4 getRow(size_t i) const
|
|
{
|
|
return sse_meta_f4(c[0][i], c[1][i], c[2][i], 0);
|
|
}
|
|
|
|
inline float operator () (size_t i, size_t j) const
|
|
{
|
|
return c[j][i];
|
|
}
|
|
|
|
inline float& operator () (size_t i, size_t j)
|
|
{
|
|
return c[j][i];
|
|
}
|
|
|
|
inline sse_meta_f4 operator * (const sse_meta_f4& v) const
|
|
{
|
|
return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))), _mm_mul_ps(c[2].v, vec_splat(v.v, 2))));
|
|
}
|
|
|
|
inline sse_meta_f12 operator * (const sse_meta_f12& mat) const
|
|
{
|
|
return sse_meta_f12((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
|
|
}
|
|
|
|
inline sse_meta_f12 operator + (const sse_meta_f12& mat) const
|
|
{
|
|
return sse_meta_f12(c[0] + mat.c[0], c[1] + mat.c[1], c[2] + mat.c[2]);
|
|
}
|
|
|
|
inline sse_meta_f12 operator - (const sse_meta_f12& mat) const
|
|
{
|
|
return sse_meta_f12(c[0] - mat.c[0], c[1] - mat.c[1], c[2] - mat.c[2]);
|
|
}
|
|
|
|
inline sse_meta_f12 operator + (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f12(c[0] + t, c[1] + t, c[2] + t);
|
|
}
|
|
|
|
inline sse_meta_f12 operator - (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f12(c[0] - t, c[1] - t, c[2] - t);
|
|
}
|
|
|
|
inline sse_meta_f12 operator * (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f12(c[0] * t, c[1] * t, c[2] * t);
|
|
}
|
|
|
|
inline sse_meta_f12 operator / (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f12(c[0] / t, c[1] / t, c[2] / t);
|
|
}
|
|
|
|
inline sse_meta_f12& operator *= (const sse_meta_f12& mat)
|
|
{
|
|
setColumn((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2]);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& operator += (const sse_meta_f12& mat)
|
|
{
|
|
c[0] += mat.c[0];
|
|
c[1] += mat.c[1];
|
|
c[2] += mat.c[2];
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& operator -= (const sse_meta_f12& mat)
|
|
{
|
|
c[0] -= mat.c[0];
|
|
c[1] -= mat.c[1];
|
|
c[2] -= mat.c[2];
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& operator += (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] += t;
|
|
c[1] += t;
|
|
c[2] += t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& operator -= (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] -= t;
|
|
c[1] -= t;
|
|
c[2] -= t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& operator *= (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] *= t;
|
|
c[1] *= t;
|
|
c[2] *= t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& operator /= (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] /= t;
|
|
c[1] /= t;
|
|
c[2] /= t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& inverse()
|
|
{
|
|
__m128 inv0, inv1, inv2;
|
|
detail::inverse(c[0].v, c[1].v, c[2].v, &inv0, &inv1, &inv2);
|
|
setColumn(inv0, inv1, inv2);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& transpose()
|
|
{
|
|
__m128 r0, r1, r2;
|
|
detail::transpose(c[0].v, c[1].v, c[2].v, &r0, &r1, &r2);
|
|
setColumn(r0, r1, r2);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f12& abs()
|
|
{
|
|
c[0] = detail::abs(c[0]);
|
|
c[1] = detail::abs(c[1]);
|
|
c[2] = detail::abs(c[2]);
|
|
return *this;
|
|
}
|
|
|
|
inline float determinant() const
|
|
{
|
|
return _mm_cvtss_f32(dot_prod3(c[2].v, cross_prod(c[0].v, c[1].v)));
|
|
}
|
|
|
|
inline sse_meta_f12 transposeTimes(const sse_meta_f12& other) const
|
|
{
|
|
return sse_meta_f12(dot_prod3(c[0], other.c[0]), dot_prod3(c[0], other.c[1]), dot_prod3(c[0], other.c[2]),
|
|
dot_prod3(c[1], other.c[0]), dot_prod3(c[1], other.c[1]), dot_prod3(c[1], other.c[2]),
|
|
dot_prod3(c[2], other.c[0]), dot_prod3(c[2], other.c[1]), dot_prod3(c[2], other.c[2]));
|
|
}
|
|
|
|
inline sse_meta_f12 timesTranspose(const sse_meta_f12& m) const
|
|
{
|
|
sse_meta_f12 tmp(m);
|
|
return (*this) * tmp.transpose();
|
|
}
|
|
|
|
inline sse_meta_f4 transposeTimes(const sse_meta_f4& v) const
|
|
{
|
|
return sse_meta_f4(dot_prod3(c[0], v), dot_prod3(c[1], v), dot_prod3(c[2], v));
|
|
}
|
|
|
|
inline float transposeDot(size_t i, const sse_meta_f4& v) const
|
|
{
|
|
return dot_prod3(c[i], v);
|
|
}
|
|
|
|
inline float dot(size_t i, const sse_meta_f4& v) const
|
|
{
|
|
return v[0] * c[0][i] + v[1] * c[1][i] + v[2] * c[2][i];
|
|
}
|
|
|
|
};
|
|
|
|
static inline sse_meta_f12 abs(const sse_meta_f12& mat)
|
|
{
|
|
return sse_meta_f12(abs(mat.getColumn(0)), abs(mat.getColumn(1)), abs(mat.getColumn(2)));
|
|
}
|
|
|
|
static inline sse_meta_f12 transpose(const sse_meta_f12& mat)
|
|
{
|
|
__m128 r0, r1, r2;
|
|
transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &r0, &r1, &r2);
|
|
return sse_meta_f12(r0, r1, r2);
|
|
}
|
|
|
|
|
|
static inline sse_meta_f12 inverse(const sse_meta_f12& mat)
|
|
{
|
|
__m128 inv0, inv1, inv2;
|
|
inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, &inv0, &inv1, &inv2);
|
|
return sse_meta_f12(inv0, inv1, inv2);
|
|
}
|
|
|
|
|
|
static inline void transpose(__m128 c0, __m128 c1, __m128 c2, __m128 c3,
|
|
__m128* r0, __m128* r1, __m128* r2, __m128* r3)
|
|
{
|
|
__m128 tmp0 = _mm_unpacklo_ps(c0, c2);
|
|
__m128 tmp1 = _mm_unpacklo_ps(c1, c3);
|
|
__m128 tmp2 = _mm_unpackhi_ps(c0, c2);
|
|
__m128 tmp3 = _mm_unpackhi_ps(c1, c3);
|
|
*r0 = _mm_unpacklo_ps(tmp0, tmp1);
|
|
*r1 = _mm_unpackhi_ps(tmp0, tmp1);
|
|
*r2 = _mm_unpacklo_ps(tmp2, tmp3);
|
|
*r3 = _mm_unpackhi_ps(tmp2, tmp3);
|
|
}
|
|
|
|
|
|
static inline void inverse(__m128 c0, __m128 c1, __m128 c2, __m128 c3,
|
|
__m128* res0, __m128* res1, __m128* res2, __m128* res3)
|
|
{
|
|
__m128 Va, Vb, Vc;
|
|
__m128 r1, r2, r3, tt, tt2;
|
|
__m128 sum, Det, RDet;
|
|
__m128 trns0, trns1, trns2, trns3;
|
|
|
|
// Calculating the minterms for the first line.
|
|
|
|
tt = c3; tt2 = _mm_ror_ps(c2,1);
|
|
Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0)); // V3'\B7V4
|
|
Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2)); // V3'\B7V4"
|
|
Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3)); // V3'\B7V4^
|
|
|
|
r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3"\B7V4^ - V3^\B7V4"
|
|
r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3^\B7V4' - V3'\B7V4^
|
|
r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V3'\B7V4" - V3"\B7V4'
|
|
|
|
tt = c1;
|
|
Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1);
|
|
Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
|
|
Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
|
|
|
|
// Calculating the determinant.
|
|
Det = _mm_mul_ps(sum,c0);
|
|
Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
|
|
|
|
static const union { int i[4]; __m128 m; } Sign_PNPN __attribute__ ((aligned(16))) = {{0x00000000, 0x80000000, 0x00000000, 0x80000000}};
|
|
static const union { int i[4]; __m128 m; } Sign_NPNP __attribute__ ((aligned(16))) = {{0x80000000, 0x00000000, 0x80000000, 0x00000000}};
|
|
static const union { float i[4]; __m128 m; } ZERONE __attribute__ ((aligned(16))) = {{1.0f, 0.0f, 0.0f, 1.0f}};
|
|
|
|
__m128 mtL1 = _mm_xor_ps(sum,Sign_PNPN.m);
|
|
|
|
// Calculating the minterms of the second line (using previous results).
|
|
tt = _mm_ror_ps(c0,1); sum = _mm_mul_ps(tt,r1);
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
|
|
__m128 mtL2 = _mm_xor_ps(sum,Sign_NPNP.m);
|
|
|
|
// Testing the determinant.
|
|
Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
|
|
|
|
// Calculating the minterms of the third line.
|
|
tt = _mm_ror_ps(c0,1);
|
|
Va = _mm_mul_ps(tt,Vb); // V1'\B7V2"
|
|
Vb = _mm_mul_ps(tt,Vc); // V1'\B7V2^
|
|
Vc = _mm_mul_ps(tt,c1); // V1'\B7V2
|
|
|
|
r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V1"\B7V2^ - V1^\B7V2"
|
|
r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V1^\B7V2' - V1'\B7V2^
|
|
r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V1'\B7V2" - V1"\B7V2'
|
|
|
|
tt = _mm_ror_ps(c3,1); sum = _mm_mul_ps(tt,r1);
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
|
|
__m128 mtL3 = _mm_xor_ps(sum,Sign_PNPN.m);
|
|
|
|
// Dividing is FASTER than rcp_nr! (Because rcp_nr causes many register-memory RWs).
|
|
RDet = _mm_div_ss(ZERONE.m, Det); // TODO: just 1.0f?
|
|
RDet = _mm_shuffle_ps(RDet,RDet,0x00);
|
|
|
|
// Devide the first 12 minterms with the determinant.
|
|
mtL1 = _mm_mul_ps(mtL1, RDet);
|
|
mtL2 = _mm_mul_ps(mtL2, RDet);
|
|
mtL3 = _mm_mul_ps(mtL3, RDet);
|
|
|
|
// Calculate the minterms of the forth line and devide by the determinant.
|
|
tt = _mm_ror_ps(c2,1); sum = _mm_mul_ps(tt,r1);
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
|
|
__m128 mtL4 = _mm_xor_ps(sum,Sign_NPNP.m);
|
|
mtL4 = _mm_mul_ps(mtL4, RDet);
|
|
|
|
// Now we just have to transpose the minterms matrix.
|
|
trns0 = _mm_unpacklo_ps(mtL1,mtL2);
|
|
trns1 = _mm_unpacklo_ps(mtL3,mtL4);
|
|
trns2 = _mm_unpackhi_ps(mtL1,mtL2);
|
|
trns3 = _mm_unpackhi_ps(mtL3,mtL4);
|
|
*res0 = _mm_movelh_ps(trns0,trns1);
|
|
*res1 = _mm_movehl_ps(trns1,trns0);
|
|
*res2 = _mm_movelh_ps(trns2,trns3);
|
|
*res3 = _mm_movehl_ps(trns3,trns2);
|
|
}
|
|
|
|
|
|
struct sse_meta_f16
|
|
{
|
|
typedef float meta_type;
|
|
typedef sse_meta_f4 vector_type;
|
|
sse_meta_f4 c[4];
|
|
|
|
sse_meta_f16() { setZero(); }
|
|
|
|
sse_meta_f16(float xx, float xy, float xz, float xw,
|
|
float yx, float yy, float yz, float yw,
|
|
float zx, float zy, float zz, float zw,
|
|
float wx, float wy, float wz, float ww)
|
|
{ setValue(xx, xy, xz, xw, yz, yy, yz, yw, zx, zy, zz, zw, wx, wy, wz, ww); }
|
|
|
|
sse_meta_f16(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
|
|
{ setColumn(x, y, z, w); }
|
|
|
|
sse_meta_f16(__m128 x, __m128 y, __m128 z, __m128 w)
|
|
{ setColumn(x, y, z, w); }
|
|
|
|
inline void setValue(float xx, float xy, float xz, float xw,
|
|
float yx, float yy, float yz, float yw,
|
|
float zx, float zy, float zz, float zw,
|
|
float wx, float wy, float wz, float ww)
|
|
{
|
|
c[0].setValue(xx, yx, zx, wx);
|
|
c[1].setValue(xy, yy, zy, wy);
|
|
c[2].setValue(xz, yz, zz, wz);
|
|
c[3].setValue(xw, yw, zw, ww);
|
|
}
|
|
|
|
inline void setColumn(const sse_meta_f4& x, const sse_meta_f4& y, const sse_meta_f4& z, const sse_meta_f4& w)
|
|
{
|
|
c[0] = x; c[1] = y; c[2] = z; c[3] = w;
|
|
}
|
|
|
|
inline void setColumn(__m128 x, __m128 y, __m128 z, __m128 w)
|
|
{
|
|
c[0].setValue(x); c[1].setValue(y); c[2].setValue(z); c[3].setValue(w);
|
|
}
|
|
|
|
inline void setIdentity()
|
|
{
|
|
c[0].setValue(1, 0, 0, 0);
|
|
c[1].setValue(0, 1, 0, 0);
|
|
c[2].setValue(0, 0, 1, 0);
|
|
c[3].setValue(0, 0, 0, 1);
|
|
}
|
|
|
|
inline void setZero()
|
|
{
|
|
c[0].setValue(0);
|
|
c[1].setValue(0);
|
|
c[2].setValue(0);
|
|
c[3].setValue(0);
|
|
}
|
|
|
|
inline const sse_meta_f4& getColumn(size_t i) const
|
|
{
|
|
return c[i];
|
|
}
|
|
|
|
inline sse_meta_f4& getColumn(size_t i)
|
|
{
|
|
return c[i];
|
|
}
|
|
|
|
inline sse_meta_f4 getRow(size_t i) const
|
|
{
|
|
return sse_meta_f4(c[0][i], c[1][i], c[2][i], c[3][i]);
|
|
}
|
|
|
|
inline float operator () (size_t i, size_t j) const
|
|
{
|
|
return c[j][i];
|
|
}
|
|
|
|
inline float& operator () (size_t i, size_t j)
|
|
{
|
|
return c[j][i];
|
|
}
|
|
|
|
inline sse_meta_f4 operator * (const sse_meta_f4& v) const
|
|
{
|
|
return sse_meta_f4(_mm_add_ps(_mm_add_ps(_mm_mul_ps(c[0].v, vec_splat(v.v, 0)), _mm_mul_ps(c[1].v, vec_splat(v.v, 1))),
|
|
_mm_add_ps(_mm_mul_ps(c[2].v, vec_splat(v.v, 2)), _mm_mul_ps(c[3].v, vec_splat(v.v, 3)))
|
|
));
|
|
}
|
|
|
|
inline sse_meta_f16 operator * (const sse_meta_f16& mat) const
|
|
{
|
|
return sse_meta_f16((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
|
|
}
|
|
|
|
|
|
inline sse_meta_f16 operator + (const sse_meta_f16& mat) const
|
|
{
|
|
return sse_meta_f16(c[0] + mat.c[0], c[1] + mat.c[1], c[2] + mat.c[2], c[3] + mat.c[3]);
|
|
}
|
|
|
|
inline sse_meta_f16 operator - (const sse_meta_f16& mat) const
|
|
{
|
|
return sse_meta_f16(c[0] - mat.c[0], c[1] - mat.c[1], c[2] - mat.c[2], c[3] - mat.c[3]);
|
|
}
|
|
|
|
inline sse_meta_f16 operator + (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f16(c[0] + t, c[1] + t, c[2] + t, c[3] + t);
|
|
}
|
|
|
|
inline sse_meta_f16 operator - (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f16(c[0] - t, c[1] - t, c[2] - t, c[3] - t);
|
|
}
|
|
|
|
inline sse_meta_f16 operator * (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f16(c[0] * t, c[1] * t, c[2] * t, c[3] * t);
|
|
}
|
|
|
|
inline sse_meta_f16 operator / (float t_) const
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
return sse_meta_f16(c[0] / t, c[1] / t, c[2] / t, c[3] / t);
|
|
}
|
|
|
|
inline sse_meta_f16& operator *= (const sse_meta_f16& mat)
|
|
{
|
|
setColumn((*this) * mat.c[0], (*this) * mat.c[1], (*this) * mat.c[2], (*this) * mat.c[3]);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& operator += (const sse_meta_f16& mat)
|
|
{
|
|
c[0] += mat.c[0];
|
|
c[1] += mat.c[1];
|
|
c[2] += mat.c[2];
|
|
c[3] += mat.c[3];
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& operator -= (const sse_meta_f16& mat)
|
|
{
|
|
c[0] -= mat.c[0];
|
|
c[1] -= mat.c[1];
|
|
c[2] -= mat.c[2];
|
|
c[3] -= mat.c[3];
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& operator += (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] += t;
|
|
c[1] += t;
|
|
c[2] += t;
|
|
c[3] += t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& operator -= (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] -= t;
|
|
c[1] -= t;
|
|
c[2] -= t;
|
|
c[3] -= t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& operator *= (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] *= t;
|
|
c[1] *= t;
|
|
c[2] *= t;
|
|
c[3] *= t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& operator /= (float t_)
|
|
{
|
|
sse_meta_f4 t(t_);
|
|
c[0] /= t;
|
|
c[1] /= t;
|
|
c[2] /= t;
|
|
c[3] /= t;
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& abs()
|
|
{
|
|
c[0] = detail::abs(c[0]);
|
|
c[1] = detail::abs(c[1]);
|
|
c[2] = detail::abs(c[2]);
|
|
c[3] = detail::abs(c[3]);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& inverse()
|
|
{
|
|
__m128 r0, r1, r2, r3;
|
|
detail::inverse(c[0].v, c[1].v, c[2].v, c[3].v, &r0, &r1, &r2, &r3);
|
|
setColumn(r0, r1, r2, r3);
|
|
return *this;
|
|
}
|
|
|
|
inline sse_meta_f16& transpose()
|
|
{
|
|
__m128 r0, r1, r2, r3;
|
|
detail::transpose(c[0].v, c[1].v, c[2].v, c[3].v, &r0, &r1, &r2, &r3);
|
|
setColumn(r0, r1, r2, r3);
|
|
return *this;
|
|
}
|
|
|
|
inline float determinant() const
|
|
{
|
|
__m128 Va, Vb, Vc;
|
|
__m128 r1, r2, r3, tt, tt2;
|
|
__m128 sum, Det;
|
|
|
|
__m128 _L1 = c[0].v;
|
|
__m128 _L2 = c[1].v;
|
|
__m128 _L3 = c[2].v;
|
|
__m128 _L4 = c[3].v;
|
|
// Calculating the minterms for the first line.
|
|
|
|
// _mm_ror_ps is just a macro using _mm_shuffle_ps().
|
|
tt = _L4; tt2 = _mm_ror_ps(_L3,1);
|
|
Vc = _mm_mul_ps(tt2,_mm_ror_ps(tt,0)); // V3'·V4
|
|
Va = _mm_mul_ps(tt2,_mm_ror_ps(tt,2)); // V3'·V4"
|
|
Vb = _mm_mul_ps(tt2,_mm_ror_ps(tt,3)); // V3'·V4^
|
|
|
|
r1 = _mm_sub_ps(_mm_ror_ps(Va,1),_mm_ror_ps(Vc,2)); // V3"·V4^ - V3^·V4"
|
|
r2 = _mm_sub_ps(_mm_ror_ps(Vb,2),_mm_ror_ps(Vb,0)); // V3^·V4' - V3'·V4^
|
|
r3 = _mm_sub_ps(_mm_ror_ps(Va,0),_mm_ror_ps(Vc,1)); // V3'·V4" - V3"·V4'
|
|
|
|
tt = _L2;
|
|
Va = _mm_ror_ps(tt,1); sum = _mm_mul_ps(Va,r1);
|
|
Vb = _mm_ror_ps(tt,2); sum = _mm_add_ps(sum,_mm_mul_ps(Vb,r2));
|
|
Vc = _mm_ror_ps(tt,3); sum = _mm_add_ps(sum,_mm_mul_ps(Vc,r3));
|
|
|
|
// Calculating the determinant.
|
|
Det = _mm_mul_ps(sum,_L1);
|
|
Det = _mm_add_ps(Det,_mm_movehl_ps(Det,Det));
|
|
|
|
// Calculating the minterms of the second line (using previous results).
|
|
tt = _mm_ror_ps(_L1,1); sum = _mm_mul_ps(tt,r1);
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r2));
|
|
tt = _mm_ror_ps(tt,1); sum = _mm_add_ps(sum,_mm_mul_ps(tt,r3));
|
|
|
|
// Testing the determinant.
|
|
Det = _mm_sub_ss(Det,_mm_shuffle_ps(Det,Det,1));
|
|
return _mm_cvtss_f32(Det);
|
|
}
|
|
|
|
inline sse_meta_f16 transposeTimes(const sse_meta_f16& other) const
|
|
{
|
|
return sse_meta_f16(dot_prod4(c[0], other.c[0]), dot_prod4(c[0], other.c[1]), dot_prod4(c[0], other.c[2]), dot_prod4(c[0], other.c[3]),
|
|
dot_prod4(c[1], other.c[0]), dot_prod4(c[1], other.c[1]), dot_prod4(c[1], other.c[2]), dot_prod4(c[1], other.c[3]),
|
|
dot_prod4(c[2], other.c[0]), dot_prod4(c[2], other.c[1]), dot_prod4(c[2], other.c[2]), dot_prod4(c[2], other.c[3]),
|
|
dot_prod4(c[3], other.c[0]), dot_prod4(c[3], other.c[1]), dot_prod4(c[3], other.c[2]), dot_prod4(c[3], other.c[3]));
|
|
}
|
|
|
|
inline sse_meta_f16 timesTranspose(const sse_meta_f16& m) const
|
|
{
|
|
sse_meta_f16 tmp(m);
|
|
return (*this) * tmp.transpose();
|
|
}
|
|
|
|
inline sse_meta_f4 transposeTimes(const sse_meta_f4& v) const
|
|
{
|
|
return sse_meta_f4(dot_prod4(c[0], v), dot_prod4(c[1], v), dot_prod4(c[2], v), dot_prod4(c[3], v));
|
|
}
|
|
|
|
inline float transposeDot(size_t i, const sse_meta_f4& v) const
|
|
{
|
|
return dot_prod4(c[i], v);
|
|
}
|
|
|
|
inline float dot(size_t i, const sse_meta_f4& v) const
|
|
{
|
|
return v[0] * c[0][i] + v[1] * c[1][i] + v[2] * c[2][i] + v[3] * c[3][i];
|
|
}
|
|
|
|
};
|
|
|
|
static inline sse_meta_f16 abs(const sse_meta_f16& mat)
|
|
{
|
|
return sse_meta_f16(abs(mat.getColumn(0)), abs(mat.getColumn(1)), abs(mat.getColumn(2)), abs(mat.getColumn(3)));
|
|
}
|
|
|
|
static inline sse_meta_f16 transpose(const sse_meta_f16& mat)
|
|
{
|
|
__m128 r0, r1, r2, r3;
|
|
transpose(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, mat.getColumn(3).v, &r0, &r1, &r2, &r3);
|
|
return sse_meta_f16(r0, r1, r2, r3);
|
|
}
|
|
|
|
|
|
static inline sse_meta_f16 inverse(const sse_meta_f16& mat)
|
|
{
|
|
__m128 r0, r1, r2, r3;
|
|
inverse(mat.getColumn(0).v, mat.getColumn(1).v, mat.getColumn(2).v, mat.getColumn(3).v, &r0, &r1, &r2, &r3);
|
|
return sse_meta_f16(r0, r1, r2, r3);
|
|
}
|
|
|
|
|
|
|
|
|
|
} // detail
|
|
} // fcl
|
|
|
|
|
|
#endif
|