AnimTestbed/3rdparty/ozz-animation/include/ozz/base/maths/internal/simd_math_sse-inl.h

2123 lines
76 KiB
C++

//----------------------------------------------------------------------------//
// //
// ozz-animation is hosted at http://github.com/guillaumeblanc/ozz-animation //
// and distributed under the MIT License (MIT). //
// //
// Copyright (c) Guillaume Blanc //
// //
// Permission is hereby granted, free of charge, to any person obtaining a //
// copy of this software and associated documentation files (the "Software"), //
// to deal in the Software without restriction, including without limitation //
// the rights to use, copy, modify, merge, publish, distribute, sublicense, //
// and/or sell copies of the Software, and to permit persons to whom the //
// Software is furnished to do so, subject to the following conditions: //
// //
// The above copyright notice and this permission notice shall be included in //
// all copies or substantial portions of the Software. //
// //
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR //
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, //
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL //
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER //
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING //
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER //
// DEALINGS IN THE SOFTWARE. //
// //
//----------------------------------------------------------------------------//
#ifndef OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_
#define OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_
// SIMD SSE2+ implementation, based on scalar floats.
#include <stdint.h>
#include <cassert>
// Temporarly needed while trigonometric functions aren't implemented.
#include <cmath>
#include "ozz/base/maths/math_constant.h"
namespace ozz {
namespace math {
namespace simd_float4 {
// Internal macros.
// Unused components of the result vector are replicated from the first input
// argument.
#ifdef OZZ_SIMD_AVX
#define OZZ_SHUFFLE_PS1(_v, _m) _mm_permute_ps(_v, _m)
#else // OZZ_SIMD_AVX
#define OZZ_SHUFFLE_PS1(_v, _m) _mm_shuffle_ps(_v, _v, _m)
#endif // OZZ_SIMD_AVX
#define OZZ_SSE_SPLAT_F(_v, _i) OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(_i, _i, _i, _i))
#define OZZ_SSE_SPLAT_I(_v, _i) \
_mm_shuffle_epi32(_v, _MM_SHUFFLE(_i, _i, _i, _i))
// _v.x + _v.y, _v.y, _v.z, _v.w
#define OZZ_SSE_HADD2_F(_v) _mm_add_ss(_v, OZZ_SSE_SPLAT_F(_v, 1))
// _v.x + _v.y + _v.z, _v.y, _v.z, _v.w
#define OZZ_SSE_HADD3_F(_v) \
_mm_add_ss(_mm_add_ss(_v, OZZ_SSE_SPLAT_F(_v, 2)), OZZ_SSE_SPLAT_F(_v, 1))
// _v.x + _v.y + _v.z + _v.w, ?, ?, ?
#define OZZ_SSE_HADD4_F(_v, _r) \
do { \
const __m128 haddxyzw = _mm_add_ps(_v, _mm_movehl_ps(_v, _v)); \
_r = _mm_add_ss(haddxyzw, OZZ_SSE_SPLAT_F(haddxyzw, 1)); \
} while (void(0), 0)
// dot2, ?, ?, ?
#define OZZ_SSE_DOT2_F(_a, _b, _r) \
do { \
const __m128 ab = _mm_mul_ps(_a, _b); \
_r = _mm_add_ss(ab, OZZ_SSE_SPLAT_F(ab, 1)); \
\
} while (void(0), 0)
#ifdef OZZ_SIMD_SSE4_1
// dot3, ?, ?, ?
#define OZZ_SSE_DOT3_F(_a, _b, _r) \
do { \
_r = _mm_dp_ps(_a, _b, 0x7f); \
} while (void(0), 0)
// dot4, ?, ?, ?
#define OZZ_SSE_DOT4_F(_a, _b, _r) \
do { \
_r = _mm_dp_ps(_a, _b, 0xff); \
} while (void(0), 0)
#else // OZZ_SIMD_SSE4_1
// dot3, ?, ?, ?
#define OZZ_SSE_DOT3_F(_a, _b, _r) \
do { \
const __m128 ab = _mm_mul_ps(_a, _b); \
_r = OZZ_SSE_HADD3_F(ab); \
} while (void(0), 0)
// dot4, ?, ?, ?
#define OZZ_SSE_DOT4_F(_a, _b, _r) \
do { \
const __m128 ab = _mm_mul_ps(_a, _b); \
OZZ_SSE_HADD4_F(ab, _r); \
} while (void(0), 0)
#endif // OZZ_SIMD_SSE4_1
// FMA operations
#ifdef OZZ_SIMD_FMA
#define OZZ_MADD(_a, _b, _c) _mm_fmadd_ps(_a, _b, _c)
#define OZZ_MSUB(_a, _b, _c) _mm_fmsub_ps(_a, _b, _c)
#define OZZ_NMADD(_a, _b, _c) _mm_fnmadd_ps(_a, _b, _c)
#define OZZ_NMSUB(_a, _b, _c) _mm_fnmsub_ps(_a, _b, _c)
#define OZZ_MADDX(_a, _b, _c) _mm_fmadd_ss(_a, _b, _c)
#define OZZ_MSUBX(_a, _b, _c) _mm_fmsub_ss(_a, _b, _c)
#define OZZ_NMADDX(_a, _b, _c) _mm_fnmadd_ss(_a, _b, _c)
#define OZZ_NMSUBX(_a, _b, _c) _mm_fnmsub_ss(_a, _b, _c)
#else // OZZ_SIMD_FMA
#define OZZ_MADD(_a, _b, _c) _mm_add_ps(_mm_mul_ps(_a, _b), _c)
#define OZZ_MSUB(_a, _b, _c) _mm_sub_ps(_mm_mul_ps(_a, _b), _c)
#define OZZ_NMADD(_a, _b, _c) _mm_sub_ps(_c, _mm_mul_ps(_a, _b))
#define OZZ_NMSUB(_a, _b, _c) (-_mm_add_ps(_mm_mul_ps(_a, _b), _c))
#define OZZ_MADDX(_a, _b, _c) _mm_add_ss(_mm_mul_ss(_a, _b), _c)
#define OZZ_MSUBX(_a, _b, _c) _mm_sub_ss(_mm_mul_ss(_a, _b), _c)
#define OZZ_NMADDX(_a, _b, _c) _mm_sub_ss(_c, _mm_mul_ss(_a, _b))
#define OZZ_NMSUBX(_a, _b, _c) (-_mm_add_ss(_mm_mul_ss(_a, _b), _c))
#endif // OZZ_SIMD_FMA
OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_div_ss(_a, _b);
}
#ifdef OZZ_SIMD_SSE4_1
#define OZZ_SSE_SELECT_F(_b, _true, _false) \
_mm_blendv_ps(_false, _true, _mm_castsi128_ps(_b))
#define OZZ_SSE_SELECT_I(_b, _true, _false) _mm_blendv_epi8(_false, _true, _b)
#else // OZZ_SIMD_SSE4_1
#define OZZ_SSE_SELECT_F(_b, _true, _false) \
_mm_or_ps(_mm_and_ps(_true, _mm_castsi128_ps(_b)), \
_mm_andnot_ps(_mm_castsi128_ps(_b), _false))
#define OZZ_SSE_SELECT_I(_b, _true, _false) \
_mm_or_si128(_mm_and_si128(_true, _b), _mm_andnot_si128(_b, _false))
#endif // OZZ_SIMD_SSE4_1
OZZ_INLINE SimdFloat4 zero() { return _mm_setzero_ps(); }
OZZ_INLINE SimdFloat4 one() {
const __m128i zero = _mm_setzero_si128();
return _mm_castsi128_ps(
_mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2));
}
OZZ_INLINE SimdFloat4 x_axis() {
const __m128i zero = _mm_setzero_si128();
const __m128i one =
_mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
return _mm_castsi128_ps(_mm_srli_si128(one, 12));
}
OZZ_INLINE SimdFloat4 y_axis() {
const __m128i zero = _mm_setzero_si128();
const __m128i one =
_mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
return _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(one, 12), 4));
}
OZZ_INLINE SimdFloat4 z_axis() {
const __m128i zero = _mm_setzero_si128();
const __m128i one =
_mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
return _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(one, 12), 8));
}
OZZ_INLINE SimdFloat4 w_axis() {
const __m128i zero = _mm_setzero_si128();
const __m128i one =
_mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
return _mm_castsi128_ps(_mm_slli_si128(one, 12));
}
OZZ_INLINE SimdFloat4 Load(float _x, float _y, float _z, float _w) {
return _mm_set_ps(_w, _z, _y, _x);
}
OZZ_INLINE SimdFloat4 LoadX(float _x) { return _mm_set_ss(_x); }
OZZ_INLINE SimdFloat4 Load1(float _x) { return _mm_set_ps1(_x); }
OZZ_INLINE SimdFloat4 LoadPtr(const float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
return _mm_load_ps(_f);
}
OZZ_INLINE SimdFloat4 LoadPtrU(const float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
return _mm_loadu_ps(_f);
}
OZZ_INLINE SimdFloat4 LoadXPtrU(const float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
return _mm_load_ss(_f);
}
OZZ_INLINE SimdFloat4 Load1PtrU(const float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
return _mm_load_ps1(_f);
}
OZZ_INLINE SimdFloat4 Load2PtrU(const float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
return _mm_unpacklo_ps(_mm_load_ss(_f + 0), _mm_load_ss(_f + 1));
}
OZZ_INLINE SimdFloat4 Load3PtrU(const float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
return _mm_movelh_ps(
_mm_unpacklo_ps(_mm_load_ss(_f + 0), _mm_load_ss(_f + 1)),
_mm_load_ss(_f + 2));
}
OZZ_INLINE SimdFloat4 FromInt(_SimdInt4 _i) { return _mm_cvtepi32_ps(_i); }
} // namespace simd_float4
OZZ_INLINE float GetX(_SimdFloat4 _v) { return _mm_cvtss_f32(_v); }
OZZ_INLINE float GetY(_SimdFloat4 _v) {
return _mm_cvtss_f32(OZZ_SSE_SPLAT_F(_v, 1));
}
OZZ_INLINE float GetZ(_SimdFloat4 _v) {
return _mm_cvtss_f32(_mm_movehl_ps(_v, _v));
}
OZZ_INLINE float GetW(_SimdFloat4 _v) {
return _mm_cvtss_f32(OZZ_SSE_SPLAT_F(_v, 3));
}
OZZ_INLINE SimdFloat4 SetX(_SimdFloat4 _v, _SimdFloat4 _f) {
return _mm_move_ss(_v, _f);
}
OZZ_INLINE SimdFloat4 SetY(_SimdFloat4 _v, _SimdFloat4 _f) {
const __m128 xfnn = _mm_unpacklo_ps(_v, _f);
return _mm_shuffle_ps(xfnn, _v, _MM_SHUFFLE(3, 2, 1, 0));
}
OZZ_INLINE SimdFloat4 SetZ(_SimdFloat4 _v, _SimdFloat4 _f) {
const __m128 ffww = _mm_shuffle_ps(_f, _v, _MM_SHUFFLE(3, 3, 0, 0));
return _mm_shuffle_ps(_v, ffww, _MM_SHUFFLE(2, 0, 1, 0));
}
OZZ_INLINE SimdFloat4 SetW(_SimdFloat4 _v, _SimdFloat4 _f) {
const __m128 ffzz = _mm_shuffle_ps(_f, _v, _MM_SHUFFLE(2, 2, 0, 0));
return _mm_shuffle_ps(_v, ffzz, _MM_SHUFFLE(0, 2, 1, 0));
}
OZZ_INLINE SimdFloat4 SetI(_SimdFloat4 _v, _SimdFloat4 _f, int _ith) {
assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range.");
union {
SimdFloat4 ret;
float af[4];
} u = {_v};
u.af[_ith] = _mm_cvtss_f32(_f);
return u.ret;
}
OZZ_INLINE void StorePtr(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
_mm_store_ps(_f, _v);
}
OZZ_INLINE void Store1Ptr(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
_mm_store_ss(_f, _v);
}
OZZ_INLINE void Store2Ptr(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
_mm_storel_pi(reinterpret_cast<__m64*>(_f), _v);
}
OZZ_INLINE void Store3Ptr(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
_mm_storel_pi(reinterpret_cast<__m64*>(_f), _v);
_mm_store_ss(_f + 2, _mm_movehl_ps(_v, _v));
}
OZZ_INLINE void StorePtrU(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
_mm_storeu_ps(_f, _v);
}
OZZ_INLINE void Store1PtrU(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
_mm_store_ss(_f, _v);
}
OZZ_INLINE void Store2PtrU(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
_mm_store_ss(_f + 0, _v);
_mm_store_ss(_f + 1, OZZ_SSE_SPLAT_F(_v, 1));
}
OZZ_INLINE void Store3PtrU(_SimdFloat4 _v, float* _f) {
assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
_mm_store_ss(_f + 0, _v);
_mm_store_ss(_f + 1, OZZ_SSE_SPLAT_F(_v, 1));
_mm_store_ss(_f + 2, _mm_movehl_ps(_v, _v));
}
OZZ_INLINE SimdFloat4 SplatX(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 0); }
OZZ_INLINE SimdFloat4 SplatY(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 1); }
OZZ_INLINE SimdFloat4 SplatZ(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 2); }
OZZ_INLINE SimdFloat4 SplatW(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 3); }
template <size_t _X, size_t _Y, size_t _Z, size_t _W>
OZZ_INLINE SimdFloat4 Swizzle(_SimdFloat4 _v) {
static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3,
"Indices must be between 0 and 3");
return OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(_W, _Z, _Y, _X));
}
template <>
OZZ_INLINE SimdFloat4 Swizzle<0, 1, 2, 3>(_SimdFloat4 _v) {
return _v;
}
template <>
OZZ_INLINE SimdFloat4 Swizzle<0, 1, 0, 1>(_SimdFloat4 _v) {
return _mm_movelh_ps(_v, _v);
}
template <>
OZZ_INLINE SimdFloat4 Swizzle<2, 3, 2, 3>(_SimdFloat4 _v) {
return _mm_movehl_ps(_v, _v);
}
template <>
OZZ_INLINE SimdFloat4 Swizzle<0, 0, 1, 1>(_SimdFloat4 _v) {
return _mm_unpacklo_ps(_v, _v);
}
template <>
OZZ_INLINE SimdFloat4 Swizzle<2, 2, 3, 3>(_SimdFloat4 _v) {
return _mm_unpackhi_ps(_v, _v);
}
OZZ_INLINE void Transpose4x1(const SimdFloat4 _in[4], SimdFloat4 _out[1]) {
const __m128 xz = _mm_unpacklo_ps(_in[0], _in[2]);
const __m128 yw = _mm_unpacklo_ps(_in[1], _in[3]);
_out[0] = _mm_unpacklo_ps(xz, yw);
}
OZZ_INLINE void Transpose1x4(const SimdFloat4 _in[1], SimdFloat4 _out[4]) {
const __m128 zwzw = _mm_movehl_ps(_in[0], _in[0]);
const __m128 yyyy = OZZ_SSE_SPLAT_F(_in[0], 1);
const __m128 wwww = OZZ_SSE_SPLAT_F(_in[0], 3);
const __m128 zero = _mm_setzero_ps();
_out[0] = _mm_move_ss(zero, _in[0]);
_out[1] = _mm_move_ss(zero, yyyy);
_out[2] = _mm_move_ss(zero, zwzw);
_out[3] = _mm_move_ss(zero, wwww);
}
OZZ_INLINE void Transpose4x2(const SimdFloat4 _in[4], SimdFloat4 _out[2]) {
const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
_out[0] = _mm_unpacklo_ps(tmp0, tmp1);
_out[1] = _mm_unpackhi_ps(tmp0, tmp1);
}
OZZ_INLINE void Transpose2x4(const SimdFloat4 _in[2], SimdFloat4 _out[4]) {
const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[1]);
const __m128 tmp1 = _mm_unpackhi_ps(_in[0], _in[1]);
const __m128 zero = _mm_setzero_ps();
_out[0] = _mm_movelh_ps(tmp0, zero);
_out[1] = _mm_movehl_ps(zero, tmp0);
_out[2] = _mm_movelh_ps(tmp1, zero);
_out[3] = _mm_movehl_ps(zero, tmp1);
}
OZZ_INLINE void Transpose4x3(const SimdFloat4 _in[4], SimdFloat4 _out[3]) {
const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]);
const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]);
_out[0] = _mm_unpacklo_ps(tmp0, tmp1);
_out[1] = _mm_unpackhi_ps(tmp0, tmp1);
_out[2] = _mm_unpacklo_ps(tmp2, tmp3);
}
OZZ_INLINE void Transpose3x4(const SimdFloat4 _in[3], SimdFloat4 _out[4]) {
const __m128 zero = _mm_setzero_ps();
const __m128 temp0 = _mm_unpacklo_ps(_in[0], _in[1]);
const __m128 temp1 = _mm_unpacklo_ps(_in[2], zero);
const __m128 temp2 = _mm_unpackhi_ps(_in[0], _in[1]);
const __m128 temp3 = _mm_unpackhi_ps(_in[2], zero);
_out[0] = _mm_movelh_ps(temp0, temp1);
_out[1] = _mm_movehl_ps(temp1, temp0);
_out[2] = _mm_movelh_ps(temp2, temp3);
_out[3] = _mm_movehl_ps(temp3, temp2);
}
OZZ_INLINE void Transpose4x4(const SimdFloat4 _in[4], SimdFloat4 _out[4]) {
const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]);
const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]);
_out[0] = _mm_unpacklo_ps(tmp0, tmp1);
_out[1] = _mm_unpackhi_ps(tmp0, tmp1);
_out[2] = _mm_unpacklo_ps(tmp2, tmp3);
_out[3] = _mm_unpackhi_ps(tmp2, tmp3);
}
OZZ_INLINE void Transpose16x16(const SimdFloat4 _in[16], SimdFloat4 _out[16]) {
const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
_out[0] = _mm_unpacklo_ps(tmp0, tmp1);
_out[4] = _mm_unpackhi_ps(tmp0, tmp1);
const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]);
const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]);
_out[8] = _mm_unpacklo_ps(tmp2, tmp3);
_out[12] = _mm_unpackhi_ps(tmp2, tmp3);
const __m128 tmp4 = _mm_unpacklo_ps(_in[4], _in[6]);
const __m128 tmp5 = _mm_unpacklo_ps(_in[5], _in[7]);
_out[1] = _mm_unpacklo_ps(tmp4, tmp5);
_out[5] = _mm_unpackhi_ps(tmp4, tmp5);
const __m128 tmp6 = _mm_unpackhi_ps(_in[4], _in[6]);
const __m128 tmp7 = _mm_unpackhi_ps(_in[5], _in[7]);
_out[9] = _mm_unpacklo_ps(tmp6, tmp7);
_out[13] = _mm_unpackhi_ps(tmp6, tmp7);
const __m128 tmp8 = _mm_unpacklo_ps(_in[8], _in[10]);
const __m128 tmp9 = _mm_unpacklo_ps(_in[9], _in[11]);
_out[2] = _mm_unpacklo_ps(tmp8, tmp9);
_out[6] = _mm_unpackhi_ps(tmp8, tmp9);
const __m128 tmp10 = _mm_unpackhi_ps(_in[8], _in[10]);
const __m128 tmp11 = _mm_unpackhi_ps(_in[9], _in[11]);
_out[10] = _mm_unpacklo_ps(tmp10, tmp11);
_out[14] = _mm_unpackhi_ps(tmp10, tmp11);
const __m128 tmp12 = _mm_unpacklo_ps(_in[12], _in[14]);
const __m128 tmp13 = _mm_unpacklo_ps(_in[13], _in[15]);
_out[3] = _mm_unpacklo_ps(tmp12, tmp13);
_out[7] = _mm_unpackhi_ps(tmp12, tmp13);
const __m128 tmp14 = _mm_unpackhi_ps(_in[12], _in[14]);
const __m128 tmp15 = _mm_unpackhi_ps(_in[13], _in[15]);
_out[11] = _mm_unpacklo_ps(tmp14, tmp15);
_out[15] = _mm_unpackhi_ps(tmp14, tmp15);
}
OZZ_INLINE SimdFloat4 MAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
return OZZ_MADD(_a, _b, _c);
}
OZZ_INLINE SimdFloat4 MSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
return OZZ_MSUB(_a, _b, _c);
}
OZZ_INLINE SimdFloat4 NMAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
return OZZ_NMADD(_a, _b, _c);
}
OZZ_INLINE SimdFloat4 NMSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
return OZZ_NMSUB(_a, _b, _c);
}
OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_div_ss(_a, _b);
}
OZZ_INLINE SimdFloat4 HAdd2(_SimdFloat4 _v) { return OZZ_SSE_HADD2_F(_v); }
OZZ_INLINE SimdFloat4 HAdd3(_SimdFloat4 _v) { return OZZ_SSE_HADD3_F(_v); }
OZZ_INLINE SimdFloat4 HAdd4(_SimdFloat4 _v) {
__m128 hadd4;
OZZ_SSE_HADD4_F(_v, hadd4);
return hadd4;
}
OZZ_INLINE SimdFloat4 Dot2(_SimdFloat4 _a, _SimdFloat4 _b) {
__m128 dot2;
OZZ_SSE_DOT2_F(_a, _b, dot2);
return dot2;
}
OZZ_INLINE SimdFloat4 Dot3(_SimdFloat4 _a, _SimdFloat4 _b) {
__m128 dot3;
OZZ_SSE_DOT3_F(_a, _b, dot3);
return dot3;
}
OZZ_INLINE SimdFloat4 Dot4(_SimdFloat4 _a, _SimdFloat4 _b) {
__m128 dot4;
OZZ_SSE_DOT4_F(_a, _b, dot4);
return dot4;
}
OZZ_INLINE SimdFloat4 Cross3(_SimdFloat4 _a, _SimdFloat4 _b) {
// Implementation with 3 shuffles only is based on:
// https://geometrian.com/programming/tutorials/cross-product
const __m128 shufa = OZZ_SHUFFLE_PS1(_a, _MM_SHUFFLE(3, 0, 2, 1));
const __m128 shufb = OZZ_SHUFFLE_PS1(_b, _MM_SHUFFLE(3, 0, 2, 1));
const __m128 shufc = OZZ_MSUB(_a, shufb, _mm_mul_ps(_b, shufa));
return OZZ_SHUFFLE_PS1(shufc, _MM_SHUFFLE(3, 0, 2, 1));
}
OZZ_INLINE SimdFloat4 RcpEst(_SimdFloat4 _v) { return _mm_rcp_ps(_v); }
OZZ_INLINE SimdFloat4 RcpEstNR(_SimdFloat4 _v) {
const __m128 nr = _mm_rcp_ps(_v);
// Do one more Newton-Raphson step to improve precision.
return OZZ_NMADD(_mm_mul_ps(nr, nr), _v, _mm_add_ps(nr, nr));
}
OZZ_INLINE SimdFloat4 RcpEstX(_SimdFloat4 _v) { return _mm_rcp_ss(_v); }
OZZ_INLINE SimdFloat4 RcpEstXNR(_SimdFloat4 _v) {
const __m128 nr = _mm_rcp_ss(_v);
// Do one more Newton-Raphson step to improve precision.
return OZZ_NMADDX(_mm_mul_ss(nr, nr), _v, _mm_add_ss(nr, nr));
}
OZZ_INLINE SimdFloat4 Sqrt(_SimdFloat4 _v) { return _mm_sqrt_ps(_v); }
OZZ_INLINE SimdFloat4 SqrtX(_SimdFloat4 _v) { return _mm_sqrt_ss(_v); }
OZZ_INLINE SimdFloat4 RSqrtEst(_SimdFloat4 _v) { return _mm_rsqrt_ps(_v); }
OZZ_INLINE SimdFloat4 RSqrtEstNR(_SimdFloat4 _v) {
const __m128 nr = _mm_rsqrt_ps(_v);
// Do one more Newton-Raphson step to improve precision.
return _mm_mul_ps(_mm_mul_ps(_mm_set_ps1(.5f), nr),
OZZ_NMADD(_mm_mul_ps(_v, nr), nr, _mm_set_ps1(3.f)));
}
OZZ_INLINE SimdFloat4 RSqrtEstX(_SimdFloat4 _v) { return _mm_rsqrt_ss(_v); }
OZZ_INLINE SimdFloat4 RSqrtEstXNR(_SimdFloat4 _v) {
const __m128 nr = _mm_rsqrt_ss(_v);
// Do one more Newton-Raphson step to improve precision.
return _mm_mul_ss(_mm_mul_ss(_mm_set_ps1(.5f), nr),
OZZ_NMADDX(_mm_mul_ss(_v, nr), nr, _mm_set_ps1(3.f)));
}
OZZ_INLINE SimdFloat4 Abs(_SimdFloat4 _v) {
const __m128i zero = _mm_setzero_si128();
return _mm_and_ps(
_mm_castsi128_ps(_mm_srli_epi32(_mm_cmpeq_epi32(zero, zero), 1)), _v);
}
OZZ_INLINE SimdInt4 Sign(_SimdFloat4 _v) {
return _mm_slli_epi32(_mm_srli_epi32(_mm_castps_si128(_v), 31), 31);
}
OZZ_INLINE SimdFloat4 Length2(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT2_F(_v, _v, sq_len);
return _mm_sqrt_ss(sq_len);
}
OZZ_INLINE SimdFloat4 Length3(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT3_F(_v, _v, sq_len);
return _mm_sqrt_ss(sq_len);
}
OZZ_INLINE SimdFloat4 Length4(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT4_F(_v, _v, sq_len);
return _mm_sqrt_ss(sq_len);
}
OZZ_INLINE SimdFloat4 Length2Sqr(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT2_F(_v, _v, sq_len);
return sq_len;
}
OZZ_INLINE SimdFloat4 Length3Sqr(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT3_F(_v, _v, sq_len);
return sq_len;
}
OZZ_INLINE SimdFloat4 Length4Sqr(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT4_F(_v, _v, sq_len);
return sq_len;
}
OZZ_INLINE SimdFloat4 Normalize2(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT2_F(_v, _v, sq_len);
assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
return _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
}
OZZ_INLINE SimdFloat4 Normalize3(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT3_F(_v, _v, sq_len);
assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
return OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
}
OZZ_INLINE SimdFloat4 Normalize4(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT4_F(_v, _v, sq_len);
assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
return _mm_mul_ps(_v, inv_lenxxxx);
}
OZZ_INLINE SimdFloat4 NormalizeEst2(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT2_F(_v, _v, sq_len);
assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
const __m128 inv_len = _mm_rsqrt_ss(sq_len);
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
return _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
}
OZZ_INLINE SimdFloat4 NormalizeEst3(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT3_F(_v, _v, sq_len);
assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
const __m128 inv_len = _mm_rsqrt_ss(sq_len);
const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
return OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
}
OZZ_INLINE SimdFloat4 NormalizeEst4(_SimdFloat4 _v) {
__m128 sq_len;
OZZ_SSE_DOT4_F(_v, _v, sq_len);
assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
const __m128 inv_len = _mm_rsqrt_ss(sq_len);
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
return _mm_mul_ps(_v, inv_lenxxxx);
}
OZZ_INLINE SimdInt4 IsNormalized2(_SimdFloat4 _v) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
__m128 dot;
OZZ_SSE_DOT2_F(_v, _v, dot);
__m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
OZZ_INLINE SimdInt4 IsNormalized3(_SimdFloat4 _v) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
__m128 dot;
OZZ_SSE_DOT3_F(_v, _v, dot);
__m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
OZZ_INLINE SimdInt4 IsNormalized4(_SimdFloat4 _v) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
__m128 dot;
OZZ_SSE_DOT4_F(_v, _v, dot);
__m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
OZZ_INLINE SimdInt4 IsNormalizedEst2(_SimdFloat4 _v) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq);
__m128 dot;
OZZ_SSE_DOT2_F(_v, _v, dot);
__m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
OZZ_INLINE SimdInt4 IsNormalizedEst3(_SimdFloat4 _v) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq);
__m128 dot;
OZZ_SSE_DOT3_F(_v, _v, dot);
__m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
OZZ_INLINE SimdInt4 IsNormalizedEst4(_SimdFloat4 _v) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq);
__m128 dot;
OZZ_SSE_DOT4_F(_v, _v, dot);
__m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
OZZ_INLINE SimdFloat4 NormalizeSafe2(_SimdFloat4 _v, _SimdFloat4 _safe) {
// assert(AreAllTrue1(IsNormalized2(_safe)) && "_safe is not normalized");
__m128 sq_len;
OZZ_SSE_DOT2_F(_v, _v, sq_len);
const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
const __m128i cond = _mm_castps_si128(
_mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
const __m128 cfalse = _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}
OZZ_INLINE SimdFloat4 NormalizeSafe3(_SimdFloat4 _v, _SimdFloat4 _safe) {
// assert(AreAllTrue1(IsNormalized3(_safe)) && "_safe is not normalized");
__m128 sq_len;
OZZ_SSE_DOT3_F(_v, _v, sq_len);
const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
const __m128i cond = _mm_castps_si128(
_mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
const __m128 cfalse = OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}
OZZ_INLINE SimdFloat4 NormalizeSafe4(_SimdFloat4 _v, _SimdFloat4 _safe) {
// assert(AreAllTrue1(IsNormalized4(_safe)) && "_safe is not normalized");
__m128 sq_len;
OZZ_SSE_DOT4_F(_v, _v, sq_len);
const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128i cond = _mm_castps_si128(
_mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
const __m128 cfalse = _mm_mul_ps(_v, inv_lenxxxx);
return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}
OZZ_INLINE SimdFloat4 NormalizeSafeEst2(_SimdFloat4 _v, _SimdFloat4 _safe) {
// assert(AreAllTrue1(IsNormalizedEst2(_safe)) && "_safe is not normalized");
__m128 sq_len;
OZZ_SSE_DOT2_F(_v, _v, sq_len);
const __m128 inv_len = _mm_rsqrt_ss(sq_len);
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
const __m128i cond = _mm_castps_si128(
_mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
const __m128 cfalse = _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}
OZZ_INLINE SimdFloat4 NormalizeSafeEst3(_SimdFloat4 _v, _SimdFloat4 _safe) {
// assert(AreAllTrue1(IsNormalizedEst3(_safe)) && "_safe is not normalized");
__m128 sq_len;
OZZ_SSE_DOT3_F(_v, _v, sq_len);
const __m128 inv_len = _mm_rsqrt_ss(sq_len);
const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
const __m128i cond = _mm_castps_si128(
_mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
const __m128 cfalse = OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}
OZZ_INLINE SimdFloat4 NormalizeSafeEst4(_SimdFloat4 _v, _SimdFloat4 _safe) {
// assert(AreAllTrue1(IsNormalizedEst4(_safe)) && "_safe is not normalized");
__m128 sq_len;
OZZ_SSE_DOT4_F(_v, _v, sq_len);
const __m128 inv_len = _mm_rsqrt_ss(sq_len);
const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
const __m128i cond = _mm_castps_si128(
_mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
const __m128 cfalse = _mm_mul_ps(_v, inv_lenxxxx);
return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}
OZZ_INLINE SimdFloat4 Lerp(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _alpha) {
return OZZ_MADD(_alpha, _mm_sub_ps(_b, _a), _a);
}
OZZ_INLINE SimdFloat4 Min(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_min_ps(_a, _b);
}
OZZ_INLINE SimdFloat4 Max(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_max_ps(_a, _b);
}
OZZ_INLINE SimdFloat4 Min0(_SimdFloat4 _v) {
return _mm_min_ps(_mm_setzero_ps(), _v);
}
OZZ_INLINE SimdFloat4 Max0(_SimdFloat4 _v) {
return _mm_max_ps(_mm_setzero_ps(), _v);
}
OZZ_INLINE SimdFloat4 Clamp(_SimdFloat4 _a, _SimdFloat4 _v, _SimdFloat4 _b) {
return _mm_max_ps(_a, _mm_min_ps(_v, _b));
}
OZZ_INLINE SimdFloat4 Select(_SimdInt4 _b, _SimdFloat4 _true,
_SimdFloat4 _false) {
return OZZ_SSE_SELECT_F(_b, _true, _false);
}
OZZ_INLINE SimdInt4 CmpEq(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_castps_si128(_mm_cmpeq_ps(_a, _b));
}
OZZ_INLINE SimdInt4 CmpNe(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_castps_si128(_mm_cmpneq_ps(_a, _b));
}
OZZ_INLINE SimdInt4 CmpLt(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_castps_si128(_mm_cmplt_ps(_a, _b));
}
OZZ_INLINE SimdInt4 CmpLe(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_castps_si128(_mm_cmple_ps(_a, _b));
}
OZZ_INLINE SimdInt4 CmpGt(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_castps_si128(_mm_cmpgt_ps(_a, _b));
}
OZZ_INLINE SimdInt4 CmpGe(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_castps_si128(_mm_cmpge_ps(_a, _b));
}
OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_and_ps(_a, _b);
}
OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_or_ps(_a, _b);
}
OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdFloat4 _b) {
return _mm_xor_ps(_a, _b);
}
OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdInt4 _b) {
return _mm_and_ps(_a, _mm_castsi128_ps(_b));
}
OZZ_INLINE SimdFloat4 AndNot(_SimdFloat4 _a, _SimdInt4 _b) {
return _mm_andnot_ps(_mm_castsi128_ps(_b), _a);
}
OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdInt4 _b) {
return _mm_or_ps(_a, _mm_castsi128_ps(_b));
}
OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdInt4 _b) {
return _mm_xor_ps(_a, _mm_castsi128_ps(_b));
}
OZZ_INLINE SimdFloat4 Cos(_SimdFloat4 _v) {
return _mm_set_ps(std::cos(GetW(_v)), std::cos(GetZ(_v)), std::cos(GetY(_v)),
std::cos(GetX(_v)));
}
OZZ_INLINE SimdFloat4 CosX(_SimdFloat4 _v) {
return _mm_move_ss(_v, _mm_set_ps1(std::cos(GetX(_v))));
}
OZZ_INLINE SimdFloat4 ACos(_SimdFloat4 _v) {
return _mm_set_ps(std::acos(GetW(_v)), std::acos(GetZ(_v)),
std::acos(GetY(_v)), std::acos(GetX(_v)));
}
OZZ_INLINE SimdFloat4 ACosX(_SimdFloat4 _v) {
return _mm_move_ss(_v, _mm_set_ps1(std::acos(GetX(_v))));
}
OZZ_INLINE SimdFloat4 Sin(_SimdFloat4 _v) {
return _mm_set_ps(std::sin(GetW(_v)), std::sin(GetZ(_v)), std::sin(GetY(_v)),
std::sin(GetX(_v)));
}
OZZ_INLINE SimdFloat4 SinX(_SimdFloat4 _v) {
return _mm_move_ss(_v, _mm_set_ps1(std::sin(GetX(_v))));
}
OZZ_INLINE SimdFloat4 ASin(_SimdFloat4 _v) {
return _mm_set_ps(std::asin(GetW(_v)), std::asin(GetZ(_v)),
std::asin(GetY(_v)), std::asin(GetX(_v)));
}
OZZ_INLINE SimdFloat4 ASinX(_SimdFloat4 _v) {
return _mm_move_ss(_v, _mm_set_ps1(std::asin(GetX(_v))));
}
OZZ_INLINE SimdFloat4 Tan(_SimdFloat4 _v) {
return _mm_set_ps(std::tan(GetW(_v)), std::tan(GetZ(_v)), std::tan(GetY(_v)),
std::tan(GetX(_v)));
}
OZZ_INLINE SimdFloat4 TanX(_SimdFloat4 _v) {
return _mm_move_ss(_v, _mm_set_ps1(std::tan(GetX(_v))));
}
OZZ_INLINE SimdFloat4 ATan(_SimdFloat4 _v) {
return _mm_set_ps(std::atan(GetW(_v)), std::atan(GetZ(_v)),
std::atan(GetY(_v)), std::atan(GetX(_v)));
}
OZZ_INLINE SimdFloat4 ATanX(_SimdFloat4 _v) {
return _mm_move_ss(_v, _mm_set_ps1(std::atan(GetX(_v))));
}
namespace simd_int4 {
OZZ_INLINE SimdInt4 zero() { return _mm_setzero_si128(); }
OZZ_INLINE SimdInt4 one() {
const __m128i zero = _mm_setzero_si128();
return _mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero));
}
OZZ_INLINE SimdInt4 x_axis() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12);
}
OZZ_INLINE SimdInt4 y_axis() {
const __m128i zero = _mm_setzero_si128();
return _mm_slli_si128(
_mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12), 4);
}
OZZ_INLINE SimdInt4 z_axis() {
const __m128i zero = _mm_setzero_si128();
return _mm_slli_si128(
_mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12), 8);
}
OZZ_INLINE SimdInt4 w_axis() {
const __m128i zero = _mm_setzero_si128();
return _mm_slli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12);
}
OZZ_INLINE SimdInt4 all_true() {
const __m128i zero = _mm_setzero_si128();
return _mm_cmpeq_epi32(zero, zero);
}
OZZ_INLINE SimdInt4 all_false() { return _mm_setzero_si128(); }
OZZ_INLINE SimdInt4 mask_sign() {
const __m128i zero = _mm_setzero_si128();
return _mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31);
}
OZZ_INLINE SimdInt4 mask_sign_xyz() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_si128(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31), 4);
}
OZZ_INLINE SimdInt4 mask_sign_w() {
const __m128i zero = _mm_setzero_si128();
return _mm_slli_si128(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31), 12);
}
OZZ_INLINE SimdInt4 mask_not_sign() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_epi32(_mm_cmpeq_epi32(zero, zero), 1);
}
OZZ_INLINE SimdInt4 mask_ffff() {
const __m128i zero = _mm_setzero_si128();
return _mm_cmpeq_epi32(zero, zero);
}
OZZ_INLINE SimdInt4 mask_0000() { return _mm_setzero_si128(); }
OZZ_INLINE SimdInt4 mask_fff0() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_si128(_mm_cmpeq_epi32(zero, zero), 4);
}
OZZ_INLINE SimdInt4 mask_f000() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_si128(_mm_cmpeq_epi32(zero, zero), 12);
}
OZZ_INLINE SimdInt4 mask_0f00() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_si128(_mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12), 8);
}
OZZ_INLINE SimdInt4 mask_00f0() {
const __m128i zero = _mm_setzero_si128();
return _mm_srli_si128(_mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12), 4);
}
OZZ_INLINE SimdInt4 mask_000f() {
const __m128i zero = _mm_setzero_si128();
return _mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12);
}
OZZ_INLINE SimdInt4 Load(int _x, int _y, int _z, int _w) {
return _mm_set_epi32(_w, _z, _y, _x);
}
OZZ_INLINE SimdInt4 LoadX(int _x) { return _mm_set_epi32(0, 0, 0, _x); }
OZZ_INLINE SimdInt4 Load1(int _x) { return _mm_set1_epi32(_x); }
OZZ_INLINE SimdInt4 Load(bool _x, bool _y, bool _z, bool _w) {
return _mm_sub_epi32(_mm_setzero_si128(), _mm_set_epi32(_w, _z, _y, _x));
}
OZZ_INLINE SimdInt4 LoadX(bool _x) {
return _mm_sub_epi32(_mm_setzero_si128(), _mm_set_epi32(0, 0, 0, _x));
}
OZZ_INLINE SimdInt4 Load1(bool _x) {
return _mm_sub_epi32(_mm_setzero_si128(), _mm_set1_epi32(_x));
}
OZZ_INLINE SimdInt4 LoadPtr(const int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
return _mm_load_si128(reinterpret_cast<const __m128i*>(_i));
}
OZZ_INLINE SimdInt4 LoadXPtr(const int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
return _mm_cvtsi32_si128(*_i);
}
OZZ_INLINE SimdInt4 Load1Ptr(const int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
return _mm_shuffle_epi32(
_mm_loadl_epi64(reinterpret_cast<const __m128i*>(_i)),
_MM_SHUFFLE(0, 0, 0, 0));
}
OZZ_INLINE SimdInt4 Load2Ptr(const int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(_i));
}
OZZ_INLINE SimdInt4 Load3Ptr(const int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
return _mm_set_epi32(0, _i[2], _i[1], _i[0]);
}
OZZ_INLINE SimdInt4 LoadPtrU(const int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(_i));
}
OZZ_INLINE SimdInt4 LoadXPtrU(const int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
return _mm_cvtsi32_si128(*_i);
}
OZZ_INLINE SimdInt4 Load1PtrU(const int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
return _mm_set1_epi32(*_i);
}
OZZ_INLINE SimdInt4 Load2PtrU(const int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
return _mm_set_epi32(0, 0, _i[1], _i[0]);
}
OZZ_INLINE SimdInt4 Load3PtrU(const int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
return _mm_set_epi32(0, _i[2], _i[1], _i[0]);
}
OZZ_INLINE SimdInt4 FromFloatRound(_SimdFloat4 _f) {
return _mm_cvtps_epi32(_f);
}
OZZ_INLINE SimdInt4 FromFloatTrunc(_SimdFloat4 _f) {
return _mm_cvttps_epi32(_f);
}
} // namespace simd_int4
OZZ_INLINE int GetX(_SimdInt4 _v) { return _mm_cvtsi128_si32(_v); }
OZZ_INLINE int GetY(_SimdInt4 _v) {
return _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
}
OZZ_INLINE int GetZ(_SimdInt4 _v) {
return _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v));
}
OZZ_INLINE int GetW(_SimdInt4 _v) {
return _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 3));
}
OZZ_INLINE SimdInt4 SetX(_SimdInt4 _v, _SimdInt4 _i) {
return _mm_castps_si128(
_mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(_i)));
}
OZZ_INLINE SimdInt4 SetY(_SimdInt4 _v, _SimdInt4 _i) {
const __m128 xfnn = _mm_castsi128_ps(_mm_unpacklo_epi32(_v, _i));
return _mm_castps_si128(
_mm_shuffle_ps(xfnn, _mm_castsi128_ps(_v), _MM_SHUFFLE(3, 2, 1, 0)));
}
OZZ_INLINE SimdInt4 SetZ(_SimdInt4 _v, _SimdInt4 _i) {
const __m128 ffww = _mm_shuffle_ps(_mm_castsi128_ps(_i), _mm_castsi128_ps(_v),
_MM_SHUFFLE(3, 3, 0, 0));
return _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(_v), ffww, _MM_SHUFFLE(2, 0, 1, 0)));
}
OZZ_INLINE SimdInt4 SetW(_SimdInt4 _v, _SimdInt4 _i) {
const __m128 ffzz = _mm_shuffle_ps(_mm_castsi128_ps(_i), _mm_castsi128_ps(_v),
_MM_SHUFFLE(2, 2, 0, 0));
return _mm_castps_si128(
_mm_shuffle_ps(_mm_castsi128_ps(_v), ffzz, _MM_SHUFFLE(0, 2, 1, 0)));
}
OZZ_INLINE SimdInt4 SetI(_SimdInt4 _v, _SimdInt4 _i, int _ith) {
assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range.");
union {
SimdInt4 ret;
int af[4];
} u = {_v};
u.af[_ith] = GetX(_i);
return u.ret;
}
OZZ_INLINE void StorePtr(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
_mm_store_si128(reinterpret_cast<__m128i*>(_i), _v);
}
OZZ_INLINE void Store1Ptr(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
*_i = _mm_cvtsi128_si32(_v);
}
OZZ_INLINE void Store2Ptr(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
_i[0] = _mm_cvtsi128_si32(_v);
_i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
}
OZZ_INLINE void Store3Ptr(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
_i[0] = _mm_cvtsi128_si32(_v);
_i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
_i[2] = _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v));
}
OZZ_INLINE void StorePtrU(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
_mm_storeu_si128(reinterpret_cast<__m128i*>(_i), _v);
}
OZZ_INLINE void Store1PtrU(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
*_i = _mm_cvtsi128_si32(_v);
}
OZZ_INLINE void Store2PtrU(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
_i[0] = _mm_cvtsi128_si32(_v);
_i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
}
OZZ_INLINE void Store3PtrU(_SimdInt4 _v, int* _i) {
assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
_i[0] = _mm_cvtsi128_si32(_v);
_i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
_i[2] = _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v));
}
OZZ_INLINE SimdInt4 SplatX(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 0); }
OZZ_INLINE SimdInt4 SplatY(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 1); }
OZZ_INLINE SimdInt4 SplatZ(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 2); }
OZZ_INLINE SimdInt4 SplatW(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 3); }
template <size_t _X, size_t _Y, size_t _Z, size_t _W>
OZZ_INLINE SimdInt4 Swizzle(_SimdInt4 _v) {
static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3,
"Indices must be between 0 and 3");
return _mm_shuffle_epi32(_v, _MM_SHUFFLE(_W, _Z, _Y, _X));
}
template <>
OZZ_INLINE SimdInt4 Swizzle<0, 1, 2, 3>(_SimdInt4 _v) {
return _v;
}
OZZ_INLINE int MoveMask(_SimdInt4 _v) {
return _mm_movemask_ps(_mm_castsi128_ps(_v));
}
OZZ_INLINE bool AreAllTrue(_SimdInt4 _v) {
return _mm_movemask_ps(_mm_castsi128_ps(_v)) == 0xf;
}
OZZ_INLINE bool AreAllTrue3(_SimdInt4 _v) {
return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x7) == 0x7;
}
OZZ_INLINE bool AreAllTrue2(_SimdInt4 _v) {
return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x3) == 0x3;
}
OZZ_INLINE bool AreAllTrue1(_SimdInt4 _v) {
return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x1) == 0x1;
}
OZZ_INLINE bool AreAllFalse(_SimdInt4 _v) {
return _mm_movemask_ps(_mm_castsi128_ps(_v)) == 0;
}
OZZ_INLINE bool AreAllFalse3(_SimdInt4 _v) {
return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x7) == 0;
}
OZZ_INLINE bool AreAllFalse2(_SimdInt4 _v) {
return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x3) == 0;
}
OZZ_INLINE bool AreAllFalse1(_SimdInt4 _v) {
return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x1) == 0;
}
OZZ_INLINE SimdInt4 HAdd2(_SimdInt4 _v) {
const __m128i hadd = _mm_add_epi32(_v, OZZ_SSE_SPLAT_I(_v, 1));
return _mm_castps_si128(
_mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(hadd)));
}
OZZ_INLINE SimdInt4 HAdd3(_SimdInt4 _v) {
const __m128i hadd = _mm_add_epi32(_mm_add_epi32(_v, OZZ_SSE_SPLAT_I(_v, 1)),
_mm_unpackhi_epi32(_v, _v));
return _mm_castps_si128(
_mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(hadd)));
}
OZZ_INLINE SimdInt4 HAdd4(_SimdInt4 _v) {
const __m128 v = _mm_castsi128_ps(_v);
const __m128i haddxyzw =
_mm_add_epi32(_v, _mm_castps_si128(_mm_movehl_ps(v, v)));
return _mm_castps_si128(_mm_move_ss(
v,
_mm_castsi128_ps(_mm_add_epi32(haddxyzw, OZZ_SSE_SPLAT_I(haddxyzw, 1)))));
}
OZZ_INLINE SimdInt4 Abs(_SimdInt4 _v) {
#ifdef OZZ_SIMD_SSSE3
return _mm_abs_epi32(_v);
#else // OZZ_SIMD_SSSE3
const __m128i zero = _mm_setzero_si128();
return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_v, zero), _mm_sub_epi32(zero, _v),
_v);
#endif // OZZ_SIMD_SSSE3
}
OZZ_INLINE SimdInt4 Sign(_SimdInt4 _v) {
return _mm_slli_epi32(_mm_srli_epi32(_v, 31), 31);
}
OZZ_INLINE SimdInt4 Min(_SimdInt4 _a, _SimdInt4 _b) {
#ifdef OZZ_SIMD_SSE4_1
return _mm_min_epi32(_a, _b);
#else // OZZ_SIMD_SSE4_1
return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_a, _b), _a, _b);
#endif // OZZ_SIMD_SSE4_1
}
OZZ_INLINE SimdInt4 Max(_SimdInt4 _a, _SimdInt4 _b) {
#ifdef OZZ_SIMD_SSE4_1
return _mm_max_epi32(_a, _b);
#else // OZZ_SIMD_SSE4_1
return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(_a, _b), _a, _b);
#endif // OZZ_SIMD_SSE4_1
}
OZZ_INLINE SimdInt4 Min0(_SimdInt4 _v) {
const __m128i zero = _mm_setzero_si128();
#ifdef OZZ_SIMD_SSE4_1
return _mm_min_epi32(zero, _v);
#else // OZZ_SIMD_SSE4_1
return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(zero, _v), zero, _v);
#endif // OZZ_SIMD_SSE4_1
}
OZZ_INLINE SimdInt4 Max0(_SimdInt4 _v) {
const __m128i zero = _mm_setzero_si128();
#ifdef OZZ_SIMD_SSE4_1
return _mm_max_epi32(zero, _v);
#else // OZZ_SIMD_SSE4_1
return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(zero, _v), zero, _v);
#endif // OZZ_SIMD_SSE4_1
}
OZZ_INLINE SimdInt4 Clamp(_SimdInt4 _a, _SimdInt4 _v, _SimdInt4 _b) {
#ifdef OZZ_SIMD_SSE4_1
return _mm_min_epi32(_mm_max_epi32(_a, _v), _b);
#else // OZZ_SIMD_SSE4_1
const __m128i min = OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_v, _b), _v, _b);
return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(_a, min), _a, min);
#endif // OZZ_SIMD_SSE4_1
}
OZZ_INLINE SimdInt4 Select(_SimdInt4 _b, _SimdInt4 _true, _SimdInt4 _false) {
return OZZ_SSE_SELECT_I(_b, _true, _false);
}
OZZ_INLINE SimdInt4 And(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_and_si128(_a, _b);
}
OZZ_INLINE SimdInt4 AndNot(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_andnot_si128(_b, _a);
}
OZZ_INLINE SimdInt4 Or(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_or_si128(_a, _b);
}
OZZ_INLINE SimdInt4 Xor(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_xor_si128(_a, _b);
}
OZZ_INLINE SimdInt4 Not(_SimdInt4 _v) {
return _mm_xor_si128(_v, _mm_cmpeq_epi32(_v, _v));
}
OZZ_INLINE SimdInt4 ShiftL(_SimdInt4 _v, int _bits) {
return _mm_slli_epi32(_v, _bits);
}
OZZ_INLINE SimdInt4 ShiftR(_SimdInt4 _v, int _bits) {
return _mm_srai_epi32(_v, _bits);
}
OZZ_INLINE SimdInt4 ShiftRu(_SimdInt4 _v, int _bits) {
return _mm_srli_epi32(_v, _bits);
}
OZZ_INLINE SimdInt4 CmpEq(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_cmpeq_epi32(_a, _b);
}
OZZ_INLINE SimdInt4 CmpNe(_SimdInt4 _a, _SimdInt4 _b) {
const __m128i eq = _mm_cmpeq_epi32(_a, _b);
return _mm_xor_si128(eq, _mm_cmpeq_epi32(_a, _a));
}
OZZ_INLINE SimdInt4 CmpLt(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_cmpgt_epi32(_b, _a);
}
OZZ_INLINE SimdInt4 CmpLe(_SimdInt4 _a, _SimdInt4 _b) {
const __m128i gt = _mm_cmpgt_epi32(_a, _b);
return _mm_xor_si128(gt, _mm_cmpeq_epi32(_a, _a));
}
OZZ_INLINE SimdInt4 CmpGt(_SimdInt4 _a, _SimdInt4 _b) {
return _mm_cmpgt_epi32(_a, _b);
}
OZZ_INLINE SimdInt4 CmpGe(_SimdInt4 _a, _SimdInt4 _b) {
const __m128i lt = _mm_cmpgt_epi32(_b, _a);
return _mm_xor_si128(lt, _mm_cmpeq_epi32(_a, _a));
}
OZZ_INLINE Float4x4 Float4x4::identity() {
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128i one = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
const __m128i x = _mm_srli_si128(one, 12);
const Float4x4 ret = {{_mm_castsi128_ps(x),
_mm_castsi128_ps(_mm_slli_si128(x, 4)),
_mm_castsi128_ps(_mm_slli_si128(x, 8)),
_mm_castsi128_ps(_mm_slli_si128(one, 12))}};
return ret;
}
OZZ_INLINE Float4x4 Transpose(const Float4x4& _m) {
const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
const Float4x4 ret = {
{_mm_unpacklo_ps(tmp0, tmp1), _mm_unpackhi_ps(tmp0, tmp1),
_mm_unpacklo_ps(tmp2, tmp3), _mm_unpackhi_ps(tmp2, tmp3)}};
return ret;
}
inline Float4x4 Invert(const Float4x4& _m, SimdInt4* _invertible) {
const __m128 _t0 =
_mm_shuffle_ps(_m.cols[0], _m.cols[1], _MM_SHUFFLE(1, 0, 1, 0));
const __m128 _t1 =
_mm_shuffle_ps(_m.cols[2], _m.cols[3], _MM_SHUFFLE(1, 0, 1, 0));
const __m128 _t2 =
_mm_shuffle_ps(_m.cols[0], _m.cols[1], _MM_SHUFFLE(3, 2, 3, 2));
const __m128 _t3 =
_mm_shuffle_ps(_m.cols[2], _m.cols[3], _MM_SHUFFLE(3, 2, 3, 2));
const __m128 c0 = _mm_shuffle_ps(_t0, _t1, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 c1 = _mm_shuffle_ps(_t1, _t0, _MM_SHUFFLE(3, 1, 3, 1));
const __m128 c2 = _mm_shuffle_ps(_t2, _t3, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 c3 = _mm_shuffle_ps(_t3, _t2, _MM_SHUFFLE(3, 1, 3, 1));
__m128 minor0, minor1, minor2, minor3, tmp1, tmp2;
tmp1 = _mm_mul_ps(c2, c3);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
minor0 = _mm_mul_ps(c1, tmp1);
minor1 = _mm_mul_ps(c0, tmp1);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
minor0 = OZZ_MSUB(c1, tmp1, minor0);
minor1 = OZZ_MSUB(c0, tmp1, minor1);
minor1 = OZZ_SHUFFLE_PS1(minor1, 0x4E);
tmp1 = _mm_mul_ps(c1, c2);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
minor0 = OZZ_MADD(c3, tmp1, minor0);
minor3 = _mm_mul_ps(c0, tmp1);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
minor0 = OZZ_NMADD(c3, tmp1, minor0);
minor3 = OZZ_MSUB(c0, tmp1, minor3);
minor3 = OZZ_SHUFFLE_PS1(minor3, 0x4E);
tmp1 = _mm_mul_ps(OZZ_SHUFFLE_PS1(c1, 0x4E), c3);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
tmp2 = OZZ_SHUFFLE_PS1(c2, 0x4E);
minor0 = OZZ_MADD(tmp2, tmp1, minor0);
minor2 = _mm_mul_ps(c0, tmp1);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
minor0 = OZZ_NMADD(tmp2, tmp1, minor0);
minor2 = OZZ_MSUB(c0, tmp1, minor2);
minor2 = OZZ_SHUFFLE_PS1(minor2, 0x4E);
tmp1 = _mm_mul_ps(c0, c1);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
minor2 = OZZ_MADD(c3, tmp1, minor2);
minor3 = OZZ_MSUB(tmp2, tmp1, minor3);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
minor2 = OZZ_MSUB(c3, tmp1, minor2);
minor3 = OZZ_NMADD(tmp2, tmp1, minor3);
tmp1 = _mm_mul_ps(c0, c3);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
minor1 = OZZ_NMADD(tmp2, tmp1, minor1);
minor2 = OZZ_MADD(c1, tmp1, minor2);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
minor1 = OZZ_MADD(tmp2, tmp1, minor1);
minor2 = OZZ_NMADD(c1, tmp1, minor2);
tmp1 = _mm_mul_ps(c0, tmp2);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
minor1 = OZZ_MADD(c3, tmp1, minor1);
minor3 = OZZ_NMADD(c1, tmp1, minor3);
tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
minor1 = OZZ_NMADD(c3, tmp1, minor1);
minor3 = OZZ_MADD(c1, tmp1, minor3);
__m128 det;
det = _mm_mul_ps(c0, minor0);
det = _mm_add_ps(OZZ_SHUFFLE_PS1(det, 0x4E), det);
det = _mm_add_ss(OZZ_SHUFFLE_PS1(det, 0xB1), det);
const SimdInt4 invertible = CmpNe(det, simd_float4::zero());
assert((_invertible || AreAllTrue1(invertible)) &&
"Matrix is not invertible");
if (_invertible != nullptr) {
*_invertible = invertible;
}
tmp1 = OZZ_SSE_SELECT_F(invertible, RcpEstNR(det), simd_float4::zero());
det = OZZ_NMADDX(det, _mm_mul_ss(tmp1, tmp1), _mm_add_ss(tmp1, tmp1));
det = OZZ_SHUFFLE_PS1(det, 0x00);
// Copy the final columns
const Float4x4 ret = {{_mm_mul_ps(det, minor0), _mm_mul_ps(det, minor1),
_mm_mul_ps(det, minor2), _mm_mul_ps(det, minor3)}};
return ret;
}
Float4x4 Float4x4::Translation(_SimdFloat4 _v) {
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128i mask000f = _mm_slli_si128(ffff, 12);
const __m128i one = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
const __m128i x = _mm_srli_si128(one, 12);
const Float4x4 ret = {
{_mm_castsi128_ps(x), _mm_castsi128_ps(_mm_slli_si128(x, 4)),
_mm_castsi128_ps(_mm_slli_si128(x, 8)),
OZZ_SSE_SELECT_F(mask000f, _mm_castsi128_ps(one), _v)}};
return ret;
} // math
Float4x4 Float4x4::Scaling(_SimdFloat4 _v) {
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128i if000 = _mm_srli_si128(ffff, 12);
const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
const Float4x4 ret = {
{_mm_and_ps(_v, _mm_castsi128_ps(if000)),
_mm_and_ps(_v, _mm_castsi128_ps(_mm_slli_si128(if000, 4))),
_mm_and_ps(_v, _mm_castsi128_ps(_mm_slli_si128(if000, 8))),
_mm_castsi128_ps(_mm_slli_si128(ione, 12))}};
return ret;
} // math
OZZ_INLINE Float4x4 Translate(const Float4x4& _m, _SimdFloat4 _v) {
const __m128 a01 = OZZ_MADD(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0),
_mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1)));
const __m128 m3 = OZZ_MADD(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2), _m.cols[3]);
const Float4x4 ret = {
{_m.cols[0], _m.cols[1], _m.cols[2], _mm_add_ps(a01, m3)}};
return ret;
}
OZZ_INLINE Float4x4 Scale(const Float4x4& _m, _SimdFloat4 _v) {
const Float4x4 ret = {{_mm_mul_ps(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0)),
_mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1)),
_mm_mul_ps(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2)),
_m.cols[3]}};
return ret;
}
OZZ_INLINE Float4x4 ColumnMultiply(const Float4x4& _m, _SimdFloat4 _v) {
const Float4x4 ret = {{_mm_mul_ps(_m.cols[0], _v), _mm_mul_ps(_m.cols[1], _v),
_mm_mul_ps(_m.cols[2], _v),
_mm_mul_ps(_m.cols[3], _v)}};
return ret;
}
inline SimdInt4 IsNormalized(const Float4x4& _m) {
const __m128 max = _mm_set_ps1(1.f + kNormalizationToleranceSq);
const __m128 min = _mm_set_ps1(1.f - kNormalizationToleranceSq);
const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
const __m128 row0 = _mm_unpacklo_ps(tmp0, tmp1);
const __m128 row1 = _mm_unpackhi_ps(tmp0, tmp1);
const __m128 row2 = _mm_unpacklo_ps(tmp2, tmp3);
const __m128 dot =
OZZ_MADD(row0, row0, OZZ_MADD(row1, row1, _mm_mul_ps(row2, row2)));
const __m128 normalized =
_mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min));
return _mm_castps_si128(
_mm_and_ps(normalized, _mm_castsi128_ps(simd_int4::mask_fff0())));
}
inline SimdInt4 IsNormalizedEst(const Float4x4& _m) {
const __m128 max = _mm_set_ps1(1.f + kNormalizationToleranceEstSq);
const __m128 min = _mm_set_ps1(1.f - kNormalizationToleranceEstSq);
const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
const __m128 row0 = _mm_unpacklo_ps(tmp0, tmp1);
const __m128 row1 = _mm_unpackhi_ps(tmp0, tmp1);
const __m128 row2 = _mm_unpacklo_ps(tmp2, tmp3);
const __m128 dot =
OZZ_MADD(row0, row0, OZZ_MADD(row1, row1, _mm_mul_ps(row2, row2)));
const __m128 normalized =
_mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min));
return _mm_castps_si128(
_mm_and_ps(normalized, _mm_castsi128_ps(simd_int4::mask_fff0())));
}
OZZ_INLINE SimdInt4 IsOrthogonal(const Float4x4& _m) {
const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
const __m128 zero = _mm_setzero_ps();
// Use simd_float4::zero() if one of the normalization fails. _m will then be
// considered not orthogonal.
const SimdFloat4 cross = NormalizeSafe3(Cross3(_m.cols[0], _m.cols[1]), zero);
const SimdFloat4 at = NormalizeSafe3(_m.cols[2], zero);
SimdFloat4 dot;
OZZ_SSE_DOT3_F(cross, at, dot);
__m128 dotx000 = _mm_move_ss(zero, dot);
return _mm_castps_si128(
_mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}
inline SimdFloat4 ToQuaternion(const Float4x4& _m) {
assert(AreAllTrue3(IsNormalizedEst(_m)));
assert(AreAllTrue1(IsOrthogonal(_m)));
// Prepares constants.
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128 half = _mm_set1_ps(0.5f);
const __m128i mask_f000 = _mm_srli_si128(ffff, 12);
const __m128i mask_000f = _mm_slli_si128(ffff, 12);
const __m128 one =
_mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2));
const __m128i mask_0f00 = _mm_slli_si128(mask_f000, 4);
const __m128i mask_00f0 = _mm_slli_si128(mask_f000, 8);
const __m128 xx_yy = OZZ_SSE_SELECT_F(mask_0f00, _m.cols[1], _m.cols[0]);
const __m128 xx_yy_0010 = OZZ_SHUFFLE_PS1(xx_yy, _MM_SHUFFLE(0, 0, 1, 0));
const __m128 xx_yy_zz_xx =
OZZ_SSE_SELECT_F(mask_00f0, _m.cols[2], xx_yy_0010);
const __m128 yy_zz_xx_yy =
OZZ_SHUFFLE_PS1(xx_yy_zz_xx, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 zz_xx_yy_zz =
OZZ_SHUFFLE_PS1(xx_yy_zz_xx, _MM_SHUFFLE(2, 1, 0, 2));
const __m128 diag_sum =
_mm_add_ps(_mm_add_ps(xx_yy_zz_xx, yy_zz_xx_yy), zz_xx_yy_zz);
const __m128 diag_diff =
_mm_sub_ps(_mm_sub_ps(xx_yy_zz_xx, yy_zz_xx_yy), zz_xx_yy_zz);
const __m128 radicand =
_mm_add_ps(OZZ_SSE_SELECT_F(mask_000f, diag_sum, diag_diff), one);
const __m128 invSqrt = one / _mm_sqrt_ps(radicand);
__m128 zy_xz_yx = OZZ_SSE_SELECT_F(mask_00f0, _m.cols[1], _m.cols[0]);
zy_xz_yx = OZZ_SHUFFLE_PS1(zy_xz_yx, _MM_SHUFFLE(0, 1, 2, 2));
zy_xz_yx =
OZZ_SSE_SELECT_F(mask_0f00, OZZ_SSE_SPLAT_F(_m.cols[2], 0), zy_xz_yx);
__m128 yz_zx_xy = OZZ_SSE_SELECT_F(mask_f000, _m.cols[1], _m.cols[0]);
yz_zx_xy = OZZ_SHUFFLE_PS1(yz_zx_xy, _MM_SHUFFLE(0, 0, 2, 0));
yz_zx_xy =
OZZ_SSE_SELECT_F(mask_f000, OZZ_SSE_SPLAT_F(_m.cols[2], 1), yz_zx_xy);
const __m128 sum = _mm_add_ps(zy_xz_yx, yz_zx_xy);
const __m128 diff = _mm_sub_ps(zy_xz_yx, yz_zx_xy);
const __m128 scale = _mm_mul_ps(invSqrt, half);
const __m128 sum0 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 1, 2, 0));
const __m128 sum1 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 0, 0, 2));
const __m128 sum2 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 0, 0, 1));
__m128 res0 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 0), sum0);
__m128 res1 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 1), sum1);
__m128 res2 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 2), sum2);
res0 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_f000, radicand, res0),
OZZ_SSE_SPLAT_F(scale, 0));
res1 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_0f00, radicand, res1),
OZZ_SSE_SPLAT_F(scale, 1));
res2 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_00f0, radicand, res2),
OZZ_SSE_SPLAT_F(scale, 2));
__m128 res3 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_000f, radicand, diff),
OZZ_SSE_SPLAT_F(scale, 3));
const __m128 xx = OZZ_SSE_SPLAT_F(_m.cols[0], 0);
const __m128 yy = OZZ_SSE_SPLAT_F(_m.cols[1], 1);
const __m128 zz = OZZ_SSE_SPLAT_F(_m.cols[2], 2);
const __m128i cond0 = _mm_castps_si128(_mm_cmpgt_ps(yy, xx));
const __m128i cond1 =
_mm_castps_si128(_mm_and_ps(_mm_cmpgt_ps(zz, xx), _mm_cmpgt_ps(zz, yy)));
const __m128i cond2 = _mm_castps_si128(
_mm_cmpgt_ps(OZZ_SSE_SPLAT_F(diag_sum, 0), _mm_castsi128_ps(zero)));
__m128 res = OZZ_SSE_SELECT_F(cond0, res1, res0);
res = OZZ_SSE_SELECT_F(cond1, res2, res);
res = OZZ_SSE_SELECT_F(cond2, res3, res);
assert(AreAllTrue1(IsNormalizedEst4(res)));
return res;
}
inline bool ToAffine(const Float4x4& _m, SimdFloat4* _translation,
SimdFloat4* _quaternion, SimdFloat4* _scale) {
const __m128 zero = _mm_setzero_ps();
const __m128 one = simd_float4::one();
const __m128i fff0 = simd_int4::mask_fff0();
const __m128 max = _mm_set_ps1(kOrthogonalisationToleranceSq);
const __m128 min = _mm_set_ps1(-kOrthogonalisationToleranceSq);
// Extracts translation.
*_translation = OZZ_SSE_SELECT_F(fff0, _m.cols[3], one);
// Extracts scale.
const __m128 m_tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
const __m128 m_tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
const __m128 m_tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
const __m128 m_tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
const __m128 m_row0 = _mm_unpacklo_ps(m_tmp0, m_tmp1);
const __m128 m_row1 = _mm_unpackhi_ps(m_tmp0, m_tmp1);
const __m128 m_row2 = _mm_unpacklo_ps(m_tmp2, m_tmp3);
const __m128 dot = OZZ_MADD(
m_row0, m_row0, OZZ_MADD(m_row1, m_row1, _mm_mul_ps(m_row2, m_row2)));
const __m128 abs_scale = _mm_sqrt_ps(dot);
const __m128 zero_axis =
_mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min));
// Builds an orthonormal matrix in order to support quaternion extraction.
Float4x4 orthonormal;
int mask = _mm_movemask_ps(zero_axis);
if (mask & 1) {
if (mask & 6) {
return false;
}
orthonormal.cols[1] = _mm_div_ps(_m.cols[1], OZZ_SSE_SPLAT_F(abs_scale, 1));
orthonormal.cols[0] = Normalize3(Cross3(orthonormal.cols[1], _m.cols[2]));
orthonormal.cols[2] =
Normalize3(Cross3(orthonormal.cols[0], orthonormal.cols[1]));
} else if (mask & 4) {
if (mask & 3) {
return false;
}
orthonormal.cols[0] = _mm_div_ps(_m.cols[0], OZZ_SSE_SPLAT_F(abs_scale, 0));
orthonormal.cols[2] = Normalize3(Cross3(orthonormal.cols[0], _m.cols[1]));
orthonormal.cols[1] =
Normalize3(Cross3(orthonormal.cols[2], orthonormal.cols[0]));
} else { // Favor z axis in the default case
if (mask & 5) {
return false;
}
orthonormal.cols[2] = _mm_div_ps(_m.cols[2], OZZ_SSE_SPLAT_F(abs_scale, 2));
orthonormal.cols[1] = Normalize3(Cross3(orthonormal.cols[2], _m.cols[0]));
orthonormal.cols[0] =
Normalize3(Cross3(orthonormal.cols[1], orthonormal.cols[2]));
}
orthonormal.cols[3] = simd_float4::w_axis();
// Get back scale signs in case of reflexions
const __m128 o_tmp0 =
_mm_unpacklo_ps(orthonormal.cols[0], orthonormal.cols[2]);
const __m128 o_tmp1 =
_mm_unpacklo_ps(orthonormal.cols[1], orthonormal.cols[3]);
const __m128 o_tmp2 =
_mm_unpackhi_ps(orthonormal.cols[0], orthonormal.cols[2]);
const __m128 o_tmp3 =
_mm_unpackhi_ps(orthonormal.cols[1], orthonormal.cols[3]);
const __m128 o_row0 = _mm_unpacklo_ps(o_tmp0, o_tmp1);
const __m128 o_row1 = _mm_unpackhi_ps(o_tmp0, o_tmp1);
const __m128 o_row2 = _mm_unpacklo_ps(o_tmp2, o_tmp3);
const __m128 scale_dot = OZZ_MADD(
o_row0, m_row0, OZZ_MADD(o_row1, m_row1, _mm_mul_ps(o_row2, m_row2)));
const __m128i cond = _mm_castps_si128(_mm_cmpgt_ps(scale_dot, zero));
const __m128 cfalse = _mm_sub_ps(zero, abs_scale);
const __m128 scale = OZZ_SSE_SELECT_F(cond, abs_scale, cfalse);
*_scale = OZZ_SSE_SELECT_F(fff0, scale, one);
// Extracts quaternion.
*_quaternion = ToQuaternion(orthonormal);
return true;
}
inline Float4x4 Float4x4::FromEuler(_SimdFloat4 _v) {
const __m128 cos = Cos(_v);
const __m128 sin = Sin(_v);
const float cx = GetX(cos);
const float sx = GetX(sin);
const float cy = GetY(cos);
const float sy = GetY(sin);
const float cz = GetZ(cos);
const float sz = GetZ(sin);
const float sycz = sy * cz;
const float sysz = sy * sz;
const Float4x4 ret = {{simd_float4::Load(cx * cy, sx * sz - cx * sycz,
cx * sysz + sx * cz, 0.f),
simd_float4::Load(sy, cy * cz, -cy * sz, 0.f),
simd_float4::Load(-sx * cy, sx * sycz + cx * sz,
-sx * sysz + cx * cz, 0.f),
simd_float4::w_axis()}};
return ret;
}
inline Float4x4 Float4x4::FromAxisAngle(_SimdFloat4 _axis, _SimdFloat4 _angle) {
assert(AreAllTrue1(IsNormalizedEst3(_axis)));
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4));
const __m128 one = _mm_castsi128_ps(ione);
const __m128 w_axis = _mm_castsi128_ps(_mm_slli_si128(ione, 12));
const __m128 sin = SplatX(SinX(_angle));
const __m128 cos = SplatX(CosX(_angle));
const __m128 one_minus_cos = _mm_sub_ps(one, cos);
const __m128 v0 =
_mm_mul_ps(_mm_mul_ps(one_minus_cos,
OZZ_SHUFFLE_PS1(_axis, _MM_SHUFFLE(3, 0, 2, 1))),
OZZ_SHUFFLE_PS1(_axis, _MM_SHUFFLE(3, 1, 0, 2)));
const __m128 r0 =
_mm_add_ps(_mm_mul_ps(_mm_mul_ps(one_minus_cos, _axis), _axis), cos);
const __m128 r1 = _mm_add_ps(_mm_mul_ps(sin, _axis), v0);
const __m128 r2 = _mm_sub_ps(v0, _mm_mul_ps(sin, _axis));
const __m128 r0fff0 = _mm_and_ps(r0, fff0);
const __m128 r1r22120 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 1, 2, 0));
const __m128 v1 = OZZ_SHUFFLE_PS1(r1r22120, _MM_SHUFFLE(0, 3, 2, 1));
const __m128 r1r20011 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1));
const __m128 v2 = OZZ_SHUFFLE_PS1(r1r20011, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 t0 = _mm_shuffle_ps(r0fff0, v1, _MM_SHUFFLE(1, 0, 3, 0));
const __m128 t1 = _mm_shuffle_ps(r0fff0, v1, _MM_SHUFFLE(3, 2, 3, 1));
const Float4x4 ret = {{OZZ_SHUFFLE_PS1(t0, _MM_SHUFFLE(1, 3, 2, 0)),
OZZ_SHUFFLE_PS1(t1, _MM_SHUFFLE(1, 3, 0, 2)),
_mm_shuffle_ps(v2, r0fff0, _MM_SHUFFLE(3, 2, 1, 0)),
w_axis}};
return ret;
}
inline Float4x4 Float4x4::FromQuaternion(_SimdFloat4 _quaternion) {
assert(AreAllTrue1(IsNormalizedEst4(_quaternion)));
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4));
const __m128 c1110 = _mm_castsi128_ps(_mm_srli_si128(ione, 4));
const __m128 w_axis = _mm_castsi128_ps(_mm_slli_si128(ione, 12));
const __m128 vsum = _mm_add_ps(_quaternion, _quaternion);
const __m128 vms = _mm_mul_ps(_quaternion, vsum);
const __m128 r0 = _mm_sub_ps(
_mm_sub_ps(
c1110,
_mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 0, 0, 1)), fff0)),
_mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 1, 2, 2)), fff0));
const __m128 v0 =
_mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 1, 0, 0)),
OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 2, 1, 2)));
const __m128 v1 =
_mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 3, 3, 3)),
OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 0, 2, 1)));
const __m128 r1 = _mm_add_ps(v0, v1);
const __m128 r2 = _mm_sub_ps(v0, v1);
const __m128 r1r21021 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 v2 = OZZ_SHUFFLE_PS1(r1r21021, _MM_SHUFFLE(1, 3, 2, 0));
const __m128 r1r22200 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 2, 0, 0));
const __m128 v3 = OZZ_SHUFFLE_PS1(r1r22200, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 q0 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(1, 0, 3, 0));
const __m128 q1 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(3, 2, 3, 1));
const Float4x4 ret = {{OZZ_SHUFFLE_PS1(q0, _MM_SHUFFLE(1, 3, 2, 0)),
OZZ_SHUFFLE_PS1(q1, _MM_SHUFFLE(1, 3, 0, 2)),
_mm_shuffle_ps(v3, r0, _MM_SHUFFLE(3, 2, 1, 0)),
w_axis}};
return ret;
}
inline Float4x4 Float4x4::FromAffine(_SimdFloat4 _translation,
_SimdFloat4 _quaternion,
_SimdFloat4 _scale) {
assert(AreAllTrue1(IsNormalizedEst4(_quaternion)));
const __m128i zero = _mm_setzero_si128();
const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4));
const __m128 c1110 = _mm_castsi128_ps(_mm_srli_si128(ione, 4));
const __m128 vsum = _mm_add_ps(_quaternion, _quaternion);
const __m128 vms = _mm_mul_ps(_quaternion, vsum);
const __m128 r0 = _mm_sub_ps(
_mm_sub_ps(
c1110,
_mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 0, 0, 1)), fff0)),
_mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 1, 2, 2)), fff0));
const __m128 v0 =
_mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 1, 0, 0)),
OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 2, 1, 2)));
const __m128 v1 =
_mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 3, 3, 3)),
OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 0, 2, 1)));
const __m128 r1 = _mm_add_ps(v0, v1);
const __m128 r2 = _mm_sub_ps(v0, v1);
const __m128 r1r21021 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 2, 1));
const __m128 v2 = OZZ_SHUFFLE_PS1(r1r21021, _MM_SHUFFLE(1, 3, 2, 0));
const __m128 r1r22200 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 2, 0, 0));
const __m128 v3 = OZZ_SHUFFLE_PS1(r1r22200, _MM_SHUFFLE(2, 0, 2, 0));
const __m128 q0 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(1, 0, 3, 0));
const __m128 q1 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(3, 2, 3, 1));
const Float4x4 ret = {
{_mm_mul_ps(OZZ_SHUFFLE_PS1(q0, _MM_SHUFFLE(1, 3, 2, 0)),
OZZ_SSE_SPLAT_F(_scale, 0)),
_mm_mul_ps(OZZ_SHUFFLE_PS1(q1, _MM_SHUFFLE(1, 3, 0, 2)),
OZZ_SSE_SPLAT_F(_scale, 1)),
_mm_mul_ps(_mm_shuffle_ps(v3, r0, _MM_SHUFFLE(3, 2, 1, 0)),
OZZ_SSE_SPLAT_F(_scale, 2)),
_mm_movelh_ps(_translation, _mm_unpackhi_ps(_translation, c1110))}};
return ret;
}
OZZ_INLINE ozz::math::SimdFloat4 TransformPoint(const ozz::math::Float4x4& _m,
ozz::math::_SimdFloat4 _v) {
const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 0), _m.cols[0]);
const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 2), _m.cols[2], _m.cols[3]);
const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 1), _m.cols[1], xxxx);
return _mm_add_ps(a01, a23);
}
OZZ_INLINE ozz::math::SimdFloat4 TransformVector(const ozz::math::Float4x4& _m,
ozz::math::_SimdFloat4 _v) {
const __m128 xxxx = _mm_mul_ps(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0));
const __m128 zzzz = _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1));
const __m128 a21 = OZZ_MADD(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2), xxxx);
return _mm_add_ps(zzzz, a21);
}
OZZ_INLINE ozz::math::SimdFloat4 operator*(const ozz::math::Float4x4& _m,
ozz::math::_SimdFloat4 _v) {
const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 0), _m.cols[0]);
const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 2), _m.cols[2]);
const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 1), _m.cols[1], xxxx);
const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 3), _m.cols[3], zzzz);
return _mm_add_ps(a01, a23);
}
inline ozz::math::Float4x4 operator*(const ozz::math::Float4x4& _a,
const ozz::math::Float4x4& _b) {
ozz::math::Float4x4 ret;
{
const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[0], 0), _a.cols[0]);
const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[0], 2), _a.cols[2]);
const __m128 a01 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[0], 1), _a.cols[1], xxxx);
const __m128 a23 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[0], 3), _a.cols[3], zzzz);
ret.cols[0] = _mm_add_ps(a01, a23);
}
{
const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[1], 0), _a.cols[0]);
const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[1], 2), _a.cols[2]);
const __m128 a01 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[1], 1), _a.cols[1], xxxx);
const __m128 a23 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[1], 3), _a.cols[3], zzzz);
ret.cols[1] = _mm_add_ps(a01, a23);
}
{
const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[2], 0), _a.cols[0]);
const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[2], 2), _a.cols[2]);
const __m128 a01 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[2], 1), _a.cols[1], xxxx);
const __m128 a23 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[2], 3), _a.cols[3], zzzz);
ret.cols[2] = _mm_add_ps(a01, a23);
}
{
const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[3], 0), _a.cols[0]);
const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[3], 2), _a.cols[2]);
const __m128 a01 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[3], 1), _a.cols[1], xxxx);
const __m128 a23 =
OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[3], 3), _a.cols[3], zzzz);
ret.cols[3] = _mm_add_ps(a01, a23);
}
return ret;
}
OZZ_INLINE ozz::math::Float4x4 operator+(const ozz::math::Float4x4& _a,
const ozz::math::Float4x4& _b) {
const ozz::math::Float4x4 ret = {
{_mm_add_ps(_a.cols[0], _b.cols[0]), _mm_add_ps(_a.cols[1], _b.cols[1]),
_mm_add_ps(_a.cols[2], _b.cols[2]), _mm_add_ps(_a.cols[3], _b.cols[3])}};
return ret;
}
OZZ_INLINE ozz::math::Float4x4 operator-(const ozz::math::Float4x4& _a,
const ozz::math::Float4x4& _b) {
const ozz::math::Float4x4 ret = {
{_mm_sub_ps(_a.cols[0], _b.cols[0]), _mm_sub_ps(_a.cols[1], _b.cols[1]),
_mm_sub_ps(_a.cols[2], _b.cols[2]), _mm_sub_ps(_a.cols[3], _b.cols[3])}};
return ret;
}
} // namespace math
} // namespace ozz
#if !defined(OZZ_DISABLE_SSE_NATIVE_OPERATORS)
OZZ_INLINE ozz::math::SimdFloat4 operator+(ozz::math::_SimdFloat4 _a,
ozz::math::_SimdFloat4 _b) {
return _mm_add_ps(_a, _b);
}
OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _a,
ozz::math::_SimdFloat4 _b) {
return _mm_sub_ps(_a, _b);
}
OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _v) {
return _mm_sub_ps(_mm_setzero_ps(), _v);
}
OZZ_INLINE ozz::math::SimdFloat4 operator*(ozz::math::_SimdFloat4 _a,
ozz::math::_SimdFloat4 _b) {
return _mm_mul_ps(_a, _b);
}
OZZ_INLINE ozz::math::SimdFloat4 operator/(ozz::math::_SimdFloat4 _a,
ozz::math::_SimdFloat4 _b) {
return _mm_div_ps(_a, _b);
}
#endif // !defined(OZZ_DISABLE_SSE_NATIVE_OPERATORS)
namespace ozz {
namespace math {
OZZ_INLINE uint16_t FloatToHalf(float _f) {
const int h = _mm_cvtsi128_si32(FloatToHalf(_mm_set1_ps(_f)));
return static_cast<uint16_t>(h);
}
OZZ_INLINE float HalfToFloat(uint16_t _h) {
return _mm_cvtss_f32(HalfToFloat(_mm_set1_epi32(_h)));
}
// Half <-> Float implementation is based on:
// http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/.
inline SimdInt4 FloatToHalf(_SimdFloat4 _f) {
const __m128i mask_sign = _mm_set1_epi32(0x80000000u);
const __m128i mask_round = _mm_set1_epi32(~0xfffu);
const __m128i f32infty = _mm_set1_epi32(255 << 23);
const __m128 magic = _mm_castsi128_ps(_mm_set1_epi32(15 << 23));
const __m128i nanbit = _mm_set1_epi32(0x200);
const __m128i infty_as_fp16 = _mm_set1_epi32(0x7c00);
const __m128 clamp = _mm_castsi128_ps(_mm_set1_epi32((31 << 23) - 0x1000));
const __m128 msign = _mm_castsi128_ps(mask_sign);
const __m128 justsign = _mm_and_ps(msign, _f);
const __m128 absf = _mm_xor_ps(_f, justsign);
const __m128 mround = _mm_castsi128_ps(mask_round);
const __m128i absf_int = _mm_castps_si128(absf);
const __m128i b_isnan = _mm_cmpgt_epi32(absf_int, f32infty);
const __m128i b_isnormal = _mm_cmpgt_epi32(f32infty, _mm_castps_si128(absf));
const __m128i inf_or_nan =
_mm_or_si128(_mm_and_si128(b_isnan, nanbit), infty_as_fp16);
const __m128 fnosticky = _mm_and_ps(absf, mround);
const __m128 scaled = _mm_mul_ps(fnosticky, magic);
// Logically, we want PMINSD on "biased", but this should gen better code
const __m128 clamped = _mm_min_ps(scaled, clamp);
const __m128i biased =
_mm_sub_epi32(_mm_castps_si128(clamped), _mm_castps_si128(mround));
const __m128i shifted = _mm_srli_epi32(biased, 13);
const __m128i normal = _mm_and_si128(shifted, b_isnormal);
const __m128i not_normal = _mm_andnot_si128(b_isnormal, inf_or_nan);
const __m128i joined = _mm_or_si128(normal, not_normal);
const __m128i sign_shift = _mm_srli_epi32(_mm_castps_si128(justsign), 16);
return _mm_or_si128(joined, sign_shift);
}
OZZ_INLINE SimdFloat4 HalfToFloat(_SimdInt4 _h) {
const __m128i mask_nosign = _mm_set1_epi32(0x7fff);
const __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23));
const __m128i was_infnan = _mm_set1_epi32(0x7bff);
const __m128 exp_infnan = _mm_castsi128_ps(_mm_set1_epi32(255 << 23));
const __m128i expmant = _mm_and_si128(mask_nosign, _h);
const __m128i shifted = _mm_slli_epi32(expmant, 13);
const __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), magic);
const __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant, was_infnan);
const __m128i sign = _mm_slli_epi32(_mm_xor_si128(_h, expmant), 16);
const __m128 infnanexp =
_mm_and_ps(_mm_castsi128_ps(b_wasinfnan), exp_infnan);
const __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
return _mm_or_ps(scaled, sign_inf);
}
} // namespace math
} // namespace ozz
#undef OZZ_SHUFFLE_PS1
#undef OZZ_SSE_SPLAT_F
#undef OZZ_SSE_HADD2_F
#undef OZZ_SSE_HADD3_F
#undef OZZ_SSE_HADD4_F
#undef OZZ_SSE_DOT2_F
#undef OZZ_SSE_DOT3_F
#undef OZZ_SSE_DOT4_F
#undef OZZ_MADD
#undef OZZ_MSUB
#undef OZZ_NMADD
#undef OZZ_NMSUB
#undef OZZ_MADDX
#undef OZZ_MSUBX
#undef OZZ_NMADDX
#undef OZZ_NMSUBX
#undef OZZ_SSE_SELECT_F
#undef OZZ_SSE_SPLAT_I
#undef OZZ_SSE_SELECT_I
#endif // OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_