//----------------------------------------------------------------------------// // // // ozz-animation is hosted at http://github.com/guillaumeblanc/ozz-animation // // and distributed under the MIT License (MIT). // // // // Copyright (c) Guillaume Blanc // // // // Permission is hereby granted, free of charge, to any person obtaining a // // copy of this software and associated documentation files (the "Software"), // // to deal in the Software without restriction, including without limitation // // the rights to use, copy, modify, merge, publish, distribute, sublicense, // // and/or sell copies of the Software, and to permit persons to whom the // // Software is furnished to do so, subject to the following conditions: // // // // The above copyright notice and this permission notice shall be included in // // all copies or substantial portions of the Software. // // // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL // // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER // // DEALINGS IN THE SOFTWARE. // // // //----------------------------------------------------------------------------// #ifndef OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_ #define OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_ // SIMD SSE2+ implementation, based on scalar floats. #include #include // Temporarly needed while trigonometric functions aren't implemented. #include #include "ozz/base/maths/math_constant.h" namespace ozz { namespace math { namespace simd_float4 { // Internal macros. // Unused components of the result vector are replicated from the first input // argument. #ifdef OZZ_SIMD_AVX #define OZZ_SHUFFLE_PS1(_v, _m) _mm_permute_ps(_v, _m) #else // OZZ_SIMD_AVX #define OZZ_SHUFFLE_PS1(_v, _m) _mm_shuffle_ps(_v, _v, _m) #endif // OZZ_SIMD_AVX #define OZZ_SSE_SPLAT_F(_v, _i) OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(_i, _i, _i, _i)) #define OZZ_SSE_SPLAT_I(_v, _i) \ _mm_shuffle_epi32(_v, _MM_SHUFFLE(_i, _i, _i, _i)) // _v.x + _v.y, _v.y, _v.z, _v.w #define OZZ_SSE_HADD2_F(_v) _mm_add_ss(_v, OZZ_SSE_SPLAT_F(_v, 1)) // _v.x + _v.y + _v.z, _v.y, _v.z, _v.w #define OZZ_SSE_HADD3_F(_v) \ _mm_add_ss(_mm_add_ss(_v, OZZ_SSE_SPLAT_F(_v, 2)), OZZ_SSE_SPLAT_F(_v, 1)) // _v.x + _v.y + _v.z + _v.w, ?, ?, ? #define OZZ_SSE_HADD4_F(_v, _r) \ do { \ const __m128 haddxyzw = _mm_add_ps(_v, _mm_movehl_ps(_v, _v)); \ _r = _mm_add_ss(haddxyzw, OZZ_SSE_SPLAT_F(haddxyzw, 1)); \ } while (void(0), 0) // dot2, ?, ?, ? #define OZZ_SSE_DOT2_F(_a, _b, _r) \ do { \ const __m128 ab = _mm_mul_ps(_a, _b); \ _r = _mm_add_ss(ab, OZZ_SSE_SPLAT_F(ab, 1)); \ \ } while (void(0), 0) #ifdef OZZ_SIMD_SSE4_1 // dot3, ?, ?, ? #define OZZ_SSE_DOT3_F(_a, _b, _r) \ do { \ _r = _mm_dp_ps(_a, _b, 0x7f); \ } while (void(0), 0) // dot4, ?, ?, ? #define OZZ_SSE_DOT4_F(_a, _b, _r) \ do { \ _r = _mm_dp_ps(_a, _b, 0xff); \ } while (void(0), 0) #else // OZZ_SIMD_SSE4_1 // dot3, ?, ?, ? #define OZZ_SSE_DOT3_F(_a, _b, _r) \ do { \ const __m128 ab = _mm_mul_ps(_a, _b); \ _r = OZZ_SSE_HADD3_F(ab); \ } while (void(0), 0) // dot4, ?, ?, ? #define OZZ_SSE_DOT4_F(_a, _b, _r) \ do { \ const __m128 ab = _mm_mul_ps(_a, _b); \ OZZ_SSE_HADD4_F(ab, _r); \ } while (void(0), 0) #endif // OZZ_SIMD_SSE4_1 // FMA operations #ifdef OZZ_SIMD_FMA #define OZZ_MADD(_a, _b, _c) _mm_fmadd_ps(_a, _b, _c) #define OZZ_MSUB(_a, _b, _c) _mm_fmsub_ps(_a, _b, _c) #define OZZ_NMADD(_a, _b, _c) _mm_fnmadd_ps(_a, _b, _c) #define OZZ_NMSUB(_a, _b, _c) _mm_fnmsub_ps(_a, _b, _c) #define OZZ_MADDX(_a, _b, _c) _mm_fmadd_ss(_a, _b, _c) #define OZZ_MSUBX(_a, _b, _c) _mm_fmsub_ss(_a, _b, _c) #define OZZ_NMADDX(_a, _b, _c) _mm_fnmadd_ss(_a, _b, _c) #define OZZ_NMSUBX(_a, _b, _c) _mm_fnmsub_ss(_a, _b, _c) #else // OZZ_SIMD_FMA #define OZZ_MADD(_a, _b, _c) _mm_add_ps(_mm_mul_ps(_a, _b), _c) #define OZZ_MSUB(_a, _b, _c) _mm_sub_ps(_mm_mul_ps(_a, _b), _c) #define OZZ_NMADD(_a, _b, _c) _mm_sub_ps(_c, _mm_mul_ps(_a, _b)) #define OZZ_NMSUB(_a, _b, _c) (-_mm_add_ps(_mm_mul_ps(_a, _b), _c)) #define OZZ_MADDX(_a, _b, _c) _mm_add_ss(_mm_mul_ss(_a, _b), _c) #define OZZ_MSUBX(_a, _b, _c) _mm_sub_ss(_mm_mul_ss(_a, _b), _c) #define OZZ_NMADDX(_a, _b, _c) _mm_sub_ss(_c, _mm_mul_ss(_a, _b)) #define OZZ_NMSUBX(_a, _b, _c) (-_mm_add_ss(_mm_mul_ss(_a, _b), _c)) #endif // OZZ_SIMD_FMA OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_div_ss(_a, _b); } #ifdef OZZ_SIMD_SSE4_1 #define OZZ_SSE_SELECT_F(_b, _true, _false) \ _mm_blendv_ps(_false, _true, _mm_castsi128_ps(_b)) #define OZZ_SSE_SELECT_I(_b, _true, _false) _mm_blendv_epi8(_false, _true, _b) #else // OZZ_SIMD_SSE4_1 #define OZZ_SSE_SELECT_F(_b, _true, _false) \ _mm_or_ps(_mm_and_ps(_true, _mm_castsi128_ps(_b)), \ _mm_andnot_ps(_mm_castsi128_ps(_b), _false)) #define OZZ_SSE_SELECT_I(_b, _true, _false) \ _mm_or_si128(_mm_and_si128(_true, _b), _mm_andnot_si128(_b, _false)) #endif // OZZ_SIMD_SSE4_1 OZZ_INLINE SimdFloat4 zero() { return _mm_setzero_ps(); } OZZ_INLINE SimdFloat4 one() { const __m128i zero = _mm_setzero_si128(); return _mm_castsi128_ps( _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2)); } OZZ_INLINE SimdFloat4 x_axis() { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2); return _mm_castsi128_ps(_mm_srli_si128(one, 12)); } OZZ_INLINE SimdFloat4 y_axis() { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2); return _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(one, 12), 4)); } OZZ_INLINE SimdFloat4 z_axis() { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2); return _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(one, 12), 8)); } OZZ_INLINE SimdFloat4 w_axis() { const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2); return _mm_castsi128_ps(_mm_slli_si128(one, 12)); } OZZ_INLINE SimdFloat4 Load(float _x, float _y, float _z, float _w) { return _mm_set_ps(_w, _z, _y, _x); } OZZ_INLINE SimdFloat4 LoadX(float _x) { return _mm_set_ss(_x); } OZZ_INLINE SimdFloat4 Load1(float _x) { return _mm_set_ps1(_x); } OZZ_INLINE SimdFloat4 LoadPtr(const float* _f) { assert(!(reinterpret_cast(_f) & 0xf) && "Invalid alignment"); return _mm_load_ps(_f); } OZZ_INLINE SimdFloat4 LoadPtrU(const float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); return _mm_loadu_ps(_f); } OZZ_INLINE SimdFloat4 LoadXPtrU(const float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); return _mm_load_ss(_f); } OZZ_INLINE SimdFloat4 Load1PtrU(const float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); return _mm_load_ps1(_f); } OZZ_INLINE SimdFloat4 Load2PtrU(const float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); return _mm_unpacklo_ps(_mm_load_ss(_f + 0), _mm_load_ss(_f + 1)); } OZZ_INLINE SimdFloat4 Load3PtrU(const float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); return _mm_movelh_ps( _mm_unpacklo_ps(_mm_load_ss(_f + 0), _mm_load_ss(_f + 1)), _mm_load_ss(_f + 2)); } OZZ_INLINE SimdFloat4 FromInt(_SimdInt4 _i) { return _mm_cvtepi32_ps(_i); } } // namespace simd_float4 OZZ_INLINE float GetX(_SimdFloat4 _v) { return _mm_cvtss_f32(_v); } OZZ_INLINE float GetY(_SimdFloat4 _v) { return _mm_cvtss_f32(OZZ_SSE_SPLAT_F(_v, 1)); } OZZ_INLINE float GetZ(_SimdFloat4 _v) { return _mm_cvtss_f32(_mm_movehl_ps(_v, _v)); } OZZ_INLINE float GetW(_SimdFloat4 _v) { return _mm_cvtss_f32(OZZ_SSE_SPLAT_F(_v, 3)); } OZZ_INLINE SimdFloat4 SetX(_SimdFloat4 _v, _SimdFloat4 _f) { return _mm_move_ss(_v, _f); } OZZ_INLINE SimdFloat4 SetY(_SimdFloat4 _v, _SimdFloat4 _f) { const __m128 xfnn = _mm_unpacklo_ps(_v, _f); return _mm_shuffle_ps(xfnn, _v, _MM_SHUFFLE(3, 2, 1, 0)); } OZZ_INLINE SimdFloat4 SetZ(_SimdFloat4 _v, _SimdFloat4 _f) { const __m128 ffww = _mm_shuffle_ps(_f, _v, _MM_SHUFFLE(3, 3, 0, 0)); return _mm_shuffle_ps(_v, ffww, _MM_SHUFFLE(2, 0, 1, 0)); } OZZ_INLINE SimdFloat4 SetW(_SimdFloat4 _v, _SimdFloat4 _f) { const __m128 ffzz = _mm_shuffle_ps(_f, _v, _MM_SHUFFLE(2, 2, 0, 0)); return _mm_shuffle_ps(_v, ffzz, _MM_SHUFFLE(0, 2, 1, 0)); } OZZ_INLINE SimdFloat4 SetI(_SimdFloat4 _v, _SimdFloat4 _f, int _ith) { assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range."); union { SimdFloat4 ret; float af[4]; } u = {_v}; u.af[_ith] = _mm_cvtss_f32(_f); return u.ret; } OZZ_INLINE void StorePtr(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0xf) && "Invalid alignment"); _mm_store_ps(_f, _v); } OZZ_INLINE void Store1Ptr(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0xf) && "Invalid alignment"); _mm_store_ss(_f, _v); } OZZ_INLINE void Store2Ptr(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0xf) && "Invalid alignment"); _mm_storel_pi(reinterpret_cast<__m64*>(_f), _v); } OZZ_INLINE void Store3Ptr(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0xf) && "Invalid alignment"); _mm_storel_pi(reinterpret_cast<__m64*>(_f), _v); _mm_store_ss(_f + 2, _mm_movehl_ps(_v, _v)); } OZZ_INLINE void StorePtrU(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); _mm_storeu_ps(_f, _v); } OZZ_INLINE void Store1PtrU(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); _mm_store_ss(_f, _v); } OZZ_INLINE void Store2PtrU(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); _mm_store_ss(_f + 0, _v); _mm_store_ss(_f + 1, OZZ_SSE_SPLAT_F(_v, 1)); } OZZ_INLINE void Store3PtrU(_SimdFloat4 _v, float* _f) { assert(!(reinterpret_cast(_f) & 0x3) && "Invalid alignment"); _mm_store_ss(_f + 0, _v); _mm_store_ss(_f + 1, OZZ_SSE_SPLAT_F(_v, 1)); _mm_store_ss(_f + 2, _mm_movehl_ps(_v, _v)); } OZZ_INLINE SimdFloat4 SplatX(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 0); } OZZ_INLINE SimdFloat4 SplatY(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 1); } OZZ_INLINE SimdFloat4 SplatZ(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 2); } OZZ_INLINE SimdFloat4 SplatW(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 3); } template OZZ_INLINE SimdFloat4 Swizzle(_SimdFloat4 _v) { static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3, "Indices must be between 0 and 3"); return OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(_W, _Z, _Y, _X)); } template <> OZZ_INLINE SimdFloat4 Swizzle<0, 1, 2, 3>(_SimdFloat4 _v) { return _v; } template <> OZZ_INLINE SimdFloat4 Swizzle<0, 1, 0, 1>(_SimdFloat4 _v) { return _mm_movelh_ps(_v, _v); } template <> OZZ_INLINE SimdFloat4 Swizzle<2, 3, 2, 3>(_SimdFloat4 _v) { return _mm_movehl_ps(_v, _v); } template <> OZZ_INLINE SimdFloat4 Swizzle<0, 0, 1, 1>(_SimdFloat4 _v) { return _mm_unpacklo_ps(_v, _v); } template <> OZZ_INLINE SimdFloat4 Swizzle<2, 2, 3, 3>(_SimdFloat4 _v) { return _mm_unpackhi_ps(_v, _v); } OZZ_INLINE void Transpose4x1(const SimdFloat4 _in[4], SimdFloat4 _out[1]) { const __m128 xz = _mm_unpacklo_ps(_in[0], _in[2]); const __m128 yw = _mm_unpacklo_ps(_in[1], _in[3]); _out[0] = _mm_unpacklo_ps(xz, yw); } OZZ_INLINE void Transpose1x4(const SimdFloat4 _in[1], SimdFloat4 _out[4]) { const __m128 zwzw = _mm_movehl_ps(_in[0], _in[0]); const __m128 yyyy = OZZ_SSE_SPLAT_F(_in[0], 1); const __m128 wwww = OZZ_SSE_SPLAT_F(_in[0], 3); const __m128 zero = _mm_setzero_ps(); _out[0] = _mm_move_ss(zero, _in[0]); _out[1] = _mm_move_ss(zero, yyyy); _out[2] = _mm_move_ss(zero, zwzw); _out[3] = _mm_move_ss(zero, wwww); } OZZ_INLINE void Transpose4x2(const SimdFloat4 _in[4], SimdFloat4 _out[2]) { const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]); const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]); _out[0] = _mm_unpacklo_ps(tmp0, tmp1); _out[1] = _mm_unpackhi_ps(tmp0, tmp1); } OZZ_INLINE void Transpose2x4(const SimdFloat4 _in[2], SimdFloat4 _out[4]) { const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[1]); const __m128 tmp1 = _mm_unpackhi_ps(_in[0], _in[1]); const __m128 zero = _mm_setzero_ps(); _out[0] = _mm_movelh_ps(tmp0, zero); _out[1] = _mm_movehl_ps(zero, tmp0); _out[2] = _mm_movelh_ps(tmp1, zero); _out[3] = _mm_movehl_ps(zero, tmp1); } OZZ_INLINE void Transpose4x3(const SimdFloat4 _in[4], SimdFloat4 _out[3]) { const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]); const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]); const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]); const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]); _out[0] = _mm_unpacklo_ps(tmp0, tmp1); _out[1] = _mm_unpackhi_ps(tmp0, tmp1); _out[2] = _mm_unpacklo_ps(tmp2, tmp3); } OZZ_INLINE void Transpose3x4(const SimdFloat4 _in[3], SimdFloat4 _out[4]) { const __m128 zero = _mm_setzero_ps(); const __m128 temp0 = _mm_unpacklo_ps(_in[0], _in[1]); const __m128 temp1 = _mm_unpacklo_ps(_in[2], zero); const __m128 temp2 = _mm_unpackhi_ps(_in[0], _in[1]); const __m128 temp3 = _mm_unpackhi_ps(_in[2], zero); _out[0] = _mm_movelh_ps(temp0, temp1); _out[1] = _mm_movehl_ps(temp1, temp0); _out[2] = _mm_movelh_ps(temp2, temp3); _out[3] = _mm_movehl_ps(temp3, temp2); } OZZ_INLINE void Transpose4x4(const SimdFloat4 _in[4], SimdFloat4 _out[4]) { const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]); const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]); const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]); const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]); _out[0] = _mm_unpacklo_ps(tmp0, tmp1); _out[1] = _mm_unpackhi_ps(tmp0, tmp1); _out[2] = _mm_unpacklo_ps(tmp2, tmp3); _out[3] = _mm_unpackhi_ps(tmp2, tmp3); } OZZ_INLINE void Transpose16x16(const SimdFloat4 _in[16], SimdFloat4 _out[16]) { const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]); const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]); _out[0] = _mm_unpacklo_ps(tmp0, tmp1); _out[4] = _mm_unpackhi_ps(tmp0, tmp1); const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]); const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]); _out[8] = _mm_unpacklo_ps(tmp2, tmp3); _out[12] = _mm_unpackhi_ps(tmp2, tmp3); const __m128 tmp4 = _mm_unpacklo_ps(_in[4], _in[6]); const __m128 tmp5 = _mm_unpacklo_ps(_in[5], _in[7]); _out[1] = _mm_unpacklo_ps(tmp4, tmp5); _out[5] = _mm_unpackhi_ps(tmp4, tmp5); const __m128 tmp6 = _mm_unpackhi_ps(_in[4], _in[6]); const __m128 tmp7 = _mm_unpackhi_ps(_in[5], _in[7]); _out[9] = _mm_unpacklo_ps(tmp6, tmp7); _out[13] = _mm_unpackhi_ps(tmp6, tmp7); const __m128 tmp8 = _mm_unpacklo_ps(_in[8], _in[10]); const __m128 tmp9 = _mm_unpacklo_ps(_in[9], _in[11]); _out[2] = _mm_unpacklo_ps(tmp8, tmp9); _out[6] = _mm_unpackhi_ps(tmp8, tmp9); const __m128 tmp10 = _mm_unpackhi_ps(_in[8], _in[10]); const __m128 tmp11 = _mm_unpackhi_ps(_in[9], _in[11]); _out[10] = _mm_unpacklo_ps(tmp10, tmp11); _out[14] = _mm_unpackhi_ps(tmp10, tmp11); const __m128 tmp12 = _mm_unpacklo_ps(_in[12], _in[14]); const __m128 tmp13 = _mm_unpacklo_ps(_in[13], _in[15]); _out[3] = _mm_unpacklo_ps(tmp12, tmp13); _out[7] = _mm_unpackhi_ps(tmp12, tmp13); const __m128 tmp14 = _mm_unpackhi_ps(_in[12], _in[14]); const __m128 tmp15 = _mm_unpackhi_ps(_in[13], _in[15]); _out[11] = _mm_unpacklo_ps(tmp14, tmp15); _out[15] = _mm_unpackhi_ps(tmp14, tmp15); } OZZ_INLINE SimdFloat4 MAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) { return OZZ_MADD(_a, _b, _c); } OZZ_INLINE SimdFloat4 MSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) { return OZZ_MSUB(_a, _b, _c); } OZZ_INLINE SimdFloat4 NMAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) { return OZZ_NMADD(_a, _b, _c); } OZZ_INLINE SimdFloat4 NMSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) { return OZZ_NMSUB(_a, _b, _c); } OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_div_ss(_a, _b); } OZZ_INLINE SimdFloat4 HAdd2(_SimdFloat4 _v) { return OZZ_SSE_HADD2_F(_v); } OZZ_INLINE SimdFloat4 HAdd3(_SimdFloat4 _v) { return OZZ_SSE_HADD3_F(_v); } OZZ_INLINE SimdFloat4 HAdd4(_SimdFloat4 _v) { __m128 hadd4; OZZ_SSE_HADD4_F(_v, hadd4); return hadd4; } OZZ_INLINE SimdFloat4 Dot2(_SimdFloat4 _a, _SimdFloat4 _b) { __m128 dot2; OZZ_SSE_DOT2_F(_a, _b, dot2); return dot2; } OZZ_INLINE SimdFloat4 Dot3(_SimdFloat4 _a, _SimdFloat4 _b) { __m128 dot3; OZZ_SSE_DOT3_F(_a, _b, dot3); return dot3; } OZZ_INLINE SimdFloat4 Dot4(_SimdFloat4 _a, _SimdFloat4 _b) { __m128 dot4; OZZ_SSE_DOT4_F(_a, _b, dot4); return dot4; } OZZ_INLINE SimdFloat4 Cross3(_SimdFloat4 _a, _SimdFloat4 _b) { // Implementation with 3 shuffles only is based on: // https://geometrian.com/programming/tutorials/cross-product const __m128 shufa = OZZ_SHUFFLE_PS1(_a, _MM_SHUFFLE(3, 0, 2, 1)); const __m128 shufb = OZZ_SHUFFLE_PS1(_b, _MM_SHUFFLE(3, 0, 2, 1)); const __m128 shufc = OZZ_MSUB(_a, shufb, _mm_mul_ps(_b, shufa)); return OZZ_SHUFFLE_PS1(shufc, _MM_SHUFFLE(3, 0, 2, 1)); } OZZ_INLINE SimdFloat4 RcpEst(_SimdFloat4 _v) { return _mm_rcp_ps(_v); } OZZ_INLINE SimdFloat4 RcpEstNR(_SimdFloat4 _v) { const __m128 nr = _mm_rcp_ps(_v); // Do one more Newton-Raphson step to improve precision. return OZZ_NMADD(_mm_mul_ps(nr, nr), _v, _mm_add_ps(nr, nr)); } OZZ_INLINE SimdFloat4 RcpEstX(_SimdFloat4 _v) { return _mm_rcp_ss(_v); } OZZ_INLINE SimdFloat4 RcpEstXNR(_SimdFloat4 _v) { const __m128 nr = _mm_rcp_ss(_v); // Do one more Newton-Raphson step to improve precision. return OZZ_NMADDX(_mm_mul_ss(nr, nr), _v, _mm_add_ss(nr, nr)); } OZZ_INLINE SimdFloat4 Sqrt(_SimdFloat4 _v) { return _mm_sqrt_ps(_v); } OZZ_INLINE SimdFloat4 SqrtX(_SimdFloat4 _v) { return _mm_sqrt_ss(_v); } OZZ_INLINE SimdFloat4 RSqrtEst(_SimdFloat4 _v) { return _mm_rsqrt_ps(_v); } OZZ_INLINE SimdFloat4 RSqrtEstNR(_SimdFloat4 _v) { const __m128 nr = _mm_rsqrt_ps(_v); // Do one more Newton-Raphson step to improve precision. return _mm_mul_ps(_mm_mul_ps(_mm_set_ps1(.5f), nr), OZZ_NMADD(_mm_mul_ps(_v, nr), nr, _mm_set_ps1(3.f))); } OZZ_INLINE SimdFloat4 RSqrtEstX(_SimdFloat4 _v) { return _mm_rsqrt_ss(_v); } OZZ_INLINE SimdFloat4 RSqrtEstXNR(_SimdFloat4 _v) { const __m128 nr = _mm_rsqrt_ss(_v); // Do one more Newton-Raphson step to improve precision. return _mm_mul_ss(_mm_mul_ss(_mm_set_ps1(.5f), nr), OZZ_NMADDX(_mm_mul_ss(_v, nr), nr, _mm_set_ps1(3.f))); } OZZ_INLINE SimdFloat4 Abs(_SimdFloat4 _v) { const __m128i zero = _mm_setzero_si128(); return _mm_and_ps( _mm_castsi128_ps(_mm_srli_epi32(_mm_cmpeq_epi32(zero, zero), 1)), _v); } OZZ_INLINE SimdInt4 Sign(_SimdFloat4 _v) { return _mm_slli_epi32(_mm_srli_epi32(_mm_castps_si128(_v), 31), 31); } OZZ_INLINE SimdFloat4 Length2(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT2_F(_v, _v, sq_len); return _mm_sqrt_ss(sq_len); } OZZ_INLINE SimdFloat4 Length3(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT3_F(_v, _v, sq_len); return _mm_sqrt_ss(sq_len); } OZZ_INLINE SimdFloat4 Length4(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT4_F(_v, _v, sq_len); return _mm_sqrt_ss(sq_len); } OZZ_INLINE SimdFloat4 Length2Sqr(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT2_F(_v, _v, sq_len); return sq_len; } OZZ_INLINE SimdFloat4 Length3Sqr(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT3_F(_v, _v, sq_len); return sq_len; } OZZ_INLINE SimdFloat4 Length4Sqr(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT4_F(_v, _v, sq_len); return sq_len; } OZZ_INLINE SimdFloat4 Normalize2(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT2_F(_v, _v, sq_len); assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable"); const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx); return _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v)); } OZZ_INLINE SimdFloat4 Normalize3(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT3_F(_v, _v, sq_len); assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable"); const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len)); const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz); return OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3)); } OZZ_INLINE SimdFloat4 Normalize4(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT4_F(_v, _v, sq_len); assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable"); const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); return _mm_mul_ps(_v, inv_lenxxxx); } OZZ_INLINE SimdFloat4 NormalizeEst2(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT2_F(_v, _v, sq_len); assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable"); const __m128 inv_len = _mm_rsqrt_ss(sq_len); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx); return _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v)); } OZZ_INLINE SimdFloat4 NormalizeEst3(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT3_F(_v, _v, sq_len); assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable"); const __m128 inv_len = _mm_rsqrt_ss(sq_len); const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz); return OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3)); } OZZ_INLINE SimdFloat4 NormalizeEst4(_SimdFloat4 _v) { __m128 sq_len; OZZ_SSE_DOT4_F(_v, _v, sq_len); assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable"); const __m128 inv_len = _mm_rsqrt_ss(sq_len); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); return _mm_mul_ps(_v, inv_lenxxxx); } OZZ_INLINE SimdInt4 IsNormalized2(_SimdFloat4 _v) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq); __m128 dot; OZZ_SSE_DOT2_F(_v, _v, dot); __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } OZZ_INLINE SimdInt4 IsNormalized3(_SimdFloat4 _v) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq); __m128 dot; OZZ_SSE_DOT3_F(_v, _v, dot); __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } OZZ_INLINE SimdInt4 IsNormalized4(_SimdFloat4 _v) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq); __m128 dot; OZZ_SSE_DOT4_F(_v, _v, dot); __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } OZZ_INLINE SimdInt4 IsNormalizedEst2(_SimdFloat4 _v) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq); __m128 dot; OZZ_SSE_DOT2_F(_v, _v, dot); __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } OZZ_INLINE SimdInt4 IsNormalizedEst3(_SimdFloat4 _v) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq); __m128 dot; OZZ_SSE_DOT3_F(_v, _v, dot); __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } OZZ_INLINE SimdInt4 IsNormalizedEst4(_SimdFloat4 _v) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq); __m128 dot; OZZ_SSE_DOT4_F(_v, _v, dot); __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } OZZ_INLINE SimdFloat4 NormalizeSafe2(_SimdFloat4 _v, _SimdFloat4 _safe) { // assert(AreAllTrue1(IsNormalized2(_safe)) && "_safe is not normalized"); __m128 sq_len; OZZ_SSE_DOT2_F(_v, _v, sq_len); const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx); const __m128i cond = _mm_castps_si128( _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps())); const __m128 cfalse = _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v)); return OZZ_SSE_SELECT_F(cond, _safe, cfalse); } OZZ_INLINE SimdFloat4 NormalizeSafe3(_SimdFloat4 _v, _SimdFloat4 _safe) { // assert(AreAllTrue1(IsNormalized3(_safe)) && "_safe is not normalized"); __m128 sq_len; OZZ_SSE_DOT3_F(_v, _v, sq_len); const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len)); const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz); const __m128i cond = _mm_castps_si128( _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps())); const __m128 cfalse = OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3)); return OZZ_SSE_SELECT_F(cond, _safe, cfalse); } OZZ_INLINE SimdFloat4 NormalizeSafe4(_SimdFloat4 _v, _SimdFloat4 _safe) { // assert(AreAllTrue1(IsNormalized4(_safe)) && "_safe is not normalized"); __m128 sq_len; OZZ_SSE_DOT4_F(_v, _v, sq_len); const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128i cond = _mm_castps_si128( _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps())); const __m128 cfalse = _mm_mul_ps(_v, inv_lenxxxx); return OZZ_SSE_SELECT_F(cond, _safe, cfalse); } OZZ_INLINE SimdFloat4 NormalizeSafeEst2(_SimdFloat4 _v, _SimdFloat4 _safe) { // assert(AreAllTrue1(IsNormalizedEst2(_safe)) && "_safe is not normalized"); __m128 sq_len; OZZ_SSE_DOT2_F(_v, _v, sq_len); const __m128 inv_len = _mm_rsqrt_ss(sq_len); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx); const __m128i cond = _mm_castps_si128( _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps())); const __m128 cfalse = _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v)); return OZZ_SSE_SELECT_F(cond, _safe, cfalse); } OZZ_INLINE SimdFloat4 NormalizeSafeEst3(_SimdFloat4 _v, _SimdFloat4 _safe) { // assert(AreAllTrue1(IsNormalizedEst3(_safe)) && "_safe is not normalized"); __m128 sq_len; OZZ_SSE_DOT3_F(_v, _v, sq_len); const __m128 inv_len = _mm_rsqrt_ss(sq_len); const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3)); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz); const __m128i cond = _mm_castps_si128( _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps())); const __m128 cfalse = OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3)); return OZZ_SSE_SELECT_F(cond, _safe, cfalse); } OZZ_INLINE SimdFloat4 NormalizeSafeEst4(_SimdFloat4 _v, _SimdFloat4 _safe) { // assert(AreAllTrue1(IsNormalizedEst4(_safe)) && "_safe is not normalized"); __m128 sq_len; OZZ_SSE_DOT4_F(_v, _v, sq_len); const __m128 inv_len = _mm_rsqrt_ss(sq_len); const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0); const __m128i cond = _mm_castps_si128( _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps())); const __m128 cfalse = _mm_mul_ps(_v, inv_lenxxxx); return OZZ_SSE_SELECT_F(cond, _safe, cfalse); } OZZ_INLINE SimdFloat4 Lerp(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _alpha) { return OZZ_MADD(_alpha, _mm_sub_ps(_b, _a), _a); } OZZ_INLINE SimdFloat4 Min(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_min_ps(_a, _b); } OZZ_INLINE SimdFloat4 Max(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_max_ps(_a, _b); } OZZ_INLINE SimdFloat4 Min0(_SimdFloat4 _v) { return _mm_min_ps(_mm_setzero_ps(), _v); } OZZ_INLINE SimdFloat4 Max0(_SimdFloat4 _v) { return _mm_max_ps(_mm_setzero_ps(), _v); } OZZ_INLINE SimdFloat4 Clamp(_SimdFloat4 _a, _SimdFloat4 _v, _SimdFloat4 _b) { return _mm_max_ps(_a, _mm_min_ps(_v, _b)); } OZZ_INLINE SimdFloat4 Select(_SimdInt4 _b, _SimdFloat4 _true, _SimdFloat4 _false) { return OZZ_SSE_SELECT_F(_b, _true, _false); } OZZ_INLINE SimdInt4 CmpEq(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_castps_si128(_mm_cmpeq_ps(_a, _b)); } OZZ_INLINE SimdInt4 CmpNe(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_castps_si128(_mm_cmpneq_ps(_a, _b)); } OZZ_INLINE SimdInt4 CmpLt(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_castps_si128(_mm_cmplt_ps(_a, _b)); } OZZ_INLINE SimdInt4 CmpLe(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_castps_si128(_mm_cmple_ps(_a, _b)); } OZZ_INLINE SimdInt4 CmpGt(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_castps_si128(_mm_cmpgt_ps(_a, _b)); } OZZ_INLINE SimdInt4 CmpGe(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_castps_si128(_mm_cmpge_ps(_a, _b)); } OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_and_ps(_a, _b); } OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_or_ps(_a, _b); } OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdFloat4 _b) { return _mm_xor_ps(_a, _b); } OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdInt4 _b) { return _mm_and_ps(_a, _mm_castsi128_ps(_b)); } OZZ_INLINE SimdFloat4 AndNot(_SimdFloat4 _a, _SimdInt4 _b) { return _mm_andnot_ps(_mm_castsi128_ps(_b), _a); } OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdInt4 _b) { return _mm_or_ps(_a, _mm_castsi128_ps(_b)); } OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdInt4 _b) { return _mm_xor_ps(_a, _mm_castsi128_ps(_b)); } OZZ_INLINE SimdFloat4 Cos(_SimdFloat4 _v) { return _mm_set_ps(std::cos(GetW(_v)), std::cos(GetZ(_v)), std::cos(GetY(_v)), std::cos(GetX(_v))); } OZZ_INLINE SimdFloat4 CosX(_SimdFloat4 _v) { return _mm_move_ss(_v, _mm_set_ps1(std::cos(GetX(_v)))); } OZZ_INLINE SimdFloat4 ACos(_SimdFloat4 _v) { return _mm_set_ps(std::acos(GetW(_v)), std::acos(GetZ(_v)), std::acos(GetY(_v)), std::acos(GetX(_v))); } OZZ_INLINE SimdFloat4 ACosX(_SimdFloat4 _v) { return _mm_move_ss(_v, _mm_set_ps1(std::acos(GetX(_v)))); } OZZ_INLINE SimdFloat4 Sin(_SimdFloat4 _v) { return _mm_set_ps(std::sin(GetW(_v)), std::sin(GetZ(_v)), std::sin(GetY(_v)), std::sin(GetX(_v))); } OZZ_INLINE SimdFloat4 SinX(_SimdFloat4 _v) { return _mm_move_ss(_v, _mm_set_ps1(std::sin(GetX(_v)))); } OZZ_INLINE SimdFloat4 ASin(_SimdFloat4 _v) { return _mm_set_ps(std::asin(GetW(_v)), std::asin(GetZ(_v)), std::asin(GetY(_v)), std::asin(GetX(_v))); } OZZ_INLINE SimdFloat4 ASinX(_SimdFloat4 _v) { return _mm_move_ss(_v, _mm_set_ps1(std::asin(GetX(_v)))); } OZZ_INLINE SimdFloat4 Tan(_SimdFloat4 _v) { return _mm_set_ps(std::tan(GetW(_v)), std::tan(GetZ(_v)), std::tan(GetY(_v)), std::tan(GetX(_v))); } OZZ_INLINE SimdFloat4 TanX(_SimdFloat4 _v) { return _mm_move_ss(_v, _mm_set_ps1(std::tan(GetX(_v)))); } OZZ_INLINE SimdFloat4 ATan(_SimdFloat4 _v) { return _mm_set_ps(std::atan(GetW(_v)), std::atan(GetZ(_v)), std::atan(GetY(_v)), std::atan(GetX(_v))); } OZZ_INLINE SimdFloat4 ATanX(_SimdFloat4 _v) { return _mm_move_ss(_v, _mm_set_ps1(std::atan(GetX(_v)))); } namespace simd_int4 { OZZ_INLINE SimdInt4 zero() { return _mm_setzero_si128(); } OZZ_INLINE SimdInt4 one() { const __m128i zero = _mm_setzero_si128(); return _mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)); } OZZ_INLINE SimdInt4 x_axis() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12); } OZZ_INLINE SimdInt4 y_axis() { const __m128i zero = _mm_setzero_si128(); return _mm_slli_si128( _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12), 4); } OZZ_INLINE SimdInt4 z_axis() { const __m128i zero = _mm_setzero_si128(); return _mm_slli_si128( _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12), 8); } OZZ_INLINE SimdInt4 w_axis() { const __m128i zero = _mm_setzero_si128(); return _mm_slli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12); } OZZ_INLINE SimdInt4 all_true() { const __m128i zero = _mm_setzero_si128(); return _mm_cmpeq_epi32(zero, zero); } OZZ_INLINE SimdInt4 all_false() { return _mm_setzero_si128(); } OZZ_INLINE SimdInt4 mask_sign() { const __m128i zero = _mm_setzero_si128(); return _mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31); } OZZ_INLINE SimdInt4 mask_sign_xyz() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_si128(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31), 4); } OZZ_INLINE SimdInt4 mask_sign_w() { const __m128i zero = _mm_setzero_si128(); return _mm_slli_si128(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31), 12); } OZZ_INLINE SimdInt4 mask_not_sign() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_epi32(_mm_cmpeq_epi32(zero, zero), 1); } OZZ_INLINE SimdInt4 mask_ffff() { const __m128i zero = _mm_setzero_si128(); return _mm_cmpeq_epi32(zero, zero); } OZZ_INLINE SimdInt4 mask_0000() { return _mm_setzero_si128(); } OZZ_INLINE SimdInt4 mask_fff0() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_si128(_mm_cmpeq_epi32(zero, zero), 4); } OZZ_INLINE SimdInt4 mask_f000() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_si128(_mm_cmpeq_epi32(zero, zero), 12); } OZZ_INLINE SimdInt4 mask_0f00() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_si128(_mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12), 8); } OZZ_INLINE SimdInt4 mask_00f0() { const __m128i zero = _mm_setzero_si128(); return _mm_srli_si128(_mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12), 4); } OZZ_INLINE SimdInt4 mask_000f() { const __m128i zero = _mm_setzero_si128(); return _mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12); } OZZ_INLINE SimdInt4 Load(int _x, int _y, int _z, int _w) { return _mm_set_epi32(_w, _z, _y, _x); } OZZ_INLINE SimdInt4 LoadX(int _x) { return _mm_set_epi32(0, 0, 0, _x); } OZZ_INLINE SimdInt4 Load1(int _x) { return _mm_set1_epi32(_x); } OZZ_INLINE SimdInt4 Load(bool _x, bool _y, bool _z, bool _w) { return _mm_sub_epi32(_mm_setzero_si128(), _mm_set_epi32(_w, _z, _y, _x)); } OZZ_INLINE SimdInt4 LoadX(bool _x) { return _mm_sub_epi32(_mm_setzero_si128(), _mm_set_epi32(0, 0, 0, _x)); } OZZ_INLINE SimdInt4 Load1(bool _x) { return _mm_sub_epi32(_mm_setzero_si128(), _mm_set1_epi32(_x)); } OZZ_INLINE SimdInt4 LoadPtr(const int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); return _mm_load_si128(reinterpret_cast(_i)); } OZZ_INLINE SimdInt4 LoadXPtr(const int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); return _mm_cvtsi32_si128(*_i); } OZZ_INLINE SimdInt4 Load1Ptr(const int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); return _mm_shuffle_epi32( _mm_loadl_epi64(reinterpret_cast(_i)), _MM_SHUFFLE(0, 0, 0, 0)); } OZZ_INLINE SimdInt4 Load2Ptr(const int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); return _mm_loadl_epi64(reinterpret_cast(_i)); } OZZ_INLINE SimdInt4 Load3Ptr(const int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); return _mm_set_epi32(0, _i[2], _i[1], _i[0]); } OZZ_INLINE SimdInt4 LoadPtrU(const int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); return _mm_loadu_si128(reinterpret_cast(_i)); } OZZ_INLINE SimdInt4 LoadXPtrU(const int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); return _mm_cvtsi32_si128(*_i); } OZZ_INLINE SimdInt4 Load1PtrU(const int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); return _mm_set1_epi32(*_i); } OZZ_INLINE SimdInt4 Load2PtrU(const int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); return _mm_set_epi32(0, 0, _i[1], _i[0]); } OZZ_INLINE SimdInt4 Load3PtrU(const int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); return _mm_set_epi32(0, _i[2], _i[1], _i[0]); } OZZ_INLINE SimdInt4 FromFloatRound(_SimdFloat4 _f) { return _mm_cvtps_epi32(_f); } OZZ_INLINE SimdInt4 FromFloatTrunc(_SimdFloat4 _f) { return _mm_cvttps_epi32(_f); } } // namespace simd_int4 OZZ_INLINE int GetX(_SimdInt4 _v) { return _mm_cvtsi128_si32(_v); } OZZ_INLINE int GetY(_SimdInt4 _v) { return _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1)); } OZZ_INLINE int GetZ(_SimdInt4 _v) { return _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v)); } OZZ_INLINE int GetW(_SimdInt4 _v) { return _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 3)); } OZZ_INLINE SimdInt4 SetX(_SimdInt4 _v, _SimdInt4 _i) { return _mm_castps_si128( _mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(_i))); } OZZ_INLINE SimdInt4 SetY(_SimdInt4 _v, _SimdInt4 _i) { const __m128 xfnn = _mm_castsi128_ps(_mm_unpacklo_epi32(_v, _i)); return _mm_castps_si128( _mm_shuffle_ps(xfnn, _mm_castsi128_ps(_v), _MM_SHUFFLE(3, 2, 1, 0))); } OZZ_INLINE SimdInt4 SetZ(_SimdInt4 _v, _SimdInt4 _i) { const __m128 ffww = _mm_shuffle_ps(_mm_castsi128_ps(_i), _mm_castsi128_ps(_v), _MM_SHUFFLE(3, 3, 0, 0)); return _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(_v), ffww, _MM_SHUFFLE(2, 0, 1, 0))); } OZZ_INLINE SimdInt4 SetW(_SimdInt4 _v, _SimdInt4 _i) { const __m128 ffzz = _mm_shuffle_ps(_mm_castsi128_ps(_i), _mm_castsi128_ps(_v), _MM_SHUFFLE(2, 2, 0, 0)); return _mm_castps_si128( _mm_shuffle_ps(_mm_castsi128_ps(_v), ffzz, _MM_SHUFFLE(0, 2, 1, 0))); } OZZ_INLINE SimdInt4 SetI(_SimdInt4 _v, _SimdInt4 _i, int _ith) { assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range."); union { SimdInt4 ret; int af[4]; } u = {_v}; u.af[_ith] = GetX(_i); return u.ret; } OZZ_INLINE void StorePtr(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); _mm_store_si128(reinterpret_cast<__m128i*>(_i), _v); } OZZ_INLINE void Store1Ptr(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); *_i = _mm_cvtsi128_si32(_v); } OZZ_INLINE void Store2Ptr(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); _i[0] = _mm_cvtsi128_si32(_v); _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1)); } OZZ_INLINE void Store3Ptr(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment"); _i[0] = _mm_cvtsi128_si32(_v); _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1)); _i[2] = _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v)); } OZZ_INLINE void StorePtrU(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); _mm_storeu_si128(reinterpret_cast<__m128i*>(_i), _v); } OZZ_INLINE void Store1PtrU(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); *_i = _mm_cvtsi128_si32(_v); } OZZ_INLINE void Store2PtrU(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); _i[0] = _mm_cvtsi128_si32(_v); _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1)); } OZZ_INLINE void Store3PtrU(_SimdInt4 _v, int* _i) { assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment"); _i[0] = _mm_cvtsi128_si32(_v); _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1)); _i[2] = _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v)); } OZZ_INLINE SimdInt4 SplatX(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 0); } OZZ_INLINE SimdInt4 SplatY(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 1); } OZZ_INLINE SimdInt4 SplatZ(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 2); } OZZ_INLINE SimdInt4 SplatW(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 3); } template OZZ_INLINE SimdInt4 Swizzle(_SimdInt4 _v) { static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3, "Indices must be between 0 and 3"); return _mm_shuffle_epi32(_v, _MM_SHUFFLE(_W, _Z, _Y, _X)); } template <> OZZ_INLINE SimdInt4 Swizzle<0, 1, 2, 3>(_SimdInt4 _v) { return _v; } OZZ_INLINE int MoveMask(_SimdInt4 _v) { return _mm_movemask_ps(_mm_castsi128_ps(_v)); } OZZ_INLINE bool AreAllTrue(_SimdInt4 _v) { return _mm_movemask_ps(_mm_castsi128_ps(_v)) == 0xf; } OZZ_INLINE bool AreAllTrue3(_SimdInt4 _v) { return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x7) == 0x7; } OZZ_INLINE bool AreAllTrue2(_SimdInt4 _v) { return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x3) == 0x3; } OZZ_INLINE bool AreAllTrue1(_SimdInt4 _v) { return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x1) == 0x1; } OZZ_INLINE bool AreAllFalse(_SimdInt4 _v) { return _mm_movemask_ps(_mm_castsi128_ps(_v)) == 0; } OZZ_INLINE bool AreAllFalse3(_SimdInt4 _v) { return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x7) == 0; } OZZ_INLINE bool AreAllFalse2(_SimdInt4 _v) { return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x3) == 0; } OZZ_INLINE bool AreAllFalse1(_SimdInt4 _v) { return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x1) == 0; } OZZ_INLINE SimdInt4 HAdd2(_SimdInt4 _v) { const __m128i hadd = _mm_add_epi32(_v, OZZ_SSE_SPLAT_I(_v, 1)); return _mm_castps_si128( _mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(hadd))); } OZZ_INLINE SimdInt4 HAdd3(_SimdInt4 _v) { const __m128i hadd = _mm_add_epi32(_mm_add_epi32(_v, OZZ_SSE_SPLAT_I(_v, 1)), _mm_unpackhi_epi32(_v, _v)); return _mm_castps_si128( _mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(hadd))); } OZZ_INLINE SimdInt4 HAdd4(_SimdInt4 _v) { const __m128 v = _mm_castsi128_ps(_v); const __m128i haddxyzw = _mm_add_epi32(_v, _mm_castps_si128(_mm_movehl_ps(v, v))); return _mm_castps_si128(_mm_move_ss( v, _mm_castsi128_ps(_mm_add_epi32(haddxyzw, OZZ_SSE_SPLAT_I(haddxyzw, 1))))); } OZZ_INLINE SimdInt4 Abs(_SimdInt4 _v) { #ifdef OZZ_SIMD_SSSE3 return _mm_abs_epi32(_v); #else // OZZ_SIMD_SSSE3 const __m128i zero = _mm_setzero_si128(); return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_v, zero), _mm_sub_epi32(zero, _v), _v); #endif // OZZ_SIMD_SSSE3 } OZZ_INLINE SimdInt4 Sign(_SimdInt4 _v) { return _mm_slli_epi32(_mm_srli_epi32(_v, 31), 31); } OZZ_INLINE SimdInt4 Min(_SimdInt4 _a, _SimdInt4 _b) { #ifdef OZZ_SIMD_SSE4_1 return _mm_min_epi32(_a, _b); #else // OZZ_SIMD_SSE4_1 return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_a, _b), _a, _b); #endif // OZZ_SIMD_SSE4_1 } OZZ_INLINE SimdInt4 Max(_SimdInt4 _a, _SimdInt4 _b) { #ifdef OZZ_SIMD_SSE4_1 return _mm_max_epi32(_a, _b); #else // OZZ_SIMD_SSE4_1 return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(_a, _b), _a, _b); #endif // OZZ_SIMD_SSE4_1 } OZZ_INLINE SimdInt4 Min0(_SimdInt4 _v) { const __m128i zero = _mm_setzero_si128(); #ifdef OZZ_SIMD_SSE4_1 return _mm_min_epi32(zero, _v); #else // OZZ_SIMD_SSE4_1 return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(zero, _v), zero, _v); #endif // OZZ_SIMD_SSE4_1 } OZZ_INLINE SimdInt4 Max0(_SimdInt4 _v) { const __m128i zero = _mm_setzero_si128(); #ifdef OZZ_SIMD_SSE4_1 return _mm_max_epi32(zero, _v); #else // OZZ_SIMD_SSE4_1 return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(zero, _v), zero, _v); #endif // OZZ_SIMD_SSE4_1 } OZZ_INLINE SimdInt4 Clamp(_SimdInt4 _a, _SimdInt4 _v, _SimdInt4 _b) { #ifdef OZZ_SIMD_SSE4_1 return _mm_min_epi32(_mm_max_epi32(_a, _v), _b); #else // OZZ_SIMD_SSE4_1 const __m128i min = OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_v, _b), _v, _b); return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(_a, min), _a, min); #endif // OZZ_SIMD_SSE4_1 } OZZ_INLINE SimdInt4 Select(_SimdInt4 _b, _SimdInt4 _true, _SimdInt4 _false) { return OZZ_SSE_SELECT_I(_b, _true, _false); } OZZ_INLINE SimdInt4 And(_SimdInt4 _a, _SimdInt4 _b) { return _mm_and_si128(_a, _b); } OZZ_INLINE SimdInt4 AndNot(_SimdInt4 _a, _SimdInt4 _b) { return _mm_andnot_si128(_b, _a); } OZZ_INLINE SimdInt4 Or(_SimdInt4 _a, _SimdInt4 _b) { return _mm_or_si128(_a, _b); } OZZ_INLINE SimdInt4 Xor(_SimdInt4 _a, _SimdInt4 _b) { return _mm_xor_si128(_a, _b); } OZZ_INLINE SimdInt4 Not(_SimdInt4 _v) { return _mm_xor_si128(_v, _mm_cmpeq_epi32(_v, _v)); } OZZ_INLINE SimdInt4 ShiftL(_SimdInt4 _v, int _bits) { return _mm_slli_epi32(_v, _bits); } OZZ_INLINE SimdInt4 ShiftR(_SimdInt4 _v, int _bits) { return _mm_srai_epi32(_v, _bits); } OZZ_INLINE SimdInt4 ShiftRu(_SimdInt4 _v, int _bits) { return _mm_srli_epi32(_v, _bits); } OZZ_INLINE SimdInt4 CmpEq(_SimdInt4 _a, _SimdInt4 _b) { return _mm_cmpeq_epi32(_a, _b); } OZZ_INLINE SimdInt4 CmpNe(_SimdInt4 _a, _SimdInt4 _b) { const __m128i eq = _mm_cmpeq_epi32(_a, _b); return _mm_xor_si128(eq, _mm_cmpeq_epi32(_a, _a)); } OZZ_INLINE SimdInt4 CmpLt(_SimdInt4 _a, _SimdInt4 _b) { return _mm_cmpgt_epi32(_b, _a); } OZZ_INLINE SimdInt4 CmpLe(_SimdInt4 _a, _SimdInt4 _b) { const __m128i gt = _mm_cmpgt_epi32(_a, _b); return _mm_xor_si128(gt, _mm_cmpeq_epi32(_a, _a)); } OZZ_INLINE SimdInt4 CmpGt(_SimdInt4 _a, _SimdInt4 _b) { return _mm_cmpgt_epi32(_a, _b); } OZZ_INLINE SimdInt4 CmpGe(_SimdInt4 _a, _SimdInt4 _b) { const __m128i lt = _mm_cmpgt_epi32(_b, _a); return _mm_xor_si128(lt, _mm_cmpeq_epi32(_a, _a)); } OZZ_INLINE Float4x4 Float4x4::identity() { const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128i one = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2); const __m128i x = _mm_srli_si128(one, 12); const Float4x4 ret = {{_mm_castsi128_ps(x), _mm_castsi128_ps(_mm_slli_si128(x, 4)), _mm_castsi128_ps(_mm_slli_si128(x, 8)), _mm_castsi128_ps(_mm_slli_si128(one, 12))}}; return ret; } OZZ_INLINE Float4x4 Transpose(const Float4x4& _m) { const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]); const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]); const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]); const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]); const Float4x4 ret = { {_mm_unpacklo_ps(tmp0, tmp1), _mm_unpackhi_ps(tmp0, tmp1), _mm_unpacklo_ps(tmp2, tmp3), _mm_unpackhi_ps(tmp2, tmp3)}}; return ret; } inline Float4x4 Invert(const Float4x4& _m, SimdInt4* _invertible) { const __m128 _t0 = _mm_shuffle_ps(_m.cols[0], _m.cols[1], _MM_SHUFFLE(1, 0, 1, 0)); const __m128 _t1 = _mm_shuffle_ps(_m.cols[2], _m.cols[3], _MM_SHUFFLE(1, 0, 1, 0)); const __m128 _t2 = _mm_shuffle_ps(_m.cols[0], _m.cols[1], _MM_SHUFFLE(3, 2, 3, 2)); const __m128 _t3 = _mm_shuffle_ps(_m.cols[2], _m.cols[3], _MM_SHUFFLE(3, 2, 3, 2)); const __m128 c0 = _mm_shuffle_ps(_t0, _t1, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 c1 = _mm_shuffle_ps(_t1, _t0, _MM_SHUFFLE(3, 1, 3, 1)); const __m128 c2 = _mm_shuffle_ps(_t2, _t3, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 c3 = _mm_shuffle_ps(_t3, _t2, _MM_SHUFFLE(3, 1, 3, 1)); __m128 minor0, minor1, minor2, minor3, tmp1, tmp2; tmp1 = _mm_mul_ps(c2, c3); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1); minor0 = _mm_mul_ps(c1, tmp1); minor1 = _mm_mul_ps(c0, tmp1); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E); minor0 = OZZ_MSUB(c1, tmp1, minor0); minor1 = OZZ_MSUB(c0, tmp1, minor1); minor1 = OZZ_SHUFFLE_PS1(minor1, 0x4E); tmp1 = _mm_mul_ps(c1, c2); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1); minor0 = OZZ_MADD(c3, tmp1, minor0); minor3 = _mm_mul_ps(c0, tmp1); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E); minor0 = OZZ_NMADD(c3, tmp1, minor0); minor3 = OZZ_MSUB(c0, tmp1, minor3); minor3 = OZZ_SHUFFLE_PS1(minor3, 0x4E); tmp1 = _mm_mul_ps(OZZ_SHUFFLE_PS1(c1, 0x4E), c3); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1); tmp2 = OZZ_SHUFFLE_PS1(c2, 0x4E); minor0 = OZZ_MADD(tmp2, tmp1, minor0); minor2 = _mm_mul_ps(c0, tmp1); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E); minor0 = OZZ_NMADD(tmp2, tmp1, minor0); minor2 = OZZ_MSUB(c0, tmp1, minor2); minor2 = OZZ_SHUFFLE_PS1(minor2, 0x4E); tmp1 = _mm_mul_ps(c0, c1); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1); minor2 = OZZ_MADD(c3, tmp1, minor2); minor3 = OZZ_MSUB(tmp2, tmp1, minor3); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E); minor2 = OZZ_MSUB(c3, tmp1, minor2); minor3 = OZZ_NMADD(tmp2, tmp1, minor3); tmp1 = _mm_mul_ps(c0, c3); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1); minor1 = OZZ_NMADD(tmp2, tmp1, minor1); minor2 = OZZ_MADD(c1, tmp1, minor2); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E); minor1 = OZZ_MADD(tmp2, tmp1, minor1); minor2 = OZZ_NMADD(c1, tmp1, minor2); tmp1 = _mm_mul_ps(c0, tmp2); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1); minor1 = OZZ_MADD(c3, tmp1, minor1); minor3 = OZZ_NMADD(c1, tmp1, minor3); tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E); minor1 = OZZ_NMADD(c3, tmp1, minor1); minor3 = OZZ_MADD(c1, tmp1, minor3); __m128 det; det = _mm_mul_ps(c0, minor0); det = _mm_add_ps(OZZ_SHUFFLE_PS1(det, 0x4E), det); det = _mm_add_ss(OZZ_SHUFFLE_PS1(det, 0xB1), det); const SimdInt4 invertible = CmpNe(det, simd_float4::zero()); assert((_invertible || AreAllTrue1(invertible)) && "Matrix is not invertible"); if (_invertible != nullptr) { *_invertible = invertible; } tmp1 = OZZ_SSE_SELECT_F(invertible, RcpEstNR(det), simd_float4::zero()); det = OZZ_NMADDX(det, _mm_mul_ss(tmp1, tmp1), _mm_add_ss(tmp1, tmp1)); det = OZZ_SHUFFLE_PS1(det, 0x00); // Copy the final columns const Float4x4 ret = {{_mm_mul_ps(det, minor0), _mm_mul_ps(det, minor1), _mm_mul_ps(det, minor2), _mm_mul_ps(det, minor3)}}; return ret; } Float4x4 Float4x4::Translation(_SimdFloat4 _v) { const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128i mask000f = _mm_slli_si128(ffff, 12); const __m128i one = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2); const __m128i x = _mm_srli_si128(one, 12); const Float4x4 ret = { {_mm_castsi128_ps(x), _mm_castsi128_ps(_mm_slli_si128(x, 4)), _mm_castsi128_ps(_mm_slli_si128(x, 8)), OZZ_SSE_SELECT_F(mask000f, _mm_castsi128_ps(one), _v)}}; return ret; } // math Float4x4 Float4x4::Scaling(_SimdFloat4 _v) { const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128i if000 = _mm_srli_si128(ffff, 12); const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2); const Float4x4 ret = { {_mm_and_ps(_v, _mm_castsi128_ps(if000)), _mm_and_ps(_v, _mm_castsi128_ps(_mm_slli_si128(if000, 4))), _mm_and_ps(_v, _mm_castsi128_ps(_mm_slli_si128(if000, 8))), _mm_castsi128_ps(_mm_slli_si128(ione, 12))}}; return ret; } // math OZZ_INLINE Float4x4 Translate(const Float4x4& _m, _SimdFloat4 _v) { const __m128 a01 = OZZ_MADD(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0), _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1))); const __m128 m3 = OZZ_MADD(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2), _m.cols[3]); const Float4x4 ret = { {_m.cols[0], _m.cols[1], _m.cols[2], _mm_add_ps(a01, m3)}}; return ret; } OZZ_INLINE Float4x4 Scale(const Float4x4& _m, _SimdFloat4 _v) { const Float4x4 ret = {{_mm_mul_ps(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0)), _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1)), _mm_mul_ps(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2)), _m.cols[3]}}; return ret; } OZZ_INLINE Float4x4 ColumnMultiply(const Float4x4& _m, _SimdFloat4 _v) { const Float4x4 ret = {{_mm_mul_ps(_m.cols[0], _v), _mm_mul_ps(_m.cols[1], _v), _mm_mul_ps(_m.cols[2], _v), _mm_mul_ps(_m.cols[3], _v)}}; return ret; } inline SimdInt4 IsNormalized(const Float4x4& _m) { const __m128 max = _mm_set_ps1(1.f + kNormalizationToleranceSq); const __m128 min = _mm_set_ps1(1.f - kNormalizationToleranceSq); const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]); const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]); const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]); const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]); const __m128 row0 = _mm_unpacklo_ps(tmp0, tmp1); const __m128 row1 = _mm_unpackhi_ps(tmp0, tmp1); const __m128 row2 = _mm_unpacklo_ps(tmp2, tmp3); const __m128 dot = OZZ_MADD(row0, row0, OZZ_MADD(row1, row1, _mm_mul_ps(row2, row2))); const __m128 normalized = _mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min)); return _mm_castps_si128( _mm_and_ps(normalized, _mm_castsi128_ps(simd_int4::mask_fff0()))); } inline SimdInt4 IsNormalizedEst(const Float4x4& _m) { const __m128 max = _mm_set_ps1(1.f + kNormalizationToleranceEstSq); const __m128 min = _mm_set_ps1(1.f - kNormalizationToleranceEstSq); const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]); const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]); const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]); const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]); const __m128 row0 = _mm_unpacklo_ps(tmp0, tmp1); const __m128 row1 = _mm_unpackhi_ps(tmp0, tmp1); const __m128 row2 = _mm_unpacklo_ps(tmp2, tmp3); const __m128 dot = OZZ_MADD(row0, row0, OZZ_MADD(row1, row1, _mm_mul_ps(row2, row2))); const __m128 normalized = _mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min)); return _mm_castps_si128( _mm_and_ps(normalized, _mm_castsi128_ps(simd_int4::mask_fff0()))); } OZZ_INLINE SimdInt4 IsOrthogonal(const Float4x4& _m) { const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq); const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq); const __m128 zero = _mm_setzero_ps(); // Use simd_float4::zero() if one of the normalization fails. _m will then be // considered not orthogonal. const SimdFloat4 cross = NormalizeSafe3(Cross3(_m.cols[0], _m.cols[1]), zero); const SimdFloat4 at = NormalizeSafe3(_m.cols[2], zero); SimdFloat4 dot; OZZ_SSE_DOT3_F(cross, at, dot); __m128 dotx000 = _mm_move_ss(zero, dot); return _mm_castps_si128( _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min))); } inline SimdFloat4 ToQuaternion(const Float4x4& _m) { assert(AreAllTrue3(IsNormalizedEst(_m))); assert(AreAllTrue1(IsOrthogonal(_m))); // Prepares constants. const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128 half = _mm_set1_ps(0.5f); const __m128i mask_f000 = _mm_srli_si128(ffff, 12); const __m128i mask_000f = _mm_slli_si128(ffff, 12); const __m128 one = _mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2)); const __m128i mask_0f00 = _mm_slli_si128(mask_f000, 4); const __m128i mask_00f0 = _mm_slli_si128(mask_f000, 8); const __m128 xx_yy = OZZ_SSE_SELECT_F(mask_0f00, _m.cols[1], _m.cols[0]); const __m128 xx_yy_0010 = OZZ_SHUFFLE_PS1(xx_yy, _MM_SHUFFLE(0, 0, 1, 0)); const __m128 xx_yy_zz_xx = OZZ_SSE_SELECT_F(mask_00f0, _m.cols[2], xx_yy_0010); const __m128 yy_zz_xx_yy = OZZ_SHUFFLE_PS1(xx_yy_zz_xx, _MM_SHUFFLE(1, 0, 2, 1)); const __m128 zz_xx_yy_zz = OZZ_SHUFFLE_PS1(xx_yy_zz_xx, _MM_SHUFFLE(2, 1, 0, 2)); const __m128 diag_sum = _mm_add_ps(_mm_add_ps(xx_yy_zz_xx, yy_zz_xx_yy), zz_xx_yy_zz); const __m128 diag_diff = _mm_sub_ps(_mm_sub_ps(xx_yy_zz_xx, yy_zz_xx_yy), zz_xx_yy_zz); const __m128 radicand = _mm_add_ps(OZZ_SSE_SELECT_F(mask_000f, diag_sum, diag_diff), one); const __m128 invSqrt = one / _mm_sqrt_ps(radicand); __m128 zy_xz_yx = OZZ_SSE_SELECT_F(mask_00f0, _m.cols[1], _m.cols[0]); zy_xz_yx = OZZ_SHUFFLE_PS1(zy_xz_yx, _MM_SHUFFLE(0, 1, 2, 2)); zy_xz_yx = OZZ_SSE_SELECT_F(mask_0f00, OZZ_SSE_SPLAT_F(_m.cols[2], 0), zy_xz_yx); __m128 yz_zx_xy = OZZ_SSE_SELECT_F(mask_f000, _m.cols[1], _m.cols[0]); yz_zx_xy = OZZ_SHUFFLE_PS1(yz_zx_xy, _MM_SHUFFLE(0, 0, 2, 0)); yz_zx_xy = OZZ_SSE_SELECT_F(mask_f000, OZZ_SSE_SPLAT_F(_m.cols[2], 1), yz_zx_xy); const __m128 sum = _mm_add_ps(zy_xz_yx, yz_zx_xy); const __m128 diff = _mm_sub_ps(zy_xz_yx, yz_zx_xy); const __m128 scale = _mm_mul_ps(invSqrt, half); const __m128 sum0 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 1, 2, 0)); const __m128 sum1 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 0, 0, 2)); const __m128 sum2 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 0, 0, 1)); __m128 res0 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 0), sum0); __m128 res1 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 1), sum1); __m128 res2 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 2), sum2); res0 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_f000, radicand, res0), OZZ_SSE_SPLAT_F(scale, 0)); res1 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_0f00, radicand, res1), OZZ_SSE_SPLAT_F(scale, 1)); res2 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_00f0, radicand, res2), OZZ_SSE_SPLAT_F(scale, 2)); __m128 res3 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_000f, radicand, diff), OZZ_SSE_SPLAT_F(scale, 3)); const __m128 xx = OZZ_SSE_SPLAT_F(_m.cols[0], 0); const __m128 yy = OZZ_SSE_SPLAT_F(_m.cols[1], 1); const __m128 zz = OZZ_SSE_SPLAT_F(_m.cols[2], 2); const __m128i cond0 = _mm_castps_si128(_mm_cmpgt_ps(yy, xx)); const __m128i cond1 = _mm_castps_si128(_mm_and_ps(_mm_cmpgt_ps(zz, xx), _mm_cmpgt_ps(zz, yy))); const __m128i cond2 = _mm_castps_si128( _mm_cmpgt_ps(OZZ_SSE_SPLAT_F(diag_sum, 0), _mm_castsi128_ps(zero))); __m128 res = OZZ_SSE_SELECT_F(cond0, res1, res0); res = OZZ_SSE_SELECT_F(cond1, res2, res); res = OZZ_SSE_SELECT_F(cond2, res3, res); assert(AreAllTrue1(IsNormalizedEst4(res))); return res; } inline bool ToAffine(const Float4x4& _m, SimdFloat4* _translation, SimdFloat4* _quaternion, SimdFloat4* _scale) { const __m128 zero = _mm_setzero_ps(); const __m128 one = simd_float4::one(); const __m128i fff0 = simd_int4::mask_fff0(); const __m128 max = _mm_set_ps1(kOrthogonalisationToleranceSq); const __m128 min = _mm_set_ps1(-kOrthogonalisationToleranceSq); // Extracts translation. *_translation = OZZ_SSE_SELECT_F(fff0, _m.cols[3], one); // Extracts scale. const __m128 m_tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]); const __m128 m_tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]); const __m128 m_tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]); const __m128 m_tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]); const __m128 m_row0 = _mm_unpacklo_ps(m_tmp0, m_tmp1); const __m128 m_row1 = _mm_unpackhi_ps(m_tmp0, m_tmp1); const __m128 m_row2 = _mm_unpacklo_ps(m_tmp2, m_tmp3); const __m128 dot = OZZ_MADD( m_row0, m_row0, OZZ_MADD(m_row1, m_row1, _mm_mul_ps(m_row2, m_row2))); const __m128 abs_scale = _mm_sqrt_ps(dot); const __m128 zero_axis = _mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min)); // Builds an orthonormal matrix in order to support quaternion extraction. Float4x4 orthonormal; int mask = _mm_movemask_ps(zero_axis); if (mask & 1) { if (mask & 6) { return false; } orthonormal.cols[1] = _mm_div_ps(_m.cols[1], OZZ_SSE_SPLAT_F(abs_scale, 1)); orthonormal.cols[0] = Normalize3(Cross3(orthonormal.cols[1], _m.cols[2])); orthonormal.cols[2] = Normalize3(Cross3(orthonormal.cols[0], orthonormal.cols[1])); } else if (mask & 4) { if (mask & 3) { return false; } orthonormal.cols[0] = _mm_div_ps(_m.cols[0], OZZ_SSE_SPLAT_F(abs_scale, 0)); orthonormal.cols[2] = Normalize3(Cross3(orthonormal.cols[0], _m.cols[1])); orthonormal.cols[1] = Normalize3(Cross3(orthonormal.cols[2], orthonormal.cols[0])); } else { // Favor z axis in the default case if (mask & 5) { return false; } orthonormal.cols[2] = _mm_div_ps(_m.cols[2], OZZ_SSE_SPLAT_F(abs_scale, 2)); orthonormal.cols[1] = Normalize3(Cross3(orthonormal.cols[2], _m.cols[0])); orthonormal.cols[0] = Normalize3(Cross3(orthonormal.cols[1], orthonormal.cols[2])); } orthonormal.cols[3] = simd_float4::w_axis(); // Get back scale signs in case of reflexions const __m128 o_tmp0 = _mm_unpacklo_ps(orthonormal.cols[0], orthonormal.cols[2]); const __m128 o_tmp1 = _mm_unpacklo_ps(orthonormal.cols[1], orthonormal.cols[3]); const __m128 o_tmp2 = _mm_unpackhi_ps(orthonormal.cols[0], orthonormal.cols[2]); const __m128 o_tmp3 = _mm_unpackhi_ps(orthonormal.cols[1], orthonormal.cols[3]); const __m128 o_row0 = _mm_unpacklo_ps(o_tmp0, o_tmp1); const __m128 o_row1 = _mm_unpackhi_ps(o_tmp0, o_tmp1); const __m128 o_row2 = _mm_unpacklo_ps(o_tmp2, o_tmp3); const __m128 scale_dot = OZZ_MADD( o_row0, m_row0, OZZ_MADD(o_row1, m_row1, _mm_mul_ps(o_row2, m_row2))); const __m128i cond = _mm_castps_si128(_mm_cmpgt_ps(scale_dot, zero)); const __m128 cfalse = _mm_sub_ps(zero, abs_scale); const __m128 scale = OZZ_SSE_SELECT_F(cond, abs_scale, cfalse); *_scale = OZZ_SSE_SELECT_F(fff0, scale, one); // Extracts quaternion. *_quaternion = ToQuaternion(orthonormal); return true; } inline Float4x4 Float4x4::FromEuler(_SimdFloat4 _v) { const __m128 cos = Cos(_v); const __m128 sin = Sin(_v); const float cx = GetX(cos); const float sx = GetX(sin); const float cy = GetY(cos); const float sy = GetY(sin); const float cz = GetZ(cos); const float sz = GetZ(sin); const float sycz = sy * cz; const float sysz = sy * sz; const Float4x4 ret = {{simd_float4::Load(cx * cy, sx * sz - cx * sycz, cx * sysz + sx * cz, 0.f), simd_float4::Load(sy, cy * cz, -cy * sz, 0.f), simd_float4::Load(-sx * cy, sx * sycz + cx * sz, -sx * sysz + cx * cz, 0.f), simd_float4::w_axis()}}; return ret; } inline Float4x4 Float4x4::FromAxisAngle(_SimdFloat4 _axis, _SimdFloat4 _angle) { assert(AreAllTrue1(IsNormalizedEst3(_axis))); const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2); const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4)); const __m128 one = _mm_castsi128_ps(ione); const __m128 w_axis = _mm_castsi128_ps(_mm_slli_si128(ione, 12)); const __m128 sin = SplatX(SinX(_angle)); const __m128 cos = SplatX(CosX(_angle)); const __m128 one_minus_cos = _mm_sub_ps(one, cos); const __m128 v0 = _mm_mul_ps(_mm_mul_ps(one_minus_cos, OZZ_SHUFFLE_PS1(_axis, _MM_SHUFFLE(3, 0, 2, 1))), OZZ_SHUFFLE_PS1(_axis, _MM_SHUFFLE(3, 1, 0, 2))); const __m128 r0 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(one_minus_cos, _axis), _axis), cos); const __m128 r1 = _mm_add_ps(_mm_mul_ps(sin, _axis), v0); const __m128 r2 = _mm_sub_ps(v0, _mm_mul_ps(sin, _axis)); const __m128 r0fff0 = _mm_and_ps(r0, fff0); const __m128 r1r22120 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 1, 2, 0)); const __m128 v1 = OZZ_SHUFFLE_PS1(r1r22120, _MM_SHUFFLE(0, 3, 2, 1)); const __m128 r1r20011 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1)); const __m128 v2 = OZZ_SHUFFLE_PS1(r1r20011, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 t0 = _mm_shuffle_ps(r0fff0, v1, _MM_SHUFFLE(1, 0, 3, 0)); const __m128 t1 = _mm_shuffle_ps(r0fff0, v1, _MM_SHUFFLE(3, 2, 3, 1)); const Float4x4 ret = {{OZZ_SHUFFLE_PS1(t0, _MM_SHUFFLE(1, 3, 2, 0)), OZZ_SHUFFLE_PS1(t1, _MM_SHUFFLE(1, 3, 0, 2)), _mm_shuffle_ps(v2, r0fff0, _MM_SHUFFLE(3, 2, 1, 0)), w_axis}}; return ret; } inline Float4x4 Float4x4::FromQuaternion(_SimdFloat4 _quaternion) { assert(AreAllTrue1(IsNormalizedEst4(_quaternion))); const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2); const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4)); const __m128 c1110 = _mm_castsi128_ps(_mm_srli_si128(ione, 4)); const __m128 w_axis = _mm_castsi128_ps(_mm_slli_si128(ione, 12)); const __m128 vsum = _mm_add_ps(_quaternion, _quaternion); const __m128 vms = _mm_mul_ps(_quaternion, vsum); const __m128 r0 = _mm_sub_ps( _mm_sub_ps( c1110, _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 0, 0, 1)), fff0)), _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 1, 2, 2)), fff0)); const __m128 v0 = _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 1, 0, 0)), OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 2, 1, 2))); const __m128 v1 = _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 3, 3, 3)), OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 0, 2, 1))); const __m128 r1 = _mm_add_ps(v0, v1); const __m128 r2 = _mm_sub_ps(v0, v1); const __m128 r1r21021 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 2, 1)); const __m128 v2 = OZZ_SHUFFLE_PS1(r1r21021, _MM_SHUFFLE(1, 3, 2, 0)); const __m128 r1r22200 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 2, 0, 0)); const __m128 v3 = OZZ_SHUFFLE_PS1(r1r22200, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 q0 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(1, 0, 3, 0)); const __m128 q1 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(3, 2, 3, 1)); const Float4x4 ret = {{OZZ_SHUFFLE_PS1(q0, _MM_SHUFFLE(1, 3, 2, 0)), OZZ_SHUFFLE_PS1(q1, _MM_SHUFFLE(1, 3, 0, 2)), _mm_shuffle_ps(v3, r0, _MM_SHUFFLE(3, 2, 1, 0)), w_axis}}; return ret; } inline Float4x4 Float4x4::FromAffine(_SimdFloat4 _translation, _SimdFloat4 _quaternion, _SimdFloat4 _scale) { assert(AreAllTrue1(IsNormalizedEst4(_quaternion))); const __m128i zero = _mm_setzero_si128(); const __m128i ffff = _mm_cmpeq_epi32(zero, zero); const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2); const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4)); const __m128 c1110 = _mm_castsi128_ps(_mm_srli_si128(ione, 4)); const __m128 vsum = _mm_add_ps(_quaternion, _quaternion); const __m128 vms = _mm_mul_ps(_quaternion, vsum); const __m128 r0 = _mm_sub_ps( _mm_sub_ps( c1110, _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 0, 0, 1)), fff0)), _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 1, 2, 2)), fff0)); const __m128 v0 = _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 1, 0, 0)), OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 2, 1, 2))); const __m128 v1 = _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 3, 3, 3)), OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 0, 2, 1))); const __m128 r1 = _mm_add_ps(v0, v1); const __m128 r2 = _mm_sub_ps(v0, v1); const __m128 r1r21021 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 2, 1)); const __m128 v2 = OZZ_SHUFFLE_PS1(r1r21021, _MM_SHUFFLE(1, 3, 2, 0)); const __m128 r1r22200 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 2, 0, 0)); const __m128 v3 = OZZ_SHUFFLE_PS1(r1r22200, _MM_SHUFFLE(2, 0, 2, 0)); const __m128 q0 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(1, 0, 3, 0)); const __m128 q1 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(3, 2, 3, 1)); const Float4x4 ret = { {_mm_mul_ps(OZZ_SHUFFLE_PS1(q0, _MM_SHUFFLE(1, 3, 2, 0)), OZZ_SSE_SPLAT_F(_scale, 0)), _mm_mul_ps(OZZ_SHUFFLE_PS1(q1, _MM_SHUFFLE(1, 3, 0, 2)), OZZ_SSE_SPLAT_F(_scale, 1)), _mm_mul_ps(_mm_shuffle_ps(v3, r0, _MM_SHUFFLE(3, 2, 1, 0)), OZZ_SSE_SPLAT_F(_scale, 2)), _mm_movelh_ps(_translation, _mm_unpackhi_ps(_translation, c1110))}}; return ret; } OZZ_INLINE ozz::math::SimdFloat4 TransformPoint(const ozz::math::Float4x4& _m, ozz::math::_SimdFloat4 _v) { const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 0), _m.cols[0]); const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 2), _m.cols[2], _m.cols[3]); const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 1), _m.cols[1], xxxx); return _mm_add_ps(a01, a23); } OZZ_INLINE ozz::math::SimdFloat4 TransformVector(const ozz::math::Float4x4& _m, ozz::math::_SimdFloat4 _v) { const __m128 xxxx = _mm_mul_ps(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0)); const __m128 zzzz = _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1)); const __m128 a21 = OZZ_MADD(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2), xxxx); return _mm_add_ps(zzzz, a21); } OZZ_INLINE ozz::math::SimdFloat4 operator*(const ozz::math::Float4x4& _m, ozz::math::_SimdFloat4 _v) { const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 0), _m.cols[0]); const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 2), _m.cols[2]); const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 1), _m.cols[1], xxxx); const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 3), _m.cols[3], zzzz); return _mm_add_ps(a01, a23); } inline ozz::math::Float4x4 operator*(const ozz::math::Float4x4& _a, const ozz::math::Float4x4& _b) { ozz::math::Float4x4 ret; { const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[0], 0), _a.cols[0]); const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[0], 2), _a.cols[2]); const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[0], 1), _a.cols[1], xxxx); const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[0], 3), _a.cols[3], zzzz); ret.cols[0] = _mm_add_ps(a01, a23); } { const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[1], 0), _a.cols[0]); const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[1], 2), _a.cols[2]); const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[1], 1), _a.cols[1], xxxx); const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[1], 3), _a.cols[3], zzzz); ret.cols[1] = _mm_add_ps(a01, a23); } { const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[2], 0), _a.cols[0]); const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[2], 2), _a.cols[2]); const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[2], 1), _a.cols[1], xxxx); const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[2], 3), _a.cols[3], zzzz); ret.cols[2] = _mm_add_ps(a01, a23); } { const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[3], 0), _a.cols[0]); const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[3], 2), _a.cols[2]); const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[3], 1), _a.cols[1], xxxx); const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[3], 3), _a.cols[3], zzzz); ret.cols[3] = _mm_add_ps(a01, a23); } return ret; } OZZ_INLINE ozz::math::Float4x4 operator+(const ozz::math::Float4x4& _a, const ozz::math::Float4x4& _b) { const ozz::math::Float4x4 ret = { {_mm_add_ps(_a.cols[0], _b.cols[0]), _mm_add_ps(_a.cols[1], _b.cols[1]), _mm_add_ps(_a.cols[2], _b.cols[2]), _mm_add_ps(_a.cols[3], _b.cols[3])}}; return ret; } OZZ_INLINE ozz::math::Float4x4 operator-(const ozz::math::Float4x4& _a, const ozz::math::Float4x4& _b) { const ozz::math::Float4x4 ret = { {_mm_sub_ps(_a.cols[0], _b.cols[0]), _mm_sub_ps(_a.cols[1], _b.cols[1]), _mm_sub_ps(_a.cols[2], _b.cols[2]), _mm_sub_ps(_a.cols[3], _b.cols[3])}}; return ret; } } // namespace math } // namespace ozz #if !defined(OZZ_DISABLE_SSE_NATIVE_OPERATORS) OZZ_INLINE ozz::math::SimdFloat4 operator+(ozz::math::_SimdFloat4 _a, ozz::math::_SimdFloat4 _b) { return _mm_add_ps(_a, _b); } OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _a, ozz::math::_SimdFloat4 _b) { return _mm_sub_ps(_a, _b); } OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _v) { return _mm_sub_ps(_mm_setzero_ps(), _v); } OZZ_INLINE ozz::math::SimdFloat4 operator*(ozz::math::_SimdFloat4 _a, ozz::math::_SimdFloat4 _b) { return _mm_mul_ps(_a, _b); } OZZ_INLINE ozz::math::SimdFloat4 operator/(ozz::math::_SimdFloat4 _a, ozz::math::_SimdFloat4 _b) { return _mm_div_ps(_a, _b); } #endif // !defined(OZZ_DISABLE_SSE_NATIVE_OPERATORS) namespace ozz { namespace math { OZZ_INLINE uint16_t FloatToHalf(float _f) { const int h = _mm_cvtsi128_si32(FloatToHalf(_mm_set1_ps(_f))); return static_cast(h); } OZZ_INLINE float HalfToFloat(uint16_t _h) { return _mm_cvtss_f32(HalfToFloat(_mm_set1_epi32(_h))); } // Half <-> Float implementation is based on: // http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/. inline SimdInt4 FloatToHalf(_SimdFloat4 _f) { const __m128i mask_sign = _mm_set1_epi32(0x80000000u); const __m128i mask_round = _mm_set1_epi32(~0xfffu); const __m128i f32infty = _mm_set1_epi32(255 << 23); const __m128 magic = _mm_castsi128_ps(_mm_set1_epi32(15 << 23)); const __m128i nanbit = _mm_set1_epi32(0x200); const __m128i infty_as_fp16 = _mm_set1_epi32(0x7c00); const __m128 clamp = _mm_castsi128_ps(_mm_set1_epi32((31 << 23) - 0x1000)); const __m128 msign = _mm_castsi128_ps(mask_sign); const __m128 justsign = _mm_and_ps(msign, _f); const __m128 absf = _mm_xor_ps(_f, justsign); const __m128 mround = _mm_castsi128_ps(mask_round); const __m128i absf_int = _mm_castps_si128(absf); const __m128i b_isnan = _mm_cmpgt_epi32(absf_int, f32infty); const __m128i b_isnormal = _mm_cmpgt_epi32(f32infty, _mm_castps_si128(absf)); const __m128i inf_or_nan = _mm_or_si128(_mm_and_si128(b_isnan, nanbit), infty_as_fp16); const __m128 fnosticky = _mm_and_ps(absf, mround); const __m128 scaled = _mm_mul_ps(fnosticky, magic); // Logically, we want PMINSD on "biased", but this should gen better code const __m128 clamped = _mm_min_ps(scaled, clamp); const __m128i biased = _mm_sub_epi32(_mm_castps_si128(clamped), _mm_castps_si128(mround)); const __m128i shifted = _mm_srli_epi32(biased, 13); const __m128i normal = _mm_and_si128(shifted, b_isnormal); const __m128i not_normal = _mm_andnot_si128(b_isnormal, inf_or_nan); const __m128i joined = _mm_or_si128(normal, not_normal); const __m128i sign_shift = _mm_srli_epi32(_mm_castps_si128(justsign), 16); return _mm_or_si128(joined, sign_shift); } OZZ_INLINE SimdFloat4 HalfToFloat(_SimdInt4 _h) { const __m128i mask_nosign = _mm_set1_epi32(0x7fff); const __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23)); const __m128i was_infnan = _mm_set1_epi32(0x7bff); const __m128 exp_infnan = _mm_castsi128_ps(_mm_set1_epi32(255 << 23)); const __m128i expmant = _mm_and_si128(mask_nosign, _h); const __m128i shifted = _mm_slli_epi32(expmant, 13); const __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), magic); const __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant, was_infnan); const __m128i sign = _mm_slli_epi32(_mm_xor_si128(_h, expmant), 16); const __m128 infnanexp = _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), exp_infnan); const __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp); return _mm_or_ps(scaled, sign_inf); } } // namespace math } // namespace ozz #undef OZZ_SHUFFLE_PS1 #undef OZZ_SSE_SPLAT_F #undef OZZ_SSE_HADD2_F #undef OZZ_SSE_HADD3_F #undef OZZ_SSE_HADD4_F #undef OZZ_SSE_DOT2_F #undef OZZ_SSE_DOT3_F #undef OZZ_SSE_DOT4_F #undef OZZ_MADD #undef OZZ_MSUB #undef OZZ_NMADD #undef OZZ_NMSUB #undef OZZ_MADDX #undef OZZ_MSUBX #undef OZZ_NMADDX #undef OZZ_NMSUBX #undef OZZ_SSE_SELECT_F #undef OZZ_SSE_SPLAT_I #undef OZZ_SSE_SELECT_I #endif // OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_