//----------------------------------------------------------------------------//
//                                                                            //
// ozz-animation is hosted at http://github.com/guillaumeblanc/ozz-animation  //
// and distributed under the MIT License (MIT).                               //
//                                                                            //
// Copyright (c) Guillaume Blanc                                              //
//                                                                            //
// Permission is hereby granted, free of charge, to any person obtaining a    //
// copy of this software and associated documentation files (the "Software"), //
// to deal in the Software without restriction, including without limitation  //
// the rights to use, copy, modify, merge, publish, distribute, sublicense,   //
// and/or sell copies of the Software, and to permit persons to whom the      //
// Software is furnished to do so, subject to the following conditions:       //
//                                                                            //
// The above copyright notice and this permission notice shall be included in //
// all copies or substantial portions of the Software.                        //
//                                                                            //
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR //
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   //
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    //
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER //
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING    //
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER        //
// DEALINGS IN THE SOFTWARE.                                                  //
//                                                                            //
//----------------------------------------------------------------------------//

#ifndef OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_
#define OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_

// SIMD SSE2+ implementation, based on scalar floats.

#include <stdint.h>
#include <cassert>

// Temporarly needed while trigonometric functions aren't implemented.
#include <cmath>

#include "ozz/base/maths/math_constant.h"

namespace ozz {
namespace math {

namespace simd_float4 {

// Internal macros.
// Unused components of the result vector are replicated from the first input
// argument.

#ifdef OZZ_SIMD_AVX
#define OZZ_SHUFFLE_PS1(_v, _m) _mm_permute_ps(_v, _m)
#else  // OZZ_SIMD_AVX
#define OZZ_SHUFFLE_PS1(_v, _m) _mm_shuffle_ps(_v, _v, _m)
#endif  // OZZ_SIMD_AVX

#define OZZ_SSE_SPLAT_F(_v, _i) OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(_i, _i, _i, _i))

#define OZZ_SSE_SPLAT_I(_v, _i) \
  _mm_shuffle_epi32(_v, _MM_SHUFFLE(_i, _i, _i, _i))

// _v.x + _v.y, _v.y, _v.z, _v.w
#define OZZ_SSE_HADD2_F(_v) _mm_add_ss(_v, OZZ_SSE_SPLAT_F(_v, 1))

// _v.x + _v.y + _v.z, _v.y, _v.z, _v.w
#define OZZ_SSE_HADD3_F(_v) \
  _mm_add_ss(_mm_add_ss(_v, OZZ_SSE_SPLAT_F(_v, 2)), OZZ_SSE_SPLAT_F(_v, 1))

// _v.x + _v.y + _v.z + _v.w, ?, ?, ?
#define OZZ_SSE_HADD4_F(_v, _r)                                    \
  do {                                                             \
    const __m128 haddxyzw = _mm_add_ps(_v, _mm_movehl_ps(_v, _v)); \
    _r = _mm_add_ss(haddxyzw, OZZ_SSE_SPLAT_F(haddxyzw, 1));       \
  } while (void(0), 0)

// dot2, ?, ?, ?
#define OZZ_SSE_DOT2_F(_a, _b, _r)               \
  do {                                           \
    const __m128 ab = _mm_mul_ps(_a, _b);        \
    _r = _mm_add_ss(ab, OZZ_SSE_SPLAT_F(ab, 1)); \
                                                 \
  } while (void(0), 0)

#ifdef OZZ_SIMD_SSE4_1
// dot3, ?, ?, ?
#define OZZ_SSE_DOT3_F(_a, _b, _r) \
  do {                             \
    _r = _mm_dp_ps(_a, _b, 0x7f);  \
  } while (void(0), 0)

// dot4, ?, ?, ?
#define OZZ_SSE_DOT4_F(_a, _b, _r) \
  do {                             \
    _r = _mm_dp_ps(_a, _b, 0xff);  \
  } while (void(0), 0)

#else  // OZZ_SIMD_SSE4_1
// dot3, ?, ?, ?
#define OZZ_SSE_DOT3_F(_a, _b, _r)        \
  do {                                    \
    const __m128 ab = _mm_mul_ps(_a, _b); \
    _r = OZZ_SSE_HADD3_F(ab);             \
  } while (void(0), 0)

// dot4, ?, ?, ?
#define OZZ_SSE_DOT4_F(_a, _b, _r)        \
  do {                                    \
    const __m128 ab = _mm_mul_ps(_a, _b); \
    OZZ_SSE_HADD4_F(ab, _r);              \
  } while (void(0), 0)
#endif  // OZZ_SIMD_SSE4_1

// FMA operations
#ifdef OZZ_SIMD_FMA
#define OZZ_MADD(_a, _b, _c) _mm_fmadd_ps(_a, _b, _c)
#define OZZ_MSUB(_a, _b, _c) _mm_fmsub_ps(_a, _b, _c)
#define OZZ_NMADD(_a, _b, _c) _mm_fnmadd_ps(_a, _b, _c)
#define OZZ_NMSUB(_a, _b, _c) _mm_fnmsub_ps(_a, _b, _c)
#define OZZ_MADDX(_a, _b, _c) _mm_fmadd_ss(_a, _b, _c)
#define OZZ_MSUBX(_a, _b, _c) _mm_fmsub_ss(_a, _b, _c)
#define OZZ_NMADDX(_a, _b, _c) _mm_fnmadd_ss(_a, _b, _c)
#define OZZ_NMSUBX(_a, _b, _c) _mm_fnmsub_ss(_a, _b, _c)
#else  //  OZZ_SIMD_FMA
#define OZZ_MADD(_a, _b, _c) _mm_add_ps(_mm_mul_ps(_a, _b), _c)
#define OZZ_MSUB(_a, _b, _c) _mm_sub_ps(_mm_mul_ps(_a, _b), _c)
#define OZZ_NMADD(_a, _b, _c) _mm_sub_ps(_c, _mm_mul_ps(_a, _b))
#define OZZ_NMSUB(_a, _b, _c) (-_mm_add_ps(_mm_mul_ps(_a, _b), _c))
#define OZZ_MADDX(_a, _b, _c) _mm_add_ss(_mm_mul_ss(_a, _b), _c)
#define OZZ_MSUBX(_a, _b, _c) _mm_sub_ss(_mm_mul_ss(_a, _b), _c)
#define OZZ_NMADDX(_a, _b, _c) _mm_sub_ss(_c, _mm_mul_ss(_a, _b))
#define OZZ_NMSUBX(_a, _b, _c) (-_mm_add_ss(_mm_mul_ss(_a, _b), _c))
#endif  // OZZ_SIMD_FMA

OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_div_ss(_a, _b);
}

#ifdef OZZ_SIMD_SSE4_1

#define OZZ_SSE_SELECT_F(_b, _true, _false) \
  _mm_blendv_ps(_false, _true, _mm_castsi128_ps(_b))

#define OZZ_SSE_SELECT_I(_b, _true, _false) _mm_blendv_epi8(_false, _true, _b)

#else  // OZZ_SIMD_SSE4_1

#define OZZ_SSE_SELECT_F(_b, _true, _false)          \
  _mm_or_ps(_mm_and_ps(_true, _mm_castsi128_ps(_b)), \
            _mm_andnot_ps(_mm_castsi128_ps(_b), _false))

#define OZZ_SSE_SELECT_I(_b, _true, _false) \
  _mm_or_si128(_mm_and_si128(_true, _b), _mm_andnot_si128(_b, _false))

#endif  // OZZ_SIMD_SSE4_1

OZZ_INLINE SimdFloat4 zero() { return _mm_setzero_ps(); }

OZZ_INLINE SimdFloat4 one() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_castsi128_ps(
      _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2));
}

OZZ_INLINE SimdFloat4 x_axis() {
  const __m128i zero = _mm_setzero_si128();
  const __m128i one =
      _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
  return _mm_castsi128_ps(_mm_srli_si128(one, 12));
}

OZZ_INLINE SimdFloat4 y_axis() {
  const __m128i zero = _mm_setzero_si128();
  const __m128i one =
      _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
  return _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(one, 12), 4));
}

OZZ_INLINE SimdFloat4 z_axis() {
  const __m128i zero = _mm_setzero_si128();
  const __m128i one =
      _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
  return _mm_castsi128_ps(_mm_slli_si128(_mm_srli_si128(one, 12), 8));
}

OZZ_INLINE SimdFloat4 w_axis() {
  const __m128i zero = _mm_setzero_si128();
  const __m128i one =
      _mm_srli_epi32(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 25), 2);
  return _mm_castsi128_ps(_mm_slli_si128(one, 12));
}

OZZ_INLINE SimdFloat4 Load(float _x, float _y, float _z, float _w) {
  return _mm_set_ps(_w, _z, _y, _x);
}

OZZ_INLINE SimdFloat4 LoadX(float _x) { return _mm_set_ss(_x); }

OZZ_INLINE SimdFloat4 Load1(float _x) { return _mm_set_ps1(_x); }

OZZ_INLINE SimdFloat4 LoadPtr(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  return _mm_load_ps(_f);
}

OZZ_INLINE SimdFloat4 LoadPtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  return _mm_loadu_ps(_f);
}

OZZ_INLINE SimdFloat4 LoadXPtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  return _mm_load_ss(_f);
}

OZZ_INLINE SimdFloat4 Load1PtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  return _mm_load_ps1(_f);
}

OZZ_INLINE SimdFloat4 Load2PtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  return _mm_unpacklo_ps(_mm_load_ss(_f + 0), _mm_load_ss(_f + 1));
}

OZZ_INLINE SimdFloat4 Load3PtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  return _mm_movelh_ps(
      _mm_unpacklo_ps(_mm_load_ss(_f + 0), _mm_load_ss(_f + 1)),
      _mm_load_ss(_f + 2));
}

OZZ_INLINE SimdFloat4 FromInt(_SimdInt4 _i) { return _mm_cvtepi32_ps(_i); }
}  // namespace simd_float4

OZZ_INLINE float GetX(_SimdFloat4 _v) { return _mm_cvtss_f32(_v); }

OZZ_INLINE float GetY(_SimdFloat4 _v) {
  return _mm_cvtss_f32(OZZ_SSE_SPLAT_F(_v, 1));
}

OZZ_INLINE float GetZ(_SimdFloat4 _v) {
  return _mm_cvtss_f32(_mm_movehl_ps(_v, _v));
}

OZZ_INLINE float GetW(_SimdFloat4 _v) {
  return _mm_cvtss_f32(OZZ_SSE_SPLAT_F(_v, 3));
}

OZZ_INLINE SimdFloat4 SetX(_SimdFloat4 _v, _SimdFloat4 _f) {
  return _mm_move_ss(_v, _f);
}

OZZ_INLINE SimdFloat4 SetY(_SimdFloat4 _v, _SimdFloat4 _f) {
  const __m128 xfnn = _mm_unpacklo_ps(_v, _f);
  return _mm_shuffle_ps(xfnn, _v, _MM_SHUFFLE(3, 2, 1, 0));
}

OZZ_INLINE SimdFloat4 SetZ(_SimdFloat4 _v, _SimdFloat4 _f) {
  const __m128 ffww = _mm_shuffle_ps(_f, _v, _MM_SHUFFLE(3, 3, 0, 0));
  return _mm_shuffle_ps(_v, ffww, _MM_SHUFFLE(2, 0, 1, 0));
}

OZZ_INLINE SimdFloat4 SetW(_SimdFloat4 _v, _SimdFloat4 _f) {
  const __m128 ffzz = _mm_shuffle_ps(_f, _v, _MM_SHUFFLE(2, 2, 0, 0));
  return _mm_shuffle_ps(_v, ffzz, _MM_SHUFFLE(0, 2, 1, 0));
}

OZZ_INLINE SimdFloat4 SetI(_SimdFloat4 _v, _SimdFloat4 _f, int _ith) {
  assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range.");
  union {
    SimdFloat4 ret;
    float af[4];
  } u = {_v};
  u.af[_ith] = _mm_cvtss_f32(_f);
  return u.ret;
}

OZZ_INLINE void StorePtr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _mm_store_ps(_f, _v);
}

OZZ_INLINE void Store1Ptr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _mm_store_ss(_f, _v);
}

OZZ_INLINE void Store2Ptr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _mm_storel_pi(reinterpret_cast<__m64*>(_f), _v);
}

OZZ_INLINE void Store3Ptr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _mm_storel_pi(reinterpret_cast<__m64*>(_f), _v);
  _mm_store_ss(_f + 2, _mm_movehl_ps(_v, _v));
}

OZZ_INLINE void StorePtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _mm_storeu_ps(_f, _v);
}

OZZ_INLINE void Store1PtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _mm_store_ss(_f, _v);
}

OZZ_INLINE void Store2PtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _mm_store_ss(_f + 0, _v);
  _mm_store_ss(_f + 1, OZZ_SSE_SPLAT_F(_v, 1));
}

OZZ_INLINE void Store3PtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _mm_store_ss(_f + 0, _v);
  _mm_store_ss(_f + 1, OZZ_SSE_SPLAT_F(_v, 1));
  _mm_store_ss(_f + 2, _mm_movehl_ps(_v, _v));
}

OZZ_INLINE SimdFloat4 SplatX(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 0); }

OZZ_INLINE SimdFloat4 SplatY(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 1); }

OZZ_INLINE SimdFloat4 SplatZ(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 2); }

OZZ_INLINE SimdFloat4 SplatW(_SimdFloat4 _v) { return OZZ_SSE_SPLAT_F(_v, 3); }

template <size_t _X, size_t _Y, size_t _Z, size_t _W>
OZZ_INLINE SimdFloat4 Swizzle(_SimdFloat4 _v) {
  static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3,
                "Indices must be between 0 and 3");
  return OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(_W, _Z, _Y, _X));
}

template <>
OZZ_INLINE SimdFloat4 Swizzle<0, 1, 2, 3>(_SimdFloat4 _v) {
  return _v;
}

template <>
OZZ_INLINE SimdFloat4 Swizzle<0, 1, 0, 1>(_SimdFloat4 _v) {
  return _mm_movelh_ps(_v, _v);
}

template <>
OZZ_INLINE SimdFloat4 Swizzle<2, 3, 2, 3>(_SimdFloat4 _v) {
  return _mm_movehl_ps(_v, _v);
}

template <>
OZZ_INLINE SimdFloat4 Swizzle<0, 0, 1, 1>(_SimdFloat4 _v) {
  return _mm_unpacklo_ps(_v, _v);
}

template <>
OZZ_INLINE SimdFloat4 Swizzle<2, 2, 3, 3>(_SimdFloat4 _v) {
  return _mm_unpackhi_ps(_v, _v);
}

OZZ_INLINE void Transpose4x1(const SimdFloat4 _in[4], SimdFloat4 _out[1]) {
  const __m128 xz = _mm_unpacklo_ps(_in[0], _in[2]);
  const __m128 yw = _mm_unpacklo_ps(_in[1], _in[3]);
  _out[0] = _mm_unpacklo_ps(xz, yw);
}

OZZ_INLINE void Transpose1x4(const SimdFloat4 _in[1], SimdFloat4 _out[4]) {
  const __m128 zwzw = _mm_movehl_ps(_in[0], _in[0]);
  const __m128 yyyy = OZZ_SSE_SPLAT_F(_in[0], 1);
  const __m128 wwww = OZZ_SSE_SPLAT_F(_in[0], 3);
  const __m128 zero = _mm_setzero_ps();
  _out[0] = _mm_move_ss(zero, _in[0]);
  _out[1] = _mm_move_ss(zero, yyyy);
  _out[2] = _mm_move_ss(zero, zwzw);
  _out[3] = _mm_move_ss(zero, wwww);
}

OZZ_INLINE void Transpose4x2(const SimdFloat4 _in[4], SimdFloat4 _out[2]) {
  const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
  _out[0] = _mm_unpacklo_ps(tmp0, tmp1);
  _out[1] = _mm_unpackhi_ps(tmp0, tmp1);
}

OZZ_INLINE void Transpose2x4(const SimdFloat4 _in[2], SimdFloat4 _out[4]) {
  const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[1]);
  const __m128 tmp1 = _mm_unpackhi_ps(_in[0], _in[1]);
  const __m128 zero = _mm_setzero_ps();
  _out[0] = _mm_movelh_ps(tmp0, zero);
  _out[1] = _mm_movehl_ps(zero, tmp0);
  _out[2] = _mm_movelh_ps(tmp1, zero);
  _out[3] = _mm_movehl_ps(zero, tmp1);
}

OZZ_INLINE void Transpose4x3(const SimdFloat4 _in[4], SimdFloat4 _out[3]) {
  const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
  const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]);
  const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]);
  _out[0] = _mm_unpacklo_ps(tmp0, tmp1);
  _out[1] = _mm_unpackhi_ps(tmp0, tmp1);
  _out[2] = _mm_unpacklo_ps(tmp2, tmp3);
}

OZZ_INLINE void Transpose3x4(const SimdFloat4 _in[3], SimdFloat4 _out[4]) {
  const __m128 zero = _mm_setzero_ps();
  const __m128 temp0 = _mm_unpacklo_ps(_in[0], _in[1]);
  const __m128 temp1 = _mm_unpacklo_ps(_in[2], zero);
  const __m128 temp2 = _mm_unpackhi_ps(_in[0], _in[1]);
  const __m128 temp3 = _mm_unpackhi_ps(_in[2], zero);
  _out[0] = _mm_movelh_ps(temp0, temp1);
  _out[1] = _mm_movehl_ps(temp1, temp0);
  _out[2] = _mm_movelh_ps(temp2, temp3);
  _out[3] = _mm_movehl_ps(temp3, temp2);
}

OZZ_INLINE void Transpose4x4(const SimdFloat4 _in[4], SimdFloat4 _out[4]) {
  const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
  const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]);
  const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]);
  _out[0] = _mm_unpacklo_ps(tmp0, tmp1);
  _out[1] = _mm_unpackhi_ps(tmp0, tmp1);
  _out[2] = _mm_unpacklo_ps(tmp2, tmp3);
  _out[3] = _mm_unpackhi_ps(tmp2, tmp3);
}

OZZ_INLINE void Transpose16x16(const SimdFloat4 _in[16], SimdFloat4 _out[16]) {
  const __m128 tmp0 = _mm_unpacklo_ps(_in[0], _in[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_in[1], _in[3]);
  _out[0] = _mm_unpacklo_ps(tmp0, tmp1);
  _out[4] = _mm_unpackhi_ps(tmp0, tmp1);
  const __m128 tmp2 = _mm_unpackhi_ps(_in[0], _in[2]);
  const __m128 tmp3 = _mm_unpackhi_ps(_in[1], _in[3]);
  _out[8] = _mm_unpacklo_ps(tmp2, tmp3);
  _out[12] = _mm_unpackhi_ps(tmp2, tmp3);
  const __m128 tmp4 = _mm_unpacklo_ps(_in[4], _in[6]);
  const __m128 tmp5 = _mm_unpacklo_ps(_in[5], _in[7]);
  _out[1] = _mm_unpacklo_ps(tmp4, tmp5);
  _out[5] = _mm_unpackhi_ps(tmp4, tmp5);
  const __m128 tmp6 = _mm_unpackhi_ps(_in[4], _in[6]);
  const __m128 tmp7 = _mm_unpackhi_ps(_in[5], _in[7]);
  _out[9] = _mm_unpacklo_ps(tmp6, tmp7);
  _out[13] = _mm_unpackhi_ps(tmp6, tmp7);
  const __m128 tmp8 = _mm_unpacklo_ps(_in[8], _in[10]);
  const __m128 tmp9 = _mm_unpacklo_ps(_in[9], _in[11]);
  _out[2] = _mm_unpacklo_ps(tmp8, tmp9);
  _out[6] = _mm_unpackhi_ps(tmp8, tmp9);
  const __m128 tmp10 = _mm_unpackhi_ps(_in[8], _in[10]);
  const __m128 tmp11 = _mm_unpackhi_ps(_in[9], _in[11]);
  _out[10] = _mm_unpacklo_ps(tmp10, tmp11);
  _out[14] = _mm_unpackhi_ps(tmp10, tmp11);
  const __m128 tmp12 = _mm_unpacklo_ps(_in[12], _in[14]);
  const __m128 tmp13 = _mm_unpacklo_ps(_in[13], _in[15]);
  _out[3] = _mm_unpacklo_ps(tmp12, tmp13);
  _out[7] = _mm_unpackhi_ps(tmp12, tmp13);
  const __m128 tmp14 = _mm_unpackhi_ps(_in[12], _in[14]);
  const __m128 tmp15 = _mm_unpackhi_ps(_in[13], _in[15]);
  _out[11] = _mm_unpacklo_ps(tmp14, tmp15);
  _out[15] = _mm_unpackhi_ps(tmp14, tmp15);
}

OZZ_INLINE SimdFloat4 MAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  return OZZ_MADD(_a, _b, _c);
}

OZZ_INLINE SimdFloat4 MSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  return OZZ_MSUB(_a, _b, _c);
}

OZZ_INLINE SimdFloat4 NMAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  return OZZ_NMADD(_a, _b, _c);
}

OZZ_INLINE SimdFloat4 NMSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  return OZZ_NMSUB(_a, _b, _c);
}

OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_div_ss(_a, _b);
}

OZZ_INLINE SimdFloat4 HAdd2(_SimdFloat4 _v) { return OZZ_SSE_HADD2_F(_v); }

OZZ_INLINE SimdFloat4 HAdd3(_SimdFloat4 _v) { return OZZ_SSE_HADD3_F(_v); }

OZZ_INLINE SimdFloat4 HAdd4(_SimdFloat4 _v) {
  __m128 hadd4;
  OZZ_SSE_HADD4_F(_v, hadd4);
  return hadd4;
}

OZZ_INLINE SimdFloat4 Dot2(_SimdFloat4 _a, _SimdFloat4 _b) {
  __m128 dot2;
  OZZ_SSE_DOT2_F(_a, _b, dot2);
  return dot2;
}

OZZ_INLINE SimdFloat4 Dot3(_SimdFloat4 _a, _SimdFloat4 _b) {
  __m128 dot3;
  OZZ_SSE_DOT3_F(_a, _b, dot3);
  return dot3;
}

OZZ_INLINE SimdFloat4 Dot4(_SimdFloat4 _a, _SimdFloat4 _b) {
  __m128 dot4;
  OZZ_SSE_DOT4_F(_a, _b, dot4);
  return dot4;
}

OZZ_INLINE SimdFloat4 Cross3(_SimdFloat4 _a, _SimdFloat4 _b) {
  // Implementation with 3 shuffles only is based on:
  // https://geometrian.com/programming/tutorials/cross-product
  const __m128 shufa = OZZ_SHUFFLE_PS1(_a, _MM_SHUFFLE(3, 0, 2, 1));
  const __m128 shufb = OZZ_SHUFFLE_PS1(_b, _MM_SHUFFLE(3, 0, 2, 1));
  const __m128 shufc = OZZ_MSUB(_a, shufb, _mm_mul_ps(_b, shufa));
  return OZZ_SHUFFLE_PS1(shufc, _MM_SHUFFLE(3, 0, 2, 1));
}

OZZ_INLINE SimdFloat4 RcpEst(_SimdFloat4 _v) { return _mm_rcp_ps(_v); }

OZZ_INLINE SimdFloat4 RcpEstNR(_SimdFloat4 _v) {
  const __m128 nr = _mm_rcp_ps(_v);
  // Do one more Newton-Raphson step to improve precision.
  return OZZ_NMADD(_mm_mul_ps(nr, nr), _v, _mm_add_ps(nr, nr));
}

OZZ_INLINE SimdFloat4 RcpEstX(_SimdFloat4 _v) { return _mm_rcp_ss(_v); }

OZZ_INLINE SimdFloat4 RcpEstXNR(_SimdFloat4 _v) {
  const __m128 nr = _mm_rcp_ss(_v);
  // Do one more Newton-Raphson step to improve precision.
  return OZZ_NMADDX(_mm_mul_ss(nr, nr), _v, _mm_add_ss(nr, nr));
}

OZZ_INLINE SimdFloat4 Sqrt(_SimdFloat4 _v) { return _mm_sqrt_ps(_v); }

OZZ_INLINE SimdFloat4 SqrtX(_SimdFloat4 _v) { return _mm_sqrt_ss(_v); }

OZZ_INLINE SimdFloat4 RSqrtEst(_SimdFloat4 _v) { return _mm_rsqrt_ps(_v); }

OZZ_INLINE SimdFloat4 RSqrtEstNR(_SimdFloat4 _v) {
  const __m128 nr = _mm_rsqrt_ps(_v);
  // Do one more Newton-Raphson step to improve precision.
  return _mm_mul_ps(_mm_mul_ps(_mm_set_ps1(.5f), nr),
                    OZZ_NMADD(_mm_mul_ps(_v, nr), nr, _mm_set_ps1(3.f)));
}

OZZ_INLINE SimdFloat4 RSqrtEstX(_SimdFloat4 _v) { return _mm_rsqrt_ss(_v); }

OZZ_INLINE SimdFloat4 RSqrtEstXNR(_SimdFloat4 _v) {
  const __m128 nr = _mm_rsqrt_ss(_v);
  // Do one more Newton-Raphson step to improve precision.
  return _mm_mul_ss(_mm_mul_ss(_mm_set_ps1(.5f), nr),
                    OZZ_NMADDX(_mm_mul_ss(_v, nr), nr, _mm_set_ps1(3.f)));
}

OZZ_INLINE SimdFloat4 Abs(_SimdFloat4 _v) {
  const __m128i zero = _mm_setzero_si128();
  return _mm_and_ps(
      _mm_castsi128_ps(_mm_srli_epi32(_mm_cmpeq_epi32(zero, zero), 1)), _v);
}

OZZ_INLINE SimdInt4 Sign(_SimdFloat4 _v) {
  return _mm_slli_epi32(_mm_srli_epi32(_mm_castps_si128(_v), 31), 31);
}

OZZ_INLINE SimdFloat4 Length2(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT2_F(_v, _v, sq_len);
  return _mm_sqrt_ss(sq_len);
}

OZZ_INLINE SimdFloat4 Length3(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT3_F(_v, _v, sq_len);
  return _mm_sqrt_ss(sq_len);
}

OZZ_INLINE SimdFloat4 Length4(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT4_F(_v, _v, sq_len);
  return _mm_sqrt_ss(sq_len);
}

OZZ_INLINE SimdFloat4 Length2Sqr(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT2_F(_v, _v, sq_len);
  return sq_len;
}

OZZ_INLINE SimdFloat4 Length3Sqr(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT3_F(_v, _v, sq_len);
  return sq_len;
}

OZZ_INLINE SimdFloat4 Length4Sqr(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT4_F(_v, _v, sq_len);
  return sq_len;
}

OZZ_INLINE SimdFloat4 Normalize2(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT2_F(_v, _v, sq_len);
  assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
  const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
  return _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
}

OZZ_INLINE SimdFloat4 Normalize3(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT3_F(_v, _v, sq_len);
  assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
  const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
  const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
  return OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
}

OZZ_INLINE SimdFloat4 Normalize4(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT4_F(_v, _v, sq_len);
  assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
  const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  return _mm_mul_ps(_v, inv_lenxxxx);
}

OZZ_INLINE SimdFloat4 NormalizeEst2(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT2_F(_v, _v, sq_len);
  assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
  const __m128 inv_len = _mm_rsqrt_ss(sq_len);
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
  return _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
}

OZZ_INLINE SimdFloat4 NormalizeEst3(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT3_F(_v, _v, sq_len);
  assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
  const __m128 inv_len = _mm_rsqrt_ss(sq_len);
  const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
  return OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
}

OZZ_INLINE SimdFloat4 NormalizeEst4(_SimdFloat4 _v) {
  __m128 sq_len;
  OZZ_SSE_DOT4_F(_v, _v, sq_len);
  assert(_mm_cvtss_f32(sq_len) != 0.f && "_v is not normalizable");
  const __m128 inv_len = _mm_rsqrt_ss(sq_len);
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  return _mm_mul_ps(_v, inv_lenxxxx);
}

OZZ_INLINE SimdInt4 IsNormalized2(_SimdFloat4 _v) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
  __m128 dot;
  OZZ_SSE_DOT2_F(_v, _v, dot);
  __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

OZZ_INLINE SimdInt4 IsNormalized3(_SimdFloat4 _v) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
  __m128 dot;
  OZZ_SSE_DOT3_F(_v, _v, dot);
  __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

OZZ_INLINE SimdInt4 IsNormalized4(_SimdFloat4 _v) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
  __m128 dot;
  OZZ_SSE_DOT4_F(_v, _v, dot);
  __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

OZZ_INLINE SimdInt4 IsNormalizedEst2(_SimdFloat4 _v) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq);
  __m128 dot;
  OZZ_SSE_DOT2_F(_v, _v, dot);
  __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

OZZ_INLINE SimdInt4 IsNormalizedEst3(_SimdFloat4 _v) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq);
  __m128 dot;
  OZZ_SSE_DOT3_F(_v, _v, dot);
  __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

OZZ_INLINE SimdInt4 IsNormalizedEst4(_SimdFloat4 _v) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceEstSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceEstSq);
  __m128 dot;
  OZZ_SSE_DOT4_F(_v, _v, dot);
  __m128 dotx000 = _mm_move_ss(_mm_setzero_ps(), dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

OZZ_INLINE SimdFloat4 NormalizeSafe2(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalized2(_safe)) && "_safe is not normalized");
  __m128 sq_len;
  OZZ_SSE_DOT2_F(_v, _v, sq_len);
  const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
  const __m128i cond = _mm_castps_si128(
      _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
  const __m128 cfalse = _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
  return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}

OZZ_INLINE SimdFloat4 NormalizeSafe3(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalized3(_safe)) && "_safe is not normalized");
  __m128 sq_len;
  OZZ_SSE_DOT3_F(_v, _v, sq_len);
  const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
  const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
  const __m128i cond = _mm_castps_si128(
      _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
  const __m128 cfalse = OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
  return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}

OZZ_INLINE SimdFloat4 NormalizeSafe4(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalized4(_safe)) && "_safe is not normalized");
  __m128 sq_len;
  OZZ_SSE_DOT4_F(_v, _v, sq_len);
  const __m128 inv_len = _mm_div_ss(simd_float4::one(), _mm_sqrt_ss(sq_len));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128i cond = _mm_castps_si128(
      _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
  const __m128 cfalse = _mm_mul_ps(_v, inv_lenxxxx);
  return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}

OZZ_INLINE SimdFloat4 NormalizeSafeEst2(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalizedEst2(_safe)) && "_safe is not normalized");
  __m128 sq_len;
  OZZ_SSE_DOT2_F(_v, _v, sq_len);
  const __m128 inv_len = _mm_rsqrt_ss(sq_len);
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 norm = _mm_mul_ps(_v, inv_lenxxxx);
  const __m128i cond = _mm_castps_si128(
      _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
  const __m128 cfalse = _mm_movelh_ps(norm, _mm_movehl_ps(_v, _v));
  return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}

OZZ_INLINE SimdFloat4 NormalizeSafeEst3(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalizedEst3(_safe)) && "_safe is not normalized");
  __m128 sq_len;
  OZZ_SSE_DOT3_F(_v, _v, sq_len);
  const __m128 inv_len = _mm_rsqrt_ss(sq_len);
  const __m128 vwxyz = OZZ_SHUFFLE_PS1(_v, _MM_SHUFFLE(0, 1, 2, 3));
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128 normwxyz = _mm_move_ss(_mm_mul_ps(vwxyz, inv_lenxxxx), vwxyz);
  const __m128i cond = _mm_castps_si128(
      _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
  const __m128 cfalse = OZZ_SHUFFLE_PS1(normwxyz, _MM_SHUFFLE(0, 1, 2, 3));
  return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}

OZZ_INLINE SimdFloat4 NormalizeSafeEst4(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalizedEst4(_safe)) && "_safe is not normalized");
  __m128 sq_len;
  OZZ_SSE_DOT4_F(_v, _v, sq_len);
  const __m128 inv_len = _mm_rsqrt_ss(sq_len);
  const __m128 inv_lenxxxx = OZZ_SSE_SPLAT_F(inv_len, 0);
  const __m128i cond = _mm_castps_si128(
      _mm_cmple_ps(OZZ_SSE_SPLAT_F(sq_len, 0), _mm_setzero_ps()));
  const __m128 cfalse = _mm_mul_ps(_v, inv_lenxxxx);
  return OZZ_SSE_SELECT_F(cond, _safe, cfalse);
}

OZZ_INLINE SimdFloat4 Lerp(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _alpha) {
  return OZZ_MADD(_alpha, _mm_sub_ps(_b, _a), _a);
}

OZZ_INLINE SimdFloat4 Min(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_min_ps(_a, _b);
}

OZZ_INLINE SimdFloat4 Max(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_max_ps(_a, _b);
}

OZZ_INLINE SimdFloat4 Min0(_SimdFloat4 _v) {
  return _mm_min_ps(_mm_setzero_ps(), _v);
}

OZZ_INLINE SimdFloat4 Max0(_SimdFloat4 _v) {
  return _mm_max_ps(_mm_setzero_ps(), _v);
}

OZZ_INLINE SimdFloat4 Clamp(_SimdFloat4 _a, _SimdFloat4 _v, _SimdFloat4 _b) {
  return _mm_max_ps(_a, _mm_min_ps(_v, _b));
}

OZZ_INLINE SimdFloat4 Select(_SimdInt4 _b, _SimdFloat4 _true,
                             _SimdFloat4 _false) {
  return OZZ_SSE_SELECT_F(_b, _true, _false);
}

OZZ_INLINE SimdInt4 CmpEq(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_castps_si128(_mm_cmpeq_ps(_a, _b));
}

OZZ_INLINE SimdInt4 CmpNe(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_castps_si128(_mm_cmpneq_ps(_a, _b));
}

OZZ_INLINE SimdInt4 CmpLt(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_castps_si128(_mm_cmplt_ps(_a, _b));
}

OZZ_INLINE SimdInt4 CmpLe(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_castps_si128(_mm_cmple_ps(_a, _b));
}

OZZ_INLINE SimdInt4 CmpGt(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_castps_si128(_mm_cmpgt_ps(_a, _b));
}

OZZ_INLINE SimdInt4 CmpGe(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_castps_si128(_mm_cmpge_ps(_a, _b));
}

OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_and_ps(_a, _b);
}

OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_or_ps(_a, _b);
}

OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdFloat4 _b) {
  return _mm_xor_ps(_a, _b);
}

OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdInt4 _b) {
  return _mm_and_ps(_a, _mm_castsi128_ps(_b));
}

OZZ_INLINE SimdFloat4 AndNot(_SimdFloat4 _a, _SimdInt4 _b) {
  return _mm_andnot_ps(_mm_castsi128_ps(_b), _a);
}

OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdInt4 _b) {
  return _mm_or_ps(_a, _mm_castsi128_ps(_b));
}

OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdInt4 _b) {
  return _mm_xor_ps(_a, _mm_castsi128_ps(_b));
}

OZZ_INLINE SimdFloat4 Cos(_SimdFloat4 _v) {
  return _mm_set_ps(std::cos(GetW(_v)), std::cos(GetZ(_v)), std::cos(GetY(_v)),
                    std::cos(GetX(_v)));
}

OZZ_INLINE SimdFloat4 CosX(_SimdFloat4 _v) {
  return _mm_move_ss(_v, _mm_set_ps1(std::cos(GetX(_v))));
}

OZZ_INLINE SimdFloat4 ACos(_SimdFloat4 _v) {
  return _mm_set_ps(std::acos(GetW(_v)), std::acos(GetZ(_v)),
                    std::acos(GetY(_v)), std::acos(GetX(_v)));
}

OZZ_INLINE SimdFloat4 ACosX(_SimdFloat4 _v) {
  return _mm_move_ss(_v, _mm_set_ps1(std::acos(GetX(_v))));
}

OZZ_INLINE SimdFloat4 Sin(_SimdFloat4 _v) {
  return _mm_set_ps(std::sin(GetW(_v)), std::sin(GetZ(_v)), std::sin(GetY(_v)),
                    std::sin(GetX(_v)));
}

OZZ_INLINE SimdFloat4 SinX(_SimdFloat4 _v) {
  return _mm_move_ss(_v, _mm_set_ps1(std::sin(GetX(_v))));
}

OZZ_INLINE SimdFloat4 ASin(_SimdFloat4 _v) {
  return _mm_set_ps(std::asin(GetW(_v)), std::asin(GetZ(_v)),
                    std::asin(GetY(_v)), std::asin(GetX(_v)));
}

OZZ_INLINE SimdFloat4 ASinX(_SimdFloat4 _v) {
  return _mm_move_ss(_v, _mm_set_ps1(std::asin(GetX(_v))));
}

OZZ_INLINE SimdFloat4 Tan(_SimdFloat4 _v) {
  return _mm_set_ps(std::tan(GetW(_v)), std::tan(GetZ(_v)), std::tan(GetY(_v)),
                    std::tan(GetX(_v)));
}

OZZ_INLINE SimdFloat4 TanX(_SimdFloat4 _v) {
  return _mm_move_ss(_v, _mm_set_ps1(std::tan(GetX(_v))));
}

OZZ_INLINE SimdFloat4 ATan(_SimdFloat4 _v) {
  return _mm_set_ps(std::atan(GetW(_v)), std::atan(GetZ(_v)),
                    std::atan(GetY(_v)), std::atan(GetX(_v)));
}

OZZ_INLINE SimdFloat4 ATanX(_SimdFloat4 _v) {
  return _mm_move_ss(_v, _mm_set_ps1(std::atan(GetX(_v))));
}

namespace simd_int4 {

OZZ_INLINE SimdInt4 zero() { return _mm_setzero_si128(); }

OZZ_INLINE SimdInt4 one() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero));
}

OZZ_INLINE SimdInt4 x_axis() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12);
}

OZZ_INLINE SimdInt4 y_axis() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_slli_si128(
      _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12), 4);
}

OZZ_INLINE SimdInt4 z_axis() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_slli_si128(
      _mm_srli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12), 8);
}

OZZ_INLINE SimdInt4 w_axis() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_slli_si128(_mm_sub_epi32(zero, _mm_cmpeq_epi32(zero, zero)), 12);
}

OZZ_INLINE SimdInt4 all_true() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_cmpeq_epi32(zero, zero);
}

OZZ_INLINE SimdInt4 all_false() { return _mm_setzero_si128(); }

OZZ_INLINE SimdInt4 mask_sign() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31);
}

OZZ_INLINE SimdInt4 mask_sign_xyz() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_si128(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31), 4);
}

OZZ_INLINE SimdInt4 mask_sign_w() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_slli_si128(_mm_slli_epi32(_mm_cmpeq_epi32(zero, zero), 31), 12);
}

OZZ_INLINE SimdInt4 mask_not_sign() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_epi32(_mm_cmpeq_epi32(zero, zero), 1);
}

OZZ_INLINE SimdInt4 mask_ffff() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_cmpeq_epi32(zero, zero);
}
OZZ_INLINE SimdInt4 mask_0000() { return _mm_setzero_si128(); }

OZZ_INLINE SimdInt4 mask_fff0() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_si128(_mm_cmpeq_epi32(zero, zero), 4);
}

OZZ_INLINE SimdInt4 mask_f000() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_si128(_mm_cmpeq_epi32(zero, zero), 12);
}

OZZ_INLINE SimdInt4 mask_0f00() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_si128(_mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12), 8);
}

OZZ_INLINE SimdInt4 mask_00f0() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_srli_si128(_mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12), 4);
}

OZZ_INLINE SimdInt4 mask_000f() {
  const __m128i zero = _mm_setzero_si128();
  return _mm_slli_si128(_mm_cmpeq_epi32(zero, zero), 12);
}

OZZ_INLINE SimdInt4 Load(int _x, int _y, int _z, int _w) {
  return _mm_set_epi32(_w, _z, _y, _x);
}

OZZ_INLINE SimdInt4 LoadX(int _x) { return _mm_set_epi32(0, 0, 0, _x); }

OZZ_INLINE SimdInt4 Load1(int _x) { return _mm_set1_epi32(_x); }

OZZ_INLINE SimdInt4 Load(bool _x, bool _y, bool _z, bool _w) {
  return _mm_sub_epi32(_mm_setzero_si128(), _mm_set_epi32(_w, _z, _y, _x));
}

OZZ_INLINE SimdInt4 LoadX(bool _x) {
  return _mm_sub_epi32(_mm_setzero_si128(), _mm_set_epi32(0, 0, 0, _x));
}

OZZ_INLINE SimdInt4 Load1(bool _x) {
  return _mm_sub_epi32(_mm_setzero_si128(), _mm_set1_epi32(_x));
}

OZZ_INLINE SimdInt4 LoadPtr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  return _mm_load_si128(reinterpret_cast<const __m128i*>(_i));
}

OZZ_INLINE SimdInt4 LoadXPtr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  return _mm_cvtsi32_si128(*_i);
}

OZZ_INLINE SimdInt4 Load1Ptr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  return _mm_shuffle_epi32(
      _mm_loadl_epi64(reinterpret_cast<const __m128i*>(_i)),
      _MM_SHUFFLE(0, 0, 0, 0));
}

OZZ_INLINE SimdInt4 Load2Ptr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(_i));
}

OZZ_INLINE SimdInt4 Load3Ptr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  return _mm_set_epi32(0, _i[2], _i[1], _i[0]);
}

OZZ_INLINE SimdInt4 LoadPtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(_i));
}

OZZ_INLINE SimdInt4 LoadXPtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  return _mm_cvtsi32_si128(*_i);
}

OZZ_INLINE SimdInt4 Load1PtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  return _mm_set1_epi32(*_i);
}

OZZ_INLINE SimdInt4 Load2PtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  return _mm_set_epi32(0, 0, _i[1], _i[0]);
}

OZZ_INLINE SimdInt4 Load3PtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  return _mm_set_epi32(0, _i[2], _i[1], _i[0]);
}

OZZ_INLINE SimdInt4 FromFloatRound(_SimdFloat4 _f) {
  return _mm_cvtps_epi32(_f);
}

OZZ_INLINE SimdInt4 FromFloatTrunc(_SimdFloat4 _f) {
  return _mm_cvttps_epi32(_f);
}
}  // namespace simd_int4

OZZ_INLINE int GetX(_SimdInt4 _v) { return _mm_cvtsi128_si32(_v); }

OZZ_INLINE int GetY(_SimdInt4 _v) {
  return _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
}

OZZ_INLINE int GetZ(_SimdInt4 _v) {
  return _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v));
}

OZZ_INLINE int GetW(_SimdInt4 _v) {
  return _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 3));
}

OZZ_INLINE SimdInt4 SetX(_SimdInt4 _v, _SimdInt4 _i) {
  return _mm_castps_si128(
      _mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(_i)));
}

OZZ_INLINE SimdInt4 SetY(_SimdInt4 _v, _SimdInt4 _i) {
  const __m128 xfnn = _mm_castsi128_ps(_mm_unpacklo_epi32(_v, _i));
  return _mm_castps_si128(
      _mm_shuffle_ps(xfnn, _mm_castsi128_ps(_v), _MM_SHUFFLE(3, 2, 1, 0)));
}

OZZ_INLINE SimdInt4 SetZ(_SimdInt4 _v, _SimdInt4 _i) {
  const __m128 ffww = _mm_shuffle_ps(_mm_castsi128_ps(_i), _mm_castsi128_ps(_v),
                                     _MM_SHUFFLE(3, 3, 0, 0));
  return _mm_castps_si128(
      _mm_shuffle_ps(_mm_castsi128_ps(_v), ffww, _MM_SHUFFLE(2, 0, 1, 0)));
}

OZZ_INLINE SimdInt4 SetW(_SimdInt4 _v, _SimdInt4 _i) {
  const __m128 ffzz = _mm_shuffle_ps(_mm_castsi128_ps(_i), _mm_castsi128_ps(_v),
                                     _MM_SHUFFLE(2, 2, 0, 0));
  return _mm_castps_si128(
      _mm_shuffle_ps(_mm_castsi128_ps(_v), ffzz, _MM_SHUFFLE(0, 2, 1, 0)));
}

OZZ_INLINE SimdInt4 SetI(_SimdInt4 _v, _SimdInt4 _i, int _ith) {
  assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range.");
  union {
    SimdInt4 ret;
    int af[4];
  } u = {_v};
  u.af[_ith] = GetX(_i);
  return u.ret;
}

OZZ_INLINE void StorePtr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _mm_store_si128(reinterpret_cast<__m128i*>(_i), _v);
}

OZZ_INLINE void Store1Ptr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  *_i = _mm_cvtsi128_si32(_v);
}

OZZ_INLINE void Store2Ptr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _i[0] = _mm_cvtsi128_si32(_v);
  _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
}

OZZ_INLINE void Store3Ptr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _i[0] = _mm_cvtsi128_si32(_v);
  _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
  _i[2] = _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v));
}

OZZ_INLINE void StorePtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _mm_storeu_si128(reinterpret_cast<__m128i*>(_i), _v);
}

OZZ_INLINE void Store1PtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  *_i = _mm_cvtsi128_si32(_v);
}

OZZ_INLINE void Store2PtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _i[0] = _mm_cvtsi128_si32(_v);
  _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
}

OZZ_INLINE void Store3PtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _i[0] = _mm_cvtsi128_si32(_v);
  _i[1] = _mm_cvtsi128_si32(OZZ_SSE_SPLAT_I(_v, 1));
  _i[2] = _mm_cvtsi128_si32(_mm_unpackhi_epi32(_v, _v));
}

OZZ_INLINE SimdInt4 SplatX(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 0); }

OZZ_INLINE SimdInt4 SplatY(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 1); }

OZZ_INLINE SimdInt4 SplatZ(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 2); }

OZZ_INLINE SimdInt4 SplatW(_SimdInt4 _a) { return OZZ_SSE_SPLAT_I(_a, 3); }

template <size_t _X, size_t _Y, size_t _Z, size_t _W>
OZZ_INLINE SimdInt4 Swizzle(_SimdInt4 _v) {
  static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3,
                "Indices must be between 0 and 3");
  return _mm_shuffle_epi32(_v, _MM_SHUFFLE(_W, _Z, _Y, _X));
}

template <>
OZZ_INLINE SimdInt4 Swizzle<0, 1, 2, 3>(_SimdInt4 _v) {
  return _v;
}

OZZ_INLINE int MoveMask(_SimdInt4 _v) {
  return _mm_movemask_ps(_mm_castsi128_ps(_v));
}

OZZ_INLINE bool AreAllTrue(_SimdInt4 _v) {
  return _mm_movemask_ps(_mm_castsi128_ps(_v)) == 0xf;
}

OZZ_INLINE bool AreAllTrue3(_SimdInt4 _v) {
  return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x7) == 0x7;
}

OZZ_INLINE bool AreAllTrue2(_SimdInt4 _v) {
  return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x3) == 0x3;
}

OZZ_INLINE bool AreAllTrue1(_SimdInt4 _v) {
  return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x1) == 0x1;
}

OZZ_INLINE bool AreAllFalse(_SimdInt4 _v) {
  return _mm_movemask_ps(_mm_castsi128_ps(_v)) == 0;
}

OZZ_INLINE bool AreAllFalse3(_SimdInt4 _v) {
  return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x7) == 0;
}

OZZ_INLINE bool AreAllFalse2(_SimdInt4 _v) {
  return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x3) == 0;
}

OZZ_INLINE bool AreAllFalse1(_SimdInt4 _v) {
  return (_mm_movemask_ps(_mm_castsi128_ps(_v)) & 0x1) == 0;
}

OZZ_INLINE SimdInt4 HAdd2(_SimdInt4 _v) {
  const __m128i hadd = _mm_add_epi32(_v, OZZ_SSE_SPLAT_I(_v, 1));
  return _mm_castps_si128(
      _mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(hadd)));
}

OZZ_INLINE SimdInt4 HAdd3(_SimdInt4 _v) {
  const __m128i hadd = _mm_add_epi32(_mm_add_epi32(_v, OZZ_SSE_SPLAT_I(_v, 1)),
                                     _mm_unpackhi_epi32(_v, _v));
  return _mm_castps_si128(
      _mm_move_ss(_mm_castsi128_ps(_v), _mm_castsi128_ps(hadd)));
}

OZZ_INLINE SimdInt4 HAdd4(_SimdInt4 _v) {
  const __m128 v = _mm_castsi128_ps(_v);
  const __m128i haddxyzw =
      _mm_add_epi32(_v, _mm_castps_si128(_mm_movehl_ps(v, v)));
  return _mm_castps_si128(_mm_move_ss(
      v,
      _mm_castsi128_ps(_mm_add_epi32(haddxyzw, OZZ_SSE_SPLAT_I(haddxyzw, 1)))));
}

OZZ_INLINE SimdInt4 Abs(_SimdInt4 _v) {
#ifdef OZZ_SIMD_SSSE3
  return _mm_abs_epi32(_v);
#else   // OZZ_SIMD_SSSE3
  const __m128i zero = _mm_setzero_si128();
  return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_v, zero), _mm_sub_epi32(zero, _v),
                          _v);
#endif  // OZZ_SIMD_SSSE3
}

OZZ_INLINE SimdInt4 Sign(_SimdInt4 _v) {
  return _mm_slli_epi32(_mm_srli_epi32(_v, 31), 31);
}

OZZ_INLINE SimdInt4 Min(_SimdInt4 _a, _SimdInt4 _b) {
#ifdef OZZ_SIMD_SSE4_1
  return _mm_min_epi32(_a, _b);
#else  // OZZ_SIMD_SSE4_1
  return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_a, _b), _a, _b);
#endif  // OZZ_SIMD_SSE4_1
}

OZZ_INLINE SimdInt4 Max(_SimdInt4 _a, _SimdInt4 _b) {
#ifdef OZZ_SIMD_SSE4_1
  return _mm_max_epi32(_a, _b);
#else  // OZZ_SIMD_SSE4_1
  return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(_a, _b), _a, _b);
#endif  // OZZ_SIMD_SSE4_1
}

OZZ_INLINE SimdInt4 Min0(_SimdInt4 _v) {
  const __m128i zero = _mm_setzero_si128();
#ifdef OZZ_SIMD_SSE4_1
  return _mm_min_epi32(zero, _v);
#else   // OZZ_SIMD_SSE4_1
  return OZZ_SSE_SELECT_I(_mm_cmplt_epi32(zero, _v), zero, _v);
#endif  // OZZ_SIMD_SSE4_1
}

OZZ_INLINE SimdInt4 Max0(_SimdInt4 _v) {
  const __m128i zero = _mm_setzero_si128();
#ifdef OZZ_SIMD_SSE4_1
  return _mm_max_epi32(zero, _v);
#else   // OZZ_SIMD_SSE4_1
  return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(zero, _v), zero, _v);
#endif  // OZZ_SIMD_SSE4_1
}

OZZ_INLINE SimdInt4 Clamp(_SimdInt4 _a, _SimdInt4 _v, _SimdInt4 _b) {
#ifdef OZZ_SIMD_SSE4_1
  return _mm_min_epi32(_mm_max_epi32(_a, _v), _b);
#else   // OZZ_SIMD_SSE4_1
  const __m128i min = OZZ_SSE_SELECT_I(_mm_cmplt_epi32(_v, _b), _v, _b);
  return OZZ_SSE_SELECT_I(_mm_cmpgt_epi32(_a, min), _a, min);
#endif  // OZZ_SIMD_SSE4_1
}

OZZ_INLINE SimdInt4 Select(_SimdInt4 _b, _SimdInt4 _true, _SimdInt4 _false) {
  return OZZ_SSE_SELECT_I(_b, _true, _false);
}

OZZ_INLINE SimdInt4 And(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_and_si128(_a, _b);
}

OZZ_INLINE SimdInt4 AndNot(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_andnot_si128(_b, _a);
}

OZZ_INLINE SimdInt4 Or(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_or_si128(_a, _b);
}

OZZ_INLINE SimdInt4 Xor(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_xor_si128(_a, _b);
}

OZZ_INLINE SimdInt4 Not(_SimdInt4 _v) {
  return _mm_xor_si128(_v, _mm_cmpeq_epi32(_v, _v));
}

OZZ_INLINE SimdInt4 ShiftL(_SimdInt4 _v, int _bits) {
  return _mm_slli_epi32(_v, _bits);
}

OZZ_INLINE SimdInt4 ShiftR(_SimdInt4 _v, int _bits) {
  return _mm_srai_epi32(_v, _bits);
}

OZZ_INLINE SimdInt4 ShiftRu(_SimdInt4 _v, int _bits) {
  return _mm_srli_epi32(_v, _bits);
}

OZZ_INLINE SimdInt4 CmpEq(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_cmpeq_epi32(_a, _b);
}

OZZ_INLINE SimdInt4 CmpNe(_SimdInt4 _a, _SimdInt4 _b) {
  const __m128i eq = _mm_cmpeq_epi32(_a, _b);
  return _mm_xor_si128(eq, _mm_cmpeq_epi32(_a, _a));
}

OZZ_INLINE SimdInt4 CmpLt(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_cmpgt_epi32(_b, _a);
}

OZZ_INLINE SimdInt4 CmpLe(_SimdInt4 _a, _SimdInt4 _b) {
  const __m128i gt = _mm_cmpgt_epi32(_a, _b);
  return _mm_xor_si128(gt, _mm_cmpeq_epi32(_a, _a));
}

OZZ_INLINE SimdInt4 CmpGt(_SimdInt4 _a, _SimdInt4 _b) {
  return _mm_cmpgt_epi32(_a, _b);
}

OZZ_INLINE SimdInt4 CmpGe(_SimdInt4 _a, _SimdInt4 _b) {
  const __m128i lt = _mm_cmpgt_epi32(_b, _a);
  return _mm_xor_si128(lt, _mm_cmpeq_epi32(_a, _a));
}

OZZ_INLINE Float4x4 Float4x4::identity() {
  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128i one = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
  const __m128i x = _mm_srli_si128(one, 12);
  const Float4x4 ret = {{_mm_castsi128_ps(x),
                         _mm_castsi128_ps(_mm_slli_si128(x, 4)),
                         _mm_castsi128_ps(_mm_slli_si128(x, 8)),
                         _mm_castsi128_ps(_mm_slli_si128(one, 12))}};
  return ret;
}

OZZ_INLINE Float4x4 Transpose(const Float4x4& _m) {
  const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
  const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
  const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
  const Float4x4 ret = {
      {_mm_unpacklo_ps(tmp0, tmp1), _mm_unpackhi_ps(tmp0, tmp1),
       _mm_unpacklo_ps(tmp2, tmp3), _mm_unpackhi_ps(tmp2, tmp3)}};
  return ret;
}

inline Float4x4 Invert(const Float4x4& _m, SimdInt4* _invertible) {
  const __m128 _t0 =
      _mm_shuffle_ps(_m.cols[0], _m.cols[1], _MM_SHUFFLE(1, 0, 1, 0));
  const __m128 _t1 =
      _mm_shuffle_ps(_m.cols[2], _m.cols[3], _MM_SHUFFLE(1, 0, 1, 0));
  const __m128 _t2 =
      _mm_shuffle_ps(_m.cols[0], _m.cols[1], _MM_SHUFFLE(3, 2, 3, 2));
  const __m128 _t3 =
      _mm_shuffle_ps(_m.cols[2], _m.cols[3], _MM_SHUFFLE(3, 2, 3, 2));
  const __m128 c0 = _mm_shuffle_ps(_t0, _t1, _MM_SHUFFLE(2, 0, 2, 0));
  const __m128 c1 = _mm_shuffle_ps(_t1, _t0, _MM_SHUFFLE(3, 1, 3, 1));
  const __m128 c2 = _mm_shuffle_ps(_t2, _t3, _MM_SHUFFLE(2, 0, 2, 0));
  const __m128 c3 = _mm_shuffle_ps(_t3, _t2, _MM_SHUFFLE(3, 1, 3, 1));

  __m128 minor0, minor1, minor2, minor3, tmp1, tmp2;
  tmp1 = _mm_mul_ps(c2, c3);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
  minor0 = _mm_mul_ps(c1, tmp1);
  minor1 = _mm_mul_ps(c0, tmp1);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
  minor0 = OZZ_MSUB(c1, tmp1, minor0);
  minor1 = OZZ_MSUB(c0, tmp1, minor1);
  minor1 = OZZ_SHUFFLE_PS1(minor1, 0x4E);

  tmp1 = _mm_mul_ps(c1, c2);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
  minor0 = OZZ_MADD(c3, tmp1, minor0);
  minor3 = _mm_mul_ps(c0, tmp1);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
  minor0 = OZZ_NMADD(c3, tmp1, minor0);
  minor3 = OZZ_MSUB(c0, tmp1, minor3);
  minor3 = OZZ_SHUFFLE_PS1(minor3, 0x4E);

  tmp1 = _mm_mul_ps(OZZ_SHUFFLE_PS1(c1, 0x4E), c3);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
  tmp2 = OZZ_SHUFFLE_PS1(c2, 0x4E);
  minor0 = OZZ_MADD(tmp2, tmp1, minor0);
  minor2 = _mm_mul_ps(c0, tmp1);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
  minor0 = OZZ_NMADD(tmp2, tmp1, minor0);
  minor2 = OZZ_MSUB(c0, tmp1, minor2);
  minor2 = OZZ_SHUFFLE_PS1(minor2, 0x4E);

  tmp1 = _mm_mul_ps(c0, c1);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
  minor2 = OZZ_MADD(c3, tmp1, minor2);
  minor3 = OZZ_MSUB(tmp2, tmp1, minor3);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
  minor2 = OZZ_MSUB(c3, tmp1, minor2);
  minor3 = OZZ_NMADD(tmp2, tmp1, minor3);

  tmp1 = _mm_mul_ps(c0, c3);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
  minor1 = OZZ_NMADD(tmp2, tmp1, minor1);
  minor2 = OZZ_MADD(c1, tmp1, minor2);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
  minor1 = OZZ_MADD(tmp2, tmp1, minor1);
  minor2 = OZZ_NMADD(c1, tmp1, minor2);

  tmp1 = _mm_mul_ps(c0, tmp2);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0xB1);
  minor1 = OZZ_MADD(c3, tmp1, minor1);
  minor3 = OZZ_NMADD(c1, tmp1, minor3);
  tmp1 = OZZ_SHUFFLE_PS1(tmp1, 0x4E);
  minor1 = OZZ_NMADD(c3, tmp1, minor1);
  minor3 = OZZ_MADD(c1, tmp1, minor3);

  __m128 det;
  det = _mm_mul_ps(c0, minor0);
  det = _mm_add_ps(OZZ_SHUFFLE_PS1(det, 0x4E), det);
  det = _mm_add_ss(OZZ_SHUFFLE_PS1(det, 0xB1), det);
  const SimdInt4 invertible = CmpNe(det, simd_float4::zero());
  assert((_invertible || AreAllTrue1(invertible)) &&
         "Matrix is not invertible");
  if (_invertible != nullptr) {
    *_invertible = invertible;
  }
  tmp1 = OZZ_SSE_SELECT_F(invertible, RcpEstNR(det), simd_float4::zero());
  det = OZZ_NMADDX(det, _mm_mul_ss(tmp1, tmp1), _mm_add_ss(tmp1, tmp1));
  det = OZZ_SHUFFLE_PS1(det, 0x00);

  // Copy the final columns
  const Float4x4 ret = {{_mm_mul_ps(det, minor0), _mm_mul_ps(det, minor1),
                         _mm_mul_ps(det, minor2), _mm_mul_ps(det, minor3)}};
  return ret;
}

Float4x4 Float4x4::Translation(_SimdFloat4 _v) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128i mask000f = _mm_slli_si128(ffff, 12);
  const __m128i one = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
  const __m128i x = _mm_srli_si128(one, 12);
  const Float4x4 ret = {
      {_mm_castsi128_ps(x), _mm_castsi128_ps(_mm_slli_si128(x, 4)),
       _mm_castsi128_ps(_mm_slli_si128(x, 8)),
       OZZ_SSE_SELECT_F(mask000f, _mm_castsi128_ps(one), _v)}};
  return ret;
}  // math

Float4x4 Float4x4::Scaling(_SimdFloat4 _v) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128i if000 = _mm_srli_si128(ffff, 12);
  const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
  const Float4x4 ret = {
      {_mm_and_ps(_v, _mm_castsi128_ps(if000)),
       _mm_and_ps(_v, _mm_castsi128_ps(_mm_slli_si128(if000, 4))),
       _mm_and_ps(_v, _mm_castsi128_ps(_mm_slli_si128(if000, 8))),
       _mm_castsi128_ps(_mm_slli_si128(ione, 12))}};
  return ret;
}  // math

OZZ_INLINE Float4x4 Translate(const Float4x4& _m, _SimdFloat4 _v) {
  const __m128 a01 = OZZ_MADD(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0),
                              _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1)));
  const __m128 m3 = OZZ_MADD(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2), _m.cols[3]);
  const Float4x4 ret = {
      {_m.cols[0], _m.cols[1], _m.cols[2], _mm_add_ps(a01, m3)}};
  return ret;
}

OZZ_INLINE Float4x4 Scale(const Float4x4& _m, _SimdFloat4 _v) {
  const Float4x4 ret = {{_mm_mul_ps(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0)),
                         _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1)),
                         _mm_mul_ps(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2)),
                         _m.cols[3]}};
  return ret;
}

OZZ_INLINE Float4x4 ColumnMultiply(const Float4x4& _m, _SimdFloat4 _v) {
  const Float4x4 ret = {{_mm_mul_ps(_m.cols[0], _v), _mm_mul_ps(_m.cols[1], _v),
                         _mm_mul_ps(_m.cols[2], _v),
                         _mm_mul_ps(_m.cols[3], _v)}};
  return ret;
}

inline SimdInt4 IsNormalized(const Float4x4& _m) {
  const __m128 max = _mm_set_ps1(1.f + kNormalizationToleranceSq);
  const __m128 min = _mm_set_ps1(1.f - kNormalizationToleranceSq);

  const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
  const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
  const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
  const __m128 row0 = _mm_unpacklo_ps(tmp0, tmp1);
  const __m128 row1 = _mm_unpackhi_ps(tmp0, tmp1);
  const __m128 row2 = _mm_unpacklo_ps(tmp2, tmp3);

  const __m128 dot =
      OZZ_MADD(row0, row0, OZZ_MADD(row1, row1, _mm_mul_ps(row2, row2)));
  const __m128 normalized =
      _mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min));
  return _mm_castps_si128(
      _mm_and_ps(normalized, _mm_castsi128_ps(simd_int4::mask_fff0())));
}

inline SimdInt4 IsNormalizedEst(const Float4x4& _m) {
  const __m128 max = _mm_set_ps1(1.f + kNormalizationToleranceEstSq);
  const __m128 min = _mm_set_ps1(1.f - kNormalizationToleranceEstSq);

  const __m128 tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
  const __m128 tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
  const __m128 tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
  const __m128 tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
  const __m128 row0 = _mm_unpacklo_ps(tmp0, tmp1);
  const __m128 row1 = _mm_unpackhi_ps(tmp0, tmp1);
  const __m128 row2 = _mm_unpacklo_ps(tmp2, tmp3);

  const __m128 dot =
      OZZ_MADD(row0, row0, OZZ_MADD(row1, row1, _mm_mul_ps(row2, row2)));

  const __m128 normalized =
      _mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min));

  return _mm_castps_si128(
      _mm_and_ps(normalized, _mm_castsi128_ps(simd_int4::mask_fff0())));
}

OZZ_INLINE SimdInt4 IsOrthogonal(const Float4x4& _m) {
  const __m128 max = _mm_set_ss(1.f + kNormalizationToleranceSq);
  const __m128 min = _mm_set_ss(1.f - kNormalizationToleranceSq);
  const __m128 zero = _mm_setzero_ps();

  // Use simd_float4::zero() if one of the normalization fails. _m will then be
  // considered not orthogonal.
  const SimdFloat4 cross = NormalizeSafe3(Cross3(_m.cols[0], _m.cols[1]), zero);
  const SimdFloat4 at = NormalizeSafe3(_m.cols[2], zero);

  SimdFloat4 dot;
  OZZ_SSE_DOT3_F(cross, at, dot);
  __m128 dotx000 = _mm_move_ss(zero, dot);
  return _mm_castps_si128(
      _mm_and_ps(_mm_cmplt_ss(dotx000, max), _mm_cmpgt_ss(dotx000, min)));
}

inline SimdFloat4 ToQuaternion(const Float4x4& _m) {
  assert(AreAllTrue3(IsNormalizedEst(_m)));
  assert(AreAllTrue1(IsOrthogonal(_m)));

  // Prepares constants.
  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128 half = _mm_set1_ps(0.5f);
  const __m128i mask_f000 = _mm_srli_si128(ffff, 12);
  const __m128i mask_000f = _mm_slli_si128(ffff, 12);
  const __m128 one =
      _mm_castsi128_ps(_mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2));
  const __m128i mask_0f00 = _mm_slli_si128(mask_f000, 4);
  const __m128i mask_00f0 = _mm_slli_si128(mask_f000, 8);

  const __m128 xx_yy = OZZ_SSE_SELECT_F(mask_0f00, _m.cols[1], _m.cols[0]);
  const __m128 xx_yy_0010 = OZZ_SHUFFLE_PS1(xx_yy, _MM_SHUFFLE(0, 0, 1, 0));
  const __m128 xx_yy_zz_xx =
      OZZ_SSE_SELECT_F(mask_00f0, _m.cols[2], xx_yy_0010);
  const __m128 yy_zz_xx_yy =
      OZZ_SHUFFLE_PS1(xx_yy_zz_xx, _MM_SHUFFLE(1, 0, 2, 1));
  const __m128 zz_xx_yy_zz =
      OZZ_SHUFFLE_PS1(xx_yy_zz_xx, _MM_SHUFFLE(2, 1, 0, 2));

  const __m128 diag_sum =
      _mm_add_ps(_mm_add_ps(xx_yy_zz_xx, yy_zz_xx_yy), zz_xx_yy_zz);
  const __m128 diag_diff =
      _mm_sub_ps(_mm_sub_ps(xx_yy_zz_xx, yy_zz_xx_yy), zz_xx_yy_zz);
  const __m128 radicand =
      _mm_add_ps(OZZ_SSE_SELECT_F(mask_000f, diag_sum, diag_diff), one);
  const __m128 invSqrt = one / _mm_sqrt_ps(radicand);

  __m128 zy_xz_yx = OZZ_SSE_SELECT_F(mask_00f0, _m.cols[1], _m.cols[0]);
  zy_xz_yx = OZZ_SHUFFLE_PS1(zy_xz_yx, _MM_SHUFFLE(0, 1, 2, 2));
  zy_xz_yx =
      OZZ_SSE_SELECT_F(mask_0f00, OZZ_SSE_SPLAT_F(_m.cols[2], 0), zy_xz_yx);
  __m128 yz_zx_xy = OZZ_SSE_SELECT_F(mask_f000, _m.cols[1], _m.cols[0]);
  yz_zx_xy = OZZ_SHUFFLE_PS1(yz_zx_xy, _MM_SHUFFLE(0, 0, 2, 0));
  yz_zx_xy =
      OZZ_SSE_SELECT_F(mask_f000, OZZ_SSE_SPLAT_F(_m.cols[2], 1), yz_zx_xy);
  const __m128 sum = _mm_add_ps(zy_xz_yx, yz_zx_xy);
  const __m128 diff = _mm_sub_ps(zy_xz_yx, yz_zx_xy);
  const __m128 scale = _mm_mul_ps(invSqrt, half);

  const __m128 sum0 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 1, 2, 0));
  const __m128 sum1 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 0, 0, 2));
  const __m128 sum2 = OZZ_SHUFFLE_PS1(sum, _MM_SHUFFLE(0, 0, 0, 1));
  __m128 res0 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 0), sum0);
  __m128 res1 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 1), sum1);
  __m128 res2 = OZZ_SSE_SELECT_F(mask_000f, OZZ_SSE_SPLAT_F(diff, 2), sum2);
  res0 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_f000, radicand, res0),
                    OZZ_SSE_SPLAT_F(scale, 0));
  res1 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_0f00, radicand, res1),
                    OZZ_SSE_SPLAT_F(scale, 1));
  res2 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_00f0, radicand, res2),
                    OZZ_SSE_SPLAT_F(scale, 2));
  __m128 res3 = _mm_mul_ps(OZZ_SSE_SELECT_F(mask_000f, radicand, diff),
                           OZZ_SSE_SPLAT_F(scale, 3));

  const __m128 xx = OZZ_SSE_SPLAT_F(_m.cols[0], 0);
  const __m128 yy = OZZ_SSE_SPLAT_F(_m.cols[1], 1);
  const __m128 zz = OZZ_SSE_SPLAT_F(_m.cols[2], 2);
  const __m128i cond0 = _mm_castps_si128(_mm_cmpgt_ps(yy, xx));
  const __m128i cond1 =
      _mm_castps_si128(_mm_and_ps(_mm_cmpgt_ps(zz, xx), _mm_cmpgt_ps(zz, yy)));
  const __m128i cond2 = _mm_castps_si128(
      _mm_cmpgt_ps(OZZ_SSE_SPLAT_F(diag_sum, 0), _mm_castsi128_ps(zero)));
  __m128 res = OZZ_SSE_SELECT_F(cond0, res1, res0);
  res = OZZ_SSE_SELECT_F(cond1, res2, res);
  res = OZZ_SSE_SELECT_F(cond2, res3, res);

  assert(AreAllTrue1(IsNormalizedEst4(res)));
  return res;
}

inline bool ToAffine(const Float4x4& _m, SimdFloat4* _translation,
                     SimdFloat4* _quaternion, SimdFloat4* _scale) {
  const __m128 zero = _mm_setzero_ps();
  const __m128 one = simd_float4::one();
  const __m128i fff0 = simd_int4::mask_fff0();
  const __m128 max = _mm_set_ps1(kOrthogonalisationToleranceSq);
  const __m128 min = _mm_set_ps1(-kOrthogonalisationToleranceSq);

  // Extracts translation.
  *_translation = OZZ_SSE_SELECT_F(fff0, _m.cols[3], one);

  // Extracts scale.
  const __m128 m_tmp0 = _mm_unpacklo_ps(_m.cols[0], _m.cols[2]);
  const __m128 m_tmp1 = _mm_unpacklo_ps(_m.cols[1], _m.cols[3]);
  const __m128 m_tmp2 = _mm_unpackhi_ps(_m.cols[0], _m.cols[2]);
  const __m128 m_tmp3 = _mm_unpackhi_ps(_m.cols[1], _m.cols[3]);
  const __m128 m_row0 = _mm_unpacklo_ps(m_tmp0, m_tmp1);
  const __m128 m_row1 = _mm_unpackhi_ps(m_tmp0, m_tmp1);
  const __m128 m_row2 = _mm_unpacklo_ps(m_tmp2, m_tmp3);

  const __m128 dot = OZZ_MADD(
      m_row0, m_row0, OZZ_MADD(m_row1, m_row1, _mm_mul_ps(m_row2, m_row2)));
  const __m128 abs_scale = _mm_sqrt_ps(dot);

  const __m128 zero_axis =
      _mm_and_ps(_mm_cmplt_ps(dot, max), _mm_cmpgt_ps(dot, min));

  // Builds an orthonormal matrix in order to support quaternion extraction.
  Float4x4 orthonormal;
  int mask = _mm_movemask_ps(zero_axis);
  if (mask & 1) {
    if (mask & 6) {
      return false;
    }
    orthonormal.cols[1] = _mm_div_ps(_m.cols[1], OZZ_SSE_SPLAT_F(abs_scale, 1));
    orthonormal.cols[0] = Normalize3(Cross3(orthonormal.cols[1], _m.cols[2]));
    orthonormal.cols[2] =
        Normalize3(Cross3(orthonormal.cols[0], orthonormal.cols[1]));
  } else if (mask & 4) {
    if (mask & 3) {
      return false;
    }
    orthonormal.cols[0] = _mm_div_ps(_m.cols[0], OZZ_SSE_SPLAT_F(abs_scale, 0));
    orthonormal.cols[2] = Normalize3(Cross3(orthonormal.cols[0], _m.cols[1]));
    orthonormal.cols[1] =
        Normalize3(Cross3(orthonormal.cols[2], orthonormal.cols[0]));
  } else {  // Favor z axis in the default case
    if (mask & 5) {
      return false;
    }
    orthonormal.cols[2] = _mm_div_ps(_m.cols[2], OZZ_SSE_SPLAT_F(abs_scale, 2));
    orthonormal.cols[1] = Normalize3(Cross3(orthonormal.cols[2], _m.cols[0]));
    orthonormal.cols[0] =
        Normalize3(Cross3(orthonormal.cols[1], orthonormal.cols[2]));
  }
  orthonormal.cols[3] = simd_float4::w_axis();

  // Get back scale signs in case of reflexions
  const __m128 o_tmp0 =
      _mm_unpacklo_ps(orthonormal.cols[0], orthonormal.cols[2]);
  const __m128 o_tmp1 =
      _mm_unpacklo_ps(orthonormal.cols[1], orthonormal.cols[3]);
  const __m128 o_tmp2 =
      _mm_unpackhi_ps(orthonormal.cols[0], orthonormal.cols[2]);
  const __m128 o_tmp3 =
      _mm_unpackhi_ps(orthonormal.cols[1], orthonormal.cols[3]);
  const __m128 o_row0 = _mm_unpacklo_ps(o_tmp0, o_tmp1);
  const __m128 o_row1 = _mm_unpackhi_ps(o_tmp0, o_tmp1);
  const __m128 o_row2 = _mm_unpacklo_ps(o_tmp2, o_tmp3);

  const __m128 scale_dot = OZZ_MADD(
      o_row0, m_row0, OZZ_MADD(o_row1, m_row1, _mm_mul_ps(o_row2, m_row2)));

  const __m128i cond = _mm_castps_si128(_mm_cmpgt_ps(scale_dot, zero));
  const __m128 cfalse = _mm_sub_ps(zero, abs_scale);
  const __m128 scale = OZZ_SSE_SELECT_F(cond, abs_scale, cfalse);
  *_scale = OZZ_SSE_SELECT_F(fff0, scale, one);

  // Extracts quaternion.
  *_quaternion = ToQuaternion(orthonormal);
  return true;
}

inline Float4x4 Float4x4::FromEuler(_SimdFloat4 _v) {
  const __m128 cos = Cos(_v);
  const __m128 sin = Sin(_v);

  const float cx = GetX(cos);
  const float sx = GetX(sin);
  const float cy = GetY(cos);
  const float sy = GetY(sin);
  const float cz = GetZ(cos);
  const float sz = GetZ(sin);

  const float sycz = sy * cz;
  const float sysz = sy * sz;

  const Float4x4 ret = {{simd_float4::Load(cx * cy, sx * sz - cx * sycz,
                                           cx * sysz + sx * cz, 0.f),
                         simd_float4::Load(sy, cy * cz, -cy * sz, 0.f),
                         simd_float4::Load(-sx * cy, sx * sycz + cx * sz,
                                           -sx * sysz + cx * cz, 0.f),
                         simd_float4::w_axis()}};
  return ret;
}

inline Float4x4 Float4x4::FromAxisAngle(_SimdFloat4 _axis, _SimdFloat4 _angle) {
  assert(AreAllTrue1(IsNormalizedEst3(_axis)));

  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
  const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4));
  const __m128 one = _mm_castsi128_ps(ione);
  const __m128 w_axis = _mm_castsi128_ps(_mm_slli_si128(ione, 12));

  const __m128 sin = SplatX(SinX(_angle));
  const __m128 cos = SplatX(CosX(_angle));
  const __m128 one_minus_cos = _mm_sub_ps(one, cos);

  const __m128 v0 =
      _mm_mul_ps(_mm_mul_ps(one_minus_cos,
                            OZZ_SHUFFLE_PS1(_axis, _MM_SHUFFLE(3, 0, 2, 1))),
                 OZZ_SHUFFLE_PS1(_axis, _MM_SHUFFLE(3, 1, 0, 2)));
  const __m128 r0 =
      _mm_add_ps(_mm_mul_ps(_mm_mul_ps(one_minus_cos, _axis), _axis), cos);
  const __m128 r1 = _mm_add_ps(_mm_mul_ps(sin, _axis), v0);
  const __m128 r2 = _mm_sub_ps(v0, _mm_mul_ps(sin, _axis));
  const __m128 r0fff0 = _mm_and_ps(r0, fff0);
  const __m128 r1r22120 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 1, 2, 0));
  const __m128 v1 = OZZ_SHUFFLE_PS1(r1r22120, _MM_SHUFFLE(0, 3, 2, 1));
  const __m128 r1r20011 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(0, 0, 1, 1));
  const __m128 v2 = OZZ_SHUFFLE_PS1(r1r20011, _MM_SHUFFLE(2, 0, 2, 0));

  const __m128 t0 = _mm_shuffle_ps(r0fff0, v1, _MM_SHUFFLE(1, 0, 3, 0));
  const __m128 t1 = _mm_shuffle_ps(r0fff0, v1, _MM_SHUFFLE(3, 2, 3, 1));
  const Float4x4 ret = {{OZZ_SHUFFLE_PS1(t0, _MM_SHUFFLE(1, 3, 2, 0)),
                         OZZ_SHUFFLE_PS1(t1, _MM_SHUFFLE(1, 3, 0, 2)),
                         _mm_shuffle_ps(v2, r0fff0, _MM_SHUFFLE(3, 2, 1, 0)),
                         w_axis}};
  return ret;
}

inline Float4x4 Float4x4::FromQuaternion(_SimdFloat4 _quaternion) {
  assert(AreAllTrue1(IsNormalizedEst4(_quaternion)));

  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
  const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4));
  const __m128 c1110 = _mm_castsi128_ps(_mm_srli_si128(ione, 4));
  const __m128 w_axis = _mm_castsi128_ps(_mm_slli_si128(ione, 12));

  const __m128 vsum = _mm_add_ps(_quaternion, _quaternion);
  const __m128 vms = _mm_mul_ps(_quaternion, vsum);

  const __m128 r0 = _mm_sub_ps(
      _mm_sub_ps(
          c1110,
          _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 0, 0, 1)), fff0)),
      _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 1, 2, 2)), fff0));
  const __m128 v0 =
      _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 1, 0, 0)),
                 OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 2, 1, 2)));
  const __m128 v1 =
      _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 3, 3, 3)),
                 OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 0, 2, 1)));

  const __m128 r1 = _mm_add_ps(v0, v1);
  const __m128 r2 = _mm_sub_ps(v0, v1);

  const __m128 r1r21021 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 2, 1));
  const __m128 v2 = OZZ_SHUFFLE_PS1(r1r21021, _MM_SHUFFLE(1, 3, 2, 0));
  const __m128 r1r22200 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 2, 0, 0));
  const __m128 v3 = OZZ_SHUFFLE_PS1(r1r22200, _MM_SHUFFLE(2, 0, 2, 0));

  const __m128 q0 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(1, 0, 3, 0));
  const __m128 q1 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(3, 2, 3, 1));
  const Float4x4 ret = {{OZZ_SHUFFLE_PS1(q0, _MM_SHUFFLE(1, 3, 2, 0)),
                         OZZ_SHUFFLE_PS1(q1, _MM_SHUFFLE(1, 3, 0, 2)),
                         _mm_shuffle_ps(v3, r0, _MM_SHUFFLE(3, 2, 1, 0)),
                         w_axis}};
  return ret;
}

inline Float4x4 Float4x4::FromAffine(_SimdFloat4 _translation,
                                     _SimdFloat4 _quaternion,
                                     _SimdFloat4 _scale) {
  assert(AreAllTrue1(IsNormalizedEst4(_quaternion)));

  const __m128i zero = _mm_setzero_si128();
  const __m128i ffff = _mm_cmpeq_epi32(zero, zero);
  const __m128i ione = _mm_srli_epi32(_mm_slli_epi32(ffff, 25), 2);
  const __m128 fff0 = _mm_castsi128_ps(_mm_srli_si128(ffff, 4));
  const __m128 c1110 = _mm_castsi128_ps(_mm_srli_si128(ione, 4));

  const __m128 vsum = _mm_add_ps(_quaternion, _quaternion);
  const __m128 vms = _mm_mul_ps(_quaternion, vsum);

  const __m128 r0 = _mm_sub_ps(
      _mm_sub_ps(
          c1110,
          _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 0, 0, 1)), fff0)),
      _mm_and_ps(OZZ_SHUFFLE_PS1(vms, _MM_SHUFFLE(3, 1, 2, 2)), fff0));
  const __m128 v0 =
      _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 1, 0, 0)),
                 OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 2, 1, 2)));
  const __m128 v1 =
      _mm_mul_ps(OZZ_SHUFFLE_PS1(_quaternion, _MM_SHUFFLE(3, 3, 3, 3)),
                 OZZ_SHUFFLE_PS1(vsum, _MM_SHUFFLE(3, 0, 2, 1)));

  const __m128 r1 = _mm_add_ps(v0, v1);
  const __m128 r2 = _mm_sub_ps(v0, v1);

  const __m128 r1r21021 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1, 0, 2, 1));
  const __m128 v2 = OZZ_SHUFFLE_PS1(r1r21021, _MM_SHUFFLE(1, 3, 2, 0));
  const __m128 r1r22200 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(2, 2, 0, 0));
  const __m128 v3 = OZZ_SHUFFLE_PS1(r1r22200, _MM_SHUFFLE(2, 0, 2, 0));

  const __m128 q0 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(1, 0, 3, 0));
  const __m128 q1 = _mm_shuffle_ps(r0, v2, _MM_SHUFFLE(3, 2, 3, 1));

  const Float4x4 ret = {
      {_mm_mul_ps(OZZ_SHUFFLE_PS1(q0, _MM_SHUFFLE(1, 3, 2, 0)),
                  OZZ_SSE_SPLAT_F(_scale, 0)),
       _mm_mul_ps(OZZ_SHUFFLE_PS1(q1, _MM_SHUFFLE(1, 3, 0, 2)),
                  OZZ_SSE_SPLAT_F(_scale, 1)),
       _mm_mul_ps(_mm_shuffle_ps(v3, r0, _MM_SHUFFLE(3, 2, 1, 0)),
                  OZZ_SSE_SPLAT_F(_scale, 2)),
       _mm_movelh_ps(_translation, _mm_unpackhi_ps(_translation, c1110))}};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 TransformPoint(const ozz::math::Float4x4& _m,
                                                ozz::math::_SimdFloat4 _v) {
  const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 0), _m.cols[0]);
  const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 2), _m.cols[2], _m.cols[3]);
  const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 1), _m.cols[1], xxxx);
  return _mm_add_ps(a01, a23);
}

OZZ_INLINE ozz::math::SimdFloat4 TransformVector(const ozz::math::Float4x4& _m,
                                                 ozz::math::_SimdFloat4 _v) {
  const __m128 xxxx = _mm_mul_ps(_m.cols[0], OZZ_SSE_SPLAT_F(_v, 0));
  const __m128 zzzz = _mm_mul_ps(_m.cols[1], OZZ_SSE_SPLAT_F(_v, 1));
  const __m128 a21 = OZZ_MADD(_m.cols[2], OZZ_SSE_SPLAT_F(_v, 2), xxxx);
  return _mm_add_ps(zzzz, a21);
}

OZZ_INLINE ozz::math::SimdFloat4 operator*(const ozz::math::Float4x4& _m,
                                           ozz::math::_SimdFloat4 _v) {
  const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 0), _m.cols[0]);
  const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_v, 2), _m.cols[2]);
  const __m128 a01 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 1), _m.cols[1], xxxx);
  const __m128 a23 = OZZ_MADD(OZZ_SSE_SPLAT_F(_v, 3), _m.cols[3], zzzz);
  return _mm_add_ps(a01, a23);
}

inline ozz::math::Float4x4 operator*(const ozz::math::Float4x4& _a,
                                     const ozz::math::Float4x4& _b) {
  ozz::math::Float4x4 ret;
  {
    const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[0], 0), _a.cols[0]);
    const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[0], 2), _a.cols[2]);
    const __m128 a01 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[0], 1), _a.cols[1], xxxx);
    const __m128 a23 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[0], 3), _a.cols[3], zzzz);
    ret.cols[0] = _mm_add_ps(a01, a23);
  }
  {
    const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[1], 0), _a.cols[0]);
    const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[1], 2), _a.cols[2]);
    const __m128 a01 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[1], 1), _a.cols[1], xxxx);
    const __m128 a23 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[1], 3), _a.cols[3], zzzz);
    ret.cols[1] = _mm_add_ps(a01, a23);
  }
  {
    const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[2], 0), _a.cols[0]);
    const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[2], 2), _a.cols[2]);
    const __m128 a01 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[2], 1), _a.cols[1], xxxx);
    const __m128 a23 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[2], 3), _a.cols[3], zzzz);
    ret.cols[2] = _mm_add_ps(a01, a23);
  }
  {
    const __m128 xxxx = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[3], 0), _a.cols[0]);
    const __m128 zzzz = _mm_mul_ps(OZZ_SSE_SPLAT_F(_b.cols[3], 2), _a.cols[2]);
    const __m128 a01 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[3], 1), _a.cols[1], xxxx);
    const __m128 a23 =
        OZZ_MADD(OZZ_SSE_SPLAT_F(_b.cols[3], 3), _a.cols[3], zzzz);
    ret.cols[3] = _mm_add_ps(a01, a23);
  }
  return ret;
}

OZZ_INLINE ozz::math::Float4x4 operator+(const ozz::math::Float4x4& _a,
                                         const ozz::math::Float4x4& _b) {
  const ozz::math::Float4x4 ret = {
      {_mm_add_ps(_a.cols[0], _b.cols[0]), _mm_add_ps(_a.cols[1], _b.cols[1]),
       _mm_add_ps(_a.cols[2], _b.cols[2]), _mm_add_ps(_a.cols[3], _b.cols[3])}};
  return ret;
}

OZZ_INLINE ozz::math::Float4x4 operator-(const ozz::math::Float4x4& _a,
                                         const ozz::math::Float4x4& _b) {
  const ozz::math::Float4x4 ret = {
      {_mm_sub_ps(_a.cols[0], _b.cols[0]), _mm_sub_ps(_a.cols[1], _b.cols[1]),
       _mm_sub_ps(_a.cols[2], _b.cols[2]), _mm_sub_ps(_a.cols[3], _b.cols[3])}};
  return ret;
}
}  // namespace math
}  // namespace ozz

#if !defined(OZZ_DISABLE_SSE_NATIVE_OPERATORS)
OZZ_INLINE ozz::math::SimdFloat4 operator+(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  return _mm_add_ps(_a, _b);
}

OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  return _mm_sub_ps(_a, _b);
}

OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _v) {
  return _mm_sub_ps(_mm_setzero_ps(), _v);
}

OZZ_INLINE ozz::math::SimdFloat4 operator*(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  return _mm_mul_ps(_a, _b);
}

OZZ_INLINE ozz::math::SimdFloat4 operator/(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  return _mm_div_ps(_a, _b);
}
#endif  // !defined(OZZ_DISABLE_SSE_NATIVE_OPERATORS)

namespace ozz {
namespace math {
OZZ_INLINE uint16_t FloatToHalf(float _f) {
  const int h = _mm_cvtsi128_si32(FloatToHalf(_mm_set1_ps(_f)));
  return static_cast<uint16_t>(h);
}

OZZ_INLINE float HalfToFloat(uint16_t _h) {
  return _mm_cvtss_f32(HalfToFloat(_mm_set1_epi32(_h)));
}

// Half <-> Float implementation is based on:
// http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/.
inline SimdInt4 FloatToHalf(_SimdFloat4 _f) {
  const __m128i mask_sign = _mm_set1_epi32(0x80000000u);
  const __m128i mask_round = _mm_set1_epi32(~0xfffu);
  const __m128i f32infty = _mm_set1_epi32(255 << 23);
  const __m128 magic = _mm_castsi128_ps(_mm_set1_epi32(15 << 23));
  const __m128i nanbit = _mm_set1_epi32(0x200);
  const __m128i infty_as_fp16 = _mm_set1_epi32(0x7c00);
  const __m128 clamp = _mm_castsi128_ps(_mm_set1_epi32((31 << 23) - 0x1000));

  const __m128 msign = _mm_castsi128_ps(mask_sign);
  const __m128 justsign = _mm_and_ps(msign, _f);
  const __m128 absf = _mm_xor_ps(_f, justsign);
  const __m128 mround = _mm_castsi128_ps(mask_round);
  const __m128i absf_int = _mm_castps_si128(absf);
  const __m128i b_isnan = _mm_cmpgt_epi32(absf_int, f32infty);
  const __m128i b_isnormal = _mm_cmpgt_epi32(f32infty, _mm_castps_si128(absf));
  const __m128i inf_or_nan =
      _mm_or_si128(_mm_and_si128(b_isnan, nanbit), infty_as_fp16);
  const __m128 fnosticky = _mm_and_ps(absf, mround);
  const __m128 scaled = _mm_mul_ps(fnosticky, magic);
  // Logically, we want PMINSD on "biased", but this should gen better code
  const __m128 clamped = _mm_min_ps(scaled, clamp);
  const __m128i biased =
      _mm_sub_epi32(_mm_castps_si128(clamped), _mm_castps_si128(mround));
  const __m128i shifted = _mm_srli_epi32(biased, 13);
  const __m128i normal = _mm_and_si128(shifted, b_isnormal);
  const __m128i not_normal = _mm_andnot_si128(b_isnormal, inf_or_nan);
  const __m128i joined = _mm_or_si128(normal, not_normal);

  const __m128i sign_shift = _mm_srli_epi32(_mm_castps_si128(justsign), 16);
  return _mm_or_si128(joined, sign_shift);
}

OZZ_INLINE SimdFloat4 HalfToFloat(_SimdInt4 _h) {
  const __m128i mask_nosign = _mm_set1_epi32(0x7fff);
  const __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23));
  const __m128i was_infnan = _mm_set1_epi32(0x7bff);
  const __m128 exp_infnan = _mm_castsi128_ps(_mm_set1_epi32(255 << 23));

  const __m128i expmant = _mm_and_si128(mask_nosign, _h);
  const __m128i shifted = _mm_slli_epi32(expmant, 13);
  const __m128 scaled = _mm_mul_ps(_mm_castsi128_ps(shifted), magic);
  const __m128i b_wasinfnan = _mm_cmpgt_epi32(expmant, was_infnan);
  const __m128i sign = _mm_slli_epi32(_mm_xor_si128(_h, expmant), 16);
  const __m128 infnanexp =
      _mm_and_ps(_mm_castsi128_ps(b_wasinfnan), exp_infnan);
  const __m128 sign_inf = _mm_or_ps(_mm_castsi128_ps(sign), infnanexp);
  return _mm_or_ps(scaled, sign_inf);
}
}  // namespace math
}  // namespace ozz

#undef OZZ_SHUFFLE_PS1
#undef OZZ_SSE_SPLAT_F
#undef OZZ_SSE_HADD2_F
#undef OZZ_SSE_HADD3_F
#undef OZZ_SSE_HADD4_F
#undef OZZ_SSE_DOT2_F
#undef OZZ_SSE_DOT3_F
#undef OZZ_SSE_DOT4_F
#undef OZZ_MADD
#undef OZZ_MSUB
#undef OZZ_NMADD
#undef OZZ_NMSUB
#undef OZZ_MADDX
#undef OZZ_MSUBX
#undef OZZ_NMADDX
#undef OZZ_NMSUBX
#undef OZZ_SSE_SELECT_F
#undef OZZ_SSE_SPLAT_I
#undef OZZ_SSE_SELECT_I
#endif  // OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_SSE_INL_H_