AnimTestbed/3rdparty/ozz-animation/include/ozz/base/maths/internal/simd_math_ref-inl.h

//----------------------------------------------------------------------------//
//                                                                            //
// ozz-animation is hosted at http://github.com/guillaumeblanc/ozz-animation  //
// and distributed under the MIT License (MIT).                               //
//                                                                            //
// Copyright (c) Guillaume Blanc                                              //
//                                                                            //
// Permission is hereby granted, free of charge, to any person obtaining a    //
// copy of this software and associated documentation files (the "Software"), //
// to deal in the Software without restriction, including without limitation  //
// the rights to use, copy, modify, merge, publish, distribute, sublicense,   //
// and/or sell copies of the Software, and to permit persons to whom the      //
// Software is furnished to do so, subject to the following conditions:       //
//                                                                            //
// The above copyright notice and this permission notice shall be included in //
// all copies or substantial portions of the Software.                        //
//                                                                            //
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR //
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   //
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    //
// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER //
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING    //
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER        //
// DEALINGS IN THE SOFTWARE.                                                  //
//                                                                            //
//----------------------------------------------------------------------------//

#ifndef OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_REF_INL_H_
#define OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_REF_INL_H_

// SIMD refence implementation, based on scalar floats.

#include <stdint.h>
#include <cassert>
#include <cmath>
#include <cstddef>

#include "ozz/base/maths/math_constant.h"

namespace ozz {
namespace math {

namespace internal {
// Defines union cast helpers that are used internally for binary logical
// operations.
union SimdFI4 {
  SimdFloat4 f;
  SimdInt4 i;
};
union SimdIF4 {
  SimdInt4 i;
  SimdFloat4 f;
};
}  // namespace internal

#define OZZ_RCP_EST(_in, _out)                 \
  do {                                         \
    const float in = _in;                      \
    const union {                              \
      float f;                                 \
      int i;                                   \
    } uf = {in};                               \
    const union {                              \
      int i;                                   \
      float f;                                 \
    } ui = {(0x3f800000 * 2) - uf.i};          \
    const float fp = ui.f * (2.f - in * ui.f); \
    _out = fp * (2.f - in * fp);               \
  } while (void(0), 0)

#define OZZ_RCP_EST_NR(_in, _out)   \
  do {                              \
    float fp2;                      \
    OZZ_RCP_EST(_in, fp2);          \
    _out = fp2 * (2.f - _in * fp2); \
  } while (void(0), 0)

#define OZZ_RSQRT_EST(_in, _out)                               \
  do {                                                         \
    const float in = _in;                                      \
    union {                                                    \
      float f;                                                 \
      int i;                                                   \
    } uf = {in};                                               \
    union {                                                    \
      int i;                                                   \
      float f;                                                 \
    } ui = {0x5f3759df - (uf.i / 2)};                          \
    const float fp = ui.f * (1.5f - (in * .5f * ui.f * ui.f)); \
    _out = fp * (1.5f - (in * .5f * fp * fp));                 \
  } while (void(0), 0)

#define OZZ_RSQRT_EST_NR(_in, _out)                \
  do {                                             \
    float fp2;                                     \
    OZZ_RSQRT_EST(_in, fp2);                       \
    _out = fp2 * (1.5f - (_in * .5f * fp2 * fp2)); \
  } while (void(0), 0)

namespace simd_float4 {

OZZ_INLINE SimdFloat4 zero() {
  const SimdFloat4 ret = {0.f, 0.f, 0.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 one() {
  const SimdFloat4 ret = {1.f, 1.f, 1.f, 1.f};
  return ret;
}

OZZ_INLINE SimdFloat4 x_axis() {
  const SimdFloat4 ret = {1.f, 0.f, 0.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 y_axis() {
  const SimdFloat4 ret = {0.f, 1.f, 0.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 z_axis() {
  const SimdFloat4 ret = {0.f, 0.f, 1.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 w_axis() {
  const SimdFloat4 ret = {0.f, 0.f, 0.f, 1.f};
  return ret;
}

OZZ_INLINE SimdFloat4 Load(float _x, float _y, float _z, float _w) {
  const SimdFloat4 ret = {_x, _y, _z, _w};
  return ret;
}

OZZ_INLINE SimdFloat4 LoadX(float _x) {
  const SimdFloat4 ret = {_x, 0.f, 0.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 Load1(float _x) {
  const SimdFloat4 ret = {_x, _x, _x, _x};
  return ret;
}

OZZ_INLINE SimdFloat4 LoadPtr(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  const SimdFloat4 ret = {_f[0], _f[1], _f[2], _f[3]};
  return ret;
}

OZZ_INLINE SimdFloat4 LoadPtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  const SimdFloat4 ret = {_f[0], _f[1], _f[2], _f[3]};
  return ret;
}

OZZ_INLINE SimdFloat4 LoadXPtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  const SimdFloat4 ret = {*_f, 0.f, 0.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 Load1PtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  const SimdFloat4 ret = {*_f, *_f, *_f, *_f};
  return ret;
}

OZZ_INLINE SimdFloat4 Load2PtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  const SimdFloat4 ret = {_f[0], _f[1], 0.f, 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 Load3PtrU(const float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  const SimdFloat4 ret = {_f[0], _f[1], _f[2]};
  return ret;
}

OZZ_INLINE SimdFloat4 FromInt(_SimdInt4 _i) {
  const SimdFloat4 ret = {static_cast<float>(_i.x), static_cast<float>(_i.y),
                          static_cast<float>(_i.z), static_cast<float>(_i.w)};
  return ret;
}
}  // namespace simd_float4

OZZ_INLINE float GetX(_SimdFloat4 _v) { return _v.x; }

OZZ_INLINE float GetY(_SimdFloat4 _v) { return _v.y; }

OZZ_INLINE float GetZ(_SimdFloat4 _v) { return _v.z; }

OZZ_INLINE float GetW(_SimdFloat4 _v) { return _v.w; }
OZZ_INLINE SimdFloat4 SetX(_SimdFloat4 _v, _SimdFloat4 _f) {
  const SimdFloat4 ret = {_f.x, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 SetY(_SimdFloat4 _v, _SimdFloat4 _f) {
  const SimdFloat4 ret = {_v.x, _f.x, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 SetZ(_SimdFloat4 _v, _SimdFloat4 _f) {
  const SimdFloat4 ret = {_v.x, _v.y, _f.x, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 SetW(_SimdFloat4 _v, _SimdFloat4 _f) {
  const SimdFloat4 ret = {_v.x, _v.y, _v.z, _f.x};
  return ret;
}

OZZ_INLINE SimdFloat4 SetI(_SimdFloat4 _v, _SimdFloat4 _f, int _ith) {
  assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range.");
  SimdFloat4 ret = _v;
  (&ret.x)[_ith] = _f.x;
  return ret;
}

OZZ_INLINE void StorePtr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _f[0] = _v.x;
  _f[1] = _v.y;
  _f[2] = _v.z;
  _f[3] = _v.w;
}

OZZ_INLINE void Store1Ptr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _f[0] = _v.x;
}

OZZ_INLINE void Store2Ptr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _f[0] = _v.x;
  _f[1] = _v.y;
}

OZZ_INLINE void Store3Ptr(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0xf) && "Invalid alignment");
  _f[0] = _v.x;
  _f[1] = _v.y;
  _f[2] = _v.z;
}

OZZ_INLINE void StorePtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _f[0] = _v.x;
  _f[1] = _v.y;
  _f[2] = _v.z;
  _f[3] = _v.w;
}

OZZ_INLINE void Store1PtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _f[0] = _v.x;
}

OZZ_INLINE void Store2PtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _f[0] = _v.x;
  _f[1] = _v.y;
}

OZZ_INLINE void Store3PtrU(_SimdFloat4 _v, float* _f) {
  assert(!(reinterpret_cast<uintptr_t>(_f) & 0x3) && "Invalid alignment");
  _f[0] = _v.x;
  _f[1] = _v.y;
  _f[2] = _v.z;
}

OZZ_INLINE SimdFloat4 SplatX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.x, _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 SplatY(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.y, _v.y, _v.y, _v.y};
  return ret;
}

OZZ_INLINE SimdFloat4 SplatZ(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.z, _v.z, _v.z, _v.z};
  return ret;
}

OZZ_INLINE SimdFloat4 SplatW(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.w, _v.w, _v.w, _v.w};
  return ret;
}

template <size_t _X, size_t _Y, size_t _Z, size_t _W>
OZZ_INLINE SimdFloat4 Swizzle(_SimdFloat4 _v) {
  static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3,
                "Indices must be between 0 and 3");
  const float* pf = &_v.x;
  const SimdFloat4 ret = {pf[_X], pf[_Y], pf[_Z], pf[_W]};
  return ret;
}

OZZ_INLINE void Transpose4x1(const SimdFloat4 _in[4], SimdFloat4 _out[1]) {
  _out[0].x = _in[0].x;
  _out[0].y = _in[1].x;
  _out[0].z = _in[2].x;
  _out[0].w = _in[3].x;
}

OZZ_INLINE void Transpose1x4(const SimdFloat4 _in[1], SimdFloat4 _out[4]) {
  _out[0].x = _in[0].x;
  _out[0].y = _out[0].z = _out[0].w = 0.f;
  _out[1].x = _in[0].y;
  _out[1].y = _out[1].z = _out[1].w = 0.f;
  _out[2].x = _in[0].z;
  _out[2].y = _out[2].z = _out[2].w = 0.f;
  _out[3].x = _in[0].w;
  _out[3].y = _out[3].z = _out[3].w = 0.f;
}

OZZ_INLINE void Transpose4x2(const SimdFloat4 _in[4], SimdFloat4 _out[2]) {
  _out[0].x = _in[0].x;
  _out[0].y = _in[1].x;
  _out[0].z = _in[2].x;
  _out[0].w = _in[3].x;
  _out[1].x = _in[0].y;
  _out[1].y = _in[1].y;
  _out[1].z = _in[2].y;
  _out[1].w = _in[3].y;
}

OZZ_INLINE void Transpose2x4(const SimdFloat4 _in[2], SimdFloat4 _out[4]) {
  _out[0].x = _in[0].x;
  _out[0].y = _in[1].x;
  _out[0].z = _out[0].w = 0.f;
  _out[1].x = _in[0].y;
  _out[1].y = _in[1].y;
  _out[1].z = _out[1].w = 0.f;
  _out[2].x = _in[0].z;
  _out[2].y = _in[1].z;
  _out[2].z = _out[2].w = 0.f;
  _out[3].x = _in[0].w;
  _out[3].y = _in[1].w;
  _out[3].z = _out[3].w = 0.f;
}

OZZ_INLINE void Transpose4x3(const SimdFloat4 _in[4], SimdFloat4 _out[3]) {
  _out[0].x = _in[0].x;
  _out[0].y = _in[1].x;
  _out[0].z = _in[2].x;
  _out[0].w = _in[3].x;
  _out[1].x = _in[0].y;
  _out[1].y = _in[1].y;
  _out[1].z = _in[2].y;
  _out[1].w = _in[3].y;
  _out[2].x = _in[0].z;
  _out[2].y = _in[1].z;
  _out[2].z = _in[2].z;
  _out[2].w = _in[3].z;
}

OZZ_INLINE void Transpose3x4(const SimdFloat4 _in[3], SimdFloat4 _out[4]) {
  _out[0].x = _in[0].x;
  _out[0].y = _in[1].x;
  _out[0].z = _in[2].x;
  _out[0].w = 0.f;
  _out[1].x = _in[0].y;
  _out[1].y = _in[1].y;
  _out[1].z = _in[2].y;
  _out[1].w = 0.f;
  _out[2].x = _in[0].z;
  _out[2].y = _in[1].z;
  _out[2].z = _in[2].z;
  _out[2].w = 0.f;
  _out[3].x = _in[0].w;
  _out[3].y = _in[1].w;
  _out[3].z = _in[2].w;
  _out[3].w = 0.f;
}

OZZ_INLINE void Transpose4x4(const SimdFloat4 _in[4], SimdFloat4 _out[4]) {
  _out[0].x = _in[0].x;
  _out[1].x = _in[0].y;
  _out[2].x = _in[0].z;
  _out[3].x = _in[0].w;
  _out[0].y = _in[1].x;
  _out[1].y = _in[1].y;
  _out[2].y = _in[1].z;
  _out[3].y = _in[1].w;
  _out[0].z = _in[2].x;
  _out[1].z = _in[2].y;
  _out[2].z = _in[2].z;
  _out[3].z = _in[2].w;
  _out[0].w = _in[3].x;
  _out[1].w = _in[3].y;
  _out[2].w = _in[3].z;
  _out[3].w = _in[3].w;
}

OZZ_INLINE void Transpose16x16(const SimdFloat4 _in[16], SimdFloat4 _out[16]) {
  for (int i = 0; i < 4; ++i) {
    const int i4 = i * 4;
    _out[i4 + 0].x = *(&_in[0].x + i);
    _out[i4 + 0].y = *(&_in[1].x + i);
    _out[i4 + 0].z = *(&_in[2].x + i);
    _out[i4 + 0].w = *(&_in[3].x + i);
    _out[i4 + 1].x = *(&_in[4].x + i);
    _out[i4 + 1].y = *(&_in[5].x + i);
    _out[i4 + 1].z = *(&_in[6].x + i);
    _out[i4 + 1].w = *(&_in[7].x + i);
    _out[i4 + 2].x = *(&_in[8].x + i);
    _out[i4 + 2].y = *(&_in[9].x + i);
    _out[i4 + 2].z = *(&_in[10].x + i);
    _out[i4 + 2].w = *(&_in[11].x + i);
    _out[i4 + 3].x = *(&_in[12].x + i);
    _out[i4 + 3].y = *(&_in[13].x + i);
    _out[i4 + 3].z = *(&_in[14].x + i);
    _out[i4 + 3].w = *(&_in[15].x + i);
  }
}

OZZ_INLINE SimdFloat4 MAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  const SimdFloat4 ret = {_a.x * _b.x + _c.x, _a.y * _b.y + _c.y,
                          _a.z * _b.z + _c.z, _a.w * _b.w + _c.w};
  return ret;
}

OZZ_INLINE SimdFloat4 MSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  const SimdFloat4 ret = {_a.x * _b.x - _c.x, _a.y * _b.y - _c.y,
                          _a.z * _b.z - _c.z, _a.w * _b.w - _c.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NMAdd(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  const SimdFloat4 ret = {_c.x - _a.x * _b.x, _c.y - _a.y * _b.y,
                          _c.z - _a.z * _b.z, _c.w - _a.w * _b.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NMSub(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _c) {
  const SimdFloat4 ret = {-_a.x * _b.x - _c.x, -_a.y * _b.y - _c.y,
                          -_a.z * _b.z - _c.z, -_a.w * _b.w - _c.w};
  return ret;
}

OZZ_INLINE SimdFloat4 DivX(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.x / _b.x, _a.y, _a.z, _a.w};
  return ret;
}

OZZ_INLINE SimdFloat4 HAdd2(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.x + _v.y, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 HAdd3(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.x + _v.y + _v.z, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 HAdd4(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.x + _v.y + _v.z + _v.w, _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Dot2(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.x * _b.x + _a.y * _b.y, _a.x, _a.x, _a.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Dot3(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.x * _b.x + _a.y * _b.y + _a.z * _b.z, _a.x, _a.x,
                          _a.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Dot4(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.x * _b.x + _a.y * _b.y + _a.z * _b.z + _a.w * _b.w,
                          _a.x, _a.x, _a.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Cross3(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.y * _b.z - _a.z * _b.y, _a.z * _b.x - _a.x * _b.z,
                          _a.x * _b.y - _a.y * _b.x, _a.x};
  return ret;
}

OZZ_INLINE SimdFloat4 RcpEst(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RCP_EST(_v.x, ret.x);
  OZZ_RCP_EST(_v.y, ret.y);
  OZZ_RCP_EST(_v.z, ret.z);
  OZZ_RCP_EST(_v.w, ret.w);
  return ret;
}

OZZ_INLINE SimdFloat4 RcpEstNR(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RCP_EST_NR(_v.x, ret.x);
  OZZ_RCP_EST_NR(_v.y, ret.y);
  OZZ_RCP_EST_NR(_v.z, ret.z);
  OZZ_RCP_EST_NR(_v.w, ret.w);
  return ret;
}

OZZ_INLINE SimdFloat4 RcpEstX(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RCP_EST(_v.x, ret.x);
  ret.y = _v.y;
  ret.z = _v.z;
  ret.w = _v.w;
  return ret;
}

OZZ_INLINE SimdFloat4 RcpEstXNR(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RCP_EST(_v.x, ret.x);
  ret.y = _v.x;
  ret.z = _v.x;
  ret.w = _v.x;
  return ret;
}

OZZ_INLINE SimdFloat4 Sqrt(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::sqrt(_v.x), std::sqrt(_v.y), std::sqrt(_v.z),
                          std::sqrt(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 SqrtX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::sqrt(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 RSqrtEst(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RSQRT_EST(_v.x, ret.x);
  OZZ_RSQRT_EST(_v.y, ret.y);
  OZZ_RSQRT_EST(_v.z, ret.z);
  OZZ_RSQRT_EST(_v.w, ret.w);
  return ret;
}

OZZ_INLINE SimdFloat4 RSqrtEstNR(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RSQRT_EST_NR(_v.x, ret.x);
  OZZ_RSQRT_EST_NR(_v.y, ret.y);
  OZZ_RSQRT_EST_NR(_v.z, ret.z);
  OZZ_RSQRT_EST_NR(_v.w, ret.w);
  return ret;
}

OZZ_INLINE SimdFloat4 RSqrtEstX(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RSQRT_EST(_v.x, ret.x);
  ret.y = _v.y;
  ret.z = _v.z;
  ret.w = _v.w;
  return ret;
}

OZZ_INLINE SimdFloat4 RSqrtEstXNR(_SimdFloat4 _v) {
  SimdFloat4 ret;
  OZZ_RSQRT_EST(_v.x, ret.x);
  ret.y = _v.x;
  ret.z = _v.x;
  ret.w = _v.x;
  return ret;
}

OZZ_INLINE SimdFloat4 Abs(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::abs(_v.x), std::abs(_v.y), std::abs(_v.z),
                          std::abs(_v.w)};
  return ret;
}

OZZ_INLINE SimdInt4 Sign(_SimdFloat4 _v) {
  internal::SimdFI4 fi = {_v};
  const SimdInt4 ret = {fi.i.x & static_cast<int>(0x80000000),
                        fi.i.y & static_cast<int>(0x80000000),
                        fi.i.z & static_cast<int>(0x80000000),
                        fi.i.w & static_cast<int>(0x80000000)};
  return ret;
}

OZZ_INLINE SimdFloat4 Length2(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  const SimdFloat4 ret = {std::sqrt(sq_len), _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Length3(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  const SimdFloat4 ret = {std::sqrt(sq_len), _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Length4(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  const SimdFloat4 ret = {std::sqrt(sq_len), _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Length2Sqr(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  const SimdFloat4 ret = {sq_len, _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Length3Sqr(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  const SimdFloat4 ret = {sq_len, _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Length4Sqr(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  const SimdFloat4 ret = {sq_len, _v.x, _v.x, _v.x};
  return ret;
}

OZZ_INLINE SimdFloat4 Normalize2(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  assert(sq_len != 0.f && "_v is not normalizable");
  const float inv_len = 1.f / std::sqrt(sq_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Normalize3(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  assert(sq_len != 0.f && "_v is not normalizable");
  const float inv_len = 1.f / std::sqrt(sq_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Normalize4(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  assert(sq_len != 0.f && "_v is not normalizable");
  const float inv_len = 1.f / std::sqrt(sq_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len,
                          _v.w * inv_len};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeEst2(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  assert(sq_len != 0.f && "_v is not normalizable");
  float inv_len;
  OZZ_RSQRT_EST(sq_len, inv_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeEst3(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  assert(sq_len != 0.f && "_v is not normalizable");
  float inv_len;
  OZZ_RSQRT_EST(sq_len, inv_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeEst4(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  assert(sq_len != 0.f && "_v is not normalizable");
  float inv_len;
  OZZ_RSQRT_EST(sq_len, inv_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len,
                          _v.w * inv_len};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalized2(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  const bool normalized = std::abs(sq_len - 1.f) < kNormalizationToleranceSq;
  const SimdInt4 ret = {-static_cast<int>(normalized), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalized3(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  const bool normalized = std::abs(sq_len - 1.f) < kNormalizationToleranceSq;
  const SimdInt4 ret = {-static_cast<int>(normalized), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalized4(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  const bool normalized = std::abs(sq_len - 1.f) < kNormalizationToleranceSq;
  const SimdInt4 ret = {-static_cast<int>(normalized), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalizedEst2(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  const bool normalized = std::abs(sq_len - 1.f) < kNormalizationToleranceEstSq;
  const SimdInt4 ret = {-static_cast<int>(normalized), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalizedEst3(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  const bool normalized = std::abs(sq_len - 1.f) < kNormalizationToleranceEstSq;
  const SimdInt4 ret = {-static_cast<int>(normalized), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalizedEst4(_SimdFloat4 _v) {
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  const bool normalized = std::abs(sq_len - 1.f) < kNormalizationToleranceEstSq;
  const SimdInt4 ret = {-static_cast<int>(normalized), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeSafe2(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalized2(_safe)) && "_safe is not normalized");
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  if (sq_len == 0.f) {
    const SimdFloat4 ret = {_safe.x, _safe.y, _v.z, _v.w};
    return ret;
  }
  const float inv_len = 1.f / std::sqrt(sq_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeSafe3(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalized3(_safe)) && "_safe is not normalized");
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  if (sq_len == 0.f) {
    const SimdFloat4 ret = {_safe.x, _safe.y, _safe.z, _v.w};
    return ret;
  }
  const float inv_len = 1.f / std::sqrt(sq_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeSafe4(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalized4(_safe)) && "_safe is not normalized");
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  if (sq_len == 0.f) {
    return _safe;
  }
  const float inv_len = 1.f / std::sqrt(sq_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len,
                          _v.w * inv_len};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeSafeEst2(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalizedEst2(_safe)) && "_safe is not normalized");
  const float sq_len = _v.x * _v.x + _v.y * _v.y;
  if (sq_len == 0.f) {
    const SimdFloat4 ret = {_safe.x, _safe.y, _v.z, _v.w};
    return ret;
  }
  float inv_len;
  OZZ_RSQRT_EST(sq_len, inv_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeSafeEst3(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalizedEst3(_safe)) && "_safe is not normalized");
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z;
  if (sq_len == 0.f) {
    const SimdFloat4 ret = {_safe.x, _safe.y, _safe.z, _v.w};
    return ret;
  }
  float inv_len;
  OZZ_RSQRT_EST(sq_len, inv_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 NormalizeSafeEst4(_SimdFloat4 _v, _SimdFloat4 _safe) {
  // assert(AreAllTrue1(IsNormalizedEst4(_safe)) && "_safe is not normalized");
  const float sq_len = _v.x * _v.x + _v.y * _v.y + _v.z * _v.z + _v.w * _v.w;
  if (sq_len == 0.f) {
    return _safe;
  }
  float inv_len;
  OZZ_RSQRT_EST(sq_len, inv_len);
  const SimdFloat4 ret = {_v.x * inv_len, _v.y * inv_len, _v.z * inv_len,
                          _v.w * inv_len};
  return ret;
}

OZZ_INLINE SimdFloat4 Lerp(_SimdFloat4 _a, _SimdFloat4 _b, _SimdFloat4 _alpha) {
  const SimdFloat4 ret = {
      (_b.x - _a.x) * _alpha.x + _a.x, (_b.y - _a.y) * _alpha.y + _a.y,
      (_b.z - _a.z) * _alpha.z + _a.z, (_b.w - _a.w) * _alpha.w + _a.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Min(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.x < _b.x ? _a.x : _b.x, _a.y < _b.y ? _a.y : _b.y,
                          _a.z < _b.z ? _a.z : _b.z, _a.w < _b.w ? _a.w : _b.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Max(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdFloat4 ret = {_a.x > _b.x ? _a.x : _b.x, _a.y > _b.y ? _a.y : _b.y,
                          _a.z > _b.z ? _a.z : _b.z, _a.w > _b.w ? _a.w : _b.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Min0(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.x < 0.f ? _v.x : 0.f, _v.y < 0.f ? _v.y : 0.f,
                          _v.z < 0.f ? _v.z : 0.f, _v.w < 0.f ? _v.w : 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 Max0(_SimdFloat4 _v) {
  const SimdFloat4 ret = {_v.x > 0.f ? _v.x : 0.f, _v.y > 0.f ? _v.y : 0.f,
                          _v.z > 0.f ? _v.z : 0.f, _v.w > 0.f ? _v.w : 0.f};
  return ret;
}

OZZ_INLINE SimdFloat4 Clamp(_SimdFloat4 _a, _SimdFloat4 _v, _SimdFloat4 _b) {
  const SimdFloat4 min = {_v.x < _b.x ? _v.x : _b.x, _v.y < _b.y ? _v.y : _b.y,
                          _v.z < _b.z ? _v.z : _b.z, _v.w < _b.w ? _v.w : _b.w};
  const SimdFloat4 r = {
      _a.x > min.x ? _a.x : min.x, _a.y > min.y ? _a.y : min.y,
      _a.z > min.z ? _a.z : min.z, _a.w > min.w ? _a.w : min.w};
  return r;
}

OZZ_INLINE SimdFloat4 Select(_SimdInt4 _b, _SimdFloat4 _true,
                             _SimdFloat4 _false) {
  using internal::SimdFI4;
  using internal::SimdIF4;

  const SimdFI4 i_true = {_true};
  const SimdFI4 i_false = {_false};
  const SimdIF4 ret = {{i_false.i.x ^ (_b.x & (i_true.i.x ^ i_false.i.x)),
                        i_false.i.y ^ (_b.y & (i_true.i.y ^ i_false.i.y)),
                        i_false.i.z ^ (_b.z & (i_true.i.z ^ i_false.i.z)),
                        i_false.i.w ^ (_b.w & (i_true.i.w ^ i_false.i.w))}};
  return ret.f;
}

OZZ_INLINE SimdInt4 CmpEq(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x == _b.x), -static_cast<int>(_a.y == _b.y),
      -static_cast<int>(_a.z == _b.z), -static_cast<int>(_a.w == _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpNe(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x != _b.x), -static_cast<int>(_a.y != _b.y),
      -static_cast<int>(_a.z != _b.z), -static_cast<int>(_a.w != _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpLt(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x < _b.x), -static_cast<int>(_a.y < _b.y),
      -static_cast<int>(_a.z < _b.z), -static_cast<int>(_a.w < _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpLe(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x <= _b.x), -static_cast<int>(_a.y <= _b.y),
      -static_cast<int>(_a.z <= _b.z), -static_cast<int>(_a.w <= _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpGt(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x > _b.x), -static_cast<int>(_a.y > _b.y),
      -static_cast<int>(_a.z > _b.z), -static_cast<int>(_a.w > _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpGe(_SimdFloat4 _a, _SimdFloat4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x >= _b.x), -static_cast<int>(_a.y >= _b.y),
      -static_cast<int>(_a.z >= _b.z), -static_cast<int>(_a.w >= _b.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdFloat4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdFI4 b = {_b};
  const internal::SimdIF4 ret = {
      {a.i.x & b.i.x, a.i.y & b.i.y, a.i.z & b.i.z, a.i.w & b.i.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdFloat4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdFI4 b = {_b};
  const internal::SimdIF4 ret = {
      {a.i.x | b.i.x, a.i.y | b.i.y, a.i.z | b.i.z, a.i.w | b.i.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdFloat4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdFI4 b = {_b};
  const internal::SimdIF4 ret = {
      {a.i.x ^ b.i.x, a.i.y ^ b.i.y, a.i.z ^ b.i.z, a.i.w ^ b.i.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 And(_SimdFloat4 _a, _SimdInt4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdIF4 ret = {
      {a.i.x & _b.x, a.i.y & _b.y, a.i.z & _b.z, a.i.w & _b.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 AndNot(_SimdFloat4 _a, _SimdInt4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdIF4 ret = {
      {a.i.x & ~_b.x, a.i.y & ~_b.y, a.i.z & ~_b.z, a.i.w & ~_b.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 Or(_SimdFloat4 _a, _SimdInt4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdIF4 ret = {
      {a.i.x | _b.x, a.i.y | _b.y, a.i.z | _b.z, a.i.w | _b.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 Xor(_SimdFloat4 _a, _SimdInt4 _b) {
  const internal::SimdFI4 a = {_a};
  const internal::SimdIF4 ret = {
      {a.i.x ^ _b.x, a.i.y ^ _b.y, a.i.z ^ _b.z, a.i.w ^ _b.w}};
  return ret.f;
}

OZZ_INLINE SimdFloat4 Cos(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::cos(_v.x), std::cos(_v.y), std::cos(_v.z),
                          std::cos(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 CosX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::cos(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 ACos(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::acos(_v.x), std::acos(_v.y), std::acos(_v.z),
                          std::acos(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 ACosX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::acos(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Sin(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::sin(_v.x), std::sin(_v.y), std::sin(_v.z),
                          std::sin(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 SinX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::sin(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 ASin(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::asin(_v.x), std::asin(_v.y), std::asin(_v.z),
                          std::asin(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 ASinX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::asin(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 Tan(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::tan(_v.x), std::tan(_v.y), std::tan(_v.z),
                          std::tan(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 TanX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::tan(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdFloat4 ATan(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::atan(_v.x), std::atan(_v.y), std::atan(_v.z),
                          std::atan(_v.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 ATanX(_SimdFloat4 _v) {
  const SimdFloat4 ret = {std::atan(_v.x), _v.y, _v.z, _v.w};
  return ret;
}

namespace simd_int4 {

OZZ_INLINE SimdInt4 zero() {
  const SimdInt4 ret = {0, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 one() {
  const SimdInt4 ret = {1, 1, 1, 1};
  return ret;
}

OZZ_INLINE SimdInt4 x_axis() {
  const SimdInt4 ret = {1, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 y_axis() {
  const SimdInt4 ret = {0, 1, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 z_axis() {
  const SimdInt4 ret = {0, 0, 1, 0};
  return ret;
}

OZZ_INLINE SimdInt4 w_axis() {
  const SimdInt4 ret = {0, 0, 0, 1};
  return ret;
}

OZZ_INLINE SimdInt4 all_true() {
  const SimdInt4 ret = {~0, ~0, ~0, ~0};
  return ret;
}

OZZ_INLINE SimdInt4 all_false() {
  const SimdInt4 ret = {0, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_sign() {
  const SimdInt4 ret = {
      static_cast<int>(0x80000000), static_cast<int>(0x80000000),
      static_cast<int>(0x80000000), static_cast<int>(0x80000000)};
  return ret;
}

OZZ_INLINE SimdInt4 mask_sign_xyz() {
  const SimdInt4 ret = {
      static_cast<int>(0x80000000), static_cast<int>(0x80000000),
      static_cast<int>(0x80000000), static_cast<int>(0x00000000)};
  return ret;
}

OZZ_INLINE SimdInt4 mask_sign_w() {
  const SimdInt4 ret = {
      static_cast<int>(0x00000000), static_cast<int>(0x00000000),
      static_cast<int>(0x00000000), static_cast<int>(0x80000000)};
  return ret;
}

OZZ_INLINE SimdInt4 mask_not_sign() {
  const SimdInt4 ret = {
      static_cast<int>(0x7fffffff), static_cast<int>(0x7fffffff),
      static_cast<int>(0x7fffffff), static_cast<int>(0x7fffffff)};
  return ret;
}

OZZ_INLINE SimdInt4 mask_ffff() {
  const SimdInt4 ret = {~0, ~0, ~0, ~0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_fff0() {
  const SimdInt4 ret = {~0, ~0, ~0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_0000() {
  const SimdInt4 ret = {0, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_f000() {
  const SimdInt4 ret = {~0, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_0f00() {
  const SimdInt4 ret = {0, ~0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_00f0() {
  const SimdInt4 ret = {0, 0, ~0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 mask_000f() {
  const SimdInt4 ret = {0, 0, 0, ~0};
  return ret;
}

OZZ_INLINE SimdInt4 Load(int _x, int _y, int _z, int _w) {
  const SimdInt4 ret = {_x, _y, _z, _w};
  return ret;
}

OZZ_INLINE SimdInt4 LoadX(int _x) {
  const SimdInt4 ret = {_x, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 Load1(int _x) {
  const SimdInt4 ret = {_x, _x, _x, _x};
  return ret;
}

OZZ_INLINE SimdInt4 Load(bool _x, bool _y, bool _z, bool _w) {
  const SimdInt4 ret = {-static_cast<int>(_x), -static_cast<int>(_y),
                        -static_cast<int>(_z), -static_cast<int>(_w)};
  return ret;
}

OZZ_INLINE SimdInt4 LoadX(bool _x) {
  const SimdInt4 ret = {-static_cast<int>(_x), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 Load1(bool _x) {
  const int i = -static_cast<int>(_x);
  const SimdInt4 ret = {i, i, i, i};
  return ret;
}

OZZ_INLINE SimdInt4 LoadPtr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  const SimdInt4 ret = {_i[0], _i[1], _i[2], _i[3]};
  return ret;
}

OZZ_INLINE SimdInt4 LoadXPtr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  const SimdInt4 ret = {*_i, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 Load1Ptr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  const SimdInt4 ret = {*_i, *_i, *_i, *_i};
  return ret;
}

OZZ_INLINE SimdInt4 Load2Ptr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  const SimdInt4 ret = {_i[0], _i[1], 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 Load3Ptr(const int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  const SimdInt4 ret = {_i[0], _i[1], _i[2], 0};
  return ret;
}

OZZ_INLINE SimdInt4 LoadPtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  const SimdInt4 ret = {_i[0], _i[1], _i[2], _i[3]};
  return ret;
}

OZZ_INLINE SimdInt4 LoadXPtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  const SimdInt4 ret = {*_i, 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 Load1PtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  const SimdInt4 ret = {*_i, *_i, *_i, *_i};
  return ret;
}

OZZ_INLINE SimdInt4 Load2PtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  const SimdInt4 ret = {_i[0], _i[1], 0, 0};
  return ret;
}

OZZ_INLINE SimdInt4 Load3PtrU(const int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  const SimdInt4 ret = {_i[0], _i[1], _i[2], 0};
  return ret;
}

OZZ_INLINE SimdInt4 FromFloatRound(_SimdFloat4 _f) {
  const SimdInt4 ret = {
      static_cast<int>(floor(_f.x + .5f)), static_cast<int>(floor(_f.y + .5f)),
      static_cast<int>(floor(_f.z + .5f)), static_cast<int>(floor(_f.w + .5f))};
  return ret;
}

OZZ_INLINE SimdInt4 FromFloatTrunc(_SimdFloat4 _f) {
  const SimdInt4 ret = {static_cast<int>(_f.x), static_cast<int>(_f.y),
                        static_cast<int>(_f.z), static_cast<int>(_f.w)};
  return ret;
}
}  // namespace simd_int4

OZZ_INLINE int GetX(_SimdInt4 _v) { return _v.x; }

OZZ_INLINE int GetY(_SimdInt4 _v) { return _v.y; }

OZZ_INLINE int GetZ(_SimdInt4 _v) { return _v.z; }

OZZ_INLINE int GetW(_SimdInt4 _v) { return _v.w; }

OZZ_INLINE SimdInt4 SetX(_SimdInt4 _v, _SimdInt4 _i) {
  const SimdInt4 ret = {_i.x, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdInt4 SetY(_SimdInt4 _v, _SimdInt4 _i) {
  const SimdInt4 ret = {_v.x, _i.x, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdInt4 SetZ(_SimdInt4 _v, _SimdInt4 _i) {
  const SimdInt4 ret = {_v.x, _v.y, _i.x, _v.w};
  return ret;
}

OZZ_INLINE SimdInt4 SetW(_SimdInt4 _v, _SimdInt4 _i) {
  const SimdInt4 ret = {_v.x, _v.y, _v.z, _i.x};
  return ret;
}

OZZ_INLINE SimdInt4 SetI(_SimdInt4 _v, _SimdInt4 _i, int _ith) {
  assert(_ith >= 0 && _ith <= 3 && "Invalid index, out of range.");
  SimdInt4 ret = _v;
  (&ret.x)[_ith] = _i.x;
  return ret;
}

OZZ_INLINE void StorePtr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _i[0] = _v.x;
  _i[1] = _v.y;
  _i[2] = _v.z;
  _i[3] = _v.w;
}

OZZ_INLINE void Store1Ptr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _i[0] = _v.x;
}

OZZ_INLINE void Store2Ptr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _i[0] = _v.x;
  _i[1] = _v.y;
}

OZZ_INLINE void Store3Ptr(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0xf) && "Invalid alignment");
  _i[0] = _v.x;
  _i[1] = _v.y;
  _i[2] = _v.z;
}

OZZ_INLINE void StorePtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _i[0] = _v.x;
  _i[1] = _v.y;
  _i[2] = _v.z;
  _i[3] = _v.w;
}

OZZ_INLINE void Store1PtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _i[0] = _v.x;
}

OZZ_INLINE void Store2PtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _i[0] = _v.x;
  _i[1] = _v.y;
}

OZZ_INLINE void Store3PtrU(_SimdInt4 _v, int* _i) {
  assert(!(uintptr_t(_i) & 0x3) && "Invalid alignment");
  _i[0] = _v.x;
  _i[1] = _v.y;
  _i[2] = _v.z;
}

OZZ_INLINE SimdInt4 SplatX(_SimdInt4 _a) {
  const SimdInt4 ret = {_a.x, _a.x, _a.x, _a.x};
  return ret;
}

OZZ_INLINE SimdInt4 SplatY(_SimdInt4 _a) {
  const SimdInt4 ret = {_a.y, _a.y, _a.y, _a.y};
  return ret;
}

OZZ_INLINE SimdInt4 SplatZ(_SimdInt4 _a) {
  const SimdInt4 ret = {_a.z, _a.z, _a.z, _a.z};
  return ret;
}

OZZ_INLINE SimdInt4 SplatW(_SimdInt4 _a) {
  const SimdInt4 ret = {_a.w, _a.w, _a.w, _a.w};
  return ret;
}

template <size_t _X, size_t _Y, size_t _Z, size_t _W>
OZZ_INLINE SimdInt4 Swizzle(_SimdInt4 _v) {
  static_assert(_X <= 3 && _Y <= 3 && _Z <= 3 && _W <= 3,
                "Indices must be between 0 and 3");
  const int* pi = &_v.x;
  const SimdInt4 ret = {pi[_X], pi[_Y], pi[_Z], pi[_W]};
  return ret;
}

OZZ_INLINE int MoveMask(_SimdInt4 _v) {
  return ((_v.x & 0x80000000) >> 31) | ((_v.y & 0x80000000) >> 30) |
         ((_v.z & 0x80000000) >> 29) | ((_v.w & 0x80000000) >> 28);
}

OZZ_INLINE bool AreAllTrue(_SimdInt4 _v) {
  return _v.x != 0 && _v.y != 0 && _v.z != 0 && _v.w != 0;
}

OZZ_INLINE bool AreAllTrue3(_SimdInt4 _v) {
  return _v.x != 0 && _v.y != 0 && _v.z != 0;
}

OZZ_INLINE bool AreAllTrue2(_SimdInt4 _v) { return _v.x != 0 && _v.y != 0; }

OZZ_INLINE bool AreAllTrue1(_SimdInt4 _v) { return _v.x != 0; }

OZZ_INLINE bool AreAllFalse(_SimdInt4 _v) {
  return _v.x == 0 && _v.y == 0 && _v.z == 0 && _v.w == 0;
}

OZZ_INLINE bool AreAllFalse3(_SimdInt4 _v) {
  return _v.x == 0 && _v.y == 0 && _v.z == 0;
}

OZZ_INLINE bool AreAllFalse2(_SimdInt4 _v) { return _v.x == 0 && _v.y == 0; }

OZZ_INLINE bool AreAllFalse1(_SimdInt4 _v) { return _v.x == 0; }

OZZ_INLINE SimdInt4 MAdd(_SimdInt4 _a, _SimdInt4 _b, _SimdInt4 _addend) {
  const SimdInt4 ret = {_a.x * _b.x + _addend.x, _a.y * _b.y + _addend.y,
                        _a.z * _b.z + _addend.z, _a.w * _b.w + _addend.w};
  return ret;
}

OZZ_INLINE SimdInt4 DivX(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x / _b.x, _a.y, _a.z, _a.w};
  return ret;
}

OZZ_INLINE SimdInt4 HAdd2(_SimdInt4 _v) {
  const SimdInt4 ret = {_v.x + _v.y, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdInt4 HAdd3(_SimdInt4 _v) {
  const SimdInt4 ret = {_v.x + _v.y + _v.z, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdInt4 HAdd4(_SimdInt4 _v) {
  const SimdInt4 ret = {_v.x + _v.y + _v.z + _v.w, _v.y, _v.z, _v.w};
  return ret;
}

OZZ_INLINE SimdInt4 Dot2(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x * _b.x + _a.y * _b.y, _a.y, _a.z, _a.w};
  return ret;
}

OZZ_INLINE SimdInt4 Dot3(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x * _b.x + _a.y * _b.y + _a.z * _b.z, _a.y, _a.z,
                        _a.w};
  return ret;
}

OZZ_INLINE SimdInt4 Dot4(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x * _b.x + _a.y * _b.y + _a.z * _b.z + _a.w * _b.w,
                        _a.y, _a.z, _a.w};
  return ret;
}
OZZ_INLINE SimdInt4 Abs(_SimdInt4 _v) {
  const SimdInt4 mash = {_v.x >> 31, _v.y >> 31, _v.z >> 31, _v.w >> 31};
  const SimdInt4 ret = {
      (_v.x + (mash.x)) ^ (mash.x), (_v.y + (mash.y)) ^ (mash.y),
      (_v.z + (mash.z)) ^ (mash.z), (_v.w + (mash.w)) ^ (mash.w)};
  return ret;
}

OZZ_INLINE SimdInt4 Sign(_SimdInt4 _v) {
  const SimdInt4 ret = {
      _v.x & static_cast<int>(0x80000000), _v.y & static_cast<int>(0x80000000),
      _v.z & static_cast<int>(0x80000000), _v.w & static_cast<int>(0x80000000)};
  return ret;
}

OZZ_INLINE SimdInt4 Min(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x < _b.x ? _a.x : _b.x, _a.y < _b.y ? _a.y : _b.y,
                        _a.z < _b.z ? _a.z : _b.z, _a.w < _b.w ? _a.w : _b.w};
  return ret;
}

OZZ_INLINE SimdInt4 Max(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x > _b.x ? _a.x : _b.x, _a.y > _b.y ? _a.y : _b.y,
                        _a.z > _b.z ? _a.z : _b.z, _a.w > _b.w ? _a.w : _b.w};
  return ret;
}

OZZ_INLINE SimdInt4 Min0(_SimdInt4 _v) {
  const SimdInt4 ret = {_v.x < 0 ? _v.x : 0, _v.y < 0 ? _v.y : 0,
                        _v.z < 0 ? _v.z : 0, _v.w < 0 ? _v.w : 0};
  return ret;
}

OZZ_INLINE SimdInt4 Max0(_SimdInt4 _v) {
  const SimdInt4 ret = {_v.x > 0 ? _v.x : 0, _v.y > 0 ? _v.y : 0,
                        _v.z > 0 ? _v.z : 0, _v.w > 0 ? _v.w : 0};
  return ret;
}

OZZ_INLINE SimdInt4 Clamp(_SimdInt4 _a, _SimdInt4 _v, _SimdInt4 _b) {
  const SimdInt4 min = {_v.x < _b.x ? _v.x : _b.x, _v.y < _b.y ? _v.y : _b.y,
                        _v.z < _b.z ? _v.z : _b.z, _v.w < _b.w ? _v.w : _b.w};
  const SimdInt4 r = {_a.x > min.x ? _a.x : min.x, _a.y > min.y ? _a.y : min.y,
                      _a.z > min.z ? _a.z : min.z, _a.w > min.w ? _a.w : min.w};
  return r;
}

OZZ_INLINE SimdInt4 Select(_SimdInt4 _b, _SimdInt4 _true, _SimdInt4 _false) {
  const SimdInt4 ret = {_false.x ^ (_b.x & (_true.x ^ _false.x)),
                        _false.y ^ (_b.y & (_true.y ^ _false.y)),
                        _false.z ^ (_b.z & (_true.z ^ _false.z)),
                        _false.w ^ (_b.w & (_true.w ^ _false.w))};
  return ret;
}

OZZ_INLINE SimdInt4 And(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x & _b.x, _a.y & _b.y, _a.z & _b.z, _a.w & _b.w};
  return ret;
}

OZZ_INLINE SimdInt4 AndNot(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x & ~_b.x, _a.y & ~_b.y, _a.z & ~_b.z, _a.w & ~_b.w};
  return ret;
}

OZZ_INLINE SimdInt4 Or(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x | _b.x, _a.y | _b.y, _a.z | _b.z, _a.w | _b.w};
  return ret;
}

OZZ_INLINE SimdInt4 Xor(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {_a.x ^ _b.x, _a.y ^ _b.y, _a.z ^ _b.z, _a.w ^ _b.w};
  return ret;
}

OZZ_INLINE SimdInt4 Not(_SimdInt4 _v) {
  const SimdInt4 ret = {~_v.x, ~_v.y, ~_v.z, ~_v.w};
  return ret;
}

OZZ_INLINE SimdInt4 ShiftL(_SimdInt4 _v, int _bits) {
  const SimdInt4 ret = {_v.x << _bits, _v.y << _bits, _v.z << _bits,
                        _v.w << _bits};
  return ret;
}

OZZ_INLINE SimdInt4 ShiftR(_SimdInt4 _v, int _bits) {
  const SimdInt4 ret = {_v.x >> _bits, _v.y >> _bits, _v.z >> _bits,
                        _v.w >> _bits};
  return ret;
}

OZZ_INLINE SimdInt4 ShiftRu(_SimdInt4 _v, int _bits) {
  const union IU {
    int i[4];
    unsigned int u[4];
  } iu = {{_v.x, _v.y, _v.z, _v.w}};
  const union UI {
    unsigned int u[4];
    int i[4];
  } ui = {
      {iu.u[0] >> _bits, iu.u[1] >> _bits, iu.u[2] >> _bits, iu.u[3] >> _bits}};
  const SimdInt4 ret = {ui.i[0], ui.i[1], ui.i[2], ui.i[3]};
  return ret;
}

OZZ_INLINE SimdInt4 CmpEq(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x == _b.x), -static_cast<int>(_a.y == _b.y),
      -static_cast<int>(_a.z == _b.z), -static_cast<int>(_a.w == _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpNe(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x != _b.x), -static_cast<int>(_a.y != _b.y),
      -static_cast<int>(_a.z != _b.z), -static_cast<int>(_a.w != _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpLt(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x < _b.x), -static_cast<int>(_a.y < _b.y),
      -static_cast<int>(_a.z < _b.z), -static_cast<int>(_a.w < _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpLe(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x <= _b.x), -static_cast<int>(_a.y <= _b.y),
      -static_cast<int>(_a.z <= _b.z), -static_cast<int>(_a.w <= _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpGt(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x > _b.x), -static_cast<int>(_a.y > _b.y),
      -static_cast<int>(_a.z > _b.z), -static_cast<int>(_a.w > _b.w)};
  return ret;
}

OZZ_INLINE SimdInt4 CmpGe(_SimdInt4 _a, _SimdInt4 _b) {
  const SimdInt4 ret = {
      -static_cast<int>(_a.x >= _b.x), -static_cast<int>(_a.y >= _b.y),
      -static_cast<int>(_a.z >= _b.z), -static_cast<int>(_a.w >= _b.w)};
  return ret;
}

OZZ_INLINE Float4x4 Float4x4::identity() {
  const Float4x4 ret = {{{1.f, 0.f, 0.f, 0.f},
                         {0.f, 1.f, 0.f, 0.f},
                         {0.f, 0.f, 1.f, 0.f},
                         {0.f, 0.f, 0.f, 1.f}}};
  return ret;
}

OZZ_INLINE Float4x4 Transpose(const Float4x4& _m) {
  const Float4x4 ret = {
      {{_m.cols[0].x, _m.cols[1].x, _m.cols[2].x, _m.cols[3].x},
       {_m.cols[0].y, _m.cols[1].y, _m.cols[2].y, _m.cols[3].y},
       {_m.cols[0].z, _m.cols[1].z, _m.cols[2].z, _m.cols[3].z},
       {_m.cols[0].w, _m.cols[1].w, _m.cols[2].w, _m.cols[3].w}}};
  return ret;
}

OZZ_INLINE Float4x4 Invert(const Float4x4& _m, SimdInt4* _invertible) {
  const SimdFloat4* cols = _m.cols;
  const float a00 = cols[2].z * cols[3].w - cols[3].z * cols[2].w;
  const float a01 = cols[2].y * cols[3].w - cols[3].y * cols[2].w;
  const float a02 = cols[2].y * cols[3].z - cols[3].y * cols[2].z;
  const float a03 = cols[2].x * cols[3].w - cols[3].x * cols[2].w;
  const float a04 = cols[2].x * cols[3].z - cols[3].x * cols[2].z;
  const float a05 = cols[2].x * cols[3].y - cols[3].x * cols[2].y;
  const float a06 = cols[1].z * cols[3].w - cols[3].z * cols[1].w;
  const float a07 = cols[1].y * cols[3].w - cols[3].y * cols[1].w;
  const float a08 = cols[1].y * cols[3].z - cols[3].y * cols[1].z;
  const float a09 = cols[1].x * cols[3].w - cols[3].x * cols[1].w;
  const float a10 = cols[1].x * cols[3].z - cols[3].x * cols[1].z;
  const float a11 = cols[1].y * cols[3].w - cols[3].y * cols[1].w;
  const float a12 = cols[1].x * cols[3].y - cols[3].x * cols[1].y;
  const float a13 = cols[1].z * cols[2].w - cols[2].z * cols[1].w;
  const float a14 = cols[1].y * cols[2].w - cols[2].y * cols[1].w;
  const float a15 = cols[1].y * cols[2].z - cols[2].y * cols[1].z;
  const float a16 = cols[1].x * cols[2].w - cols[2].x * cols[1].w;
  const float a17 = cols[1].x * cols[2].z - cols[2].x * cols[1].z;
  const float a18 = cols[1].x * cols[2].y - cols[2].x * cols[1].y;

  const float b0x = cols[1].y * a00 - cols[1].z * a01 + cols[1].w * a02;
  const float b1x = -cols[1].x * a00 + cols[1].z * a03 - cols[1].w * a04;
  const float b2x = cols[1].x * a01 - cols[1].y * a03 + cols[1].w * a05;
  const float b3x = -cols[1].x * a02 + cols[1].y * a04 - cols[1].z * a05;

  const float b0y = -cols[0].y * a00 + cols[0].z * a01 - cols[0].w * a02;
  const float b1y = cols[0].x * a00 - cols[0].z * a03 + cols[0].w * a04;
  const float b2y = -cols[0].x * a01 + cols[0].y * a03 - cols[0].w * a05;
  const float b3y = cols[0].x * a02 - cols[0].y * a04 + cols[0].z * a05;

  const float b0z = cols[0].y * a06 - cols[0].z * a07 + cols[0].w * a08;
  const float b1z = -cols[0].x * a06 + cols[0].z * a09 - cols[0].w * a10;
  const float b2z = cols[0].x * a11 - cols[0].y * a09 + cols[0].w * a12;
  const float b3z = -cols[0].x * a08 + cols[0].y * a10 - cols[0].z * a12;

  const float b0w = -cols[0].y * a13 + cols[0].z * a14 - cols[0].w * a15;
  const float b1w = cols[0].x * a13 - cols[0].z * a16 + cols[0].w * a17;
  const float b2w = -cols[0].x * a14 + cols[0].y * a16 - cols[0].w * a18;
  const float b3w = cols[0].x * a15 - cols[0].y * a17 + cols[0].z * a18;

  const float det =
      cols[0].x * b0x + cols[0].y * b1x + cols[0].z * b2x + cols[0].w * b3x;
  const bool invertible = det != 0.f;
  assert((_invertible || invertible) && "Matrix is not invertible");
  if (_invertible != nullptr) {
    *_invertible = simd_int4::LoadX(invertible);
  }
  const float inv_det = invertible ? 1.f / det : 0.f;

  const Float4x4 ret = {
      {{b0x * inv_det, b0y * inv_det, b0z * inv_det, b0w * inv_det},
       {b1x * inv_det, b1y * inv_det, b1z * inv_det, b1w * inv_det},
       {b2x * inv_det, b2y * inv_det, b2z * inv_det, b2w * inv_det},
       {b3x * inv_det, b3y * inv_det, b3z * inv_det, b3w * inv_det}}};
  return ret;
}

Float4x4 Float4x4::Scaling(_SimdFloat4 _v) {
  const Float4x4 ret = {{{_v.x, 0.f, 0.f, 0.f},
                         {0.f, _v.y, 0.f, 0.f},
                         {0.f, 0.f, _v.z, 0.f},
                         {0.f, 0.f, 0.f, 1.f}}};
  return ret;
}

Float4x4 Float4x4::Translation(_SimdFloat4 _v) {
  const Float4x4 ret = {{{1.f, 0.f, 0.f, 0.f},
                         {0.f, 1.f, 0.f, 0.f},
                         {0.f, 0.f, 1.f, 0.f},
                         {_v.x, _v.y, _v.z, 1.f}}};
  return ret;
}

OZZ_INLINE Float4x4 Translate(const Float4x4& _m, _SimdFloat4 _v) {
  const Float4x4 ret = {{_m.cols[0],
                         _m.cols[1],
                         _m.cols[2],
                         {_m.cols[0].x * _v.x + _m.cols[1].x * _v.y +
                              _m.cols[2].x * _v.z + _m.cols[3].x,
                          _m.cols[0].y * _v.x + _m.cols[1].y * _v.y +
                              _m.cols[2].y * _v.z + _m.cols[3].y,
                          _m.cols[0].z * _v.x + _m.cols[1].z * _v.y +
                              _m.cols[2].z * _v.z + _m.cols[3].z,
                          _m.cols[0].w * _v.x + _m.cols[1].w * _v.y +
                              _m.cols[2].w * _v.z + _m.cols[3].w}}};
  return ret;
}

OZZ_INLINE Float4x4 Scale(const Float4x4& _m, _SimdFloat4 _v) {
  const Float4x4 ret = {{{_m.cols[0].x * _v.x, _m.cols[0].y * _v.x,
                          _m.cols[0].z * _v.x, _m.cols[0].w * _v.x},
                         {_m.cols[1].x * _v.y, _m.cols[1].y * _v.y,
                          _m.cols[1].z * _v.y, _m.cols[1].w * _v.y},
                         {_m.cols[2].x * _v.z, _m.cols[2].y * _v.z,
                          _m.cols[2].z * _v.z, _m.cols[2].w * _v.z},
                         _m.cols[3]}};
  return ret;
}

OZZ_INLINE Float4x4 ColumnMultiply(const Float4x4& _m, _SimdFloat4 _v) {
  const Float4x4 ret = {{{_m.cols[0].x * _v.x, _m.cols[0].y * _v.y,
                          _m.cols[0].z * _v.z, _m.cols[0].w * _v.w},
                         {_m.cols[1].x * _v.x, _m.cols[1].y * _v.y,
                          _m.cols[1].z * _v.z, _m.cols[1].w * _v.w},
                         {_m.cols[2].x * _v.x, _m.cols[2].y * _v.y,
                          _m.cols[2].z * _v.z, _m.cols[2].w * _v.w},
                         {_m.cols[3].x * _v.x, _m.cols[3].y * _v.y,
                          _m.cols[3].z * _v.z, _m.cols[3].w * _v.w}}};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalized(const Float4x4& _m) {
  const SimdInt4 ret = {IsNormalized3(_m.cols[0]).x,
                        IsNormalized3(_m.cols[1]).x,
                        IsNormalized3(_m.cols[2]).x, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsNormalizedEst(const Float4x4& _m) {
  const SimdInt4 ret = {IsNormalizedEst3(_m.cols[0]).x,
                        IsNormalizedEst3(_m.cols[1]).x,
                        IsNormalizedEst3(_m.cols[2]).x, 0};
  return ret;
}

OZZ_INLINE SimdInt4 IsOrthogonal(const Float4x4& _m) {
  // Use simd_float4::zero() if one of the normalization fails. _m will then be
  // considered not orthogonal.
  const SimdFloat4 cross =
      NormalizeSafe3(Cross3(_m.cols[0], _m.cols[1]), simd_float4::zero());
  const SimdFloat4 at = NormalizeSafe3(_m.cols[2], simd_float4::zero());

  const float sq_len = cross.x * at.x + cross.y * at.y + cross.z * at.z;
  const bool same = std::abs(sq_len - 1.f) < kNormalizationToleranceSq;
  const SimdInt4 ret = {-static_cast<int>(same), 0, 0, 0};
  return ret;
}

OZZ_INLINE SimdFloat4 ToQuaternion(const Float4x4& _m) {
  assert(AreAllTrue3(IsNormalized(_m)));
  assert(AreAllTrue1(IsOrthogonal(_m)));
  // Cf From Quaternion to Matrix and Back, J.M.P. van Waveren 2005.
  SimdFloat4 ret;
  if (_m.cols[0].x + _m.cols[1].y + _m.cols[2].z > .0f) {
    const float t = _m.cols[0].x + _m.cols[1].y + _m.cols[2].z + 1.0f;
    const float s = (1.f / std::sqrt(t)) * .5f;
    ret.x = (_m.cols[1].z - _m.cols[2].y) * s;
    ret.y = (_m.cols[2].x - _m.cols[0].z) * s;
    ret.z = (_m.cols[0].y - _m.cols[1].x) * s;
    ret.w = s * t;
  } else if (_m.cols[0].x > _m.cols[1].y && _m.cols[0].x > _m.cols[2].z) {
    const float t = _m.cols[0].x - _m.cols[1].y - _m.cols[2].z + 1.0f;
    const float s = (1.f / std::sqrt(t)) * .5f;
    ret.x = s * t;
    ret.y = (_m.cols[0].y + _m.cols[1].x) * s;
    ret.z = (_m.cols[2].x + _m.cols[0].z) * s;
    ret.w = (_m.cols[1].z - _m.cols[2].y) * s;
  } else if (_m.cols[1].y > _m.cols[2].z) {
    const float t = -_m.cols[0].x + _m.cols[1].y - _m.cols[2].z + 1.0f;
    const float s = (1.f / std::sqrt(t)) * .5f;
    ret.x = (_m.cols[0].y + _m.cols[1].x) * s;
    ret.y = s * t;
    ret.z = (_m.cols[1].z + _m.cols[2].y) * s;
    ret.w = (_m.cols[2].x - _m.cols[0].z) * s;
  } else {
    const float t = -_m.cols[0].x - _m.cols[1].y + _m.cols[2].z + 1.0f;
    const float s = (1.f / std::sqrt(t)) * .5f;
    ret.x = (_m.cols[2].x + _m.cols[0].z) * s;
    ret.y = (_m.cols[1].z + _m.cols[2].y) * s;
    ret.z = s * t;
    ret.w = (_m.cols[0].y - _m.cols[1].x) * s;
  }
  assert(AreAllTrue1(IsNormalizedEst4(ret)));
  return ret;
}

OZZ_INLINE bool ToAffine(const Float4x4& _m, SimdFloat4* _translation,
                         SimdFloat4* _quaternion, SimdFloat4* _scale) {
  _translation->x = _m.cols[3].x;
  _translation->y = _m.cols[3].y;
  _translation->z = _m.cols[3].z;
  _translation->w = 1.f;

  // Extracts scale.
  const float sq_scale_x = Length3Sqr(_m.cols[0]).x;
  const float scale_x = std::sqrt(sq_scale_x);
  const float sq_scale_y = Length3Sqr(_m.cols[1]).x;
  const float scale_y = std::sqrt(sq_scale_y);
  const float sq_scale_z = Length3Sqr(_m.cols[2]).x;
  const float scale_z = std::sqrt(sq_scale_z);

  // Builds an orthonormal matrix in order to support quaternion extraction.
  const bool x_zero = std::abs(sq_scale_x) < kOrthogonalisationToleranceSq;
  const bool y_zero = std::abs(sq_scale_y) < kOrthogonalisationToleranceSq;
  const bool z_zero = std::abs(sq_scale_z) < kOrthogonalisationToleranceSq;

  Float4x4 orthonormal;
  if (x_zero) {
    if (y_zero || z_zero) {
      return false;
    }
    orthonormal.cols[1].x = _m.cols[1].x / scale_y;
    orthonormal.cols[1].y = _m.cols[1].y / scale_y;
    orthonormal.cols[1].z = _m.cols[1].z / scale_y;
    orthonormal.cols[1].w = 0.f;
    orthonormal.cols[0] = Normalize3(Cross3(orthonormal.cols[1], _m.cols[2]));
    orthonormal.cols[2] =
        Normalize3(Cross3(orthonormal.cols[0], orthonormal.cols[1]));
  } else if (z_zero) {
    if (x_zero || y_zero) {
      return false;
    }
    orthonormal.cols[0].x = _m.cols[0].x / scale_x;
    orthonormal.cols[0].y = _m.cols[0].y / scale_x;
    orthonormal.cols[0].z = _m.cols[0].z / scale_x;
    orthonormal.cols[0].w = 0.f;
    orthonormal.cols[2] = Normalize3(Cross3(orthonormal.cols[0], _m.cols[1]));
    orthonormal.cols[1] =
        Normalize3(Cross3(orthonormal.cols[2], orthonormal.cols[0]));
  } else {  // Favor z axis in the default case
    if (x_zero || z_zero) {
      return false;
    }
    orthonormal.cols[2].x = _m.cols[2].x / scale_z;
    orthonormal.cols[2].y = _m.cols[2].y / scale_z;
    orthonormal.cols[2].z = _m.cols[2].z / scale_z;
    orthonormal.cols[2].w = 0.f;
    orthonormal.cols[1] = Normalize3(Cross3(orthonormal.cols[2], _m.cols[0]));
    orthonormal.cols[0] =
        Normalize3(Cross3(orthonormal.cols[1], orthonormal.cols[2]));
  }

  // orthonormal.cols[3] = simd_float4::w_axis();  Not used by ToQuaternion.

  // Get back scale signs in case of reflexions
  _scale->x =
      Dot3(orthonormal.cols[0], _m.cols[0]).x > 0.f ? scale_x : -scale_x;
  _scale->y =
      Dot3(orthonormal.cols[1], _m.cols[1]).x > 0.f ? scale_y : -scale_y;
  _scale->z =
      Dot3(orthonormal.cols[2], _m.cols[2]).x > 0.f ? scale_z : -scale_z;
  _scale->w = 1.f;

  // Extracts quaternion.
  *_quaternion = ToQuaternion(orthonormal);
  return true;
}

OZZ_INLINE Float4x4 Float4x4::FromEuler(_SimdFloat4 _v) {
  const float ch = std::cos(_v.x);
  const float sh = std::sin(_v.x);
  const float ca = std::cos(_v.y);
  const float sa = std::sin(_v.y);
  const float cb = std::cos(_v.z);
  const float sb = std::sin(_v.z);

  const float sa_cb = sa * cb;
  const float sa_sb = sa * sb;

  const Float4x4 ret = {
      {{ch * ca, sh * sb - ch * sa_cb, ch * sa_sb + sh * cb, 0.f},
       {sa, ca * cb, -ca * sb, 0.f},
       {-sh * ca, sh * sa_cb + ch * sb, -sh * sa_sb + ch * cb, 0.f},
       {0.f, 0.f, 0.f, 1.f}}};
  return ret;
}

OZZ_INLINE Float4x4 Float4x4::FromAxisAngle(_SimdFloat4 _axis,
                                            _SimdFloat4 _angle) {
  assert(AreAllTrue1(IsNormalizedEst3(_axis)));

  const float cos = std::cos(_angle.x);
  const float sin = std::sin(_angle.x);
  const float t = 1.f - cos;

  const float a = _axis.x * _axis.y * t;
  const float b = _axis.z * sin;
  const float c = _axis.x * _axis.z * t;
  const float d = _axis.y * sin;
  const float e = _axis.y * _axis.z * t;
  const float f = _axis.x * sin;

  const Float4x4 ret = {{{cos + _axis.x * _axis.x * t, a + b, c - d, 0.f},
                         {a - b, cos + _axis.y * _axis.y * t, e + f, 0.f},
                         {c + d, e - f, cos + _axis.z * _axis.z * t, 0.f},
                         {0.f, 0.f, 0.f, 1.f}}};
  return ret;
}

OZZ_INLINE Float4x4 Float4x4::FromQuaternion(_SimdFloat4 _v) {
  assert(AreAllTrue1(IsNormalizedEst4(_v)));

  const float xx = _v.x * _v.x;
  const float xy = _v.x * _v.y;
  const float xz = _v.x * _v.z;
  const float xw = _v.x * _v.w;
  const float yy = _v.y * _v.y;
  const float yz = _v.y * _v.z;
  const float yw = _v.y * _v.w;
  const float zz = _v.z * _v.z;
  const float zw = _v.z * _v.w;

  const Float4x4 ret = {
      {{1.f - 2.f * (yy + zz), 2.f * (xy + zw), 2.f * (xz - yw), 0.f},
       {2.f * (xy - zw), 1.f - 2.f * (xx + zz), 2.f * (yz + xw), 0.f},
       {2.f * (xz + yw), 2.f * (yz - xw), 1.f - 2.f * (xx + yy), 0.f},
       {0.f, 0.f, 0.f, 1.f}}};
  return ret;
}

OZZ_INLINE Float4x4 Float4x4::FromAffine(_SimdFloat4 _translation,
                                         _SimdFloat4 _quaternion,
                                         _SimdFloat4 _scale) {
  assert(AreAllTrue1(IsNormalizedEst4(_quaternion)));

  const float xx = _quaternion.x * _quaternion.x;
  const float xy = _quaternion.x * _quaternion.y;
  const float xz = _quaternion.x * _quaternion.z;
  const float xw = _quaternion.x * _quaternion.w;
  const float yy = _quaternion.y * _quaternion.y;
  const float yz = _quaternion.y * _quaternion.z;
  const float yw = _quaternion.y * _quaternion.w;
  const float zz = _quaternion.z * _quaternion.z;
  const float zw = _quaternion.z * _quaternion.w;

  const Float4x4 ret = {
      {{_scale.x * (1.f - 2.f * (yy + zz)), _scale.x * 2.f * (xy + zw),
        _scale.x * 2.f * (xz - yw), 0.f},
       {_scale.y * 2.f * (xy - zw), _scale.y * (1.f - 2.f * (xx + zz)),
        _scale.y * (2.f * (yz + xw)), 0.f},
       {_scale.z * 2.f * (xz + yw), _scale.z * 2.f * (yz - xw),
        _scale.z * (1.f - 2.f * (xx + yy)), 0.f},
       {_translation.x, _translation.y, _translation.z, 1.f}}};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 TransformPoint(const ozz::math::Float4x4& _m,
                                                ozz::math::_SimdFloat4 _v) {
  const ozz::math::SimdFloat4 ret = {_m.cols[0].x * _v.x + _m.cols[1].x * _v.y +
                                         _m.cols[2].x * _v.z + _m.cols[3].x,
                                     _m.cols[0].y * _v.x + _m.cols[1].y * _v.y +
                                         _m.cols[2].y * _v.z + _m.cols[3].y,
                                     _m.cols[0].z * _v.x + _m.cols[1].z * _v.y +
                                         _m.cols[2].z * _v.z + _m.cols[3].z,
                                     _m.cols[0].w * _v.x + _m.cols[1].w * _v.y +
                                         _m.cols[2].w * _v.z + _m.cols[3].w};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 TransformVector(const ozz::math::Float4x4& _m,
                                                 ozz::math::_SimdFloat4 _v) {
  const ozz::math::SimdFloat4 ret = {
      _m.cols[0].x * _v.x + _m.cols[1].x * _v.y + _m.cols[2].x * _v.z,
      _m.cols[0].y * _v.x + _m.cols[1].y * _v.y + _m.cols[2].y * _v.z,
      _m.cols[0].z * _v.x + _m.cols[1].z * _v.y + _m.cols[2].z * _v.z,
      _m.cols[0].w * _v.x + _m.cols[1].w * _v.y + _m.cols[2].w * _v.z};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 operator*(const ozz::math::Float4x4& _m,
                                           ozz::math::_SimdFloat4 _v) {
  const ozz::math::SimdFloat4 ret = {
      _m.cols[0].x * _v.x + _m.cols[1].x * _v.y + _m.cols[2].x * _v.z +
          _m.cols[3].x * _v.w,
      _m.cols[0].y * _v.x + _m.cols[1].y * _v.y + _m.cols[2].y * _v.z +
          _m.cols[3].y * _v.w,
      _m.cols[0].z * _v.x + _m.cols[1].z * _v.y + _m.cols[2].z * _v.z +
          _m.cols[3].z * _v.w,
      _m.cols[0].w * _v.x + _m.cols[1].w * _v.y + _m.cols[2].w * _v.z +
          _m.cols[3].w * _v.w};
  return ret;
}

OZZ_INLINE ozz::math::Float4x4 operator*(const ozz::math::Float4x4& _a,
                                         const ozz::math::Float4x4& _b) {
  const ozz::math::Float4x4 ret = {
      {_a * _b.cols[0], _a * _b.cols[1], _a * _b.cols[2], _a * _b.cols[3]}};
  return ret;
}

OZZ_INLINE ozz::math::Float4x4 operator+(const ozz::math::Float4x4& _a,
                                         const ozz::math::Float4x4& _b) {
  const ozz::math::Float4x4 ret = {
      {{_a.cols[0].x + _b.cols[0].x, _a.cols[0].y + _b.cols[0].y,
        _a.cols[0].z + _b.cols[0].z, _a.cols[0].w + _b.cols[0].w},
       {_a.cols[1].x + _b.cols[1].x, _a.cols[1].y + _b.cols[1].y,
        _a.cols[1].z + _b.cols[1].z, _a.cols[1].w + _b.cols[1].w},
       {_a.cols[2].x + _b.cols[2].x, _a.cols[2].y + _b.cols[2].y,
        _a.cols[2].z + _b.cols[2].z, _a.cols[2].w + _b.cols[2].w},
       {_a.cols[3].x + _b.cols[3].x, _a.cols[3].y + _b.cols[3].y,
        _a.cols[3].z + _b.cols[3].z, _a.cols[3].w + _b.cols[3].w}}};
  return ret;
}

OZZ_INLINE ozz::math::Float4x4 operator-(const ozz::math::Float4x4& _a,
                                         const ozz::math::Float4x4& _b) {
  const ozz::math::Float4x4 ret = {
      {{_a.cols[0].x - _b.cols[0].x, _a.cols[0].y - _b.cols[0].y,
        _a.cols[0].z - _b.cols[0].z, _a.cols[0].w - _b.cols[0].w},
       {_a.cols[1].x - _b.cols[1].x, _a.cols[1].y - _b.cols[1].y,
        _a.cols[1].z - _b.cols[1].z, _a.cols[1].w - _b.cols[1].w},
       {_a.cols[2].x - _b.cols[2].x, _a.cols[2].y - _b.cols[2].y,
        _a.cols[2].z - _b.cols[2].z, _a.cols[2].w - _b.cols[2].w},
       {_a.cols[3].x - _b.cols[3].x, _a.cols[3].y - _b.cols[3].y,
        _a.cols[3].z - _b.cols[3].z, _a.cols[3].w - _b.cols[3].w}}};
  return ret;
}
}  // namespace math
}  // namespace ozz

OZZ_INLINE ozz::math::SimdFloat4 operator+(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  const ozz::math::SimdFloat4 ret = {_a.x + _b.x, _a.y + _b.y, _a.z + _b.z,
                                     _a.w + _b.w};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  const ozz::math::SimdFloat4 ret = {_a.x - _b.x, _a.y - _b.y, _a.z - _b.z,
                                     _a.w - _b.w};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 operator-(ozz::math::_SimdFloat4 _v) {
  const ozz::math::SimdFloat4 ret = {-_v.x, -_v.y, -_v.z, -_v.w};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 operator*(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  const ozz::math::SimdFloat4 ret = {_a.x * _b.x, _a.y * _b.y, _a.z * _b.z,
                                     _a.w * _b.w};
  return ret;
}

OZZ_INLINE ozz::math::SimdFloat4 operator/(ozz::math::_SimdFloat4 _a,
                                           ozz::math::_SimdFloat4 _b) {
  const ozz::math::SimdFloat4 ret = {_a.x / _b.x, _a.y / _b.y, _a.z / _b.z,
                                     _a.w / _b.w};
  return ret;
}

namespace ozz {
namespace math {
// Half <-> Float implementation is based on:
// http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/.
OZZ_INLINE uint16_t FloatToHalf(float _f) {
  const uint32_t f32infty = 255 << 23;
  const uint32_t f16infty = 31 << 23;
  const union {
    uint32_t u;
    float f;
  } magic = {15 << 23};
  const uint32_t sign_mask = 0x80000000u;
  const uint32_t round_mask = ~0x00000fffu;

  const union {
    float f;
    uint32_t u;
  } f = {_f};
  const uint32_t sign = f.u & sign_mask;
  const uint32_t f_nosign = f.u & ~sign_mask;

  if (f_nosign >= f32infty) {  // Inf or NaN (all exponent bits set)
    // NaN->qNaN and Inf->Inf
    const uint32_t result =
        ((f_nosign > f32infty) ? 0x7e00 : 0x7c00) | (sign >> 16);
    return static_cast<uint16_t>(result);
  } else {  // (De)normalized number or zero
    const union {
      uint32_t u;
      float f;
    } rounded = {f_nosign & round_mask};
    const union {
      float f;
      uint32_t u;
    } exp = {rounded.f * magic.f};
    const uint32_t re_rounded = exp.u - round_mask;
    // Clamp to signed infinity if overflowed
    const uint32_t result =
        ((re_rounded > f16infty ? f16infty : re_rounded) >> 13) | (sign >> 16);
    return static_cast<uint16_t>(result);
  }
}

OZZ_INLINE float HalfToFloat(uint16_t _h) {
  const union {
    uint32_t u;
    float f;
  } magic = {(254 - 15) << 23};
  const union {
    uint32_t u;
    float f;
  } infnan = {(127 + 16) << 23};

  const uint32_t sign = _h & 0x8000;
  const union {
    int32_t u;
    float f;
  } exp_mant = {(_h & 0x7fff) << 13};
  const union {
    float f;
    uint32_t u;
  } adjust = {exp_mant.f * magic.f};
  // Make sure Inf/NaN survive
  const union {
    uint32_t u;
    float f;
  } result = {(adjust.f >= infnan.f ? (adjust.u | 255 << 23) : adjust.u) |
              (sign << 16)};
  return result.f;
}

OZZ_INLINE SimdInt4 FloatToHalf(_SimdFloat4 _f) {
  const ozz::math::SimdInt4 ret = {FloatToHalf(_f.x), FloatToHalf(_f.y),
                                   FloatToHalf(_f.z), FloatToHalf(_f.w)};
  return ret;
}

OZZ_INLINE SimdFloat4 HalfToFloat(_SimdInt4 _h) {
  const ozz::math::SimdFloat4 ret = {
      HalfToFloat(_h.x & 0x0000ffff), HalfToFloat(_h.y & 0x0000ffff),
      HalfToFloat(_h.z & 0x0000ffff), HalfToFloat(_h.w & 0x0000ffff)};
  return ret;
}
}  // namespace math
}  // namespace ozz

#undef OZZ_RCP_EST
#undef OZZ_RSQRT_EST
#endif  // OZZ_OZZ_BASE_MATHS_INTERNAL_SIMD_MATH_REF_INL_H_