rbdlsim/3rdparty/vectorial/include/vectorial/simd4f_sse.h

237 lines
6.2 KiB
C

/*
Vectorial
Copyright (c) 2010 Mikko Lehtonen
Copyright (c) 2014 Google, Inc.
Licensed under the terms of the two-clause BSD License (see LICENSE)
*/
#ifndef VECTORIAL_SIMD4F_SSE_H
#define VECTORIAL_SIMD4F_SSE_H
// Conditionally enable SSE4.1 otherwise fallback to SSE.
#if defined(_M_IX86_FP)
#if _M_IX86_FP >=2
#define VECTORIAL_USE_SSE4_1
#endif
#elif defined(__SSE4_1__)
#define VECTORIAL_USE_SSE4_1
#endif
#include <xmmintrin.h>
#if defined(VECTORIAL_USE_SSE4_1)
#include <smmintrin.h>
#endif
#include <string.h> // memcpy
#ifdef __cplusplus
extern "C" {
#endif
typedef __m128 simd4f;
typedef union {
simd4f s ;
float f[4];
unsigned int ui[4];
} _simd4f_union;
// creating
vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
simd4f s = { x, y, z, w };
return s;
}
vectorial_inline simd4f simd4f_zero() { return _mm_setzero_ps(); }
vectorial_inline simd4f simd4f_uload4(const float *ary) {
simd4f s = _mm_loadu_ps(ary);
return s;
}
vectorial_inline simd4f simd4f_uload3(const float *ary) {
simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
return s;
}
vectorial_inline simd4f simd4f_uload2(const float *ary) {
simd4f s = simd4f_create(ary[0], ary[1], 0, 0);
return s;
}
vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
_mm_storeu_ps(ary, val);
}
vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
memcpy(ary, &val, sizeof(float) * 3);
}
vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
memcpy(ary, &val, sizeof(float) * 2);
}
// utilites
vectorial_inline simd4f simd4f_splat(float v) {
simd4f s = _mm_set1_ps(v);
return s;
}
vectorial_inline simd4f simd4f_splat_x(simd4f v) {
simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0,0,0,0));
return s;
}
vectorial_inline simd4f simd4f_splat_y(simd4f v) {
simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1,1,1,1));
return s;
}
vectorial_inline simd4f simd4f_splat_z(simd4f v) {
simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,2,2,2));
return s;
}
vectorial_inline simd4f simd4f_splat_w(simd4f v) {
simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3,3,3,3));
return s;
}
// arithmetic
vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
simd4f ret = _mm_add_ps(lhs, rhs);
return ret;
}
vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
simd4f ret = _mm_sub_ps(lhs, rhs);
return ret;
}
vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
simd4f ret = _mm_mul_ps(lhs, rhs);
return ret;
}
vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
simd4f ret = _mm_div_ps(lhs, rhs);
return ret;
}
vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
return simd4f_add( simd4f_mul(m1, m2), a );
}
vectorial_inline simd4f simd4f_reciprocal(simd4f v) {
simd4f s = _mm_rcp_ps(v);
const simd4f two = simd4f_create(2.0f, 2.0f, 2.0f, 2.0f);
s = simd4f_mul(s, simd4f_sub(two, simd4f_mul(v, s)));
return s;
}
vectorial_inline simd4f simd4f_sqrt(simd4f v) {
simd4f s = _mm_sqrt_ps(v);
return s;
}
vectorial_inline simd4f simd4f_rsqrt(simd4f v) {
simd4f s = _mm_rsqrt_ps(v);
const simd4f half = simd4f_create(0.5f, 0.5f, 0.5f, 0.5f);
const simd4f three = simd4f_create(3.0f, 3.0f, 3.0f, 3.0f);
s = simd4f_mul(simd4f_mul(s, half), simd4f_sub(three, simd4f_mul(s, simd4f_mul(v,s))));
return s;
}
vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
vectorial_inline simd4f simd4f_dot3(simd4f lhs,simd4f rhs) {
#if defined(VECTORIAL_USE_SSE4_1)
return _mm_dp_ps(lhs, rhs, 0x7f);
#else
simd4f_aligned16 const unsigned int mask_array[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
const simd4f mask = _mm_load_ps((const float*)mask_array);
const simd4f m = _mm_mul_ps(lhs, rhs);
const simd4f s0 = _mm_and_ps(m, mask);
const simd4f s1 = _mm_add_ps(s0, _mm_movehl_ps(s0, s0));
const simd4f s2 = _mm_add_ss(s1, _mm_shuffle_ps(s1, s1, 1));
return _mm_shuffle_ps(s2,s2, 0);
#endif
}
vectorial_inline float simd4f_dot3_scalar(simd4f lhs,simd4f rhs) {
return simd4f_get_x(simd4f_dot3(lhs, rhs));
}
vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
const simd4f lyzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,0,2,1));
const simd4f lzxy = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,1,0,2));
const simd4f ryzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,0,2,1));
const simd4f rzxy = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,1,0,2));
return _mm_sub_ps(_mm_mul_ps(lyzx, rzxy), _mm_mul_ps(lzxy, ryzx));
}
vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(2,1,0,3) ); }
vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(1,0,3,2) ); }
vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(0,3,2,1) ); }
vectorial_inline simd4f simd4f_zero_w(simd4f s) {
simd4f r = _mm_unpackhi_ps(s, _mm_setzero_ps());
return _mm_movelh_ps(s, r);
}
vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
return _mm_movelh_ps(s, _mm_setzero_ps());
}
vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) {
return _mm_movehl_ps(abcd, xyzw);
}
typedef simd4f_aligned16 union {
unsigned int ui[4];
float f[4];
} _simd4f_uif;
vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
const _simd4f_uif upnpn = { { 0x00000000, 0x80000000, 0x00000000, 0x80000000 } };
return _mm_xor_ps( s, _mm_load_ps(upnpn.f) );
}
vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
const _simd4f_uif unpnp = { { 0x80000000, 0x00000000, 0x80000000, 0x00000000 } };
return _mm_xor_ps( s, _mm_load_ps(unpnp.f) );
}
vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
return _mm_min_ps( a, b );
}
vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
return _mm_max_ps( a, b );
}
#ifdef __cplusplus
}
#endif
#endif