413 lines
12 KiB
C++
413 lines
12 KiB
C++
/*
|
|
Vectorial
|
|
Copyright (c) 2010 Mikko Lehtonen
|
|
Copyright (c) 2014 Google, Inc.
|
|
Licensed under the terms of the two-clause BSD License (see LICENSE)
|
|
*/
|
|
#ifndef VECTORIAL_SIMD4X4F_H
|
|
#define VECTORIAL_SIMD4X4F_H
|
|
|
|
|
|
#include "simd4f.h"
|
|
|
|
#include <math.h>
|
|
|
|
/*
|
|
Note, x,y,z,w are conceptually columns with matrix math.
|
|
*/
|
|
|
|
typedef struct {
|
|
simd4f x,y,z,w;
|
|
} simd4x4f;
|
|
|
|
|
|
|
|
vectorial_inline simd4x4f simd4x4f_create(simd4f x, simd4f y, simd4f z, SIMD_PARAM(simd4f, w)) {
|
|
simd4x4f s = { x, y, z, w };
|
|
return s;
|
|
}
|
|
|
|
|
|
vectorial_inline void simd4x4f_identity(simd4x4f* m) {
|
|
*m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
|
|
simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
|
|
simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
|
|
simd4f_create(0.0f, 0.0f, 0.0f, 1.0f));
|
|
}
|
|
|
|
|
|
|
|
vectorial_inline void simd4x4f_uload(simd4x4f* m, const float *f) {
|
|
|
|
m->x = simd4f_uload4(f + 0);
|
|
m->y = simd4f_uload4(f + 4);
|
|
m->z = simd4f_uload4(f + 8);
|
|
m->w = simd4f_uload4(f + 12);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef VECTORIAL_SCALAR
|
|
#include "simd4x4f_scalar.h"
|
|
#elif defined(VECTORIAL_SSE)
|
|
#include "simd4x4f_sse.h"
|
|
#elif defined(VECTORIAL_GNU)
|
|
#include "simd4x4f_gnu.h"
|
|
#elif defined(VECTORIAL_NEON)
|
|
#include "simd4x4f_neon.h"
|
|
#else
|
|
#error No implementation defined
|
|
#endif
|
|
|
|
vectorial_inline void simd4x4f_sum(const simd4x4f* a, simd4f* out) {
|
|
simd4f t;
|
|
t = simd4f_add(a->x, a->y);
|
|
t = simd4f_add(t, a->z);
|
|
t = simd4f_add(t, a->w);
|
|
*out = t;
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_matrix_vector_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
|
|
|
|
const simd4f x = a->x;
|
|
const simd4f y = a->y;
|
|
const simd4f z = a->z;
|
|
const simd4f w = a->w;
|
|
const simd4f v = *b;
|
|
const simd4f vx = simd4f_splat_x(v);
|
|
const simd4f vy = simd4f_splat_y(v);
|
|
const simd4f vz = simd4f_splat_z(v);
|
|
const simd4f vw = simd4f_splat_w(v);
|
|
|
|
#if 0
|
|
// In a hasty benchmark, this actually performed worse on neon
|
|
// TODO: revisit and conditionalize accordingly
|
|
|
|
*out = simd4f_madd(x, vx,
|
|
simd4f_madd(y, vy,
|
|
simd4f_madd(z, vz,
|
|
simd4f_mul(w, vw) ) ) );
|
|
|
|
#else
|
|
|
|
*out = simd4f_add(simd4f_mul(x, vx),
|
|
simd4f_add(simd4f_mul(y, vy),
|
|
simd4f_add(simd4f_mul(z, vz),
|
|
simd4f_mul(w, vw) ) ) );
|
|
|
|
#endif
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
|
|
|
|
#if 0
|
|
*out = simd4f_madd( a->x, simd4f_splat_x(*b),
|
|
simd4f_madd( a->y, simd4f_splat_y(*b),
|
|
simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
|
|
#else
|
|
*out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)),
|
|
simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)),
|
|
simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
|
|
#endif
|
|
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
|
|
|
|
#if 0
|
|
*out = simd4f_madd( a->x, simd4f_splat_x(*b),
|
|
simd4f_madd( a->y, simd4f_splat_y(*b),
|
|
simd4f_madd( a->z, simd4f_splat_z(*b),
|
|
a->w ) ) );
|
|
#else
|
|
*out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)),
|
|
simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)),
|
|
simd4f_add( simd4f_mul(a->z, simd4f_splat_z(*b)),
|
|
a->w ) ) );
|
|
#endif
|
|
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_inv_ortho_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
|
|
simd4f translation = simd4f_sub(*b, a->w);
|
|
|
|
simd4x4f transpose = *a;
|
|
|
|
transpose.w = simd4f_create(0,0,0,0);
|
|
simd4x4f_transpose_inplace(&transpose);
|
|
|
|
simd4x4f_matrix_point3_mul(&transpose, &translation, out);
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_inv_ortho_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
|
|
simd4f translation = *b;
|
|
|
|
simd4x4f transpose = *a;
|
|
|
|
transpose.w = simd4f_create(0,0,0,0);
|
|
simd4x4f_transpose_inplace(&transpose);
|
|
|
|
simd4x4f_matrix_vector3_mul(&transpose, &translation, out);
|
|
}
|
|
|
|
|
|
vectorial_inline void simd4x4f_matrix_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
|
|
|
|
simd4x4f_matrix_vector_mul(a, &b->x, &out->x);
|
|
simd4x4f_matrix_vector_mul(a, &b->y, &out->y);
|
|
simd4x4f_matrix_vector_mul(a, &b->z, &out->z);
|
|
simd4x4f_matrix_vector_mul(a, &b->w, &out->w);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
vectorial_inline void simd4x4f_perspective(simd4x4f *m, float fovy_radians, float aspect, float znear, float zfar) {
|
|
|
|
float deltaz = zfar - znear;
|
|
float cotangent = tanf( VECTORIAL_HALFPI - fovy_radians * 0.5f );
|
|
|
|
float a = cotangent / aspect;
|
|
float b = cotangent;
|
|
float c = -(zfar + znear) / deltaz;
|
|
float d = -2 * znear * zfar / deltaz;
|
|
|
|
m->x = simd4f_create( a, 0, 0, 0);
|
|
m->y = simd4f_create( 0, b, 0, 0);
|
|
m->z = simd4f_create( 0, 0, c, -1);
|
|
m->w = simd4f_create( 0, 0, d, 0);
|
|
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_ortho(simd4x4f *m, float left, float right, float bottom, float top, float znear, float zfar) {
|
|
|
|
float deltax = right - left;
|
|
float deltay = top - bottom;
|
|
float deltaz = zfar - znear;
|
|
|
|
float a = 2.0f / deltax;
|
|
float b = -(right + left) / deltax;
|
|
float c = 2.0f / deltay;
|
|
float d = -(top + bottom) / deltay;
|
|
float e = -2.0f / deltaz;
|
|
float f = -(zfar + znear) / deltaz;
|
|
|
|
m->x = simd4f_create( a, 0, 0, 0);
|
|
m->y = simd4f_create( 0, c, 0, 0);
|
|
m->z = simd4f_create( 0, 0, e, 0);
|
|
m->w = simd4f_create( b, d, f, 1);
|
|
|
|
}
|
|
|
|
|
|
vectorial_inline void simd4x4f_lookat(simd4x4f *m, simd4f eye, simd4f center, simd4f up) {
|
|
|
|
simd4f zaxis = simd4f_normalize3( simd4f_sub(center, eye) );
|
|
simd4f xaxis = simd4f_normalize3( simd4f_cross3( zaxis, up ) );
|
|
simd4f yaxis = simd4f_cross3(xaxis, zaxis);
|
|
|
|
zaxis = simd4f_sub( simd4f_zero(), zaxis);
|
|
|
|
float x = -simd4f_dot3_scalar(xaxis, eye);
|
|
float y = -simd4f_dot3_scalar(yaxis, eye);
|
|
float z = -simd4f_dot3_scalar(zaxis, eye);
|
|
|
|
m->x = xaxis;
|
|
m->y = yaxis;
|
|
m->z = zaxis;
|
|
|
|
m->w = simd4f_create( 0,0,0, 1);
|
|
simd4x4f_transpose_inplace(m);
|
|
m->w = simd4f_create( x,y,z,1);
|
|
|
|
}
|
|
|
|
|
|
vectorial_inline void simd4x4f_translation(simd4x4f* m, float x, float y, float z) {
|
|
*m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
|
|
simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
|
|
simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
|
|
simd4f_create( x, y, z, 1.0f));
|
|
}
|
|
|
|
|
|
vectorial_inline void simd4x4f_axis_rotation(simd4x4f* m, float radians, simd4f axis) {
|
|
|
|
radians = -radians;
|
|
|
|
axis = simd4f_normalize3(axis);
|
|
|
|
const float sine = sinf(radians);
|
|
const float cosine = cosf(radians);
|
|
|
|
const float x = simd4f_get_x(axis);
|
|
const float y = simd4f_get_y(axis);
|
|
const float z = simd4f_get_z(axis);
|
|
|
|
const float ab = x * y * (1 - cosine);
|
|
const float bc = y * z * (1 - cosine);
|
|
const float ca = z * x * (1 - cosine);
|
|
|
|
const float tx = x * x;
|
|
const float ty = y * y;
|
|
const float tz = z * z;
|
|
|
|
const simd4f i = simd4f_create( tx + cosine * (1 - tx), ab - z * sine, ca + y * sine, 0);
|
|
const simd4f j = simd4f_create( ab + z * sine, ty + cosine * (1 - ty), bc - x * sine, 0);
|
|
const simd4f k = simd4f_create( ca - y * sine, bc + x * sine, tz + cosine * (1 - tz), 0);
|
|
|
|
*m = simd4x4f_create( i,j,k, simd4f_create(0, 0, 0, 1) );
|
|
|
|
}
|
|
|
|
|
|
|
|
vectorial_inline void simd4x4f_add(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
|
|
|
|
out->x = simd4f_add(a->x, b->x);
|
|
out->y = simd4f_add(a->y, b->y);
|
|
out->z = simd4f_add(a->z, b->z);
|
|
out->w = simd4f_add(a->w, b->w);
|
|
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_sub(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
|
|
|
|
out->x = simd4f_sub(a->x, b->x);
|
|
out->y = simd4f_sub(a->y, b->y);
|
|
out->z = simd4f_sub(a->z, b->z);
|
|
out->w = simd4f_sub(a->w, b->w);
|
|
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
|
|
|
|
out->x = simd4f_mul(a->x, b->x);
|
|
out->y = simd4f_mul(a->y, b->y);
|
|
out->z = simd4f_mul(a->z, b->z);
|
|
out->w = simd4f_mul(a->w, b->w);
|
|
|
|
}
|
|
|
|
vectorial_inline void simd4x4f_div(simd4x4f* a, simd4x4f* b, simd4x4f* out) {
|
|
|
|
out->x = simd4f_div(a->x, b->x);
|
|
out->y = simd4f_div(a->y, b->y);
|
|
out->z = simd4f_div(a->z, b->z);
|
|
out->w = simd4f_div(a->w, b->w);
|
|
|
|
}
|
|
|
|
vectorial_inline simd4f simd4x4f_inverse(const simd4x4f* a, simd4x4f* out) {
|
|
|
|
const simd4f c0 = a->x;
|
|
const simd4f c1 = a->y;
|
|
const simd4f c2 = a->z;
|
|
const simd4f c3 = a->w;
|
|
|
|
const simd4f c0_wxyz = simd4f_shuffle_wxyz(c0);
|
|
const simd4f c0_zwxy = simd4f_shuffle_zwxy(c0);
|
|
const simd4f c0_yzwx = simd4f_shuffle_yzwx(c0);
|
|
|
|
const simd4f c1_wxyz = simd4f_shuffle_wxyz(c1);
|
|
const simd4f c1_zwxy = simd4f_shuffle_zwxy(c1);
|
|
const simd4f c1_yzwx = simd4f_shuffle_yzwx(c1);
|
|
|
|
const simd4f c2_wxyz = simd4f_shuffle_wxyz(c2);
|
|
const simd4f c2_zwxy = simd4f_shuffle_zwxy(c2);
|
|
const simd4f c2_yzwx = simd4f_shuffle_yzwx(c2);
|
|
|
|
const simd4f c3_wxyz = simd4f_shuffle_wxyz(c3);
|
|
const simd4f c3_zwxy = simd4f_shuffle_zwxy(c3);
|
|
const simd4f c3_yzwx = simd4f_shuffle_yzwx(c3);
|
|
|
|
const simd4f c0_wxyz_x_c1 = simd4f_mul(c0_wxyz, c1);
|
|
const simd4f c0_wxyz_x_c1_yzwx = simd4f_mul(c0_wxyz, c1_yzwx);
|
|
const simd4f c0_wxyz_x_c1_zwxy = simd4f_mul(c0_wxyz, c1_zwxy);
|
|
|
|
const simd4f c2_wxyz_x_c3 = simd4f_mul(c2_wxyz, c3);
|
|
const simd4f c2_wxyz_x_c3_yzwx = simd4f_mul(c2_wxyz, c3_yzwx);
|
|
const simd4f c2_wxyz_x_c3_zwxy = simd4f_mul(c2_wxyz, c3_zwxy);
|
|
|
|
const simd4f ar1 = simd4f_sub( simd4f_shuffle_wxyz(c2_wxyz_x_c3_zwxy), simd4f_shuffle_zwxy(c2_wxyz_x_c3) );
|
|
const simd4f ar2 = simd4f_sub( simd4f_shuffle_zwxy(c2_wxyz_x_c3_yzwx), c2_wxyz_x_c3_yzwx );
|
|
const simd4f ar3 = simd4f_sub( c2_wxyz_x_c3_zwxy, simd4f_shuffle_wxyz(c2_wxyz_x_c3) );
|
|
|
|
const simd4f br1 = simd4f_sub( simd4f_shuffle_wxyz(c0_wxyz_x_c1_zwxy), simd4f_shuffle_zwxy(c0_wxyz_x_c1) );
|
|
const simd4f br2 = simd4f_sub( simd4f_shuffle_zwxy(c0_wxyz_x_c1_yzwx), c0_wxyz_x_c1_yzwx );
|
|
const simd4f br3 = simd4f_sub( c0_wxyz_x_c1_zwxy, simd4f_shuffle_wxyz(c0_wxyz_x_c1) );
|
|
|
|
|
|
const simd4f c0_sum = simd4f_madd(c0_yzwx, ar3,
|
|
simd4f_madd(c0_zwxy, ar2,
|
|
simd4f_mul(c0_wxyz, ar1)));
|
|
|
|
const simd4f c1_sum = simd4f_madd(c1_wxyz, ar1,
|
|
simd4f_madd(c1_zwxy, ar2,
|
|
simd4f_mul(c1_yzwx, ar3)));
|
|
|
|
const simd4f c2_sum = simd4f_madd(c2_yzwx, br3,
|
|
simd4f_madd(c2_zwxy, br2,
|
|
simd4f_mul(c2_wxyz, br1)));
|
|
|
|
const simd4f c3_sum = simd4f_madd(c3_yzwx, br3,
|
|
simd4f_madd(c3_zwxy, br2,
|
|
simd4f_mul(c3_wxyz, br1)));
|
|
|
|
|
|
const simd4f d0 = simd4f_mul(c1_sum, c0);
|
|
const simd4f d1 = simd4f_add(d0, simd4f_merge_high(d0, d0));
|
|
const simd4f det = simd4f_sub(d1, simd4f_splat_y(d1));
|
|
|
|
const simd4f invdet = simd4f_splat_x( simd4f_div(simd4f_splat(1.0f), det) );
|
|
|
|
const simd4f o0 = simd4f_mul( simd4f_flip_sign_0101(c1_sum), invdet );
|
|
const simd4f o1 = simd4f_mul( simd4f_flip_sign_1010(c0_sum), invdet );
|
|
const simd4f o2 = simd4f_mul( simd4f_flip_sign_0101(c3_sum), invdet );
|
|
const simd4f o3 = simd4f_mul( simd4f_flip_sign_1010(c2_sum), invdet );
|
|
|
|
const simd4x4f mt = simd4x4f_create(o0, o1, o2, o3);
|
|
|
|
simd4x4f_transpose( &mt, out);
|
|
|
|
return det;
|
|
}
|
|
|
|
#ifdef __cplusplus
|
|
|
|
#ifdef VECTORIAL_OSTREAM
|
|
#include <ostream>
|
|
|
|
vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4x4f& v) {
|
|
os << "simd4x4f(simd4f(" << simd4f_get_x(v.x) << ", "
|
|
<< simd4f_get_y(v.x) << ", "
|
|
<< simd4f_get_z(v.x) << ", "
|
|
<< simd4f_get_w(v.x) << "),\n"
|
|
<< " simd4f(" << simd4f_get_x(v.y) << ", "
|
|
<< simd4f_get_y(v.y) << ", "
|
|
<< simd4f_get_z(v.y) << ", "
|
|
<< simd4f_get_w(v.y) << "),\n"
|
|
<< " simd4f(" << simd4f_get_x(v.z) << ", "
|
|
<< simd4f_get_y(v.z) << ", "
|
|
<< simd4f_get_z(v.z) << ", "
|
|
<< simd4f_get_w(v.z) << "),\n"
|
|
<< " simd4f(" << simd4f_get_x(v.w) << ", "
|
|
<< simd4f_get_y(v.w) << ", "
|
|
<< simd4f_get_z(v.w) << ", "
|
|
<< simd4f_get_w(v.w) << "))";
|
|
return os;
|
|
}
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#endif
|