Added own camera object and its vectorial dependency.

2021-11-12 11:00:39 +01:00 · 2021-11-12 11:00:39 +01:00 · 94cc4aeb1d
commit 94cc4aeb1d
parent b78045ffe7
53 changed files with 7413 additions and 50 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,10 @@
 .*.swp
 *CMakeFiles/**
 /.idea
 /scratch*
 /build*
 /cmake-build*
 /3rdparty/ozz-animation/media
 **/CMakeFiles:w
--- a/3rdparty/vectorial/.gitignore
+++ b/3rdparty/vectorial/.gitignore
@ -0,0 +1,3 @@
 *.o
 *.orig
 specsuite-*
--- a/3rdparty/vectorial/.travis.yml
+++ b/3rdparty/vectorial/.travis.yml
@ -0,0 +1,6 @@
 language: cpp
 compiler:
  - gcc
  - clang
 script: make
--- a/3rdparty/vectorial/LICENSE
+++ b/3rdparty/vectorial/LICENSE
@ -0,0 +1,22 @@
 Copyright 2010 Mikko Lehtonen. All rights reserved.
 Redistribution and use in source and binary forms, with or without modification, are
 permitted provided that the following conditions are met:
   1. Redistributions of source code must retain the above copyright notice, this list of
      conditions and the following disclaimer.
   2. Redistributions in binary form must reproduce the above copyright notice, this list
      of conditions and the following disclaimer in the documentation and/or other materials
      provided with the distribution.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGE.
--- a/3rdparty/vectorial/Makefile
+++ b/3rdparty/vectorial/Makefile
@ -0,0 +1,294 @@
 CXX?=g++
 CLANG_CC=clang
 CLANG_CXX=clang++
 IPHONE_PLATFORM_PATH = /Developer/Platforms/iPhoneOS.platform/Developer
 IPHONE_ISYSROOT_PATH = $(IPHONE_PLATFORM_PATH)/SDKs/iPhoneOS4.2.sdk/
 IPHONE_CC = $(IPHONE_PLATFORM_PATH)/usr/bin/g++ -isysroot $(IPHONE_ISYSROOT_PATH)   -arch armv7
 # -mfloat-abi=softfp -mfpu=neon  
 #CXXFLAGS += -Iinclude -O0
 #CXXFLAGS += -g -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math 
 CXXFLAGS += -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math  -D__extern_always_inline=inline
 SPEC_SRC = $(wildcard spec/*.cpp)
 SPEC_OBJ = $(SPEC_SRC:.cpp=.o)
 BENCH_SRC = $(wildcard bench/*.cpp)
 BENCH_OBJ = $(BENCH_SRC:.cpp=.o)
 BENCH_ASM = $(patsubst %.cpp,asm$(SUFFIX)/%.S,$(BENCH_SRC))
 SUFFIX=
 DEFAULT_CC=1
 ifeq ($(FORCE_SCALAR),1)
 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SCALAR
 	SUFFIX=-scalar
 endif
 ifeq ($(FORCE_SSE),1)
 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SSE -msse -msse2 -mfpmath=sse
 	SUFFIX=-sse
 endif
 ifeq ($(FORCE_GNU),1)
 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_GNU 
 	#-msse -msse2 -mfpmath=sse
 	SUFFIX=-gnu
 endif
 ifeq ($(FORCE_NEON),1)
 	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_NEON
 	SUFFIX=-neon
 	ARM=1
 endif
 ifeq ($(ARM),1)
 ifeq ($(shell uname -s),Darwin)
 	CC=$(IPHONE_CC)
 	CXX=$(IPHONE_CC)
 endif
 #	CXXFLAGS+= -mcpu=cortex-a8 
 	CXXFLAGS+= -mno-thumb -mfloat-abi=softfp -mfpu=neon
 	DEFAULT_CC=0
 endif
 ifeq ($(CLANG),1)
 	CC=$(CLANG_CC)
 	CXX=$(CLANG_CXX)
 	DEFAULT_CC=0
 endif
 ifeq ($(DEFAULT_CC),1)
 #	CXXFLAGS += -msse -msse2 -mfpmath=sse
 endif
 ifeq ($(ASM),1)
 	CC+= -S
 	CXX+= -S
 endif
 BUILDDIR=build$(SUFFIX)
 SPEC_OBJ := $(addprefix $(BUILDDIR)/,$(SPEC_OBJ))
 BENCH_OBJ := $(addprefix $(BUILDDIR)/,$(BENCH_OBJ))
 SILENT=@
 MKDIR=mkdir -p
 PATH_SEPARATOR=/
 $(BUILDDIR)/%.o: %.cpp
 	@echo CXX $<
 	$(SILENT) $(MKDIR) $(subst /,$(PATH_SEPARATOR),$(dir $@))
 	$(SILENT) $(COMPILE.cc) -o $@ $<
 .PHONY: all
 all: specsuite$(SUFFIX)
 	./specsuite$(SUFFIX)
 .PHONY: full
 full:
 	@clear
 	@echo FULL COMPILE at `date +%H:%M:%S`
 #	FORCE_SCALAR=1 $(MAKE) clean 
 	@FORCE_SCALAR=1 $(MAKE)  specsuite-scalar
 #	FORCE_GNU=1 $(MAKE) clean 
 	@FORCE_GNU=1 $(MAKE)  specsuite-gnu
 #	FORCE_SSE=1 $(MAKE) clean 
 	@FORCE_SSE=1 $(MAKE)  specsuite-sse
 #	FORCE_NEON=1 $(MAKE) clean 
 #	FORCE_NEON=1 $(MAKE) specsuite-neon
 	@./specsuite-scalar
 	@./specsuite-sse
 	@./specsuite-gnu
 specsuite$(SUFFIX): $(SPEC_OBJ)
 	@echo LINK $@
 	@$(CXX) $(LDFLAGS) $^ -o $@
 .PHONY: depend
 depend:
 	@echo DEP
 	@makedepend -Y -- $(CXXFLAGS) -- $(SPEC_SRC) $(BENCH_SRC) -p$(BUILDDIR)/ > /dev/null 2>&1 
 	@$(RM) Makefile.bak
 define asm-command
@mkdir -p $(dir asm$(SUFFIX)/$(1))
 $(CXX) $(CXXFLAGS) -S $(1) -o asm$(SUFFIX)/$(1).S
 endef
 bench-asm: $(BENCH_SRC)
 	$(foreach p,$(BENCH_SRC),$(call asm-command,$(p)))
 benchmark$(SUFFIX): $(BENCH_OBJ) bench-asm
 	$(CXX) $(BENCH_OBJ) -o $@
 .PHONY: bench-full
 bench-full:
 	FORCE_SCALAR=1 $(MAKE) benchmark-scalar
 	FORCE_GNU=1 $(MAKE) benchmark-gnu
 	FORCE_SSE=1 $(MAKE) benchmark-sse
 #	FORCE_NEON=1 $(MAKE) clean 
 #	FORCE_NEON=1 $(MAKE) benchmark-neon
 	./benchmark-scalar
 	./benchmark-sse
 	./benchmark-gnu
 .PHONY: clean
 clean:
 	rm -f $(SPEC_OBJ) $(BENCH_OBJ) benchmark$(SUFFIX) specsuite$(SUFFIX) 
 	rm -rf asm$(SUFFIX)
 .PHONY: realclean
 realclean: clean
 	rm -f specsuite*
 	rm -rf build*
 .PHONY: update_spec
 update_spec:
 	./tools/update_spec.rb spec/spec_*.cpp
 ifeq ($(MAKECMDGOALS),export)
 ifeq ($(origin to),undefined)
 $(error to not set, like  make export to=/foo/bar)
 endif
 endif
 .PHONY: export
 export:
 	$(SILENT) git archive --format tar master | tar x -C $(to)
 include/vectorial/vec2f.h include/vectorial/vec3f.h include/vectorial/vec4f.h: include/vectorial/simd4f.h
 include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h
 include/vectorial/simd4f.h: include/vectorial/simd4f_neon.h
 include/vectorial/simd4f.h: include/vectorial/simd4f_gnu.h
 include/vectorial/simd4f.h: include/vectorial/simd4f_sse.h
 include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h
 include/vectorial/simd4f.h: include/vectorial/config.h
 include/vectorial/simd4x4f.h: include/vectorial/simd4f.h
 include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_scalar.h
 include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_neon.h
 include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_gnu.h
 include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_sse.h
 include/vectorial/simd4x4f.h: include/vectorial/config.h
 spec/spec_helper.h: include/vectorial/simd4x4f.h include/vectorial/simd4f.h include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h
 spec/spec.cpp: spec/spec.h
 spec/spec_main.cpp: spec/spec.h
 spec/spec_simd4f.cpp: spec/spec_helper.h
 spec/spec_simd4x4f.cpp: spec/spec_helper.h
 spec/spec_vec2f.cpp: spec/spec_helper.h
 spec/spec_vec3f.cpp: spec/spec_helper.h
 spec/spec_vec4f.cpp: spec/spec_helper.h
 $(BUILDDIR)/spec/spec_simd4f.o: \
  include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
  include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
  include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
  include/vectorial/config.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: \
  include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
  include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
  include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
  include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \
  include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h
 $(BUILDDIR)/spec/spec_vec2f.o $(BUILDDIR)/spec/spec_vec3f.o $(BUILDDIR)/spec/spec_vec4f.o: \
  include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
  include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h \
  include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
  include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
  include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \
  include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h
 # DO NOT DELETE
 $(BUILDDIR)/spec/spec.o: spec/spec.h
 $(BUILDDIR)/spec/spec_main.o: spec/spec.h
 $(BUILDDIR)/spec/spec_mat4f.o: spec/spec_helper.h spec/spec.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/config.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_gnu.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_common.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec2f.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/mat4f.h
 $(BUILDDIR)/spec/spec_simd4f.o: spec/spec_helper.h spec/spec.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/config.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_gnu.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_common.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec2f.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/mat4f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: spec/spec_helper.h spec/spec.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/config.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_gnu.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_common.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec4f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec3f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec2f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/mat4f.h
 $(BUILDDIR)/spec/spec_vec2f.o: spec/spec_helper.h spec/spec.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/config.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_gnu.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_common.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec2f.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/mat4f.h
 $(BUILDDIR)/spec/spec_vec3f.o: spec/spec_helper.h spec/spec.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/config.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_gnu.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_common.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec2f.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/mat4f.h
 $(BUILDDIR)/spec/spec_vec4f.o: spec/spec_helper.h spec/spec.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/config.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_gnu.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_common.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec2f.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/mat4f.h
 $(BUILDDIR)/bench/add_bench.o: bench/bench.h include/vectorial/vec4f.h
 $(BUILDDIR)/bench/bench.o: bench/bench.h include/vectorial/config.h
 $(BUILDDIR)/bench/dot_bench.o: bench/bench.h include/vectorial/vec4f.h
 $(BUILDDIR)/bench/matrix_bench.o: bench/bench.h include/vectorial/simd4x4f.h
 $(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4f.h
 $(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4x4f_gnu.h
 $(BUILDDIR)/bench/quad_bench.o: bench/bench.h include/vectorial/simd4x4f.h
 $(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4f.h
 $(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4x4f_gnu.h
--- a/3rdparty/vectorial/README
+++ b/3rdparty/vectorial/README
@ -0,0 +1,60 @@
    Vectorial - vector math library
  Motivation
    I couldn't find an open source math library that was usable and
    supported simd - especially the ARM NEON variant.
  Features
    Supports NEON, SSE, scalar and generic gcc vector extension.
    Most basic vector and matrix math is available, but not quite
    yet full featured.
  Design
    Vectorial consists of two main parts, pure-C wrapper around
    platform-specific vector instructions in the simd*.h files
    and C++ classes for common uses, the vec*.h and mat*.h
    The config.h autodetects approriate vector instructions to use.
    The platform-specific support is done with intrisincs only,
    allowing the compiler to have a full view of the code, hopefully
    resulting in better optimizations especially with reordering etc.
  Installation / Usage
    Add vectorial/include to your include path
    #include "vectorial/simd4f.h"  
    for C-only simd wrapper, using it looks like this:
      simd4f v = simd4f_normalize( simd4f_add( simd4f_create(1,2,3,4), y) );
      float z = simd4f_get_z(v);
    #include "vectorial/vectorial.h"
    for C++ classes. They reside in vectorial namespace, you might
    want to alias them to your own namespace
      namespace myproject {
        using namespace ::vectorial;
        // if you like different name: typedef vec3f Vector3;
      }
      using myproject::vec4f;
      vec4f v = normalize( vec4f(1,2,3,4) + y );
      float z = v.z();
  License
    2-clause BSD. See LICENSE
--- a/3rdparty/vectorial/bench/add_bench.cpp
+++ b/3rdparty/vectorial/bench/add_bench.cpp
@ -0,0 +1,60 @@
 #include "bench.h"
 #include <stdlib.h>
 #include <iostream>
 #include "vectorial/vec4f.h"
 #define NUM (81920)
 #define ITER 100
 using namespace vectorial;
 namespace {
    vec4f* alloc_vec4f(size_t n) {
        void *ptr = memalign(n*sizeof(vec4f), 16);
        return static_cast<vec4f*>(ptr);
    }
 }
 static vec4f * a;
 static vec4f * b;
 static vec4f * c;
 void add_func() {
    vec4f* vectorial_restrict aa = a;
    vec4f* vectorial_restrict bb = b;
    vec4f* vectorial_restrict cc = c;
    for(size_t i = 0; i < NUM; ++i)
    {
        cc[i] = aa[i] + bb[i];
    }    
 }
 void add_bench() {
    a = alloc_vec4f(NUM);
    b = alloc_vec4f(NUM);
    c = alloc_vec4f(NUM);
    for(size_t i = 0; i < NUM; ++i)
    {
        a[i]=vec4f(i,i,i,i);
        b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i);
    }
    profile("add", add_func, ITER, NUM);
    memfree(a);
    memfree(b);
    memfree(c);
 }
--- a/3rdparty/vectorial/bench/bench.cpp
+++ b/3rdparty/vectorial/bench/bench.cpp
@ -0,0 +1,117 @@
 #include "bench.h"
 #include <sstream>
 #include <iostream>
 #include "vectorial/config.h"
 namespace profiler {
    #ifdef BENCH_MACH
    mach_timebase_info_data_t info;
    void init() {
        mach_timebase_info(&info);
    }
    #endif
    #ifdef BENCH_GTOD
    void init() {
    }
    #endif
    #ifdef BENCH_QPC
    double frequency;
    void init() {
        LARGE_INTEGER freq;
        QueryPerformanceFrequency(&freq);
        frequency = (double)freq.QuadPart;
    }
    #endif
    time_t now() {
        #ifdef BENCH_MACH
        return mach_absolute_time();
        #endif
        #ifdef BENCH_GTOD
        time_t v;
        gettimeofday(&v, NULL);
        return v;
        #endif
        #ifdef BENCH_QPC
        LARGE_INTEGER v;
        QueryPerformanceCounter(&v);
        return v;
        #endif
    }
    double diffTime(time_t start, time_t end) {
        #ifdef BENCH_GTOD
        return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;
        #endif
        #ifdef BENCH_MACH        
        return ((end-start) * info.numer / info.denom) / 1000000000.0;
        #endif
        #ifdef BENCH_QPC
        return (end.QuadPart - start.QuadPart) / frequency;
        #endif
    }
 }
 std::string formatTime(double d, double relative ) {
    const double sec   = 1.0;
    const double milli = 0.001;
    const double micro = 0.000001;
    const double nano  = 0.000000001;
    std::stringstream ss;
    if( relative < 0.0) relative=d;
    if( relative >= sec ) ss << d << "s";
    else if( relative >= milli ) ss << d/milli << "ms";
    else if( relative >= micro ) ss << d/micro <<"us";
    else ss << d/nano << "ns";
    return ss.str();
 }
 void profile(const char* name, void (*func)(), int iterations, int elements) {
    profiler::init();
    profiler::time_t start = profiler::now();
    for(int i = 0; i < iterations; ++i)
    {
        func();
    }
    profiler::time_t end = profiler::now();
    std::cout << "Using simd: " << VECTORIAL_SIMD_TYPE << std::endl;
    std::cout << "Testing: " << name << std::endl;
    std::cout << "Duration " << formatTime(profiler::diffTime(start,end)) << std::endl;
    std::cout << "Per iter " << formatTime(profiler::diffTime(start,end) / iterations) << std::endl;
    std::cout << "Per item " << formatTime(profiler::diffTime(start,end) / iterations / elements) << std::endl;
 }
 void add_bench();
 void dot_bench();
 void quad_bench();
 void matrix_bench();
 int main() {
 //    add_bench();
 //    dot_bench();
 //    quad_bench();
    matrix_bench();
    return 0;
 }
--- a/3rdparty/vectorial/bench/bench.h
+++ b/3rdparty/vectorial/bench/bench.h
@ -0,0 +1,65 @@
 #ifndef BENCH_H
 #define BENCH_H
 #include <string>
 #include <stdlib.h>
 #ifdef __APPLE__
    #define BENCH_MACH
    #include <mach/mach_time.h>
    #include <stdint.h>
 #elif defined(_WIN32)
    #define BENCH_QPC
    #define WIN32_LEAN_AND_MEAN
    #include <windows.h>
    #include <malloc.h>
 #else
    #define BENCH_GTOD
    #include <sys/time.h>
 #endif
 static void* memalign(size_t count, size_t align) {
    #ifdef _WIN32
    return _aligned_malloc(count,align);
    #else
    void *ptr;
    int e = posix_memalign(&ptr, align, count);
    //    if( e == EINVAL ) printf("EINVAL posix_memalign\n");
    //    if( e == ENOMEM ) printf("ENOMEM posix_memalign\n");
    return ptr;
    #endif
 }
 static void memfree(void* ptr) {
    #ifdef _WIN32
    _aligned_free(ptr);
    #else
    free(ptr);
    #endif
 }
 namespace profiler {
    #ifdef BENCH_GTOD
        typedef struct timeval time_t;
    #endif
    #ifdef BENCH_MACH
        typedef const uint64_t time_t;
    #endif
    #ifdef BENCH_QPC
        typedef LARGE_INTEGER time_t;
    #endif
    void init();
    time_t now();
    double diffTime(time_t start, time_t end);
 }
 std::string formatTime(double d, double relative=-1);
 void profile(const char* name, void (*func)(), int iterations, int elements);
 #endif
--- a/3rdparty/vectorial/bench/dot_bench.cpp
+++ b/3rdparty/vectorial/bench/dot_bench.cpp
@ -0,0 +1,60 @@
 #include "bench.h"
 #include <stdlib.h>
 #include <iostream>
 #include "vectorial/vec4f.h"
 #define NUM (81920)
 #define ITER 100
 using namespace vectorial;
 namespace {
    vec4f* alloc_vec4f(size_t n) {
        void *ptr = memalign(n*sizeof(vec4f), 16);
        return static_cast<vec4f*>(ptr);
    }    
 }
 static vec4f * a;
 static vec4f * b;
 static float * c;
 void dot_func() {
    vec4f* vectorial_restrict aa = a;
    vec4f* vectorial_restrict bb = b;
    float* vectorial_restrict cc = c;
    for(size_t i = 0; i < NUM; ++i)
    {
        cc[i] = dot(aa[i], bb[i]);
    }    
 }
 void dot_bench() {
    a = alloc_vec4f(NUM);
    b = alloc_vec4f(NUM);
    c = static_cast<float*>(malloc(NUM * sizeof(float)));
    for(size_t i = 0; i < NUM; ++i)
    {
        a[i]=vec4f(i,i,i,i);
        b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i);
    }
    profile("dot", dot_func, ITER, NUM);
    memfree(a);
    memfree(b);
    memfree(c);
 }
--- a/3rdparty/vectorial/bench/matrix_bench.cpp
+++ b/3rdparty/vectorial/bench/matrix_bench.cpp
@ -0,0 +1,62 @@
 #include "bench.h"
 #include <stdlib.h>
 #include <iostream>
 #include "vectorial/simd4x4f.h"
 #define NUM (819200)
 #define ITER 100
 //using namespace vectorial;
 namespace {
    simd4x4f* alloc_vec4x4f(size_t n) {
        void *ptr = memalign(n*sizeof(simd4x4f), 16);
        return static_cast<simd4x4f*>(ptr);
    }    
 }
 static simd4x4f * a;
 static simd4x4f * b;
 static simd4x4f * c;
 void matrix_func() {
    simd4x4f* vectorial_restrict aa = a;
    simd4x4f* vectorial_restrict bb = b;
    simd4x4f* vectorial_restrict cc = c;
    for(size_t i = 0; i < NUM; ++i)
    {
        simd4x4f_matrix_mul(&aa[i], &bb[i], &bb[i]);
    }    
 }
 void matrix_bench() {
    a = alloc_vec4x4f(NUM);
    b = alloc_vec4x4f(NUM);
    c = alloc_vec4x4f(NUM);
    for(size_t i = 0; i < NUM; ++i)
    {
        simd4f v = simd4f_create(i,i,i,i);
        simd4f vi = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i);
        a[i]=simd4x4f_create(v,v,v,v);
        b[i]=simd4x4f_create(vi,vi,vi,vi);
    }
    profile("matrix mul", matrix_func, ITER, NUM);
    memfree(a);
    memfree(b);
    memfree(c);
 }
--- a/3rdparty/vectorial/bench/quad_bench.cpp
+++ b/3rdparty/vectorial/bench/quad_bench.cpp
@ -0,0 +1,123 @@
 #include "bench.h"
 #include <stdlib.h>
 #include <iostream>
 #include "vectorial/simd4x4f.h"
 #define NUM (81920)
 #define ITER 100
 //using namespace vectorial;
 namespace {
    simd4x4f* alloc_simd4x4f(size_t n) {
        void *ptr = memalign(n*sizeof(simd4x4f), 16);
        return static_cast<simd4x4f*>(ptr);
    }    
 }
 static simd4x4f * a;
 static simd4x4f * b;
 static simd4x4f * c;
 static simd4x4f add_4x4(SIMD_PARAM(simd4x4f, a), SIMD_PARAM(simd4x4f, b)) {
    return simd4x4f_create(
        simd4f_add(a.x, b.x),
        simd4f_add(a.y, b.y),
        simd4f_add(a.z, b.z),
        simd4f_add(a.w, b.w)
        );
 }
 static simd4x4f add_4x4_rp(simd4x4f *a, simd4x4f *b) {
    return simd4x4f_create(
        simd4f_add(a->x, b->x),
        simd4f_add(a->y, b->y),
        simd4f_add(a->z, b->z),
        simd4f_add(a->w, b->w)
        );
 }
 static void add_4x4_p(simd4x4f *a, simd4x4f *b, simd4x4f *out) {
    out->x = simd4f_add(a->x, b->x);
    out->y = simd4f_add(a->y, b->y);
    out->z = simd4f_add(a->z, b->z);
    out->w = simd4f_add(a->w, b->w);
 }
 void quad_return_func() {
    simd4x4f* aa = a;
    simd4x4f* bb = b;
    simd4x4f* cc = c;
    for(size_t i = 0; i < NUM; ++i)
    {
        bb[i] = add_4x4(aa[i], bb[i]);
    }    
 }
 void quad_pointer_func() {
    simd4x4f* aa = a;
    simd4x4f* bb = b;
    simd4x4f* cc = c;
    for(size_t i = 0; i < NUM; ++i)
    {
        add_4x4_p(&aa[i], &bb[i], &bb[i]);
    }
 }
 void quad_pointer_return_func() {
    simd4x4f* aa = a;
    simd4x4f* bb = b;
    simd4x4f* cc = c;
    for(size_t i = 0; i < NUM; ++i)
    {
        bb[i] = add_4x4_rp(&aa[i], &bb[i]);
    }    
 }
 void quad_bench() {
    a = alloc_simd4x4f(NUM);
    b = alloc_simd4x4f(NUM);
    c = alloc_simd4x4f(NUM);
    for(size_t i = 0; i < NUM; ++i)
    {
        simd4f t = simd4f_create(i,i,i,i); 
        simd4f t2 = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i); 
        a[i]=simd4x4f_create(t,t,t,t);
        b[i]=simd4x4f_create(t2,t2,t2,t2);
    }
    profile("quad return-value", quad_return_func, ITER, NUM);
    profile("quad pass-by-pointer", quad_pointer_func, ITER, NUM);
    profile("quad pass-by-pointer return-value", quad_pointer_return_func, ITER, NUM);
    memfree(a);
    memfree(b);
    memfree(c);
 }
--- a/3rdparty/vectorial/include/vectorial/config.h
+++ b/3rdparty/vectorial/include/vectorial/config.h
@ -0,0 +1,101 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_CONFIG_H
 #define VECTORIAL_CONFIG_H
 #ifndef VECTORIAL_FORCED
    #if defined(__SSE__) || (_M_IX86_FP > 0) || (_M_X64 > 0)
        #define VECTORIAL_SSE
    // __ARM_NEON is used instead of __ARM_NEON__ on armv8.
    #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
        #define VECTORIAL_NEON
    // Don't use gnu extension for arm, buggy with some gccs with armv6 and -Os,
    // Also doesn't seem perform as well
    #elif defined(__GNUC__) && !defined(__arm__)
        #define VECTORIAL_GNU
    #else
        #define VECTORIAL_SCALAR
    #endif
 #endif
 #ifdef VECTORIAL_SCALAR
    #define VECTORIAL_SIMD_TYPE "scalar"
 #endif
 #ifdef VECTORIAL_SSE
    #define VECTORIAL_SIMD_TYPE "sse"
 #endif
 #ifdef VECTORIAL_NEON
    #define VECTORIAL_SIMD_TYPE "neon"
    #define VECTORIAL_HAVE_SIMD2F
 #endif
 #ifdef VECTORIAL_GNU
    #define VECTORIAL_SIMD_TYPE "gnu"
 #endif
 #if defined(VECTORIAL_FORCED) && !defined(VECTORIAL_SIMD_TYPE)
    #error VECTORIAL_FORCED set but no simd-type found, try f.ex. VECTORIAL_SCALAR
 #endif
 #define vectorial_inline    static inline
 #if defined(__GNUC__) 
  #if defined(__cplusplus)
    #define vectorial_restrict  __restrict
  #endif
  #define simd4f_aligned16  __attribute__ ((aligned (16)))
 #elif defined(_WIN32)
  #define vectorial_restrict  
  #define simd4f_aligned16   __declspec(align(16))
 #else
  #define vectorial_restrict  restrict
  #define simd4f_aligned16   
 #endif
 // #define vectorial_restrict
 #ifdef __GNUC__
    #define vectorial_pure __attribute__((pure))
 #else
    #define vectorial_pure
 #endif
 #ifdef _WIN32
  #if defined(min) || defined(max)
 #pragma message ( "set NOMINMAX as preprocessor macro, undefining min/max " )
 #undef min
 #undef max
  #endif
 #endif
 #ifdef __cplusplus
    // Hack around msvc badness
    #define SIMD_PARAM(t, p) const t& p
 #else
    #define SIMD_PARAM(t, p) t p
 #endif
 #define VECTORIAL_PI      3.14159265f
 #define VECTORIAL_HALFPI  1.57079633f
 #endif
--- a/3rdparty/vectorial/include/vectorial/mat4f.h
+++ b/3rdparty/vectorial/include/vectorial/mat4f.h
@ -0,0 +1,197 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_MAT4F_H
 #define VECTORIAL_MAT4F_H
 #ifndef VECTORIAL_SIMD4X4F_H
  #include "vectorial/simd4x4f.h"
 #endif
 #ifndef VECTORIAL_VEC4F_H
  #include "vectorial/vec4f.h"
 #endif
 namespace vectorial {
    class mat4f {
    public:
        simd4x4f value;
        inline mat4f() {}
        inline mat4f(const mat4f& m) : value(m.value) {}
        inline mat4f(const simd4x4f& v) : value(v) {}
        inline mat4f(const vec4f& v0, const vec4f& v1, const vec4f& v2, const vec4f& v3) : value(simd4x4f_create(v0.value, v1.value, v2.value, v3.value)) {}
        explicit inline mat4f(const float *ary) { simd4x4f_uload(&value, ary); }
        inline void load(const float *ary) { 
            value.x = simd4f_uload4(ary);
            value.y = simd4f_uload4(ary+4); 
            value.z = simd4f_uload4(ary+8); 
            value.w = simd4f_uload4(ary+12); 
        }
        inline void store(float *ary) const { 
            simd4f_ustore4(value.x, ary);
            simd4f_ustore4(value.y, ary+4);
            simd4f_ustore4(value.z, ary+8);
            simd4f_ustore4(value.w, ary+12);
        }
        static mat4f identity() { mat4f m; simd4x4f_identity(&m.value); return m; }
        static mat4f perspective(float fovy, float aspect, float znear, float zfar) {
            simd4x4f m;
            simd4x4f_perspective(&m, fovy, aspect, znear, zfar);
            return m;
        }
        static mat4f ortho(float left, float right, float bottom, float top, float znear, float zfar) {
            simd4x4f m;
            simd4x4f_ortho(&m, left, right, bottom, top, znear, zfar);
            return m;
        }
        static mat4f lookAt(const vec3f& eye, const vec3f& center, const vec3f& up) {
            simd4x4f m;
            simd4x4f_lookat(&m, eye.value, center.value, up.value);
            return m;            
        }
        static mat4f translation(const vec3f& pos) {
            simd4x4f m;
            simd4x4f_translation(&m, pos.x(), pos.y(), pos.z());
            return m;            
        }
        static mat4f axisRotation(float angle, const vec3f& axis) {
            simd4x4f m;
            simd4x4f_axis_rotation(&m, angle, axis.value);
            return m;            
        }
        static mat4f scale(float scale) {
            return simd4x4f_create( simd4f_create(scale,0,0,0),
                                    simd4f_create(0,scale,0,0),
                                    simd4f_create(0,0,scale,0),
                                    simd4f_create(0,0,0,1) );
        }
        static mat4f scale(const vec3f& scale) {
            return simd4x4f_create( simd4f_create(scale.x(),0,0,0),
                                   simd4f_create(0,scale.y(),0,0),
                                   simd4f_create(0,0,scale.z(),0),
                                   simd4f_create(0,0,0,1) );
        }
    };
    vectorial_inline mat4f operator*(const mat4f& lhs, const mat4f& rhs) {
        mat4f ret;
        simd4x4f_matrix_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline mat4f operator*=(mat4f& lhs, const mat4f& rhs) {
        const simd4x4f tmp = lhs.value;
        simd4x4f_matrix_mul(&tmp, &rhs.value, &lhs.value);
        return lhs;
    }
    vectorial_inline vec4f operator*(const mat4f& lhs, const vec4f& rhs) {
        vec4f ret;
        simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline vec3f transformVector(const mat4f& lhs, const vec3f& rhs) {
        vec3f ret;
        simd4x4f_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline vec4f transformVector(const mat4f& lhs, const vec4f& rhs) {
        vec4f ret;
        simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline vec3f transformPoint(const mat4f& lhs, const vec3f& rhs) {
        vec3f ret;
        simd4x4f_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline vec3f orthoInverseTransformPoint(const mat4f& lhs, const vec3f& rhs) {
        vec3f ret;
        simd4x4f_inv_ortho_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline vec3f orthoInverseTransformVector(const mat4f& lhs, const vec3f& rhs) {
        vec3f ret;
        simd4x4f_inv_ortho_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value);
        return ret;
    }
    vectorial_inline mat4f transpose(const mat4f& m) {
        mat4f ret;
        simd4x4f_transpose(&m.value, &ret.value);
        return ret;
    }
    vectorial_inline mat4f inverse(const mat4f& m) {
        mat4f ret;
        simd4x4f_inverse(&m.value, &ret.value);
        return ret;
    }
 }
 #ifdef VECTORIAL_OSTREAM
 //#include <ostream>
 vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::mat4f& v) {
    os << "[ ";
    os << simd4f_get_x(v.value.x) << ", ";
    os << simd4f_get_x(v.value.y) << ", ";
    os << simd4f_get_x(v.value.z) << ", ";
    os << simd4f_get_x(v.value.w) << " ; ";
    os << simd4f_get_y(v.value.x) << ", ";
    os << simd4f_get_y(v.value.y) << ", ";
    os << simd4f_get_y(v.value.z) << ", ";
    os << simd4f_get_y(v.value.w) << " ; ";
    os << simd4f_get_z(v.value.x) << ", ";
    os << simd4f_get_z(v.value.y) << ", ";
    os << simd4f_get_z(v.value.z) << ", ";
    os << simd4f_get_z(v.value.w) << " ; ";
    os << simd4f_get_w(v.value.x) << ", ";
    os << simd4f_get_w(v.value.y) << ", ";
    os << simd4f_get_w(v.value.z) << ", ";
    os << simd4f_get_w(v.value.w) << " ]";
    return os;
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd2f.h
+++ b/3rdparty/vectorial/include/vectorial/simd2f.h
@ -0,0 +1,38 @@
 /*
  Vectorial
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD2F_H
 #define VECTORIAL_SIMD2F_H
 #include "vectorial/config.h"
 #if defined(VECTORIAL_NEON)
    #include "simd2f_neon.h"
 #else
    #error No implementation defined
 #endif
 #include "simd2f_common.h"
 #ifdef __cplusplus
    #ifdef VECTORIAL_OSTREAM
        #include <ostream>
        vectorial_inline std::ostream& operator<<(std::ostream& os, const simd2f& v) {
            os << "simd2f(" << simd2f_get_x(v) << ", "
                       << simd2f_get_y(v) << ")";
            return os;
        }
    #endif
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd2f_common.h
+++ b/3rdparty/vectorial/include/vectorial/simd2f_common.h
@ -0,0 +1,22 @@
 /*
  Vectorial
  Copyright (c) 2014 Google
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD2F_COMMON_H
 #define VECTORIAL_SIMD2F_COMMON_H
 vectorial_inline simd2f simd2f_length2(simd2f v) {
    return simd2f_sqrt( simd2f_dot2(v,v) );
 }
 vectorial_inline simd2f simd2f_length2_squared(simd2f v) {
    return simd2f_dot2(v,v);
 }
 vectorial_inline simd2f simd2f_normalize2(simd2f a) {
    simd2f invlen = simd2f_rsqrt( simd2f_dot2(a,a) );
    return simd2f_mul(a, invlen);
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd2f_neon.h
+++ b/3rdparty/vectorial/include/vectorial/simd2f_neon.h
@ -0,0 +1,159 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD2F_NEON_H
 #define VECTORIAL_SIMD2F_NEON_H
 #include <arm_neon.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef float32x2_t simd2f;
 typedef union {
    simd2f s ;
    float f[2];
 } _simd2f_union;
 vectorial_inline simd2f simd2f_create(float x, float y) {
    const float32_t d[2] = { x,y };
    simd2f s = vld1_f32(d);
    return s;
 }
 vectorial_inline simd2f simd2f_zero() { return vdup_n_f32(0.0f); }
 vectorial_inline simd2f simd2f_uload2(const float *ary) {
    const float32_t* ary32 = (const float32_t*)ary;
    simd2f s = vld1_f32(ary32);
    return s;
 }
 vectorial_inline void simd2f_ustore2(const simd2f val, float *ary) {
    vst1_f32( (float32_t*)ary, val);
 }
 vectorial_inline simd2f simd2f_splat(float v) {
    simd2f s = vdup_n_f32(v);
    return s;
 }
 vectorial_inline simd2f simd2f_splat_x(simd2f v) {
    simd2f ret = vdup_lane_f32(v, 0);
    return ret;
 }
 vectorial_inline simd2f simd2f_splat_y(simd2f v) {
    simd2f ret = vdup_lane_f32(v, 1);
    return ret;
 }
 vectorial_inline simd2f simd2f_reciprocal(simd2f v) {
    simd2f estimate = vrecpe_f32(v);
    estimate = vmul_f32(vrecps_f32(estimate, v), estimate);
    estimate = vmul_f32(vrecps_f32(estimate, v), estimate);
    return estimate;
 }
 vectorial_inline void simd2f_rsqrt_1iteration(const simd2f& v, simd2f& estimate) {
    simd2f estimate2 = vmul_f32(estimate, v);
    estimate = vmul_f32(estimate, vrsqrts_f32(estimate2, estimate));
 }
 vectorial_inline simd2f simd2f_rsqrt1(simd2f v) {
    simd2f estimate = vrsqrte_f32(v);
    simd2f_rsqrt_1iteration(v, estimate);
    return estimate;
 }
 vectorial_inline simd2f simd2f_rsqrt2(simd2f v) {
    simd2f estimate = vrsqrte_f32(v);
    simd2f_rsqrt_1iteration(v, estimate);
    simd2f_rsqrt_1iteration(v, estimate);
    return estimate;
 }
 vectorial_inline simd2f simd2f_rsqrt3(simd2f v) {
    simd2f estimate = vrsqrte_f32(v);
    simd2f_rsqrt_1iteration(v, estimate);
    simd2f_rsqrt_1iteration(v, estimate);
    simd2f_rsqrt_1iteration(v, estimate);
    return estimate;
 }
 // http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for
 // one iteration but two gives a signficant accuracy improvment.
 vectorial_inline simd2f simd2f_rsqrt(simd2f v) {
    return simd2f_rsqrt2(v);
 }
 vectorial_inline simd2f simd2f_sqrt(simd2f v) {
    return vreinterpret_f32_u32(vand_u32( vtst_u32(vreinterpret_u32_f32(v),
                                                      vreinterpret_u32_f32(v)),
                                            vreinterpret_u32_f32(
                                              simd2f_reciprocal(simd2f_rsqrt(v)))
                                          )
                                );
 }
 // arithmetics
 vectorial_inline simd2f simd2f_add(simd2f lhs, simd2f rhs) {
    simd2f ret = vadd_f32(lhs, rhs);
    return ret;
 }
 vectorial_inline simd2f simd2f_sub(simd2f lhs, simd2f rhs) {
    simd2f ret = vsub_f32(lhs, rhs);
    return ret;
 }
 vectorial_inline simd2f simd2f_mul(simd2f lhs, simd2f rhs) {
    simd2f ret = vmul_f32(lhs, rhs);
    return ret;
 }
 vectorial_inline simd2f simd2f_div(simd2f lhs, simd2f rhs) {
    simd2f recip = simd2f_reciprocal( rhs );
    simd2f ret = vmul_f32(lhs, recip);
    return ret;
 }
 vectorial_inline simd2f simd2f_madd(simd2f m1, simd2f m2, simd2f a) {
    return vmla_f32( a, m1, m2 );
 }
 vectorial_inline float simd2f_get_x(simd2f s) { return vget_lane_f32(s, 0); }
 vectorial_inline float simd2f_get_y(simd2f s) { return vget_lane_f32(s, 1); }
 vectorial_inline simd2f simd2f_dot2(simd2f lhs, simd2f rhs) {
    const simd2f m = simd2f_mul(lhs, rhs);
    return vpadd_f32(m, m);
 }
 vectorial_inline simd2f simd2f_min(simd2f a, simd2f b) {
    return vmin_f32( a, b );
 }
 vectorial_inline simd2f simd2f_max(simd2f a, simd2f b) {
    return vmax_f32( a, b );
 }
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4f.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f.h
@ -0,0 +1,51 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4F_H
 #define VECTORIAL_SIMD4F_H
 #ifndef VECTORIAL_CONFIG_H
  #include "vectorial/config.h"
 #endif
 #ifdef VECTORIAL_SCALAR
    #include "simd4f_scalar.h"
 #elif defined(VECTORIAL_SSE)
    #include "simd4f_sse.h"
 #elif defined(VECTORIAL_GNU)
    #include "simd4f_gnu.h"
 #elif defined(VECTORIAL_NEON)
    #include "simd4f_neon.h"
 #else
    #error No implementation defined
 #endif
 #include "simd4f_common.h"
 #ifdef __cplusplus
    #ifdef VECTORIAL_OSTREAM
        #include <ostream>
        vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4f& v) {
            os << "simd4f(" << simd4f_get_x(v) << ", "
                       << simd4f_get_y(v) << ", "
                       << simd4f_get_z(v) << ", "
                       << simd4f_get_w(v) << ")";
            return os;
        }
    #endif
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_common.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_common.h
@ -0,0 +1,74 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4F_COMMON_H
 #define VECTORIAL_SIMD4F_COMMON_H
 vectorial_inline simd4f simd4f_sum(simd4f v) { 
    const simd4f s1 = simd4f_add(simd4f_splat_x(v), simd4f_splat_y(v));
    const simd4f s2 = simd4f_add(s1, simd4f_splat_z(v));
    const simd4f s3 = simd4f_add(s2, simd4f_splat_w(v));
    return s3;
 }
 vectorial_inline simd4f simd4f_dot4(simd4f lhs, simd4f rhs) {
    return simd4f_sum( simd4f_mul(lhs, rhs) );
 }
 vectorial_inline simd4f simd4f_dot2(simd4f lhs, simd4f rhs) {
    const simd4f m = simd4f_mul(lhs, rhs);
    const simd4f s1 = simd4f_add(simd4f_splat_x(m), simd4f_splat_y(m));
    return s1;
 }
 vectorial_inline simd4f simd4f_length4(simd4f v) {
    return simd4f_sqrt( simd4f_dot4(v,v) );
 }
 vectorial_inline simd4f simd4f_length3(simd4f v) {
    return simd4f_sqrt( simd4f_dot3(v,v) );
 }
 vectorial_inline simd4f simd4f_length2(simd4f v) {
    return simd4f_sqrt( simd4f_dot2(v,v) );
 }
 vectorial_inline simd4f simd4f_length4_squared(simd4f v) {
    return simd4f_dot4(v,v);
 }
 vectorial_inline simd4f simd4f_length3_squared(simd4f v) {
    return simd4f_dot3(v,v);
 }
 vectorial_inline float simd4f_length3_squared_scalar(simd4f v) {
    return simd4f_dot3_scalar(v,v);
 }
 vectorial_inline simd4f simd4f_length2_squared(simd4f v) {
    return simd4f_dot2(v,v);
 }
 vectorial_inline simd4f simd4f_normalize4(simd4f a) {
    simd4f invlen = simd4f_rsqrt( simd4f_dot4(a,a) );
    return simd4f_mul(a, invlen);    
 }
 vectorial_inline simd4f simd4f_normalize3(simd4f a) {
    simd4f invlen = simd4f_rsqrt( simd4f_dot3(a,a) );
    return simd4f_mul(a, invlen);
 }
 vectorial_inline simd4f simd4f_normalize2(simd4f a) {
    simd4f invlen = simd4f_rsqrt( simd4f_dot2(a,a) );
    return simd4f_mul(a, invlen);    
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_gnu.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_gnu.h
@ -0,0 +1,225 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4F_GNU_H
 #define VECTORIAL_SIMD4F_GNU_H
 #include <math.h>
 #include <string.h>  // memcpy
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef float simd4f __attribute__ ((vector_size (16)));
 typedef union {
    simd4f s ;
    float f[4];
 } _simd4f_union;
 vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
 vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
 vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
 vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
 vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
    simd4f s = { x, y, z, w };
    return s;
 }
 vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); }
 vectorial_inline simd4f simd4f_uload4(const float *ary) {
    simd4f s = { ary[0], ary[1], ary[2], ary[3] };
    return s;
 }
 vectorial_inline simd4f simd4f_uload3(const float *ary) {
    simd4f s = { ary[0], ary[1], ary[2], 0 };
    return s;
 }
 vectorial_inline simd4f simd4f_uload2(const float *ary) {
    simd4f s = { ary[0], ary[1], 0, 0 };
    return s;
 }
 vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 4);
 }
 vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 3);
 }
 vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 2);
 }
 vectorial_inline simd4f simd4f_splat(float v) { 
    simd4f s = { v, v, v, v }; 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
    float s = simd4f_get_x(v);
    simd4f ret = { s, s, s, s }; 
    return ret;
 }
 vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
    float s = simd4f_get_y(v);
    simd4f ret = { s, s, s, s }; 
    return ret;
 }
 vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
    float s = simd4f_get_z(v);
    simd4f ret = { s, s, s, s }; 
    return ret;
 }
 vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
    float s = simd4f_get_w(v);
    simd4f ret = { s, s, s, s }; 
    return ret;
 }
 vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
    return simd4f_splat(1.0f) / v;
 }
 vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
    simd4f ret = { sqrtf(simd4f_get_x(v)), sqrtf(simd4f_get_y(v)), sqrtf(simd4f_get_z(v)), sqrtf(simd4f_get_w(v)) };
    return ret;
 }
 vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
    return simd4f_splat(1.0f) / simd4f_sqrt(v);
 }
 vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
    simd4f ret = lhs + rhs;
    return ret;
 }
 vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
    simd4f ret = lhs - rhs;
    return ret;
 }
 vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
    simd4f ret = lhs * rhs;
    return ret;
 }
 vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
    simd4f ret = lhs / rhs;
    return ret;
 }
 vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
    return simd4f_add( simd4f_mul(m1, m2), a );
 }
 vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
    _simd4f_union l = {lhs};
    _simd4f_union r = {rhs};
    return l.f[0] * r.f[0] + l.f[1] * r.f[1] + l.f[2] * r.f[2];
 }
 vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
    return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) );
 }
 vectorial_inline simd4f simd4f_cross3(simd4f l, simd4f r) {
    _simd4f_union lhs = {l};
    _simd4f_union rhs = {r};
    return simd4f_create( lhs.f[1] * rhs.f[2] - lhs.f[2] * rhs.f[1],
                          lhs.f[2] * rhs.f[0] - lhs.f[0] * rhs.f[2],
                          lhs.f[0] * rhs.f[1] - lhs.f[1] * rhs.f[0], 0);
 }
 vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 
    _simd4f_union u = {s};
    return simd4f_create(u.f[3], u.f[0], u.f[1], u.f[2]); 
 }
 vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 
    _simd4f_union u = {s};
    return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 
 }
 vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 
    _simd4f_union u = {s};
    return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 
 }
 vectorial_inline simd4f simd4f_zero_w(simd4f s) {
    _simd4f_union u = {s};
    return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f);
 }
 vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
    _simd4f_union u = {s};
    return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f);
 }
 vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 
    _simd4f_union u1 = {abcd};
    _simd4f_union u2 = {xyzw};
    return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]);
 }
 vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
    _simd4f_union u = {s};
    return simd4f_create(u.f[0], -u.f[1], u.f[2], -u.f[3]);
 }
 vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
    _simd4f_union u = {s};
    return simd4f_create(-u.f[0], u.f[1], -u.f[2], u.f[3]);
 }
 vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
    _simd4f_union ua = {a};
    _simd4f_union ub = {b};
    return simd4f_create( ua.f[0] < ub.f[0] ? ua.f[0] : ub.f[0], 
                          ua.f[1] < ub.f[1] ? ua.f[1] : ub.f[1], 
                          ua.f[2] < ub.f[2] ? ua.f[2] : ub.f[2], 
                          ua.f[3] < ub.f[3] ? ua.f[3] : ub.f[3] );
 }
 vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
    _simd4f_union ua = {a};
    _simd4f_union ub = {b};
    return simd4f_create( ua.f[0] > ub.f[0] ? ua.f[0] : ub.f[0], 
                          ua.f[1] > ub.f[1] ? ua.f[1] : ub.f[1], 
                          ua.f[2] > ub.f[2] ? ua.f[2] : ub.f[2], 
                          ua.f[3] > ub.f[3] ? ua.f[3] : ub.f[3] );
 }
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_neon.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_neon.h
@ -0,0 +1,280 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4F_NEON_H
 #define VECTORIAL_SIMD4F_NEON_H
 #include <arm_neon.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef float32x4_t simd4f;
 typedef float32x2_t simd2f;
 typedef union {
    simd4f s ;
    float f[4];
 } _simd4f_union;
 vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
    const float32_t d[4] = { x,y,z,w };
    simd4f s = vld1q_f32(d);
    return s;
 }
 vectorial_inline simd4f simd4f_zero() { return vdupq_n_f32(0.0f); }
 vectorial_inline simd4f simd4f_uload4(const float *ary) {
    const float32_t* ary32 = (const float32_t*)ary;
    simd4f s = vld1q_f32(ary32);    
    return s;
 }
 vectorial_inline simd4f simd4f_uload3(const float *ary) {
    simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
    return s;
 }
 vectorial_inline simd4f simd4f_uload2(const float *ary) {
    const float32_t* ary32 = (const float32_t*)ary;
    float32x2_t low = vld1_f32(ary32);
    const float32_t zero = 0;
    float32x2_t high = vld1_dup_f32(&zero); // { 0,0 } but stupid warnings from llvm-gcc
    return vcombine_f32(low, high);
 }
 vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
    vst1q_f32( (float32_t*)ary, val);
 }
 vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
    float* local_data = ary;
    vst1q_lane_f32(local_data++, val, 0);
    vst1q_lane_f32(local_data++, val, 1);
    vst1q_lane_f32(local_data, val, 2);
 }
 vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
    const float32x2_t low = vget_low_f32(val);
    vst1_f32( (float32_t*)ary, low);
 }
 vectorial_inline simd4f simd4f_splat(float v) { 
    simd4f s = vdupq_n_f32(v);
    return s;
 }
 // todo: or is simd4f_splat(simd4f_get_x(v))  better?
 vectorial_inline simd4f simd4f_splat_x(simd4f v) {
    float32x2_t o = vget_low_f32(v);
    simd4f ret = vdupq_lane_f32(o, 0);
    return ret;
 }
 vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
    float32x2_t o = vget_low_f32(v);
    simd4f ret = vdupq_lane_f32(o, 1);
    return ret;
 }
 vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
    float32x2_t o = vget_high_f32(v);
    simd4f ret = vdupq_lane_f32(o, 0);
    return ret;
 }
 vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
    float32x2_t o = vget_high_f32(v);
    simd4f ret = vdupq_lane_f32(o, 1);
    return ret;
 }
 vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
    simd4f estimate = vrecpeq_f32(v);
    estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate);
    estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate);
    return estimate;
 }
 vectorial_inline void simd4f_rsqrt_1iteration(const simd4f& v, simd4f& estimate) {
    simd4f estimate2 = vmulq_f32(estimate, v);
    estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate));
 }
 vectorial_inline simd4f simd4f_rsqrt1(simd4f v) {
    simd4f estimate = vrsqrteq_f32(v);
    simd4f_rsqrt_1iteration(v, estimate);
    return estimate;
 }
 vectorial_inline simd4f simd4f_rsqrt2(simd4f v) {
    simd4f estimate = vrsqrteq_f32(v);
    simd4f_rsqrt_1iteration(v, estimate);
    simd4f_rsqrt_1iteration(v, estimate);
    return estimate;
 }
 vectorial_inline simd4f simd4f_rsqrt3(simd4f v) {
    simd4f estimate = vrsqrteq_f32(v);
    simd4f_rsqrt_1iteration(v, estimate);
    simd4f_rsqrt_1iteration(v, estimate);
    simd4f_rsqrt_1iteration(v, estimate);
    return estimate;
 }
 // http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for
 // one iteration but two gives a signficant accuracy improvment.
 vectorial_inline simd4f simd4f_rsqrt(simd4f v) {
    return simd4f_rsqrt2(v);
 }
 vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
    return vreinterpretq_f32_u32(vandq_u32( vtstq_u32(vreinterpretq_u32_f32(v),  
                                                      vreinterpretq_u32_f32(v)), 
                                            vreinterpretq_u32_f32(
                                              simd4f_reciprocal(simd4f_rsqrt(v)))
                                          )
                                );
 }
 // arithmetics
 vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
    simd4f ret = vaddq_f32(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
    simd4f ret = vsubq_f32(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
    simd4f ret = vmulq_f32(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
    simd4f recip = simd4f_reciprocal( rhs );
    simd4f ret = vmulq_f32(lhs, recip);
    return ret;
 }
 vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
    return vmlaq_f32( a, m1, m2 );
 }
 vectorial_inline float simd4f_get_x(simd4f s) { return vgetq_lane_f32(s, 0); }
 vectorial_inline float simd4f_get_y(simd4f s) { return vgetq_lane_f32(s, 1); }
 vectorial_inline float simd4f_get_z(simd4f s) { return vgetq_lane_f32(s, 2); }
 vectorial_inline float simd4f_get_w(simd4f s) { return vgetq_lane_f32(s, 3); }
 // This function returns x*x+y*y+z*z and ignores the w component.
 vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
    const simd4f m = simd4f_mul(lhs, rhs);
    simd2f s1 = vpadd_f32(vget_low_f32(m), vget_low_f32(m));
    s1 = vadd_f32(s1, vget_high_f32(m));
    return vget_lane_f32(s1, 0);
 }
 vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
    return simd4f_splat(simd4f_dot3_scalar(lhs, rhs));
 }
 vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
    // Compute lhs and rhs in order yzx
    simd2f lhs_low = vget_low_f32(lhs);
    simd2f rhs_low = vget_low_f32(rhs);
    simd4f lhs_yzx = vcombine_f32(vext_f32(lhs_low, vget_high_f32(lhs),1), lhs_low);
    simd4f rhs_yzx = vcombine_f32(vext_f32(rhs_low, vget_high_f32(rhs),1), rhs_low);
    // Compute cross in order zxy
    simd4f s3 = simd4f_sub(simd4f_mul(rhs_yzx, lhs), simd4f_mul(lhs_yzx, rhs));
    // Permute cross to order xyz and zero out the fourth value
    simd2f low = vget_low_f32(s3);
    static const uint32_t mask_array[] = {
      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
    static const int32x4_t mask = vld1q_s32((const int32_t*)mask_array);
    s3 = vcombine_f32(vext_f32(low, vget_high_f32(s3), 1), low);
    return (simd4f)vandq_s32((int32x4_t)s3,mask);
 }
 vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 
    _simd4f_union u = {s};
    return simd4f_create( u.f[3], u.f[0], u.f[1], u.f[2]); 
 }
 vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 
    _simd4f_union u = {s};
    return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 
 }
 vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 
    _simd4f_union u = {s};
    return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 
 }
 vectorial_inline simd4f simd4f_zero_w(simd4f s) {
    _simd4f_union u = {s};
    return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f);
 }
 vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
    _simd4f_union u = {s};
    return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f);
 }
 vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 
    _simd4f_union u1 = {xyzw};
    _simd4f_union u2 = {abcd};
    return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]); 
 }
 vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
    const unsigned int upnpn[4] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
    const uint32x4_t pnpn = vld1q_u32( upnpn );
    return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), pnpn ) ); 
 }
 vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
    const unsigned int unpnp[4] = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 };
    const uint32x4_t npnp = vld1q_u32( unpnp );
    return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), npnp ) ); 
 }
 vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
    return vminq_f32( a, b ); 
 }
 vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
    return vmaxq_f32( a, b ); 
 }
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_scalar.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_scalar.h
@ -0,0 +1,199 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4F_SCALAR_H
 #define VECTORIAL_SIMD4F_SCALAR_H
 #include <math.h>
 #include <string.h>  // memcpy
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct { 
    float x;
    float y; 
    float z; 
    float w;
 } simd4f;
 vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
    simd4f s = { x, y, z, w };
    return s;
 }
 vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); }
 vectorial_inline simd4f simd4f_uload4(const float *ary) {
    simd4f s = { ary[0], ary[1], ary[2], ary[3] };
    return s;
 }
 vectorial_inline simd4f simd4f_uload3(const float *ary) {
    simd4f s = { ary[0], ary[1], ary[2], 0 };
    return s;
 }
 vectorial_inline simd4f simd4f_uload2(const float *ary) {
    simd4f s = { ary[0], ary[1], 0, 0 };
    return s;
 }
 vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 4);
 }
 vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 3);
 }
 vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 2);
 }
 // utilities
 vectorial_inline simd4f simd4f_splat(float v) { 
    simd4f s = { v, v, v, v }; 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
    simd4f s = { v.x, v.x, v.x, v.x }; 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
    simd4f s = { v.y, v.y, v.y, v.y }; 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
    simd4f s = { v.z, v.z, v.z, v.z }; 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
    simd4f s = { v.w, v.w, v.w, v.w }; 
    return s;
 }
 vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
    simd4f s = { 1.0f/v.x, 1.0f/v.y, 1.0f/v.z, 1.0f/v.w }; 
    return s;
 }
 vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
    simd4f s = { sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w) }; 
    return s;
 }
 vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
    simd4f s = { 1.0f/sqrtf(v.x), 1.0f/sqrtf(v.y), 1.0f/sqrtf(v.z), 1.0f/sqrtf(v.w) }; 
    return s;
 }
 // arithmetic
 vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
    simd4f ret = { lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z, lhs.w + rhs.w };
    return ret;
 }
 vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
    simd4f ret = { lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z, lhs.w - rhs.w };
    return ret;
 }
 vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
    simd4f ret = { lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z, lhs.w * rhs.w };
    return ret;
 }
 vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
    simd4f ret = { lhs.x / rhs.x, lhs.y / rhs.y, lhs.z / rhs.z, lhs.w / rhs.w };
    return ret;
 }
 vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
    return simd4f_add( simd4f_mul(m1, m2), a );
 }
 vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
    return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z;
 }
 vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
    return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) );
 }
 vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
    return simd4f_create( lhs.y * rhs.z - lhs.z * rhs.y,
                          lhs.z * rhs.x - lhs.x * rhs.z,
                          lhs.x * rhs.y - lhs.y * rhs.x, 0);
 }
 vectorial_inline float simd4f_get_x(simd4f s) { return s.x; }
 vectorial_inline float simd4f_get_y(simd4f s) { return s.y; }
 vectorial_inline float simd4f_get_z(simd4f s) { return s.z; }
 vectorial_inline float simd4f_get_w(simd4f s) { return s.w; }
 vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return simd4f_create(s.w, s.x, s.y, s.z); }
 vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return simd4f_create(s.z, s.w, s.x, s.y); }
 vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return simd4f_create(s.y, s.z, s.w, s.x); }
 vectorial_inline simd4f simd4f_zero_w(simd4f s) {
    return simd4f_create(s.x, s.y, s.z, 0.0f);
 }
 vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
    return simd4f_create(s.x, s.y, 0.0f, 0.0f);
 }
 vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 
    return simd4f_create(abcd.z, abcd.w, xyzw.z, xyzw.w);
 }
 vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
    return simd4f_create(s.x, -s.y, s.z, -s.w);
 }
 vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
    return simd4f_create(-s.x, s.y, -s.z, s.w);
 }
 vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
    return simd4f_create( a.x < b.x ? a.x : b.x, 
                          a.y < b.y ? a.y : b.y, 
                          a.z < b.z ? a.z : b.z, 
                          a.w < b.w ? a.w : b.w );
 }
 vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
    return simd4f_create( a.x > b.x ? a.x : b.x, 
                          a.y > b.y ? a.y : b.y, 
                          a.z > b.z ? a.z : b.z, 
                          a.w > b.w ? a.w : b.w );
 }
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_sse.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_sse.h
@ -0,0 +1,236 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4F_SSE_H
 #define VECTORIAL_SIMD4F_SSE_H
 // Conditionally enable SSE4.1 otherwise fallback to SSE.
 #if defined(_M_IX86_FP)
    #if _M_IX86_FP >=2
        #define VECTORIAL_USE_SSE4_1
    #endif
 #elif defined(__SSE4_1__)
        #define VECTORIAL_USE_SSE4_1
 #endif
 #include <xmmintrin.h>
 #if defined(VECTORIAL_USE_SSE4_1)
    #include <smmintrin.h>
 #endif
 #include <string.h>  // memcpy
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef __m128 simd4f; 
 typedef union {
    simd4f s ;
    float f[4];
    unsigned int ui[4];
 } _simd4f_union;
 // creating
 vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
    simd4f s = { x, y, z, w };
    return s;
 }
 vectorial_inline simd4f simd4f_zero() { return _mm_setzero_ps(); }
 vectorial_inline simd4f simd4f_uload4(const float *ary) {
    simd4f s = _mm_loadu_ps(ary);
    return s;
 }
 vectorial_inline simd4f simd4f_uload3(const float *ary) {
    simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
    return s;
 }
 vectorial_inline simd4f simd4f_uload2(const float *ary) {
    simd4f s = simd4f_create(ary[0], ary[1], 0, 0);
    return s;
 }
 vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
    _mm_storeu_ps(ary, val);
 }
 vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 3);
 }
 vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
    memcpy(ary, &val, sizeof(float) * 2);
 }
 // utilites
 vectorial_inline simd4f simd4f_splat(float v) { 
    simd4f s = _mm_set1_ps(v); 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0,0,0,0)); 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1,1,1,1)); 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,2,2,2)); 
    return s;
 }
 vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3,3,3,3)); 
    return s;
 }
 // arithmetic
 vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
    simd4f ret = _mm_add_ps(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
    simd4f ret = _mm_sub_ps(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
    simd4f ret = _mm_mul_ps(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
    simd4f ret = _mm_div_ps(lhs, rhs);
    return ret;
 }
 vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
    return simd4f_add( simd4f_mul(m1, m2), a );
 }
 vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
    simd4f s = _mm_rcp_ps(v); 
    const simd4f two = simd4f_create(2.0f, 2.0f, 2.0f, 2.0f);
    s = simd4f_mul(s, simd4f_sub(two, simd4f_mul(v, s)));
    return s;
 }
 vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
    simd4f s = _mm_sqrt_ps(v); 
    return s;
 }
 vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
    simd4f s = _mm_rsqrt_ps(v); 
    const simd4f half = simd4f_create(0.5f, 0.5f, 0.5f, 0.5f);
    const simd4f three = simd4f_create(3.0f, 3.0f, 3.0f, 3.0f);
    s = simd4f_mul(simd4f_mul(s, half), simd4f_sub(three, simd4f_mul(s, simd4f_mul(v,s))));
    return s;
 }
 vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
 vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
 vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
 vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
 vectorial_inline simd4f simd4f_dot3(simd4f lhs,simd4f rhs) {
 #if defined(VECTORIAL_USE_SSE4_1)
    return _mm_dp_ps(lhs, rhs, 0x7f);
 #else
    simd4f_aligned16 const unsigned int mask_array[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
    const simd4f mask = _mm_load_ps((const float*)mask_array);
    const simd4f m = _mm_mul_ps(lhs, rhs);
    const simd4f s0 = _mm_and_ps(m, mask);
    const simd4f s1 = _mm_add_ps(s0, _mm_movehl_ps(s0, s0));
    const simd4f s2 = _mm_add_ss(s1, _mm_shuffle_ps(s1, s1, 1));
    return _mm_shuffle_ps(s2,s2, 0);
 #endif
 }
 vectorial_inline float simd4f_dot3_scalar(simd4f lhs,simd4f rhs) {
    return simd4f_get_x(simd4f_dot3(lhs, rhs));
 }
 vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
    const simd4f lyzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,0,2,1));
    const simd4f lzxy = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,1,0,2));
    const simd4f ryzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,0,2,1));
    const simd4f rzxy = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,1,0,2));
    return _mm_sub_ps(_mm_mul_ps(lyzx, rzxy), _mm_mul_ps(lzxy, ryzx));
 }
 vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(2,1,0,3) ); }
 vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(1,0,3,2) ); }
 vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(0,3,2,1) ); }
 vectorial_inline simd4f simd4f_zero_w(simd4f s) {
    simd4f r = _mm_unpackhi_ps(s, _mm_setzero_ps());
    return _mm_movelh_ps(s, r);
 }
 vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
    return _mm_movelh_ps(s, _mm_setzero_ps());
 }
 vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 
    return _mm_movehl_ps(abcd, xyzw);
 }
 typedef simd4f_aligned16 union {
    unsigned int ui[4];
    float f[4];
 } _simd4f_uif;
 vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
    const _simd4f_uif upnpn = { { 0x00000000, 0x80000000, 0x00000000, 0x80000000 } };
    return _mm_xor_ps( s, _mm_load_ps(upnpn.f) ); 
 }
 vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
    const _simd4f_uif unpnp = { { 0x80000000, 0x00000000, 0x80000000, 0x00000000 } };
    return _mm_xor_ps( s, _mm_load_ps(unpnp.f) ); 
 }
 vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
    return _mm_min_ps( a, b ); 
 }
 vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
    return _mm_max_ps( a, b ); 
 }
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f.h
@ -0,0 +1,412 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4X4F_H
 #define VECTORIAL_SIMD4X4F_H
 #include "simd4f.h"
 #include <math.h>
 /*
  Note, x,y,z,w are conceptually columns with matrix math.
 */
 typedef struct {
    simd4f x,y,z,w;
 } simd4x4f;
 vectorial_inline simd4x4f simd4x4f_create(simd4f x, simd4f y, simd4f z, SIMD_PARAM(simd4f, w)) {
    simd4x4f s = { x, y, z, w };
    return s;
 }
 vectorial_inline void simd4x4f_identity(simd4x4f* m) {
    *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
                          simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
                          simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
                          simd4f_create(0.0f, 0.0f, 0.0f, 1.0f));
 }
 vectorial_inline void simd4x4f_uload(simd4x4f* m, const float *f) {
    m->x = simd4f_uload4(f + 0);
    m->y = simd4f_uload4(f + 4);
    m->z = simd4f_uload4(f + 8);
    m->w = simd4f_uload4(f + 12);
 }
 #ifdef VECTORIAL_SCALAR
    #include "simd4x4f_scalar.h"
 #elif defined(VECTORIAL_SSE)
    #include "simd4x4f_sse.h"
 #elif defined(VECTORIAL_GNU)
    #include "simd4x4f_gnu.h"
 #elif defined(VECTORIAL_NEON)
    #include "simd4x4f_neon.h"
 #else
    #error No implementation defined
 #endif
 vectorial_inline void simd4x4f_sum(const simd4x4f* a, simd4f* out) {
    simd4f t;
    t = simd4f_add(a->x, a->y);
    t = simd4f_add(t, a->z);
    t = simd4f_add(t, a->w);
    *out = t;
 }
 vectorial_inline void simd4x4f_matrix_vector_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
    const simd4f x = a->x;
    const simd4f y = a->y;
    const simd4f z = a->z;
    const simd4f w = a->w;
    const simd4f v = *b;
    const simd4f vx = simd4f_splat_x(v);
    const simd4f vy = simd4f_splat_y(v);
    const simd4f vz = simd4f_splat_z(v);
    const simd4f vw = simd4f_splat_w(v);
    #if 0
    // In a hasty benchmark, this actually performed worse on neon
    // TODO: revisit and conditionalize accordingly
    *out = simd4f_madd(x, vx, 
             simd4f_madd(y, vy, 
               simd4f_madd(z, vz, 
                 simd4f_mul(w, vw) ) ) );
    #else    
     *out = simd4f_add(simd4f_mul(x, vx), 
              simd4f_add(simd4f_mul(y, vy), 
                simd4f_add(simd4f_mul(z, vz), 
                  simd4f_mul(w, vw) ) ) );
    #endif
 }
 vectorial_inline void simd4x4f_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
    #if 0
    *out = simd4f_madd( a->x, simd4f_splat_x(*b), 
             simd4f_madd( a->y, simd4f_splat_y(*b), 
               simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
    #else
    *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)), 
             simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)), 
               simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
    #endif
 }
 vectorial_inline void simd4x4f_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
    #if 0
    *out = simd4f_madd( a->x, simd4f_splat_x(*b),
             simd4f_madd( a->y, simd4f_splat_y(*b),
               simd4f_madd( a->z, simd4f_splat_z(*b),
                 a->w ) ) );
    #else
    *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)),
             simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)),
               simd4f_add( simd4f_mul(a->z, simd4f_splat_z(*b)),
                 a->w ) ) );
    #endif
 }
 vectorial_inline void simd4x4f_inv_ortho_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
    simd4f translation = simd4f_sub(*b, a->w);
    simd4x4f transpose = *a;
    transpose.w = simd4f_create(0,0,0,0);
    simd4x4f_transpose_inplace(&transpose);
    simd4x4f_matrix_point3_mul(&transpose, &translation, out);
 }
 vectorial_inline void simd4x4f_inv_ortho_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
    simd4f translation = *b;
    simd4x4f transpose = *a;
    transpose.w = simd4f_create(0,0,0,0);
    simd4x4f_transpose_inplace(&transpose);
    simd4x4f_matrix_vector3_mul(&transpose, &translation, out);
 }
 vectorial_inline void simd4x4f_matrix_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
    simd4x4f_matrix_vector_mul(a, &b->x, &out->x);
    simd4x4f_matrix_vector_mul(a, &b->y, &out->y);
    simd4x4f_matrix_vector_mul(a, &b->z, &out->z);
    simd4x4f_matrix_vector_mul(a, &b->w, &out->w);
 }
 vectorial_inline void simd4x4f_perspective(simd4x4f *m, float fovy_radians, float aspect, float znear, float zfar) {
    float deltaz = zfar - znear;
    float cotangent = tanf( VECTORIAL_HALFPI - fovy_radians * 0.5f );
    float a = cotangent / aspect;
    float b = cotangent;
    float c = -(zfar + znear) / deltaz;
    float d = -2 * znear * zfar / deltaz;
    m->x = simd4f_create( a, 0, 0,  0);
    m->y = simd4f_create( 0, b, 0,  0);
    m->z = simd4f_create( 0, 0, c, -1);
    m->w = simd4f_create( 0, 0, d,  0);
 }
 vectorial_inline void simd4x4f_ortho(simd4x4f *m, float left, float right, float bottom, float top, float znear, float zfar) {
    float deltax = right - left;
    float deltay = top - bottom;
    float deltaz = zfar - znear;
    float a = 2.0f / deltax;
    float b = -(right + left) / deltax;
    float c = 2.0f / deltay;
    float d = -(top + bottom) / deltay;
    float e =  -2.0f / deltaz;
    float f = -(zfar + znear) / deltaz;
    m->x = simd4f_create( a, 0, 0, 0);
    m->y = simd4f_create( 0, c, 0, 0);
    m->z = simd4f_create( 0, 0, e, 0);
    m->w = simd4f_create( b, d, f, 1);
 }
 vectorial_inline void simd4x4f_lookat(simd4x4f *m, simd4f eye, simd4f center, simd4f up) {
    simd4f zaxis = simd4f_normalize3( simd4f_sub(center, eye) );
    simd4f xaxis = simd4f_normalize3( simd4f_cross3( zaxis, up ) );
    simd4f yaxis = simd4f_cross3(xaxis, zaxis);
    zaxis = simd4f_sub( simd4f_zero(), zaxis);
    float x = -simd4f_dot3_scalar(xaxis, eye);
    float y = -simd4f_dot3_scalar(yaxis, eye);
    float z = -simd4f_dot3_scalar(zaxis, eye);
    m->x = xaxis;
    m->y = yaxis;
    m->z = zaxis;
    m->w = simd4f_create( 0,0,0, 1);
    simd4x4f_transpose_inplace(m);
    m->w = simd4f_create( x,y,z,1);
 }
 vectorial_inline void simd4x4f_translation(simd4x4f* m, float x, float y, float z) {
    *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
                          simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
                          simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
                          simd4f_create(   x,    y,    z, 1.0f));
 }
 vectorial_inline void simd4x4f_axis_rotation(simd4x4f* m, float radians, simd4f axis) {
    radians = -radians;
    axis = simd4f_normalize3(axis);
    const float sine = sinf(radians);
    const float cosine = cosf(radians);
    const float x = simd4f_get_x(axis);
    const float y = simd4f_get_y(axis);
    const float z = simd4f_get_z(axis);
    const float ab = x * y * (1 - cosine);
    const float bc = y * z * (1 - cosine);
    const float ca = z * x * (1 - cosine);
    const float tx = x * x;
    const float ty = y * y;
    const float tz = z * z;
    const simd4f i = simd4f_create( tx + cosine * (1 - tx), ab - z * sine,          ca + y * sine,          0);
    const simd4f j = simd4f_create( ab + z * sine,          ty + cosine * (1 - ty), bc - x * sine,          0);
    const simd4f k = simd4f_create( ca - y * sine,          bc + x * sine,          tz + cosine * (1 - tz), 0);
    *m = simd4x4f_create( i,j,k, simd4f_create(0, 0, 0, 1) );
 }
 vectorial_inline void simd4x4f_add(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
    out->x = simd4f_add(a->x, b->x);
    out->y = simd4f_add(a->y, b->y);
    out->z = simd4f_add(a->z, b->z);
    out->w = simd4f_add(a->w, b->w);
 }
 vectorial_inline void simd4x4f_sub(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
    out->x = simd4f_sub(a->x, b->x);
    out->y = simd4f_sub(a->y, b->y);
    out->z = simd4f_sub(a->z, b->z);
    out->w = simd4f_sub(a->w, b->w);
 }
 vectorial_inline void simd4x4f_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
    out->x = simd4f_mul(a->x, b->x);
    out->y = simd4f_mul(a->y, b->y);
    out->z = simd4f_mul(a->z, b->z);
    out->w = simd4f_mul(a->w, b->w);
 }
 vectorial_inline void simd4x4f_div(simd4x4f* a, simd4x4f* b, simd4x4f* out) {
    out->x = simd4f_div(a->x, b->x);
    out->y = simd4f_div(a->y, b->y);
    out->z = simd4f_div(a->z, b->z);
    out->w = simd4f_div(a->w, b->w);
 }
 vectorial_inline simd4f simd4x4f_inverse(const simd4x4f* a, simd4x4f* out) {
    const simd4f c0 = a->x;
    const simd4f c1 = a->y;
    const simd4f c2 = a->z;
    const simd4f c3 = a->w;
    const simd4f c0_wxyz = simd4f_shuffle_wxyz(c0);
    const simd4f c0_zwxy = simd4f_shuffle_zwxy(c0);
    const simd4f c0_yzwx = simd4f_shuffle_yzwx(c0);
    const simd4f c1_wxyz = simd4f_shuffle_wxyz(c1);
    const simd4f c1_zwxy = simd4f_shuffle_zwxy(c1);
    const simd4f c1_yzwx = simd4f_shuffle_yzwx(c1);
    const simd4f c2_wxyz = simd4f_shuffle_wxyz(c2);
    const simd4f c2_zwxy = simd4f_shuffle_zwxy(c2);
    const simd4f c2_yzwx = simd4f_shuffle_yzwx(c2);
    const simd4f c3_wxyz = simd4f_shuffle_wxyz(c3);
    const simd4f c3_zwxy = simd4f_shuffle_zwxy(c3);
    const simd4f c3_yzwx = simd4f_shuffle_yzwx(c3);
    const simd4f c0_wxyz_x_c1 = simd4f_mul(c0_wxyz, c1);
    const simd4f c0_wxyz_x_c1_yzwx = simd4f_mul(c0_wxyz, c1_yzwx);
    const simd4f c0_wxyz_x_c1_zwxy = simd4f_mul(c0_wxyz, c1_zwxy);
    const simd4f c2_wxyz_x_c3 = simd4f_mul(c2_wxyz, c3);
    const simd4f c2_wxyz_x_c3_yzwx = simd4f_mul(c2_wxyz, c3_yzwx);
    const simd4f c2_wxyz_x_c3_zwxy = simd4f_mul(c2_wxyz, c3_zwxy);
    const simd4f ar1 = simd4f_sub( simd4f_shuffle_wxyz(c2_wxyz_x_c3_zwxy), simd4f_shuffle_zwxy(c2_wxyz_x_c3) );
    const simd4f ar2 = simd4f_sub( simd4f_shuffle_zwxy(c2_wxyz_x_c3_yzwx), c2_wxyz_x_c3_yzwx );
    const simd4f ar3 = simd4f_sub( c2_wxyz_x_c3_zwxy, simd4f_shuffle_wxyz(c2_wxyz_x_c3) );
    const simd4f br1 = simd4f_sub( simd4f_shuffle_wxyz(c0_wxyz_x_c1_zwxy), simd4f_shuffle_zwxy(c0_wxyz_x_c1) );
    const simd4f br2 = simd4f_sub( simd4f_shuffle_zwxy(c0_wxyz_x_c1_yzwx), c0_wxyz_x_c1_yzwx );
    const simd4f br3 = simd4f_sub( c0_wxyz_x_c1_zwxy, simd4f_shuffle_wxyz(c0_wxyz_x_c1) );
    const simd4f c0_sum = simd4f_madd(c0_yzwx, ar3,
                            simd4f_madd(c0_zwxy, ar2,
                              simd4f_mul(c0_wxyz, ar1)));
    const simd4f c1_sum = simd4f_madd(c1_wxyz,  ar1, 
                            simd4f_madd(c1_zwxy,  ar2, 
                              simd4f_mul(c1_yzwx, ar3)));
    const simd4f c2_sum = simd4f_madd(c2_yzwx, br3,
                            simd4f_madd(c2_zwxy, br2,
                              simd4f_mul(c2_wxyz, br1)));
    const simd4f c3_sum = simd4f_madd(c3_yzwx, br3,
                            simd4f_madd(c3_zwxy, br2,
                              simd4f_mul(c3_wxyz, br1)));
    const simd4f d0 = simd4f_mul(c1_sum, c0);
    const simd4f d1 = simd4f_add(d0, simd4f_merge_high(d0, d0));
    const simd4f det = simd4f_sub(d1, simd4f_splat_y(d1));
    const simd4f invdet = simd4f_splat_x( simd4f_div(simd4f_splat(1.0f), det) );
    const simd4f o0 = simd4f_mul( simd4f_flip_sign_0101(c1_sum), invdet );
    const simd4f o1 = simd4f_mul( simd4f_flip_sign_1010(c0_sum), invdet );
    const simd4f o2 = simd4f_mul( simd4f_flip_sign_0101(c3_sum), invdet );
    const simd4f o3 = simd4f_mul( simd4f_flip_sign_1010(c2_sum), invdet );
    const simd4x4f mt = simd4x4f_create(o0, o1, o2, o3);
    simd4x4f_transpose( &mt, out);
    return det;
 }
 #ifdef __cplusplus
    #ifdef VECTORIAL_OSTREAM
        #include <ostream>
        vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4x4f& v) {
            os << "simd4x4f(simd4f(" << simd4f_get_x(v.x) << ", "
                       << simd4f_get_y(v.x) << ", "
                       << simd4f_get_z(v.x) << ", "
                       << simd4f_get_w(v.x) << "),\n"
                       << "         simd4f(" << simd4f_get_x(v.y) << ", "
                       << simd4f_get_y(v.y) << ", "
                       << simd4f_get_z(v.y) << ", "
                       << simd4f_get_w(v.y) << "),\n"
                       << "         simd4f(" << simd4f_get_x(v.z) << ", "
                       << simd4f_get_y(v.z) << ", "
                       << simd4f_get_z(v.z) << ", "
                       << simd4f_get_w(v.z) << "),\n"
                       << "         simd4f(" << simd4f_get_x(v.w) << ", "
                       << simd4f_get_y(v.w) << ", "
                       << simd4f_get_z(v.w) << ", "
                       << simd4f_get_w(v.w) << "))";
            return os;
        }
    #endif
 #endif
 #endif 
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_gnu.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_gnu.h
@ -0,0 +1,36 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4X4F_GNU_H
 #define VECTORIAL_SIMD4X4F_GNU_H
 vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) {
    const _simd4f_union sx = { s->x };
    const _simd4f_union sy = { s->y };
    const _simd4f_union sz = { s->z };
    const _simd4f_union sw = { s->w };
    const simd4f dx = { sx.f[0], sy.f[0], sz.f[0], sw.f[0] };
    const simd4f dy = { sx.f[1], sy.f[1], sz.f[1], sw.f[1] };
    const simd4f dz = { sx.f[2], sy.f[2], sz.f[2], sw.f[2] };
    const simd4f dw = { sx.f[3], sy.f[3], sz.f[3], sw.f[3] };
    s->x = dx;
    s->y = dy;
    s->z = dz;
    s->w = dw;
 }
 vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
    *out=*s;
    simd4x4f_transpose_inplace(out);
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_neon.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_neon.h
@ -0,0 +1,35 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4X4F_NEON_H
 #define VECTORIAL_SIMD4X4F_NEON_H
 vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) {
    const _simd4f_union sx = { s->x };
    const _simd4f_union sy = { s->y };
    const _simd4f_union sz = { s->z };
    const _simd4f_union sw = { s->w };
    const simd4f dx = simd4f_create( sx.f[0], sy.f[0], sz.f[0], sw.f[0] );
    const simd4f dy = simd4f_create( sx.f[1], sy.f[1], sz.f[1], sw.f[1] );
    const simd4f dz = simd4f_create( sx.f[2], sy.f[2], sz.f[2], sw.f[2] );
    const simd4f dw = simd4f_create( sx.f[3], sy.f[3], sz.f[3], sw.f[3] );
    s->x = dx;
    s->y = dy;
    s->z = dz;
    s->w = dw;
 }
 vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
    *out=*s;
    simd4x4f_transpose_inplace(out);
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_scalar.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_scalar.h
@ -0,0 +1,41 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4X4F_SCALAR_H
 #define VECTORIAL_SIMD4X4F_SCALAR_H
 vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) {
    simd4x4f d=*s;
    s->x.x = d.x.x;
    s->x.y = d.y.x;
    s->x.z = d.z.x;
    s->x.w = d.w.x;
    s->y.x = d.x.y;
    s->y.y = d.y.y;
    s->y.z = d.z.y;
    s->y.w = d.w.y;
    s->z.x = d.x.z;
    s->z.y = d.y.z;
    s->z.z = d.z.z;
    s->z.w = d.w.z;
    s->w.x = d.x.w;
    s->w.y = d.y.w;
    s->w.z = d.z.w;
    s->w.w = d.w.w;
 }
 vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
    *out=*s;
    simd4x4f_transpose_inplace(out);
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_sse.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_sse.h
@ -0,0 +1,23 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_SIMD4X4F_SSE_H
 #define VECTORIAL_SIMD4X4F_SSE_H
 vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) {
    _MM_TRANSPOSE4_PS(s->x, s->y, s->z, s->w);
 }
 vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
    *out=*s;
    simd4x4f_transpose_inplace(out);
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/vec2f.h
+++ b/3rdparty/vectorial/include/vectorial/vec2f.h
@ -0,0 +1,191 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_VEC2F_H
 #ifndef VECTORIAL_SIMD4F_H
  #include "vectorial/simd4f.h"
 #endif
 namespace vectorial {
    class vec4f;
    class vec3f;
    class vec2f {
    public:
        simd4f value;
        inline vec2f() {}
        inline vec2f(const vec2f& v) : value(v.value) {}
        inline vec2f(const simd4f& v) : value(v) {}
        explicit inline vec2f(float xy) : value( simd4f_splat(xy) ) {}
        inline vec2f(float x, float y) : value( simd4f_create(x,y,0,0) ) {}
        explicit inline vec2f(const float *ary) : value( simd4f_uload2(ary) ) { }
        inline float x() const { return simd4f_get_x(value); }
        inline float y() const { return simd4f_get_y(value); }
        inline void load(const float *ary) { value = simd4f_uload2(ary); }
        inline void store(float *ary) const { simd4f_ustore2(value, ary); }
        enum { elements = 2 };
        static vec2f zero() { return vec2f(simd4f_zero()); }
        static vec2f one() { return vec2f(1.0f); }
        static vec2f xAxis() { return vec2f(1.0f, 0.0f); }
        static vec2f yAxis() { return vec2f(0.0f, 1.0f); }
        inline vec4f xyzw(float z, float w) const;
        inline vec4f xy00() const;
        inline vec4f xy01() const;
        inline vec3f xyz(float z) const;
        inline vec3f xy0() const;
        inline vec2f xy() const;
    };
    vectorial_inline vec2f operator-(const vec2f& lhs) {
        return vec2f( simd4f_sub(simd4f_zero(), lhs.value) );
    }
    vectorial_inline vec2f operator+(const vec2f& lhs, const vec2f& rhs) {
        return vec2f( simd4f_add(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator-(const vec2f& lhs, const vec2f& rhs) {
        return vec2f( simd4f_sub(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator*(const vec2f& lhs, const vec2f& rhs) {
        return vec2f( simd4f_mul(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator/(const vec2f& lhs, const vec2f& rhs) {
        return vec2f( simd4f_div(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator+=(vec2f& lhs, const vec2f& rhs) {
        return lhs = vec2f( simd4f_add(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator-=(vec2f& lhs, const vec2f& rhs) {
        return lhs = vec2f( simd4f_sub(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator*=(vec2f& lhs, const vec2f& rhs) {
        return lhs = vec2f( simd4f_mul(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator/=(vec2f& lhs, const vec2f& rhs) {
        return lhs = vec2f( simd4f_div(lhs.value, rhs.value) );
    }
    vectorial_inline vec2f operator+(const vec2f& lhs, float rhs) {
        return vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator-(const vec2f& lhs, float rhs) {
        return vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator*(const vec2f& lhs, float rhs) {
        return vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator/(const vec2f& lhs, float rhs) {
        return vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator+(float lhs, const vec2f& rhs) {
        return vec2f( simd4f_add(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec2f operator-(float lhs, const vec2f& rhs) {
        return vec2f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec2f operator*(float lhs, const vec2f& rhs) {
        return vec2f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec2f operator/(float lhs, const vec2f& rhs) {
        return vec2f( simd4f_div(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec2f operator+=(vec2f& lhs, float rhs) {
        return lhs = vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator-=(vec2f& lhs, float rhs) {
        return lhs = vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator*=(vec2f& lhs, float rhs) {
        return lhs = vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec2f operator/=(vec2f& lhs, float rhs) {
        return lhs = vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline float dot(const vec2f& lhs, const vec2f& rhs) {
        return simd4f_get_x( simd4f_dot2(lhs.value, rhs.value) );
    }
    vectorial_inline float length(const vec2f& v) {
        return simd4f_get_x( simd4f_length2(v.value) );
    }
    vectorial_inline float length_squared(const vec2f& v) {
        return simd4f_get_x( simd4f_length2_squared(v.value) );
    }
    vectorial_inline vec2f normalize(const vec2f& v) {
        return vec2f( simd4f_normalize2(v.value) );
    }
    vectorial_inline vec2f min(const vec2f& a, const vec2f& b) {
        return vec2f( simd4f_min(a.value, b.value) );
    }
    vectorial_inline vec2f max(const vec2f& a, const vec2f& b) {
        return vec2f( simd4f_max(a.value, b.value) );
    }
 }
 namespace std {
    inline ::vectorial::vec2f min(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::min(a,b); }
    inline ::vectorial::vec2f max(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::max(a,b); }
 }
 #ifdef VECTORIAL_OSTREAM
 #include <ostream>
 vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec2f& v) {
    os << "[ " << v.x() << ", "
               << v.y() << " ]";
    return os;
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/vec3f.h
+++ b/3rdparty/vectorial/include/vectorial/vec3f.h
@ -0,0 +1,197 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Copyright (c) 2014 Google, Inc.
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_VEC3F_H
 #ifndef VECTORIAL_SIMD4F_H
  #include "vectorial/simd4f.h"
 #endif
 namespace vectorial {
    class vec4f;
    class vec2f;
    class vec3f {
    public:
        simd4f value;
        inline vec3f() {}
        inline vec3f(const vec3f& v) : value(v.value) {}
        inline vec3f(const simd4f& v) : value(v) {}
        explicit inline vec3f(float xyz) : value( simd4f_splat(xyz) ) {}
        inline vec3f(float x, float y, float z) : value( simd4f_create(x,y,z,0) ) {}
        explicit inline vec3f(const float *ary) : value( simd4f_uload3(ary) ) { }
        inline float x() const { return simd4f_get_x(value); }
        inline float y() const { return simd4f_get_y(value); }
        inline float z() const { return simd4f_get_z(value); }
        inline void load(const float *ary) { value = simd4f_uload3(ary); }
        inline void store(float *ary) const { simd4f_ustore3(value, ary); }
        enum { elements = 3 };
        static vec3f zero() { return vec3f(simd4f_zero()); }
        static vec3f one() { return vec3f(1.0f); }
        static vec3f xAxis() { return vec3f(1.0f, 0.0f, 0.0f); }
        static vec3f yAxis() { return vec3f(0.0f, 1.0f, 0.0f); }
        static vec3f zAxis() { return vec3f(0.0f, 0.0f, 1.0f); }
        inline vec4f xyz0() const;
        inline vec4f xyz1() const;
        inline vec4f xyzw(float w) const;
        inline vec3f xyz() const;
        inline vec3f xy0() const;
        inline vec2f xy() const;
    };
    vectorial_inline vec3f operator-(const vec3f& lhs) {
        return vec3f( simd4f_sub(simd4f_zero(), lhs.value) );
    }
    vectorial_inline vec3f operator+(const vec3f& lhs, const vec3f& rhs) {
        return vec3f( simd4f_add(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator-(const vec3f& lhs, const vec3f& rhs) {
        return vec3f( simd4f_sub(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator*(const vec3f& lhs, const vec3f& rhs) {
        return vec3f( simd4f_mul(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator/(const vec3f& lhs, const vec3f& rhs) {
        return vec3f( simd4f_div(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator+=(vec3f& lhs, const vec3f& rhs) {
        return lhs = vec3f( simd4f_add(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator-=(vec3f& lhs, const vec3f& rhs) {
        return lhs = vec3f( simd4f_sub(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator*=(vec3f& lhs, const vec3f& rhs) {
        return lhs = vec3f( simd4f_mul(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator/=(vec3f& lhs, const vec3f& rhs) {
        return lhs = vec3f( simd4f_div(lhs.value, rhs.value) );
    }
    vectorial_inline vec3f operator+(const vec3f& lhs, float rhs) {
        return vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator-(const vec3f& lhs, float rhs) {
        return vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator*(const vec3f& lhs, float rhs) {
        return vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator/(const vec3f& lhs, float rhs) {
        return vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator+(float lhs, const vec3f& rhs) {
        return vec3f( simd4f_add(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec3f operator-(float lhs, const vec3f& rhs) {
        return vec3f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec3f operator*(float lhs, const vec3f& rhs) {
        return vec3f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec3f operator/(float lhs, const vec3f& rhs) {
        return vec3f( simd4f_div(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec3f operator+=(vec3f& lhs, float rhs) {
        return lhs = vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator-=(vec3f& lhs, float rhs) {
        return lhs = vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator*=(vec3f& lhs, float rhs) {
        return lhs = vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec3f operator/=(vec3f& lhs, float rhs) {
        return lhs = vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline float dot(const vec3f& lhs, const vec3f& rhs) {
        return simd4f_dot3_scalar(lhs.value, rhs.value);
    }
    vectorial_inline vec3f cross(const vec3f& lhs, const vec3f& rhs) {
        return simd4f_cross3(lhs.value, rhs.value);
    }
    vectorial_inline float length(const vec3f& v) {
        return simd4f_get_x( simd4f_length3(v.value) );
    }
    vectorial_inline float length_squared(const vec3f& v) {
        return simd4f_get_x( simd4f_length3_squared(v.value) );
    }
    vectorial_inline vec3f normalize(const vec3f& v) {
        return vec3f( simd4f_normalize3(v.value) );
    }
    vectorial_inline vec3f min(const vec3f& a, const vec3f& b) {
        return vec3f( simd4f_min(a.value, b.value) );
    }
    vectorial_inline vec3f max(const vec3f& a, const vec3f& b) {
        return vec3f( simd4f_max(a.value, b.value) );
    }
 }
 namespace std {
    inline ::vectorial::vec3f min(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::min(a,b); }
    inline ::vectorial::vec3f max(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::max(a,b); }
 }
 #ifdef VECTORIAL_OSTREAM
 #include <ostream>
 vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec3f& v) {
    os << "[ " << v.x() << ", "
               << v.y() << ", "
               << v.z() << " ]";
    return os;
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/vec4f.h
+++ b/3rdparty/vectorial/include/vectorial/vec4f.h
@ -0,0 +1,195 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_VEC4F_H
 #define VECTORIAL_VEC4F_H
 #ifndef VECTORIAL_SIMD4F_H
  #include "vectorial/simd4f.h"
 #endif
 namespace vectorial {
    class vec3f;
    class vec2f;
    class vec4f {
    public:
        simd4f value;
        inline vec4f() {}
        inline vec4f(const vec4f& v) : value(v.value) {}
        inline vec4f(const simd4f& v) : value(v) {}
        explicit inline vec4f(float xyzw) : value( simd4f_splat(xyzw) ) {}
        inline vec4f(float x, float y, float z, float w) : value( simd4f_create(x,y,z,w) ) {}
        explicit inline vec4f(const float *ary) : value( simd4f_uload4(ary) ) { }
        inline float x() const { return simd4f_get_x(value); }
        inline float y() const { return simd4f_get_y(value); }
        inline float z() const { return simd4f_get_z(value); }
        inline float w() const { return simd4f_get_w(value); }
        inline void load(const float *ary) { value = simd4f_uload4(ary); }
        inline void store(float *ary) const { simd4f_ustore4(value, ary); }
        enum { elements = 4 };
        static vec4f zero() { return vec4f(simd4f_zero()); }
        static vec4f one() { return vec4f(1.0f); }
        static vec4f xAxis() { return vec4f(1.0f, 0.0f, 0.0f, 0.0f); }
        static vec4f yAxis() { return vec4f(0.0f, 1.0f, 0.0f, 0.0f); }
        static vec4f zAxis() { return vec4f(0.0f, 0.0f, 1.0f, 0.0f); }
        static vec4f wAxis() { return vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
        inline vec3f xyz() const;
        inline vec2f xy() const;
    };
    vectorial_inline vec4f operator-(const vec4f& lhs) {
        return vec4f( simd4f_sub(simd4f_zero(), lhs.value) );
    }
    vectorial_inline vec4f operator+(const vec4f& lhs, const vec4f& rhs) {
        return vec4f( simd4f_add(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator-(const vec4f& lhs, const vec4f& rhs) {
        return vec4f( simd4f_sub(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator*(const vec4f& lhs, const vec4f& rhs) {
        return vec4f( simd4f_mul(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator/(const vec4f& lhs, const vec4f& rhs) {
        return vec4f( simd4f_div(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator+=(vec4f& lhs, const vec4f& rhs) {
        return lhs = vec4f( simd4f_add(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator-=(vec4f& lhs, const vec4f& rhs) {
        return lhs = vec4f( simd4f_sub(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator*=(vec4f& lhs, const vec4f& rhs) {
        return lhs = vec4f( simd4f_mul(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator/=(vec4f& lhs, const vec4f& rhs) {
        return lhs = vec4f( simd4f_div(lhs.value, rhs.value) );
    }
    vectorial_inline vec4f operator+(const vec4f& lhs, float rhs) {
        return vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator-(const vec4f& lhs, float rhs) {
        return vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator*(const vec4f& lhs, float rhs) {
        return vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator/(const vec4f& lhs, float rhs) {
        return vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator+(float lhs, const vec4f& rhs) {
        return vec4f( simd4f_add(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec4f operator-(float lhs, const vec4f& rhs) {
        return vec4f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec4f operator*(float lhs, const vec4f& rhs) {
        return vec4f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec4f operator/(float lhs, const vec4f& rhs) {
        return vec4f( simd4f_div(simd4f_splat(lhs), rhs.value) );
    }
    vectorial_inline vec4f operator+=(vec4f& lhs, float rhs) {
        return lhs = vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator-=(vec4f& lhs, float rhs) {
        return lhs = vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator*=(vec4f& lhs, float rhs) {
        return lhs = vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline vec4f operator/=(vec4f& lhs, float rhs) {
        return lhs = vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
    }
    vectorial_inline float dot(const vec4f& lhs, const vec4f& rhs) {
        return simd4f_get_x( simd4f_dot4(lhs.value, rhs.value) );
    }
    vectorial_inline float length(const vec4f& v) {
        return simd4f_get_x( simd4f_length4(v.value) );
    }
    vectorial_inline float length_squared(const vec4f& v) {
        return simd4f_get_x( simd4f_length4_squared(v.value) );
    }
    vectorial_inline vec4f normalize(const vec4f& v) {
        return vec4f( simd4f_normalize4(v.value) );
    }
    vectorial_inline vec4f min(const vec4f& a, const vec4f& b) {
        return vec4f( simd4f_min(a.value, b.value) );
    }
    vectorial_inline vec4f max(const vec4f& a, const vec4f& b) {
        return vec4f( simd4f_max(a.value, b.value) );
    }
 }
 namespace std {
    inline ::vectorial::vec4f min(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::min(a,b); }
    inline ::vectorial::vec4f max(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::max(a,b); }
 }
 #ifdef VECTORIAL_OSTREAM
 #include <ostream>
 vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec4f& v) {
    os << "[ " << v.x() << ", "
               << v.y() << ", "
               << v.z() << ", "
               << v.w() << " ]";
    return os;
 }
 #endif
 #endif
--- a/3rdparty/vectorial/include/vectorial/vec_convert.h
+++ b/3rdparty/vectorial/include/vectorial/vec_convert.h
@ -0,0 +1,31 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_VEC_CONVERT_H
 #define VECTORIAL_VEC_CONVERT_H
 namespace vectorial {
    inline vec3f vec4f::xyz() const { return vec3f(value); }
    inline vec2f vec4f::xy() const { return vec2f(value); }
    inline vec4f vec3f::xyz0() const { return vec4f(simd4f_zero_w(value)); }
    inline vec4f vec3f::xyz1() const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
    inline vec4f vec3f::xyzw(float w) const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, w); }
    inline vec3f vec3f::xyz() const { return vec3f(value); }
    inline vec3f vec3f::xy0() const { return vec3f(value) * vec3f(1.0f, 1.0f, 0.0f); }
    inline vec2f vec3f::xy() const { return vec2f(value); }
    inline vec4f vec2f::xy00() const { return vec4f(simd4f_zero_zw(value)); }
    inline vec4f vec2f::xy01() const { return xy00() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
    inline vec4f vec2f::xyzw(float z, float w) const { return xy00() + vec4f(0.0f, 0.0f, z, w); }
    inline vec3f vec2f::xy0() const { return vec3f(simd4f_zero_zw(value)); }
    inline vec2f vec2f::xy() const { return vec2f(value); }
 }
 #endif
--- a/3rdparty/vectorial/include/vectorial/vectorial.h
+++ b/3rdparty/vectorial/include/vectorial/vectorial.h
@ -0,0 +1,19 @@
 /*
  Vectorial
  Copyright (c) 2010 Mikko Lehtonen
  Licensed under the terms of the two-clause BSD License (see LICENSE)
 */
 #ifndef VECTORIAL_VECTORIAL_H
 #define VECTORIAL_VECTORIAL_H
 #include "vectorial/vec2f.h"
 #include "vectorial/vec3f.h"
 #include "vectorial/vec4f.h"
 #include "vectorial/vec_convert.h"
 #include "vectorial/mat4f.h"
 #endif
--- a/3rdparty/vectorial/spec/spec.cpp
+++ b/3rdparty/vectorial/spec/spec.cpp
@ -0,0 +1,229 @@
 /* Specific - Minimal C++ spec framework.
 The zlib/libpng License
 Copyright (c) 2008 Mikko Lehtonen
 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
 arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it
 freely, subject to the following restrictions:
    1. The origin of this software must not be misrepresented; you must not
    claim that you wrote the original software. If you use this software
    in a product, an acknowledgment in the product documentation would be
    appreciated but is not required.
    2. Altered source versions must be plainly marked as such, and must not be
    misrepresented as being the original software.
    3. This notice may not be removed or altered from any source
    distribution.
 */
 #include "spec.h"
 #include <iostream>
 namespace specific {
    void SpecWriter::startGroup(std::string /*group*/, std::string /*description*/) {}
    void SpecWriter::addFailedAssertation(std::string msg, const char *file, int line) {
        mFailures.push_back( SpecFailure(msg,file,line) );
    }
    void SpecWriter::addSpecResult(SpecResult r) {
        mResults.push_back( r );
    }
    void SpecWriter::start() {}
    void SpecWriter::stop() {
        std::cout << std::endl;
        size_t nth = 0;
        for(std::vector<SpecFailure>::iterator i=mFailures.begin(); i != mFailures.end(); ++i, ++nth)
        {
            std::cout << std::endl;
            std::cout << (nth+1) << ") Failed assertation at " << i->file << ":"
            << i->line << ":" << std::endl << "  " << i->msg << std::endl;
        }
        std::cout << std::endl << mResults.size() << " examples, " << mFailures.size() << " failures" << std::endl;
    }
    void ProgressWriter::addSpecResult(SpecResult r) {
        SpecWriter::addSpecResult(r);
        switch(r.type) {
            case SpecResult::PASSED:
                std::cout << ".";
                break;
            case SpecResult::FAILED:
                std::cout << "F";
                break;
            case SpecResult::ERRORED:
                std::cout << "E";
                break;
        }
        std::cout << std::flush;
    }
    void SpecdocWriter::startGroup(std::string group, std::string description) {
        std::cout << group << ": " << description << std::endl;            
    }
    void SpecdocWriter::addSpecResult(SpecResult r) {
        SpecWriter::addSpecResult(r);
        size_t nth = mFailures.size();
        std::cout << "- " << r.test;
        switch(r.type) {
            case SpecResult::PASSED:
                std::cout << " [OK]";
                break;
            case SpecResult::FAILED:
                std::cout << " [FAILED - " << nth << "]";
                break;
            case SpecResult::ERRORED:
                std::cout << " [ERROR - "<< nth <<"]";
                break;
        }
        std::cout << std::endl;
    }
    class spec_failure {};
    SpecBase::SpecBase() : mWriter(NULL), mName(NULL),
        mFailed(false), mLastFailed(false), mError(false), mExecutionPoint(0), mContinuePoint(0) 
    {
        SpecRunner::getInstance().add(this);
    }
    SpecBase::~SpecBase() {
    }
    bool SpecBase::startSpec(const char* name) 
    {
        endSpec();
        mExecutionPoint++;
        if(mExecutionPoint <= mContinuePoint) return false;
        mContinuePoint++;
        mName = name;
        return true;
    }
    void SpecBase::endSpec() 
    {
        if(!mName) return;
        SpecResult r;
        r.group = getGroup();
        r.description = getDescription();
        r.type = SpecResult::PASSED;
        if(mLastFailed) r.type = SpecResult::FAILED;
        if(mError) r.type = SpecResult::ERRORED;
        r.test = mName;
        mWriter->addSpecResult( r );
        mName = NULL; 
    }
    void SpecBase::should_test(bool value, const char* message, const char* file, int line) {
        mLastFailed=false;
        if(!value) {
            mWriter->addFailedAssertation(message, file, line);
            mLastFailed = mFailed = true;
            throw spec_failure();
        }
    }
    void SpecBase::error(std::string msg) {
        mWriter->addFailedAssertation(msg, "exception", 0);
        mLastFailed = true;
        mFailed = true;
        mError = true;
    }
    bool SpecBase::done() {
        if( mError ) {
            mError = false;
            return false;
        }
        return true;
    }
    SpecRunner::SpecRunner() {}
    SpecRunner::~SpecRunner() { }
    SpecRunner& SpecRunner::getInstance() {
        static SpecRunner* instance = NULL;
        if( instance == NULL ) {
            instance = new SpecRunner;
        }
        return *instance;
    }
    bool SpecRunner::run(SpecWriter& writer, const std::string subset) {
        bool success = true;
        writer.start();
        std::vector<SpecBase*>::iterator i = mSpecs.begin();
        for(; i != mSpecs.end(); ++i) {
            SpecBase *b = *i;
            if( b->getGroup().find(subset, 0) == std::string::npos ) continue;
            b->mContinuePoint = 0;
            b->setWriter(&writer);
            writer.startGroup( b->getGroup(), b->getDescription() );
            do {
                b->mExecutionPoint = 0;
                try {
                    b->specify();
                } catch(spec_failure& e) {
                    b->mError=true;
                } catch( std::exception& e) {
                    b->error(e.what());
                } catch( ... ) {
                    b->error("unknown exception");
                }
                b->endSpec();
            } while( !b->done() );
            success = success && b->isSuccessful();
        }
        writer.stop();
        return success;
    }
 }
--- a/3rdparty/vectorial/spec/spec.h
+++ b/3rdparty/vectorial/spec/spec.h
@ -0,0 +1,217 @@
 /* Specific - Minimal C++ spec framework.
 The zlib/libpng License
 Copyright (c) 2008 Mikko Lehtonen
 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
 arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it
 freely, subject to the following restrictions:
    1. The origin of this software must not be misrepresented; you must not
    claim that you wrote the original software. If you use this software
    in a product, an acknowledgment in the product documentation would be
    appreciated but is not required.
    2. Altered source versions must be plainly marked as such, and must not be
    misrepresented as being the original software.
    3. This notice may not be removed or altered from any source
    distribution.
 */
 #ifndef SPECIFIC_SPEC_H
 #define SPECIFIC_SPEC_H
 #include <string>
 #include <vector>
 #include <stdexcept>
 #include <sstream>
 namespace specific {
    class SpecResult {
    public:
        typedef enum {
            PASSED,
            FAILED,
            ERRORED
        } Type;
        Type type;
        std::string group;
        std::string description;
        std::string test;
    };
    class SpecFailure {
    public:
        SpecFailure(std::string amsg, const char* afile, int aline)
            : msg(amsg), file(afile), line(aline) { }
        std::string msg;
        const char* file;
        int line;
    };
    class SpecWriter {
    public:
        std::vector<SpecResult> mResults;
        std::vector<SpecFailure> mFailures;
        SpecWriter() {}
        virtual ~SpecWriter() {}
        virtual void startGroup(std::string group, std::string description);
        virtual void addFailedAssertation(std::string msg, const char *file, int line);
        virtual void addSpecResult(SpecResult r);
        virtual void start();
        virtual void stop();
    };
    class ProgressWriter : public SpecWriter {
    public:
        void addSpecResult(SpecResult r);
    };
    class SpecdocWriter : public SpecWriter {
    public:
        void startGroup(std::string group, std::string description);
        void addSpecResult(SpecResult r);
    };
    template<class T> std::string inspect(const T& value) {
        std::stringstream ss;
        ss << value;
        return ss.str();
    }
    class SpecBase {
    public:
        SpecBase();
        virtual ~SpecBase();
        virtual void specify() = 0;
        void setWriter(SpecWriter* w) { mWriter = w; }
        bool startSpec(const char* name);
        void endSpec();
        void should_test(bool value, const char* message, const char* file, int line);
        template<typename T1, typename T2> void should_equal_template(const T1& a, const T2& b, const char* file, int line) {
            std::stringstream ss;
            ss << "`" << ::specific::inspect(a) << "'" << " == " << "`" << ::specific::inspect(b) << "'";
            should_test( a == b, ss.str().c_str(), file, line);
        }
        template<typename T1, typename T2> void should_not_equal_template(const T1& a, const T2& b, const char* file, int line) {
            std::stringstream ss;
            ss << "`" << ::specific::inspect(a) << "'" << " != " << "`" << ::specific::inspect(b) << "'";
            should_test( a != b, ss.str().c_str(), file, line);
        }
        virtual std::string getGroup() = 0;
        virtual std::string getDescription() = 0;
        bool isSuccessful() { return !mFailed; }
        bool done();
        void error(std::string msg);
        SpecWriter* mWriter;
        const char* mName;
        bool mFailed;
        bool mLastFailed;
        bool mError;
        int mExecutionPoint;
        int mContinuePoint;
        char *mFile;
        std::string mErrorMessage;
        int mLine;
    };
    class SpecRunner {
    public:
        static SpecRunner& getInstance();
        void add(SpecBase* spec) { mSpecs.push_back( spec ); }
        bool run(SpecWriter& writer, const std::string subset = "");
    private:
        std::vector<SpecBase*> mSpecs;
        SpecRunner();
        ~SpecRunner();
    };
    #define SPEC_UNIQUE_NAME3(x,y) x##y
    #define SPEC_UNIQUE_NAME2(x,y) SPEC_UNIQUE_NAME3(x,y)
    #define SPEC_NAME(x) SPEC_UNIQUE_NAME2(SPEC_##x, SPEC_UNIQUE_NAME2(_startingOnLine, __LINE__) )
    #define describe(group, description)                                    \
    class SPEC_NAME(group) : public specific::SpecBase                         \
    {                                                                       \
    public:                                                                 \
        void specify();                                                     \
        std::string getGroup() { return #group; }                           \
        std::string getDescription() { return description; }                \
    };                                                                      \
    static SPEC_NAME(group) SPEC_UNIQUE_NAME2(SPEC_NAME(group), _instance); \
    void SPEC_NAME(group)::specify()
    #define it(description) if(startSpec(description))
    // Matchers
    #define should_be_true(a) should_test(a, #a, __FILE__, __LINE__)
    #define should_be_false(a) should_be_true( !a )
    #ifndef SPECIFIC_NO_OSTREAM
        #define should_equal(a, b) should_equal_template( a,b, __FILE__, __LINE__ )
        #define should_not_equal(a, b) should_not_equal_template( a,b, __FILE__, __LINE__ )
    #else
        #define should_equal(a, b) should_be_true( (a) == (b) )
        #define should_not_equal(a, b) should_be_true( (a) != (b) )
    #endif
    #define should_throw(code, what) \
    do {                             \
        bool _thrown = false;        \
        try {                        \
          code ;                     \
        } catch(what& e) {           \
            _thrown = true;          \
        }                            \
        should_test(_thrown, "should throw exception " #what, __FILE__, __LINE__); \
    } while(0)
 }
 #endif /* Include guard */
--- a/3rdparty/vectorial/spec/spec_helper.h
+++ b/3rdparty/vectorial/spec/spec_helper.h
@ -0,0 +1,215 @@
 #ifndef VECTORIAL_SPEC_HELPER_H
 #define VECTORIAL_SPEC_HELPER_H
 #define VECTORIAL_OSTREAM
 #include "spec.h"
 #include "vectorial/vectorial.h"
 #ifdef VECTORIAL_HAVE_SIMD2F
 #include "vectorial/simd2f.h"
 #endif
 #include <cmath>
 #include <cstdlib>
 #include <iostream>
 #define should_be_close_to(a,b,tolerance) should_be_close_to_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_simd4f( a, b, tolerance) should_be_equal_simd4f_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_simd2f( a, b, tolerance) should_be_equal_simd2f_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_vec4f( a, b, tolerance) should_be_equal_vec4f_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_vec3f( a, b, tolerance) should_be_equal_vec3f_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_vec2f( a, b, tolerance) should_be_equal_vec2f_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_simd4x4f( a, b, tolerance) should_be_equal_simd4x4f_(this, a,b,tolerance,__FILE__,__LINE__)
 #define should_be_equal_mat4f( a, b, tolerance) should_be_equal_mat4f_(this, a,b,tolerance,__FILE__,__LINE__)
 // Based on:
 // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
 // 
 static inline bool compare_floats(float A, float B, int maxUlps)
 {
    // Make sure maxUlps is non-negative and small enough that the
    // default NAN won't compare as equal to anything.
    // assert(maxUlps > 0 && maxUlps < 4 * 1024 * 1024);
    union {
        float f;
        int i;
    } f2iA, f2iB;
    f2iA.f = A;
    f2iB.f = B;
    int aInt = f2iA.i;
 //    int aInt = *(int*)&A;
    // Make aInt lexicographically ordered as a twos-complement int
    if (aInt < 0)
        aInt = 0x80000000 - aInt;
    // Make bInt lexicographically ordered as a twos-complement int
    int bInt = f2iB.i;
 //    int bInt = *(int*)&B;
    if (bInt < 0)
        bInt = 0x80000000 - bInt;
    int intDiff = abs(aInt - bInt);
    if (intDiff <= maxUlps)
        return true;
    return false;
 }
 static inline void should_be_close_to_(specific::SpecBase *spec, float a, float b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats(a,b,tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 #ifdef VECTORIAL_HAVE_SIMD2F
 static inline void should_be_equal_simd2f_(specific::SpecBase *spec, const simd2f& a, const simd2f& b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats( simd2f_get_x(a), simd2f_get_x(b), tolerance) ) equal = false;
    if( !compare_floats( simd2f_get_y(a), simd2f_get_y(b), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 #endif
 static inline void should_be_equal_simd4f_(specific::SpecBase *spec, const simd4f& a, const simd4f& b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats( simd4f_get_x(a), simd4f_get_x(b), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a), simd4f_get_y(b), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a), simd4f_get_z(b), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a), simd4f_get_w(b), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 static inline void should_be_equal_vec4f_(specific::SpecBase *spec, const vectorial::vec4f& a, const vectorial::vec4f& b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
    if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
    if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false;
    if( !compare_floats( a.w(), b.w(), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 static inline void should_be_equal_vec3f_(specific::SpecBase *spec, const vectorial::vec3f& a, const vectorial::vec3f& b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
    if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
    if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 static inline void should_be_equal_vec2f_(specific::SpecBase *spec, const vectorial::vec2f& a, const vectorial::vec2f& b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
    if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 static inline void should_be_equal_simd4x4f_(specific::SpecBase *spec, const simd4x4f& a, const simd4x4f& b, int tolerance, const char *file, int line) {
    bool equal=true;
    if( !compare_floats( simd4f_get_x(a.x), simd4f_get_x(b.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.x), simd4f_get_y(b.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.x), simd4f_get_z(b.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.x), simd4f_get_w(b.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_x(a.y), simd4f_get_x(b.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.y), simd4f_get_y(b.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.y), simd4f_get_z(b.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.y), simd4f_get_w(b.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_x(a.z), simd4f_get_x(b.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.z), simd4f_get_y(b.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.z), simd4f_get_z(b.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.z), simd4f_get_w(b.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_x(a.w), simd4f_get_x(b.w), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.w), simd4f_get_y(b.w), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.w), simd4f_get_z(b.w), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.w), simd4f_get_w(b.w), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 static inline void should_be_equal_mat4f_(specific::SpecBase *spec, const vectorial::mat4f& a, const vectorial::mat4f& b, int tolerance, const char *file, int line) {
    bool equal=true;                                                    
    if( !compare_floats( simd4f_get_x(a.value.x), simd4f_get_x(b.value.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.value.x), simd4f_get_y(b.value.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.value.x), simd4f_get_z(b.value.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.value.x), simd4f_get_w(b.value.x), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_x(a.value.y), simd4f_get_x(b.value.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.value.y), simd4f_get_y(b.value.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.value.y), simd4f_get_z(b.value.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.value.y), simd4f_get_w(b.value.y), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_x(a.value.z), simd4f_get_x(b.value.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.value.z), simd4f_get_y(b.value.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.value.z), simd4f_get_z(b.value.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.value.z), simd4f_get_w(b.value.z), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_x(a.value.w), simd4f_get_x(b.value.w), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_y(a.value.w), simd4f_get_y(b.value.w), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_z(a.value.w), simd4f_get_z(b.value.w), tolerance) ) equal = false;
    if( !compare_floats( simd4f_get_w(a.value.w), simd4f_get_w(b.value.w), tolerance) ) equal = false;
    std::stringstream ss;
    ss << a << " == " << b << " (with tolerance of " << tolerance << " ulps)";
    spec->should_test(equal, ss.str().c_str(), file, line);
 }
 #endif
--- a/3rdparty/vectorial/spec/spec_main.cpp
+++ b/3rdparty/vectorial/spec/spec_main.cpp
@ -0,0 +1,55 @@
 /* Specific - Minimal C++ spec framework.
 The zlib/libpng License
 Copyright (c) 2008 Mikko Lehtonen
 This software is provided 'as-is', without any express or implied
 warranty. In no event will the authors be held liable for any damages
 arising from the use of this software.
 Permission is granted to anyone to use this software for any purpose,
 including commercial applications, and to alter it and redistribute it
 freely, subject to the following restrictions:
    1. The origin of this software must not be misrepresented; you must not
    claim that you wrote the original software. If you use this software
    in a product, an acknowledgment in the product documentation would be
    appreciated but is not required.
    2. Altered source versions must be plainly marked as such, and must not be
    misrepresented as being the original software.
    3. This notice may not be removed or altered from any source
    distribution.
 */
 #include "spec.h"
 #include <cstdlib>
 int main(int argc, char *argv[]) 
 {
    std::string subset("");
    specific::ProgressWriter progressWriter;
    specific::SpecdocWriter specdocWriter;
    specific::SpecWriter* writer = &progressWriter;
    for(size_t i = 1; i < size_t(argc); ++i) {
        if( std::string("-s") == argv[i] ) {
            writer = &specdocWriter;
        } else {
            subset = argv[i];
        }
    }
    bool success = specific::SpecRunner::getInstance().run(*writer, subset);
    return success ? EXIT_SUCCESS : EXIT_FAILURE;
 }
--- a/3rdparty/vectorial/spec/spec_mat4f.cpp
+++ b/3rdparty/vectorial/spec/spec_mat4f.cpp
@ -0,0 +1,29 @@
 #include "spec_helper.h"
 #include <iostream>
 using vectorial::vec4f;
 using vectorial::mat4f;
 const int epsilon = 1;
 describe(mat4f, "constructing") {
    it("should have default constructor that does nothing..") {
        mat4f x;
    }
    it("should have constructor that constructs from four vec4") {
        mat4f x( vec4f(1,2,3,4), vec4f(5,6,7,8), vec4f(9,10,11,12), vec4f(13,14,15,16) );
        // octave mat4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ]
        should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), simd4f_create(5.000000000000000f, 6.000000000000000f, 7.000000000000000f, 8.000000000000000f), simd4f_create(9.000000000000000f, 10.000000000000000f, 11.000000000000000f, 12.000000000000000f), simd4f_create(13.000000000000000f, 14.000000000000000f, 15.000000000000000f, 16.000000000000000f)), epsilon );
    }
    it("should have static function to create identity matrix") {
        mat4f x = mat4f::identity();
        // octave mat4f: [1,0,0,0;0,1,0,0;0,0,1,0;0,0,0,1]
        should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 1.000000000000000f)), epsilon );
    }
 }
--- a/3rdparty/vectorial/spec/spec_simd2f.cpp
+++ b/3rdparty/vectorial/spec/spec_simd2f.cpp
@ -0,0 +1,242 @@
 #include "spec_helper.h"
 const int epsilon = 1;
 #ifdef VECTORIAL_HAVE_SIMD2F
 describe(simd2f, "sanity") {
    it("VECTORIAL_SIMD_TYPE should be defined to a string") {
        std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl;
    }
 }
 describe(simd2f, "creating") {
    it("should be possible to create with simd2f_create") {
        simd2f x = simd2f_create(1, 2);
        should_be_close_to( simd2f_get_x(x), 1, epsilon);
        should_be_close_to( simd2f_get_y(x), 2, epsilon);
        // octave simd2f: [1,2]
        should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon );
    }
    it("should have simd2f_zero for zero vector") {
        simd2f x = simd2f_zero();
        // octave simd2f: [0,0]
        should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon );
    }
 }
 #ifdef _MSC_VER
 #include <malloc.h>
 #else
 #include <alloca.h>
 #endif
 #define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4))
 describe(simd2f, "utilities") {
    it("should have simd2f_uload2 for loading two float values from float an unaligned array into simd2f") {
        float *f = unaligned_mem(2);
        f[0] = 1;
        f[1] = 2;
        simd2f x = simd2f_uload2(f);
        // octave simd2f: [1,2]
        should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon );
    }
    it("should have simd2f_ustore2 for storing two float values from simd2f to an unaligned array") {
        float *f = unaligned_mem(2);
        f[0] = -1;
        f[1] = -1;
        simd2f a = simd2f_create(1,2);
        simd2f_ustore2(a, f);
        should_be_close_to(f[0], 1, epsilon);
        should_be_close_to(f[1], 2, epsilon);
    }
    it("should have simd2f_splat that expands a single scalar to all elements") {
        simd2f x = simd2f_splat(42);
        // octave simd2f: [42,42]
        should_be_equal_simd2f(x, simd2f_create(42.000000000000000f, 42.000000000000000f), epsilon );
    }
    it("should have simd2f_splat_x,y splatting of an element") {
        simd2f a = simd2f_create(1,2);
        simd2f x;
        x = simd2f_splat_x(a);
        // octave simd2f: [1,1]
        should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 1.000000000000000f), epsilon );
        x = simd2f_splat_y(a);
        // octave simd2f: [2,2]
        should_be_equal_simd2f(x, simd2f_create(2.000000000000000f, 2.000000000000000f), epsilon );
    }
 #if 0
    it("should have simd2f_sum that adds elements") {
        simd2f a = simd2f_create(1,2);
        simd2f x = simd2f_sum(a);
        // octave simd2f: [sum([1,2]), sum([1,2,3,4])]
        should_be_equal_simd2f(x, simd2f_create(3.000000000000000f, 10.000000000000000f), epsilon );
    }
 #endif
    it("should have simd2f_reciprocal") {
        simd2f a = simd2f_create(0.00001f, 2.00001f);
        simd2f x = simd2f_reciprocal(a);
        // octave simd2f: 1 ./ [0.00001, 2.00001]
        should_be_equal_simd2f(x, simd2f_create(99999.999999999985448f, 0.499997500012500f), epsilon );
    }
    it("should have simd2f_sqrt") {
        simd2f a = simd2f_create(0.00001f, 2.00001f);
        simd2f x = simd2f_sqrt(a);
        // octave simd2f:  sqrt([0.00001, 2.00001])
        should_be_equal_simd2f(x, simd2f_create(0.003162277660168f, 1.414217097902582f), epsilon );
        x = simd2f_sqrt( simd2f_create(0.0f, 0.0f) );
        // octave simd2f:  sqrt([0, 0])
        should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon );
    }
    it("should have simd2f_rsqrt for reciprocal of square-root") {
        simd2f a = simd2f_create(0.00001f, 2.00001f);
        simd2f x = simd2f_rsqrt(a);
        const int epsilon = 4; // Grant larger error
        // octave simd2f:  1 ./ sqrt([0.00001, 2.00001])
        should_be_equal_simd2f(x, simd2f_create(316.227766016837904f, 0.707105013426224f), epsilon );
    }
 }
 describe(simd2f, "arithmetic with another simd2f") {
    it("should have simd2f_add for component-wise addition") {
        simd2f a = simd2f_create(1,2);
        simd2f b = simd2f_create(10,20);
        simd2f x = simd2f_add(a,b);
        // octave simd2f: [1,2] + [10,20]
        should_be_equal_simd2f(x, simd2f_create(11.000000000000000f, 22.000000000000000f), epsilon );
    }
    it("should have simd2f_sub for component-wise subtraction") {
        simd2f a = simd2f_create(1,2);
        simd2f b = simd2f_create(10,20);
        simd2f x = simd2f_sub(b,a);
        // octave simd2f: [10,20] - [1,2] 
        should_be_equal_simd2f(x, simd2f_create(9.000000000000000f, 18.000000000000000f), epsilon );
    }
    it("should have simd2f_mul for component-wise multiply") {
        simd2f a = simd2f_create(1,2);
        simd2f b = simd2f_create(10,20);
        simd2f x = simd2f_mul(a,b);
        // octave simd2f: [1,2] .* [10,20]
        should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 40.000000000000000f), epsilon );
    }
    it("should have simd2f_div for component-wise division") {
        simd2f a = simd2f_create(1,2);
        simd2f b = simd2f_create(10,20);
        simd2f x = simd2f_div(b,a);
        // octave simd2f: [10,20] ./ [1,2] 
        should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 10.000000000000000f), epsilon );
    }
    it("should have simd2f_madd for multiply-add") {
        simd2f a = simd2f_create(1,2);
        simd2f b = simd2f_create(100,100);
        simd2f c = simd2f_create(6,7);
        simd2f x = simd2f_madd(a,b,c);
        // octave simd2f: [1,2] .* [100,100] .+ [6,7]
        should_be_equal_simd2f(x, simd2f_create(106.000000000000000f, 207.000000000000000f), epsilon );
    }
 }
 describe(simd2f, "vector math") {
    it("should have simd2f_dot2 for two component dot product") {
        simd2f a = simd2f_create(1,2);
        simd2f b = simd2f_create(10,20);
        simd2f x = simd2f_dot2(a,b);
        // octave simd2f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20])]
        should_be_equal_simd2f(x, simd2f_create(50.000000000000000f, 50.000000000000000f), epsilon );
    }
    it("should have simd2f_length2 for two component vector length") {
        simd2f a = simd2f_create(1,2);
        simd2f x = simd2f_length2(a);
        // octave simd2f: [norm([1,2]),norm([1,2])]
        should_be_equal_simd2f(x, simd2f_create(2.236067977499790f, 2.236067977499790f), epsilon );
    }
    it("should have simd2f_length2_squared for two component squared vector length") {
        simd2f a = simd2f_create(1,2);
        simd2f x = simd2f_length2_squared(a);
        // octave simd2f: ([dot([1,2], [1,2]), dot([1,2], [1,2])])
        should_be_equal_simd2f(x, simd2f_create(5.000000000000000f, 5.000000000000000f), epsilon );
    }
    it("should have simd2f_normalize2 for normalizing two component vector to unit length") {
        simd2f a = simd2f_create(1,2);
        simd2f x = simd2f_normalize2(a);
        // octave simd2f: [1,2] / norm([1,2])
        should_be_equal_simd2f(x, simd2f_create(0.447213595499958f, 0.894427190999916f), epsilon );
    }
 }
 describe(simd2f, "min-max") {
    it("should have simd2f_min for choosing minimum elements") {
        simd2f a = simd2f_create(1.0f,  2.0f);
        simd2f b = simd2f_create(2.0f, -2.0f);
        simd2f x = simd2f_min(a,b);
        should_be_equal_simd2f(x, simd2f_create(1.0f, -2.0f), epsilon);
    }
    it("should have simd2f_max for choosing maximum elements") {
        simd2f a = simd2f_create(1.0f,  2.0f);
        simd2f b = simd2f_create(2.0f, -2.0f);
        simd2f x = simd2f_max(a,b);
        should_be_equal_simd2f(x, simd2f_create(2.0f, 2.0f), epsilon);
    }
 }
 #endif
--- a/3rdparty/vectorial/spec/spec_simd4f.cpp
+++ b/3rdparty/vectorial/spec/spec_simd4f.cpp
@ -0,0 +1,457 @@
 #include "spec_helper.h"
 const int epsilon = 1;
 describe(simd4f, "sanity") {
    it("VECTORIAL_SIMD_TYPE should be defined to a string") {
        std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl;
    }
 }
 describe(simd4f, "creating") {
    it("should be possible to create with simd4f_create") {
        simd4f x = simd4f_create(1, 2, 3, 4);
        should_be_close_to( simd4f_get_x(x), 1, epsilon);
        should_be_close_to( simd4f_get_y(x), 2, epsilon);
        should_be_close_to( simd4f_get_z(x), 3, epsilon);
        should_be_close_to( simd4f_get_w(x), 4, epsilon);
        // octave simd4f: [1,2,3,4]
        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
    }
    it("should have simd4f_zero for zero vector") {
        simd4f x = simd4f_zero();
        // octave simd4f: [0,0,0,0]
        should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon );
    }
 }
 #ifdef _MSC_VER
 #include <malloc.h>
 #else
 #include <alloca.h>
 #endif
 #define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4))
 describe(simd4f, "utilities") {
    it("should have simd4f_uload4 for loading four float values from an unaligned float array into simd4f") {
        float *f = unaligned_mem(4);
        f[0] = 1;
        f[1] = 2;
        f[2] = 3;
        f[3] = 4;
        simd4f x = simd4f_uload4(f);
        // octave simd4f: [1,2,3,4]
        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
    }
    it("should have simd4f_uload3 for loading three float values from an unaligned float array into simd4f") {
        float *f = unaligned_mem(3);
        f[0] = 1;
        f[1] = 2;
        f[2] = 3;
        simd4f x = simd4f_uload3(f);
        // octave simd4f: [1,2,3]
        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
    }
    it("should have simd4f_uload2 for loading two float values from float an unaligned array into simd4f") {
        float *f = unaligned_mem(2);
        f[0] = 1;
        f[1] = 2;
        simd4f x = simd4f_uload2(f);
        // octave simd4f: [1,2]
        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have simd4f_ustore4 for storing four float values from simd4f to an unaligned array") {
        float *f = unaligned_mem(4);
        f[0] = -1;
        f[1] = -1;
        f[2] = -1;
        f[3] = -1;
        simd4f a = simd4f_create(1,2,3,4);
        simd4f_ustore4(a, f);
        should_be_close_to(f[0], 1, epsilon);
        should_be_close_to(f[1], 2, epsilon);
        should_be_close_to(f[2], 3, epsilon);
        should_be_close_to(f[3], 4, epsilon);
    }
    it("should have simd4f_ustore3 for storing three float values from simd4f to an unaligned array") {
        float *f = unaligned_mem(3);
        f[0] = -1;
        f[1] = -1;
        f[2] = -1;
        simd4f a = simd4f_create(1,2,3,4);
        simd4f_ustore3(a, f);
        should_be_close_to(f[0], 1, epsilon);
        should_be_close_to(f[1], 2, epsilon);
        should_be_close_to(f[2], 3, epsilon);
    }
    it("should have simd4f_ustore2 for storing two float values from simd4f to an unaligned array") {
        float *f = unaligned_mem(2);
        f[0] = -1;
        f[1] = -1;
        simd4f a = simd4f_create(1,2,3,4);
        simd4f_ustore2(a, f);
        should_be_close_to(f[0], 1, epsilon);
        should_be_close_to(f[1], 2, epsilon);
    }
    it("should have simd4f_splat that expands a single scalar to all elements") {
        simd4f x = simd4f_splat(42);
        // octave simd4f: [42,42,42,42]
        should_be_equal_simd4f(x, simd4f_create(42.000000000000000f, 42.000000000000000f, 42.000000000000000f, 42.000000000000000f), epsilon );
    }
    it("should have simd4f_splat_x,y,z,w splatting of an element") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x;
        x = simd4f_splat_x(a);
        // octave simd4f: [1,1,1,1]
        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 1.000000000000000f, 1.000000000000000f, 1.000000000000000f), epsilon );
        x = simd4f_splat_y(a);
        // octave simd4f: [2,2,2,2]
        should_be_equal_simd4f(x, simd4f_create(2.000000000000000f, 2.000000000000000f, 2.000000000000000f, 2.000000000000000f), epsilon );
        x = simd4f_splat_z(a);
        // octave simd4f: [3,3,3,3]
        should_be_equal_simd4f(x, simd4f_create(3.000000000000000f, 3.000000000000000f, 3.000000000000000f, 3.000000000000000f), epsilon );
        x = simd4f_splat_w(a);
        // octave simd4f: [4,4,4,4]
        should_be_equal_simd4f(x, simd4f_create(4.000000000000000f, 4.000000000000000f, 4.000000000000000f, 4.000000000000000f), epsilon );
    }
    it("should have simd4f_sum that adds elements") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_sum(a);
        // octave simd4f: [sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4])]
        should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
    }
    it("should have simd4f_reciprocal") {
        simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
        simd4f x = simd4f_reciprocal(a);
        // octave simd4f: 1 ./ [0.00001, 2.00001, 3.0, 99999999.0]
        should_be_equal_simd4f(x, simd4f_create(99999.999999999985448f, 0.499997500012500f, 0.333333333333333f, 0.000000010000000f), epsilon );
    }
    it("should have simd4f_sqrt") {
        simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
        simd4f x = simd4f_sqrt(a);
        // octave simd4f:  sqrt([0.00001, 2.00001, 3.0, 99999999.0])
        should_be_equal_simd4f(x, simd4f_create(0.003162277660168f, 1.414217097902582f, 1.732050807568877f, 9999.999949999999444f), epsilon );
        x = simd4f_sqrt( simd4f_create(0.0f, 0.0f, 0.0f, 0.0f) );
        // octave simd4f:  sqrt([0, 0, 0, 0])
        should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon );
    }
    it("should have simd4f_rsqrt for reciprocal of square-root") {
        simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
        simd4f x = simd4f_rsqrt(a);
        const int epsilon = 4; // Grant larger error
        // octave simd4f:  1 ./ sqrt([0.00001, 2.00001, 3.0, 99999999.0])
        should_be_equal_simd4f(x, simd4f_create(316.227766016837904f, 0.707105013426224f, 0.577350269189626f, 0.000100000000500f), epsilon );
    }
 }
 describe(simd4f, "arithmetic with another simd4f") {
    it("should have simd4f_add for component-wise addition") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(10,20,30,40);
        simd4f x = simd4f_add(a,b);
        // octave simd4f: [1,2,3,4] + [10,20,30,40]
        should_be_equal_simd4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
    }
    it("should have simd4f_sub for component-wise subtraction") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(10,20,30,40);
        simd4f x = simd4f_sub(b,a);
        // octave simd4f: [10,20,30,40] - [1,2,3,4] 
        should_be_equal_simd4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
    }
    it("should have simd4f_mul for component-wise multiply") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(10,20,30,40);
        simd4f x = simd4f_mul(a,b);
        // octave simd4f: [1,2,3,4] .* [10,20,30,40]
        should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
    }
    it("should have simd4f_div for component-wise division") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(10,20,30,40);
        simd4f x = simd4f_div(b,a);
        // octave simd4f: [10,20,30,40] ./ [1,2,3,4] 
        should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
    }
    it("should have simd4f_madd for multiply-add") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(100,100,100,100);
        simd4f c = simd4f_create(6,7,8,9);
        simd4f x = simd4f_madd(a,b,c);
        // octave simd4f: [1,2,3,4] .* [100,100,100,100] .+ [6,7,8,9]
        should_be_equal_simd4f(x, simd4f_create(106.000000000000000f, 207.000000000000000f, 308.000000000000000f, 409.000000000000000f), epsilon );
    }
 }
 describe(simd4f, "vector math") {
    it("should have simd4f_dot4 for four component dot product") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(10,20,30,40);
        simd4f x = simd4f_dot4(a,b);
        // octave simd4f: [dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40])]
        should_be_equal_simd4f(x, simd4f_create(300.000000000000000f, 300.000000000000000f, 300.000000000000000f, 300.000000000000000f), epsilon );
    }
    it("should have simd4f_dot3_scalar for three component dot product returning float") {
        simd4f a = simd4f_create(1,2,3,9999);
        simd4f b = simd4f_create(10,20,30,-9990);
        float x = simd4f_dot3_scalar(a,b);
        // octave float: dot([1, 2, 3], [10, 20, 30])
        should_be_close_to(x, 140.000000000000000f, epsilon );
    }
    it("should have simd4f_dot3 for three component dot product returning simd4f") {
        simd4f a = simd4f_create(1,2,3,9999);
        simd4f b = simd4f_create(10,20,30,-9990);
        simd4f x = simd4f_dot3(a,b);
        // octave simd4f: [dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30])]
        should_be_equal_simd4f(x, simd4f_create(140.000000000000000f, 140.000000000000000f, 140.000000000000000f, 140.000000000000000f), epsilon );
    }
    it("should have simd4f_dot2 for two component dot product") {
        simd4f a = simd4f_create(1,2,3,9999);
        simd4f b = simd4f_create(10,20,30,-9990);
        simd4f x = simd4f_dot2(a,b);
        // octave simd4f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20])]
        should_be_equal_simd4f(x, simd4f_create(50.000000000000000f, 50.000000000000000f, 50.000000000000000f, 50.000000000000000f), epsilon );
    }
    it("should have simd4f_length4 for four component vector length") {
        simd4f a = simd4f_create(1,2,-3,9999);
        simd4f x = simd4f_length4(a);
        // octave simd4f: [norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999])]
        should_be_equal_simd4f(x, simd4f_create(9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f), epsilon );
    }
    it("should have simd4f_length3 for three component vector length") {
        simd4f a = simd4f_create(1,2,-3,9999);
        simd4f x = simd4f_length3(a);
        // octave simd4f: [norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3])]
        should_be_equal_simd4f(x, simd4f_create(3.741657386773941f, 3.741657386773941f, 3.741657386773941f, 3.741657386773941f), epsilon );
    }
    it("should have simd4f_length2 for two component vector length") {
        simd4f a = simd4f_create(1,2,-3,9999);
        simd4f x = simd4f_length2(a);
        // octave simd4f: [norm([1,2]),norm([1,2]),norm([1,2]),norm([1,2])]
        should_be_equal_simd4f(x, simd4f_create(2.236067977499790f, 2.236067977499790f, 2.236067977499790f, 2.236067977499790f), epsilon );
    }
    it("should have simd4f_length4_squared for four component squared vector length") {
        simd4f a = simd4f_create(1,2,-3,9999);
        simd4f x = simd4f_length4_squared(a);
        // octave simd4f: ([(dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999]))])
        should_be_equal_simd4f(x, simd4f_create(99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f), epsilon );
    }
    it("should have simd4f_length3_squared for three component squared vector length") {
        simd4f a = simd4f_create(1,2,-3,9999);
        simd4f x = simd4f_length3_squared(a);
        // octave simd4f: ([dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3])])
        should_be_equal_simd4f(x, simd4f_create(14.000000000000000f, 14.000000000000000f, 14.000000000000000f, 14.000000000000000f), epsilon );
    }
    it("should have simd4f_length2_squared for two component squared vector length") {
        simd4f a = simd4f_create(1,2,-3,9999);
        simd4f x = simd4f_length2_squared(a);
        // octave simd4f: ([dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2])])
        should_be_equal_simd4f(x, simd4f_create(5.000000000000000f, 5.000000000000000f, 5.000000000000000f, 5.000000000000000f), epsilon );
    }
    it("should have simd4f_cross3 for cross product") {
        simd4f a = simd4f_create(1,12,3,-9999);
        simd4f b = simd4f_create(5,6,-17, 9999);
        simd4f x = simd4f_cross3(a,b);
        // octave simd4f: horzcat(  cross( [1,12,3], [5,6,-17] )  , [0] )
        should_be_equal_simd4f(x, simd4f_create(-222.000000000000000f, 32.000000000000000f, -54.000000000000000f, 0.000000000000000f), epsilon );
    }
    it("should have simd4f_normalize4 for normalizing four const vector to unit length") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_normalize4(a);
        // octave simd4f: [1,2,3,4] / norm([1,2,3,4])
        should_be_equal_simd4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon );
    }
    it("should have simd4f_normalize3 for normalizing three component vector to unit length") {
        simd4f a = simd4f_create(1,2,3,0);
        simd4f x = simd4f_normalize3(a);
        // octave simd4f: [1,2,3,0] / norm([1,2,3])
        should_be_equal_simd4f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.000000000000000f), epsilon );
    }
    it("should have simd4f_normalize2 for normalizing two component vector to unit length") {
        simd4f a = simd4f_create(1,2,0,0);
        simd4f x = simd4f_normalize2(a);
        // octave simd4f: [1,2,0,0] / norm([1,2])
        should_be_equal_simd4f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.000000000000000f, 0.000000000000000f), epsilon );
    }
 }
 describe(simd4f, "shuffles and merges") {
    it("should have simd4f_shuffle_wxyz") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_shuffle_wxyz(a);
        should_be_equal_simd4f(x, simd4f_create(4,1,2,3), epsilon );
    }
    it("should have simd4f_shuffle_zwxy") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_shuffle_zwxy(a);
        should_be_equal_simd4f(x, simd4f_create(3,4,1,2), epsilon );
    }
    it("should have simd4f_shuffle_yzwx") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_shuffle_yzwx(a);
        should_be_equal_simd4f(x, simd4f_create(2,3,4,1), epsilon );
    }
    it("should have simd4f_merge_high") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f b = simd4f_create(5,6,7,8);
        simd4f x = simd4f_merge_high(a,b);
        should_be_equal_simd4f(x, simd4f_create(3,4,7,8), epsilon );
    }
 }
 describe(simd4f, "signs") {
    it("should have simd4f_flip_sign_0101 for flipping even elements sign") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_flip_sign_0101(a);
        should_be_equal_simd4f(x, simd4f_create(1,-2,3,-4), epsilon );
    }
    it("should have simd4f_flip_sign_1010 for flipping even elements sign") {
        simd4f a = simd4f_create(1,2,3,4);
        simd4f x = simd4f_flip_sign_1010(a);
        should_be_equal_simd4f(x, simd4f_create(-1,2,-3,4), epsilon );
    }
 }
 describe(simd4f, "min-max") {
    it("should have simd4f_min for choosing minimum elements") {
        simd4f a = simd4f_create(1.0f,  2.0f, -300000000.0f, -0.000002f);
        simd4f b = simd4f_create(2.0f, -2.0f,  300000000.0f,  0.000001f);
        simd4f x = simd4f_min(a,b);
        should_be_equal_simd4f(x, simd4f_create(1.0f, -2.0f, -300000000.0f, -0.000002f), epsilon);
    }
    it("should have simd4f_max for choosing maximum elements") {
        simd4f a = simd4f_create(1.0f,  2.0f, -300000000.0f, -0.000002f);
        simd4f b = simd4f_create(2.0f, -2.0f,  300000000.0f,  0.000001f);
        simd4f x = simd4f_max(a,b);
        should_be_equal_simd4f(x, simd4f_create(2.0f, 2.0f, 300000000.0f, 0.000001f), epsilon);
    }
 }
 describe(simd4f, "zeroing")
 {
    it("should have simd4f_zero_w that zeros the last element")
    {
        const float nan = sqrtf(-1.0f);
        simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f);
        simd4f b = simd4f_create(1.0f, 2.0f, 3.0f, nan);
        simd4f x = simd4f_zero_w(a);
        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon);
        x = simd4f_zero_w(b);
        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon);
    }
    it("should have simd4f_zero_zw that zeros the last element")
    {
        const float nan = sqrtf(-1.0f);
        simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f);
        simd4f b = simd4f_create(1.0f, 2.0f, nan, nan);
        simd4f x = simd4f_zero_zw(a);
        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon);
        x = simd4f_zero_zw(b);
        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon);
    }
 }
--- a/3rdparty/vectorial/spec/spec_simd4x4f.cpp
+++ b/3rdparty/vectorial/spec/spec_simd4x4f.cpp
@ -0,0 +1,381 @@
 #include "spec_helper.h"
 const int epsilon = 1;
 #ifndef M_PI
 #define M_PI 3.141592f
 #endif
 describe(simd4x4f, "creating") {
    it("should be possible to create with params") {
        simd4x4f x = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                   simd4f_create(5,  6,  7,  8 ),
                                   simd4f_create(9,  10, 11, 12 ),
                                   simd4f_create(13, 14, 15, 16 ));
        should_be_equal_simd4f( x.x, simd4f_create(1,  2,  3,  4 ) , epsilon);
        should_be_equal_simd4f( x.y, simd4f_create(5,  6,  7,  8 ) , epsilon);
        should_be_equal_simd4f( x.z, simd4f_create(9,  10, 11, 12 ), epsilon);
        should_be_equal_simd4f( x.w, simd4f_create(13, 14, 15, 16 ), epsilon);
        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), simd4f_create(5.000000000000000f, 6.000000000000000f, 7.000000000000000f, 8.000000000000000f), simd4f_create(9.000000000000000f, 10.000000000000000f, 11.000000000000000f, 12.000000000000000f), simd4f_create(13.000000000000000f, 14.000000000000000f, 15.000000000000000f, 16.000000000000000f)), epsilon );
    }
    it("should be possible to set to identity") {
        simd4x4f x;
        simd4x4f_identity(&x);
        // octave simd4x4f: [1,0,0,0; 0,1,0,0; 0,0,1,0; 0,0,0,1]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 1.000000000000000f)), epsilon );
    }
 }
 describe(simd4x4f, "loading and storing") {
    it("should be possible to load from array of 16 floats with simd4x4f_uload") {
        simd4x4f x;
        float f[16] = {1,2,3,4, 5,6,7,8, 9,10,11,12, 13,14,15,16 };
        simd4x4f_uload(&x, f);
        should_be_equal_simd4x4f(x, simd4x4f_create( simd4f_create(1,2,3,4),
                                                     simd4f_create(5,6,7,8),
                                                     simd4f_create(9,10,11,12),
                                                     simd4f_create(13,14,15,16) ), epsilon);
    }
 }
 describe(simd4x4f, "matrix utility") {
    it("should have simd4x4f_transpose_inplace for transpose") {
        simd4x4f x = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                   simd4f_create(5,  6,  7,  8 ),
                                   simd4f_create(9,  10, 11, 12 ),
                                   simd4f_create(13, 14, 15, 16 ));
        simd4x4f_transpose_inplace(&x);
        // octave simd4x4f: transpose([1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ])
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 5.000000000000000f, 9.000000000000000f, 13.000000000000000f), simd4f_create(2.000000000000000f, 6.000000000000000f, 10.000000000000000f, 14.000000000000000f), simd4f_create(3.000000000000000f, 7.000000000000000f, 11.000000000000000f, 15.000000000000000f), simd4f_create(4.000000000000000f, 8.000000000000000f, 12.000000000000000f, 16.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_transpose for transpose") {
        simd4x4f in = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                   simd4f_create(5,  6,  7,  8 ),
                                   simd4f_create(9,  10, 11, 12 ),
                                   simd4f_create(13, 14, 15, 16 ));
        simd4x4f x;
        simd4x4f_transpose(&in, &x);
        // octave simd4x4f: transpose([1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ])
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 5.000000000000000f, 9.000000000000000f, 13.000000000000000f), simd4f_create(2.000000000000000f, 6.000000000000000f, 10.000000000000000f, 14.000000000000000f), simd4f_create(3.000000000000000f, 7.000000000000000f, 11.000000000000000f, 15.000000000000000f), simd4f_create(4.000000000000000f, 8.000000000000000f, 12.000000000000000f, 16.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_matrix_vector_mul for matrix-vector multiply") {
        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
                                     simd4f_create( 3,   11,   19,   27 ),
                                     simd4f_create( 5,   13,   21,   29 ),
                                     simd4f_create( 7,   15,   23,   31 ));
        simd4f b = simd4f_create( 26,  -28,   30,  -32 );
        simd4f x;
        simd4x4f_matrix_vector_mul(&a, &b, &x);
        // octave simd4f: [1,3,5,7;9,11,13,15;17,19,21,23;25,27,29,31] * [26;-28;30;-32]
        should_be_equal_simd4f(x, simd4f_create(-132.000000000000000f, -164.000000000000000f, -196.000000000000000f, -228.000000000000000f), epsilon );
    }
    it("should have simd4x4f_matrix_vector3_mul for matrix-vector3 multiply") {
        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
                                     simd4f_create( 3,   11,   19,   27 ),
                                     simd4f_create( 5,   13,   21,   29 ),
                                     simd4f_create( 7,   15,   23,   31 ));
        simd4f b = simd4f_create( 26,  -28,   30,  -32 );
        simd4f x;
        simd4x4f_matrix_vector3_mul(&a, &b, &x);
        // TODO octave simd4f: 
    }
    it("should have simd4x4f_matrix_vector3_mul for matrix-vector3 multiply") {
        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
                                     simd4f_create( 3,   11,   19,   27 ),
                                     simd4f_create( 5,   13,   21,   29 ),
                                     simd4f_create( 7,   15,   23,   31 ));
        simd4f b = simd4f_create( 26,  -28,   30,  -32 );
        simd4f x;
        simd4x4f_matrix_vector3_mul(&a, &b, &x);
        // TODO octave simd4f: 
    }
    it("should have simd4x4f_matrix_point3_mul") { /* TODO */ }
    it("should have simd4x4f_inv_ortho_matrix_point3_mul for transforming point with inverse of a orhtonormal matrix") {
        simd4x4f a = simd4x4f_create(simd4f_create( 0,  -1,   0,   0 ),
                                     simd4f_create( 1,   0,   0,   0 ),
                                     simd4f_create( 0,   0,   1,   0 ),
                                     simd4f_create( 1,   2,   3,   1 ));
        simd4f b = simd4f_create(5,6,7,0);
        simd4f x;
        simd4x4f_inv_ortho_matrix_point3_mul(&a, &b, &x);
        // octave simd4f: inverse([0,1,0,1; -1,0,0,2; 0,0,1,3; 0,0,0,1]) * [5;6;7;1] .* [1;1;1;0]
        should_be_equal_simd4f(x, simd4f_create(-4.000000000000000f, 4.000000000000000f, 4.000000000000000f, 0.000000000000000f), epsilon );
    }
    it("should have simd4x4f_matrix_mul for matrix multiply") {
        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
                                     simd4f_create( 3,   11,   19,   27 ),
                                     simd4f_create( 5,   13,   21,   29 ),
                                     simd4f_create( 7,   15,   23,   31 ));
        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
                                     simd4f_create( -4,   12,  -20,   28 ),
                                     simd4f_create(  6,  -14,   22,  -30 ),
                                     simd4f_create( -8,   16,  -24,   32 ));
        simd4x4f x;
        simd4x4f_matrix_mul(&a, &b, &x);
        // octave simd4x4f: [1,3,5,7;9,11,13,15;17,19,21,23;25,27,29,31] * [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(-120.000000000000000f, -248.000000000000000f, -376.000000000000000f, -504.000000000000000f), simd4f_create(128.000000000000000f, 256.000000000000000f, 384.000000000000000f, 512.000000000000000f), simd4f_create(-136.000000000000000f, -264.000000000000000f, -392.000000000000000f, -520.000000000000000f), simd4f_create(144.000000000000000f, 272.000000000000000f, 400.000000000000000f, 528.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_inverse for calculating inverse matrix") {
        simd4x4f a = simd4x4f_create(simd4f_create(7,  2,  87,  5 ),
                                   simd4f_create(5,  24,  6,  3 ),
                                   simd4f_create(4,  6, 5, 6 ),
                                   simd4f_create(5, 7, 4, 6 ));
        simd4x4f x;
        simd4x4f_inverse(&a, &x);
        // octave simd4x4f: inverse( [7,5,4,5 ; 2,24,6,7 ; 87,6,5,4 ; 5,3,6,6] )
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.015309310560300f, -0.049885440533222f, -1.081337221412206f, 1.093522182878568f), simd4f_create(-0.004061653822120f, 0.054051239325141f, 0.123620079150177f, -0.147260987294314f), simd4f_create(0.011247656738180f, 0.004165798791918f, 0.042282857737971f, -0.053738804415747f), simd4f_create(-0.015517600499896f, -0.024265777962924f, 0.728702353676318f, -0.536971464278276f)), epsilon );
        simd4x4f x2;
        simd4x4f_matrix_mul(&x, &a, &x2);
        simd4x4f identity;
        simd4x4f_identity(&identity);
        // Allow larger error for M * M' = I
        const int epsilon = 0x35100000; 
        should_be_equal_simd4x4f(x2, identity, epsilon);
    }
 }
 describe(simd4x4f, "math on elements") {
    it("should have simd4x4f_add for element-wise addition") {
        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                     simd4f_create(5,  6,  7,  8 ),
                                     simd4f_create(9,  10, 11, 12 ),
                                     simd4f_create(13, 14, 15, 16 ));
        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
                                     simd4f_create( -4,   12,  -20,   28 ),
                                     simd4f_create(  6,  -14,   22,  -30 ),
                                     simd4f_create( -8,   16,  -24,   32 ));
        simd4x4f x;
        simd4x4f_add(&a, &b, &x);
        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] + [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(3.000000000000000f, -8.000000000000000f, 21.000000000000000f, -22.000000000000000f), simd4f_create(1.000000000000000f, 18.000000000000000f, -13.000000000000000f, 36.000000000000000f), simd4f_create(15.000000000000000f, -4.000000000000000f, 33.000000000000000f, -18.000000000000000f), simd4f_create(5.000000000000000f, 30.000000000000000f, -9.000000000000000f, 48.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_sub for element-wise substraction") {
        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                     simd4f_create(5,  6,  7,  8 ),
                                     simd4f_create(9,  10, 11, 12 ),
                                     simd4f_create(13, 14, 15, 16 ));
        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
                                     simd4f_create( -4,   12,  -20,   28 ),
                                     simd4f_create(  6,  -14,   22,  -30 ),
                                     simd4f_create( -8,   16,  -24,   32 ));
        simd4x4f x;
        simd4x4f_sub(&a, &b, &x);
        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] - [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(-1.000000000000000f, 12.000000000000000f, -15.000000000000000f, 30.000000000000000f), simd4f_create(9.000000000000000f, -6.000000000000000f, 27.000000000000000f, -20.000000000000000f), simd4f_create(3.000000000000000f, 24.000000000000000f, -11.000000000000000f, 42.000000000000000f), simd4f_create(21.000000000000000f, -2.000000000000000f, 39.000000000000000f, -16.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_mul for element-wise multiplication") {
        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                     simd4f_create(5,  6,  7,  8 ),
                                     simd4f_create(9,  10, 11, 12 ),
                                     simd4f_create(13, 14, 15, 16 ));
        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
                                     simd4f_create( -4,   12,  -20,   28 ),
                                     simd4f_create(  6,  -14,   22,  -30 ),
                                     simd4f_create( -8,   16,  -24,   32 ));
        simd4x4f x;
        simd4x4f_mul(&a, &b, &x);
        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] .* [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(2.000000000000000f, -20.000000000000000f, 54.000000000000000f, -104.000000000000000f), simd4f_create(-20.000000000000000f, 72.000000000000000f, -140.000000000000000f, 224.000000000000000f), simd4f_create(54.000000000000000f, -140.000000000000000f, 242.000000000000000f, -360.000000000000000f), simd4f_create(-104.000000000000000f, 224.000000000000000f, -360.000000000000000f, 512.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_div for element-wise division") {
        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
                                     simd4f_create(5,  6,  7,  8 ),
                                     simd4f_create(9,  10, 11, 12 ),
                                     simd4f_create(13, 14, 15, 16 ));
        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
                                     simd4f_create( -4,   12,  -20,   28 ),
                                     simd4f_create(  6,  -14,   22,  -30 ),
                                     simd4f_create( -8,   16,  -24,   32 ));
        simd4x4f x;
        simd4x4f_div(&a, &b, &x);
        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] ./ [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.500000000000000f, -0.200000000000000f, 0.166666666666667f, -0.153846153846154f), simd4f_create(-1.250000000000000f, 0.500000000000000f, -0.350000000000000f, 0.285714285714286f), simd4f_create(1.500000000000000f, -0.714285714285714f, 0.500000000000000f, -0.400000000000000f), simd4f_create(-1.625000000000000f, 0.875000000000000f, -0.625000000000000f, 0.500000000000000f)), epsilon );
    }
 }
 describe(simd4x4f, "creating projection and view matrices") {
    it("should have simd4x4f_perspective for creating perspective projection matrix") {
        const float fov = 10.0f * M_PI / 180.0f;
        const float aspect = 1.6f;
        const float znear = 2.0f;
        const float zfar = 50.0f;
        const int epsilon = 50;
        simd4x4f x;
        simd4x4f_perspective(&x, fov, aspect, znear, zfar);
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(7.14378, 0, 0, 0),
                                                    simd4f_create(0, 11.4301, 0, 0),
                                                    simd4f_create(0, 0, -1.08333, -1),
                                                    simd4f_create(-0, -0, -4.16667, -0)), epsilon);
    }
    it("should have simd4x4f_ortho for creating orthogonal projection matrix") {
        simd4x4f x;
        simd4x4f_ortho(&x, -10, 20, -30, 40, -50, 60);
        const int epsilon = 20;        
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.0666667, 0, 0, 0),
                                                    simd4f_create(0, 0.0285714, 0, 0),
                                                    simd4f_create(-0, -0, -0.0181818, -0),
                                                    simd4f_create(-0.333333, -0.142857, -0.0909091, 1)), epsilon);
    }
    it("should have simd4x4f_lookat for creating look-at matrix") {
        simd4f eye = simd4f_create(1,2,3,0);
        simd4f center = simd4f_create(3,4,5,0);
        simd4f up = simd4f_create(0,1,0,0);
        simd4x4f x;
        simd4x4f_lookat(&x, eye, center, up);
        const int epsilon = 40;
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(-0.707107, -0.408248, -0.57735, 0),
                                                    simd4f_create(0, 0.816497, -0.57735, 0),
                                                    simd4f_create(0.707107, -0.408248, -0.57735, 0),
                                                    simd4f_create(-1.41421, 0, 3.4641, 1)), epsilon);
    }
    it("should have simd4x4f_translation for creating translation matrix") {
        simd4x4f x;
        simd4x4f_translation(&x, 1,2,3);
        // octave simd4x4f: [1,0,0,1; 0,1,0,2; 0,0,1,3; 0,0,0,1]
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 1.000000000000000f)), epsilon );
    }
    it("should have simd4x4f_axis_rotation for creating a rotation matrix along a axis") {
        simd4x4f x;
        simd4x4f_axis_rotation(&x, 45 * M_PI / 180.0f, simd4f_create(1,2,3,0));
        const int epsilon = 20;
        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.728028, 0.608789, -0.315202, 0),
                                                   simd4f_create(-0.525105, 0.790791, 0.314508, 0),
                                                   simd4f_create(0.440727, -0.0634566, 0.895395, 0),
                                                   simd4f_create(0, 0, 0, 1)), epsilon);
    }
 }
--- a/3rdparty/vectorial/spec/spec_vec2f.cpp
+++ b/3rdparty/vectorial/spec/spec_vec2f.cpp
@ -0,0 +1,255 @@
 #include "spec_helper.h"
 #include <iostream>
 using vectorial::vec2f;
 const int epsilon = 1;
 describe(vec2f, "constructing") {
    it("should have default constructor that does nothing..") {
        vec2f x;
    }
    it("should have constructor with element values") {
        vec2f x(10,20);
        // octave vec2f: [10,20]
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have constructor that loads from a float array") {
        float ary[2] = { 1,2 };
        vec2f x(ary);
        // octave vec2f: [1,2]
        should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
    }
 }
 describe(vec2f, "loads and stores") {
    it("should have method for loading from a float array") {
        float ary[2] = { 1, 2 };
        vec2f x(-1, -1 );
        x.load(ary);
        // octave vec2f: [1,2]
        should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have method for storing to a float array") {
        float ary[2] = { -1, -1 };
        vec2f x(1, 2);
        x.store(ary);
        should_be_close_to(ary[0], 1, epsilon);
        should_be_close_to(ary[1], 2, epsilon);
    }
 }
 describe(vec2f, "arithmetic with another vec2f") {
    it("should have operator+ for component-wise addition") {
        vec2f a(1,2);
        vec2f b(10,20);
        vec2f x = a + b;
        // octave vec2f: [1,2] + [10,20]
        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator- for component-wise subtraction") {
        vec2f a(1,2);
        vec2f b(10,20);
        vec2f x = b - a;
        // octave vec2f:  [10,20] - [1,2]
        should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication") {
        vec2f a(1,2);
        vec2f b(10,20);
        vec2f x = a * b;
        // octave vec2f: [1,2] .* [10,20]
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator/ for component-wise division") {
        vec2f a(1,2);
        vec2f b(10,20);
        vec2f x = b / a;
        // octave vec2f:  [10,20] ./ [1,2]
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator+= for component-wise addition") {
        vec2f x(1,2);
        vec2f b(10,20);
        x += b;
        // octave vec2f: [1,2] + [10,20]
        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator-= for component-wise subtraction") {
        vec2f a(1,2);
        vec2f x(10,20);
        x -= a;
        // octave vec2f:  [10,20] - [1,2]
        should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator*= for component-wise multiplication") {
        vec2f x(1,2);
        vec2f b(10,20);
        x *= b;
        // octave vec2f: [1,2] .* [10,20]
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator/= for component-wise division") {
        vec2f a(1,2);
        vec2f x(10,20);
        x /= a;
        // octave vec2f:  [10,20] ./ [1,2]
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
    }
 }
 describe(vec2f, "arithmetic with scalar") {
    it("should have operator+ for component-wise addition") {
        vec2f a(1,2);
        float b=10;
        vec2f x = a + b;
        // octave vec2f: [1,2] + 10
        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator- for component-wise subtraction") {
        float a=10;
        vec2f b(10,20);
        vec2f x = b - a;
        // octave vec2f:  [10,20] - 10
        should_be_equal_vec2f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication") {
        vec2f a(1,2);
        float b=10;
        vec2f x = a * b;
        // octave vec2f: [1,2] .* 10
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator/ for component-wise division") {
        vec2f a(10,20);
        float b=10;
        vec2f x = a / b;
        // octave vec2f: [10,20] ./ 10
        should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator+ for component-wise addition (float as lhs)") {
        vec2f b(1,2);
        float a=10;
        vec2f x = a + b;
        // octave vec2f: 10 + [1,2]
        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator- for component-wise subtraction (float as lhs)") {
        float b=50;
        vec2f a(10,20);
        vec2f x = b - a;
        // octave vec2f:  50 - [10,20]
        should_be_equal_vec2f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication (float as lhs)") {
        vec2f b(1,2);
        float a=10;
        vec2f x = a * b;
        // octave vec2f: 10 .* [1,2] 
        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication (float as lhs)") {
        vec2f b(10,20);
        float a=40;
        vec2f x = a / b;
        // octave vec2f: 40 ./ [10,20] 
        should_be_equal_vec2f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
    }
 }
 describe(vec2f, "vector math") {
    it("should have unary minus operator") {
        vec2f a(1,2);
        vec2f x = -a;
        // octave vec2f: -[1,2]
        should_be_equal_vec2f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, 0.0f, 0.0f), epsilon );
    }
    it("should have dot function") {
        vec2f a(1,2);
        vec2f b(6,7);
        float x = vectorial::dot(a,b);
        // octave vec2f: dot([1,2],[6,7])
        should_be_close_to(x, 20.000000000000000f, epsilon );
    }
    it("should have length_squared function") {
        vec2f a(1,2);
        float x = vectorial::length_squared(a);
        // octave vec2f: dot([1,2],[1,2])
        should_be_close_to(x, 5.000000000000000f, epsilon );
    }
    it("should have length function") {
        vec2f a(1,2);
        float x = vectorial::length(a);
        // octave vec2f: norm([1,2])
        should_be_close_to(x, 2.236067977499790f, epsilon );
    }
    it("should have normalize function") {
        vec2f a(1,2);
        vec2f x = vectorial::normalize(a);
        // octave vec2f: [1,2] / norm([1,2])
        should_be_equal_vec2f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.0f, 0.0f), epsilon );
    }
 }
--- a/3rdparty/vectorial/spec/spec_vec3f.cpp
+++ b/3rdparty/vectorial/spec/spec_vec3f.cpp
@ -0,0 +1,263 @@
 #include "spec_helper.h"
 #include <iostream>
 using vectorial::vec3f;
 const int epsilon = 1;
 describe(vec3f, "constructing") {
    it("should have default constructor that does nothing..") {
        vec3f x;
    }
    it("should have constructor with element values") {
        vec3f x(10,20,30);
        // octave vec3f: [10,20,30]
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
    }
    it("should have constructor that loads from a float array") {
        float ary[3] = { 1,2,3 };
        vec3f x(ary);
        // octave vec3f: [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
    }
 }
 describe(vec3f, "loads and stores") {
    it("should have method for loading from a float array") {
        float ary[3] = { 1,2,3 };
        vec3f x(-1, -1, -1 );
        x.load(ary);
        // octave vec3f: [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
    }
    it("should have method for storing to a float array") {
        float ary[3] = { -1, -1, -1 };
        vec3f x(1, 2, 3);
        x.store(ary);
        should_be_close_to(ary[0], 1, epsilon);
        should_be_close_to(ary[1], 2, epsilon);
        should_be_close_to(ary[2], 3, epsilon);
    }
 }
 describe(vec3f, "arithmetic with another vec3f") {
    it("should have operator+ for component-wise addition") {
        vec3f a(1,2,3);
        vec3f b(10,20,30);
        vec3f x = a + b;
        // octave vec3f: [1,2,3] + [10,20,30]
        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator- for component-wise subtraction") {
        vec3f a(1,2,3);
        vec3f b(10,20,30);
        vec3f x = b - a;
        // octave vec3f:  [10,20,30] - [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication") {
        vec3f a(1,2,3);
        vec3f b(10,20,30);
        vec3f x = a * b;
        // octave vec3f: [1,2,3] .* [10,20,30]
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator/ for component-wise division") {
        vec3f a(1,2,3);
        vec3f b(10,20,30);
        vec3f x = b / a;
        // octave vec3f:  [10,20,30] ./ [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator+= for component-wise addition") {
        vec3f x(1,2,3);
        vec3f b(10,20,30);
        x += b;
        // octave vec3f: [1,2,3] + [10,20,30]
        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator-= for component-wise subtraction") {
        vec3f a(1,2,3);
        vec3f x(10,20,30);
        x -= a;
        // octave vec3f:  [10,20,30] - [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator*= for component-wise multiplication") {
        vec3f x(1,2,3);
        vec3f b(10,20,30);
        x *= b;
        // octave vec3f: [1,2,3] .* [10,20,30]
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator/= for component-wise division") {
        vec3f a(1,2,3);
        vec3f x(10,20,30);
        x /= a;
        // octave vec3f:  [10,20,30] ./ [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon );
    }
 }
 describe(vec3f, "arithmetic with scalar") {
    it("should have operator+ for component-wise addition") {
        vec3f a(1,2,3);
        float b=10;
        vec3f x = a + b;
        // octave vec3f: [1,2,3] + 10
        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator- for component-wise subtraction") {
        float a=10;
        vec3f b(10,20,30);
        vec3f x = b - a;
        // octave vec3f:  [10,20,30] - 10
        should_be_equal_vec3f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication") {
        vec3f a(1,2,3);
        float b=10;
        vec3f x = a * b;
        // octave vec3f: [1,2,3] .* 10
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator/ for component-wise division") {
        vec3f a(10,20,30);
        float b=10;
        vec3f x = a / b;
        // octave vec3f: [10,20,30] ./ 10
        should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator+ for component-wise addition (float as lhs)") {
        vec3f b(1,2,3);
        float a=10;
        vec3f x = a + b;
        // octave vec3f: 10 + [1,2,3]
        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator- for component-wise subtraction (float as lhs)") {
        float b=50;
        vec3f a(10,20,30);
        vec3f x = b - a;
        // octave vec3f:  50 - [10,20,30]
        should_be_equal_vec3f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication (float as lhs)") {
        vec3f b(1,2,3);
        float a=10;
        vec3f x = a * b;
        // octave vec3f: 10 .* [1,2,3] 
        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
    }
    it("should have operator* for component-wise multiplication (float as lhs)") {
        vec3f b(10,20,30);
        float a=40;
        vec3f x = a / b;
        // octave vec3f: 40 ./ [10,20,30] 
        should_be_equal_vec3f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 0.0f), epsilon );
    }
 }
 describe(vec3f, "vector math") {
    it("should have unary minus operator") {
        vec3f a(1,2,3);
        vec3f x = -a;
        // octave vec3f: -[1,2,3]
        should_be_equal_vec3f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, 0.0f), epsilon );
    }
    it("should have dot function") {
        vec3f a(1,2,3);
        vec3f b(6,7,8);
        float x = vectorial::dot(a,b);
        // octave vec3f: dot([1,2,3],[6,7,8])
        should_be_close_to(x, 44.000000000000000f, epsilon );
    }
    it("should have cross function") {
        vec3f a(1,2,3);
        vec3f b(6,7,8);
        vec3f x = vectorial::cross(a,b);
        // octave vec3f: cross([1,2,3],[6,7,8])
        should_be_equal_vec3f(x, simd4f_create(-5.000000000000000f, 10.000000000000000f, -5.000000000000000f, 0.0f), epsilon );
    }
    it("should have length_squared function") {
        vec3f a(1,2,3);
        float x = vectorial::length_squared(a);
        // octave vec3f: dot([1,2,3],[1,2,3])
        should_be_close_to(x, 14.000000000000000f, epsilon );
    }
    it("should have length function") {
        vec3f a(1,2,3);
        float x = vectorial::length(a);
        // octave vec3f: norm([1,2,3])
        should_be_close_to(x, 3.741657386773941f, epsilon );
    }
    it("should have normalize function") {
        vec3f a(1,2,3);
        vec3f x = vectorial::normalize(a);
        // octave vec3f: [1,2,3] / norm([1,2,3])
        should_be_equal_vec3f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.0f), epsilon );
    }
 }
--- a/3rdparty/vectorial/spec/spec_vec4f.cpp
+++ b/3rdparty/vectorial/spec/spec_vec4f.cpp
@ -0,0 +1,258 @@
 #include "spec_helper.h"
 #include <iostream>
 using vectorial::vec4f;
 const int epsilon = 1;
 describe(vec4f, "constructing") {
    it("should have default constructor that does nothing..") {
        vec4f x;
    }
    it("should have constructor with element values") {
        vec4f x(10,20,30,40);
        // octave vec4f: [10,20,30,40]
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
    }
    it("should have constructor that loads from a float array") {
        float ary[4] = { 1,2,3,4 };
        vec4f x(ary);
        // octave vec4f: [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
    }
 }
 describe(vec4f, "loads and stores") {
    it("should have method for loading from a float array") {
        float ary[4] = { 1,2,3,4 };
        vec4f x(-1, -1, -1, -1);
        x.load(ary);
        // octave vec4f: [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
    }
    it("should have method for storing to a float array") {
        float ary[4] = { -1, -1, -1, -1 };
        vec4f x(1, 2, 3, 4);
        x.store(ary);
        should_be_close_to(ary[0], 1, epsilon);
        should_be_close_to(ary[1], 2, epsilon);
        should_be_close_to(ary[2], 3, epsilon);
        should_be_close_to(ary[3], 4, epsilon);
    }
 }
 describe(vec4f, "arithmetic with another vec4f") {
    it("should have operator+ for component-wise addition") {
        vec4f a(1,2,3,4);
        vec4f b(10,20,30,40);
        vec4f x = a + b;
        // octave vec4f: [1,2,3,4] + [10,20,30,40]
        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
    }
    it("should have operator- for component-wise subtraction") {
        vec4f a(1,2,3,4);
        vec4f b(10,20,30,40);
        vec4f x = b - a;
        // octave vec4f:  [10,20,30,40] - [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
    }
    it("should have operator* for component-wise multiplication") {
        vec4f a(1,2,3,4);
        vec4f b(10,20,30,40);
        vec4f x = a * b;
        // octave vec4f: [1,2,3,4] .* [10,20,30,40]
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
    }
    it("should have operator/ for component-wise division") {
        vec4f a(1,2,3,4);
        vec4f b(10,20,30,40);
        vec4f x = b / a;
        // octave vec4f:  [10,20,30,40] ./ [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
    }
    it("should have operator+= for component-wise addition") {
        vec4f x(1,2,3,4);
        vec4f b(10,20,30,40);
        x += b;
        // octave vec4f: [1,2,3,4] + [10,20,30,40]
        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
    }
    it("should have operator-= for component-wise subtraction") {
        vec4f a(1,2,3,4);
        vec4f x(10,20,30,40);
        x -= a;
        // octave vec4f:  [10,20,30,40] - [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
    }
    it("should have operator*= for component-wise multiplication") {
        vec4f x(1,2,3,4);
        vec4f b(10,20,30,40);
        x *= b;
        // octave vec4f: [1,2,3,4] .* [10,20,30,40]
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
    }
    it("should have operator/= for component-wise division") {
        vec4f a(1,2,3,4);
        vec4f x(10,20,30,40);
        x /= a;
        // octave vec4f:  [10,20,30,40] ./ [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
    }
 }
 describe(vec4f, "arithmetic with scalar") {
    it("should have operator+ for component-wise addition") {
        vec4f a(1,2,3,4);
        float b=10;
        vec4f x = a + b;
        // octave vec4f: [1,2,3,4] + 10
        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon );
    }
    it("should have operator- for component-wise subtraction") {
        float a=10;
        vec4f b(10,20,30,40);
        vec4f x = b - a;
        // octave vec4f:  [10,20,30,40] - 10
        should_be_equal_vec4f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 30.000000000000000f), epsilon );
    }
    it("should have operator* for component-wise multiplication") {
        vec4f a(1,2,3,4);
        float b=10;
        vec4f x = a * b;
        // octave vec4f: [1,2,3,4] .* 10
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
    }
    it("should have operator/ for component-wise division") {
        vec4f a(10,20,30,40);
        float b=10;
        vec4f x = a / b;
        // octave vec4f: [10,20,30,40] ./ 10
        should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
    }
    it("should have operator+ for component-wise addition (float as lhs)") {
        vec4f b(1,2,3,4);
        float a=10;
        vec4f x = a + b;
        // octave vec4f: 10 + [1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon );
    }
    it("should have operator- for component-wise subtraction (float as lhs)") {
        float b=50;
        vec4f a(10,20,30,40);
        vec4f x = b - a;
        // octave vec4f:  50 - [10,20,30,40]
        should_be_equal_vec4f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 10.000000000000000f), epsilon );
    }
    it("should have operator* for component-wise multiplication (float as lhs)") {
        vec4f b(1,2,3,4);
        float a=10;
        vec4f x = a * b;
        // octave vec4f: 10 .* [1,2,3,4] 
        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
    }
    it("should have operator* for component-wise multiplication (float as lhs)") {
        vec4f b(10,20,30,40);
        float a=40;
        vec4f x = a / b;
        // octave vec4f: 40 ./ [10,20,30,40] 
        should_be_equal_vec4f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 1.000000000000000f), epsilon );
    }
 }
 describe(vec4f, "vector math") {
    it("should have unary minus operator") {
        vec4f a(1,2,3,4);
        vec4f x = -a;
        // octave vec4f: -[1,2,3,4]
        should_be_equal_vec4f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, -4.000000000000000f), epsilon );
    }
    it("should have dot function") {
        vec4f a(1,2,3,4);
        vec4f b(6,7,8,9);
        float x = vectorial::dot(a,b);
        // octave vec4f: dot([1,2,3,4],[6,7,8,9])
        should_be_close_to(x, 80.000000000000000f, epsilon );
    }
    it("should have length_squared function") {
        vec4f a(1,2,3,4);
        float x = vectorial::length_squared(a);
        // octave vec4f: dot([1,2,3,4],[1,2,3,4])
        should_be_close_to(x, 30.000000000000000f, epsilon );
    }
    it("should have length function") {
        vec4f a(1,2,3,4);
        float x = vectorial::length(a);
        // octave vec4f: norm([1,2,3,4])
        should_be_close_to(x, 5.477225575051661f, epsilon );
    }
    it("should have normalize function") {
        vec4f a(1,2,3,4);
        vec4f x = vectorial::normalize(a);
        // octave vec4f: [1,2,3,4] / norm([1,2,3,4])
        should_be_equal_vec4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon );
    }
 }
--- a/3rdparty/vectorial/tools/spechelper.m
+++ b/3rdparty/vectorial/tools/spechelper.m
@ -0,0 +1,45 @@
 #!/usr/bin/env octave
 1;
 function spec_formatter (val,type)
    if( isscalar(val) == 1 ) 
        printf("        should_be_close_to(x, %15.15ff, epsilon );", val);
        return;
    endif
    if( size(val) == [1,2] ) 
        if( strcmp(type,"simd2f") == 1 )
        printf("        should_be_equal_%s(x, simd2f_create(%15.15ff, %15.15ff), epsilon );",type, val(1), val(2));
        else
        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, 0.0f, 0.0f), epsilon );",type, val(1), val(2));
        endif
        return;
    endif
    if( size(val) == [1,3] ) 
        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, 0.0f), epsilon );",type, val(1), val(2), val(3));
        return;
    endif
    if( size(val) == [1,4] ) 
        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4));
        return;
    endif
    if( size(val) == [4,1] ) 
        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4));
        return;
    endif
    if( size(val) == [4,4] ) 
        printf("        should_be_equal_%s(x, simd4x4f_create(simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff)), epsilon );",type, 
        val(1), val(2), val(3), val(4), val(5), val(6), val(7), val(8), val(9), val(10), val(11), val(12), val(13), val(14), val(15), val(16)
        );
        return;
    endif
 endfunction
--- a/3rdparty/vectorial/tools/update_spec.rb
+++ b/3rdparty/vectorial/tools/update_spec.rb
@ -0,0 +1,24 @@
 #!/usr/bin/env ruby
 SPECHELPER = File.join(File.dirname(__FILE__), "spechelper.m")
 def octave_eval(str, type)
  puts "evalling (#{type}): #{str}"
  ret = `octave --quiet --eval 'source("#{SPECHELPER}"); spec_formatter(#{str}, "#{type}")'`
  puts "    = #{ret.strip}"
  ret
 end
 ARGV.each do |fn|
  str = File.read(fn)
  str.gsub!(%r{(// octave (\w+):)(.*?)\n(.*?\n)}) do |match|
    e = octave_eval($3, $2)
    [$1, $3, "\n", e, "\n"].join
  end
  File.open(fn, "w") do |f|
    f.write str
  end
 end
--- a/3rdparty/vectorial/vectorial.sln
+++ b/3rdparty/vectorial/vectorial.sln
@ -0,0 +1,31 @@
 Microsoft Visual Studio Solution File, Format Version 10.00
 # Visual C++ Express 2008
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial specsuite", "vectorial.vcproj", "{9450BCE8-02CB-4169-8471-2DFF764817F4}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial benchmark", "vectorialbenchmark.vcproj", "{1E78F64D-C404-4048-8AE6-217089480E8A}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Win32 = Debug|Win32
 		Release Scalar|Win32 = Release Scalar|Win32
 		Release SSE|Win32 = Release SSE|Win32
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.ActiveCfg = Debug|Win32
 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.Build.0 = Debug|Win32
 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32
 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.Build.0 = Release Scalar|Win32
 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.ActiveCfg = Release|Win32
 		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.Build.0 = Release|Win32
 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.ActiveCfg = Debug|Win32
 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.Build.0 = Debug|Win32
 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32
 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.Build.0 = Release Scalar|Win32
 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.ActiveCfg = Release|Win32
 		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 EndGlobal
--- a/3rdparty/vectorial/vectorial.vcproj
+++ b/3rdparty/vectorial/vectorial.vcproj
@ -0,0 +1,350 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
 	Version="9,00"
 	Name="vectorial specsuite"
 	ProjectGUID="{9450BCE8-02CB-4169-8471-2DFF764817F4}"
 	RootNamespace="vectorial specsuite"
 	Keyword="Win32Proj"
 	TargetFrameworkVersion="0"
 	>
 	<Platforms>
 		<Platform
 			Name="Win32"
 		/>
 	</Platforms>
 	<ToolFiles>
 	</ToolFiles>
 	<Configurations>
 		<Configuration
 			Name="Debug|Win32"
 			OutputDirectory="Debug"
 			IntermediateDirectory="Debug"
 			ConfigurationType="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="include"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
 				FloatingPointModel="0"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				Detect64BitPortabilityProblems="false"
 				DebugInformationFormat="4"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="2"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release|Win32"
 			OutputDirectory="Release"
 			IntermediateDirectory="Release"
 			ConfigurationType="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				AdditionalIncludeDirectories="include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;NOMINMAX"
 				RuntimeLibrary="2"
 				EnableEnhancedInstructionSet="2"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				Detect64BitPortabilityProblems="false"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="0"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release Scalar|Win32"
 			OutputDirectory="$(ConfigurationName)"
 			IntermediateDirectory="$(ConfigurationName)"
 			ConfigurationType="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				AdditionalIncludeDirectories="include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;"
 				RuntimeLibrary="2"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				Detect64BitPortabilityProblems="false"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="0"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 	</Configurations>
 	<References>
 	</References>
 	<Files>
 		<Filter
 			Name="vectorial"
 			>
 			<File
 				RelativePath=".\include\vectorial\config.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_common.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_gnu.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_neon.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_scalar.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_sse.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_gnu.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_neon.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_scalar.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_sse.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\vec2f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\vec3f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\vec4f.h"
 				>
 			</File>
 		</Filter>
 		<Filter
 			Name="spec"
 			>
 			<File
 				RelativePath=".\spec\spec.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec.h"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_helper.h"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_main.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_mat4f.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_simd4f.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_simd4x4f.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_vec2f.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_vec3f.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\spec\spec_vec4f.cpp"
 				>
 			</File>
 		</Filter>
 	</Files>
 	<Globals>
 	</Globals>
 </VisualStudioProject>
--- a/3rdparty/vectorial/vectorialbenchmark.vcproj
+++ b/3rdparty/vectorial/vectorialbenchmark.vcproj
@ -0,0 +1,340 @@
 <?xml version="1.0" encoding="Windows-1252"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
 	Version="9,00"
 	Name="vectorial benchmark"
 	ProjectGUID="{1E78F64D-C404-4048-8AE6-217089480E8A}"
 	RootNamespace="vectorialbenchmark"
 	Keyword="Win32Proj"
 	TargetFrameworkVersion="196613"
 	>
 	<Platforms>
 		<Platform
 			Name="Win32"
 		/>
 	</Platforms>
 	<ToolFiles>
 	</ToolFiles>
 	<Configurations>
 		<Configuration
 			Name="Debug|Win32"
 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 			IntermediateDirectory="$(ConfigurationName)"
 			ConfigurationType="1"
 			CharacterSet="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="0"
 				AdditionalIncludeDirectories="include"
 				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
 				MinimalRebuild="true"
 				BasicRuntimeChecks="3"
 				RuntimeLibrary="3"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="4"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="2"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release|Win32"
 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 			IntermediateDirectory="$(ConfigurationName)"
 			ConfigurationType="1"
 			CharacterSet="1"
 			WholeProgramOptimization="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				EnableIntrinsicFunctions="true"
 				AdditionalIncludeDirectories="include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;NOMINMAX"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="false"
 				EnableEnhancedInstructionSet="2"
 				FloatingPointModel="2"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="1"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 		<Configuration
 			Name="Release Scalar|Win32"
 			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
 			IntermediateDirectory="$(ConfigurationName)"
 			ConfigurationType="1"
 			CharacterSet="1"
 			WholeProgramOptimization="1"
 			>
 			<Tool
 				Name="VCPreBuildEventTool"
 			/>
 			<Tool
 				Name="VCCustomBuildTool"
 			/>
 			<Tool
 				Name="VCXMLDataGeneratorTool"
 			/>
 			<Tool
 				Name="VCWebServiceProxyGeneratorTool"
 			/>
 			<Tool
 				Name="VCMIDLTool"
 			/>
 			<Tool
 				Name="VCCLCompilerTool"
 				Optimization="2"
 				EnableIntrinsicFunctions="true"
 				AdditionalIncludeDirectories="include"
 				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
 				RuntimeLibrary="2"
 				EnableFunctionLevelLinking="false"
 				EnableEnhancedInstructionSet="0"
 				FloatingPointModel="2"
 				UsePrecompiledHeader="0"
 				WarningLevel="3"
 				DebugInformationFormat="3"
 			/>
 			<Tool
 				Name="VCManagedResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCResourceCompilerTool"
 			/>
 			<Tool
 				Name="VCPreLinkEventTool"
 			/>
 			<Tool
 				Name="VCLinkerTool"
 				LinkIncremental="1"
 				GenerateDebugInformation="true"
 				SubSystem="1"
 				OptimizeReferences="2"
 				EnableCOMDATFolding="2"
 				TargetMachine="1"
 			/>
 			<Tool
 				Name="VCALinkTool"
 			/>
 			<Tool
 				Name="VCManifestTool"
 			/>
 			<Tool
 				Name="VCXDCMakeTool"
 			/>
 			<Tool
 				Name="VCBscMakeTool"
 			/>
 			<Tool
 				Name="VCFxCopTool"
 			/>
 			<Tool
 				Name="VCAppVerifierTool"
 			/>
 			<Tool
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
 	</Configurations>
 	<References>
 	</References>
 	<Files>
 		<Filter
 			Name="vectorial"
 			>
 			<File
 				RelativePath=".\include\vectorial\config.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_common.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_gnu.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_neon.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_scalar.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4f_sse.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_gnu.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_neon.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_scalar.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\simd4x4f_sse.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\vec2f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\vec3f.h"
 				>
 			</File>
 			<File
 				RelativePath=".\include\vectorial\vec4f.h"
 				>
 			</File>
 		</Filter>
 		<Filter
 			Name="bench"
 			>
 			<File
 				RelativePath=".\bench\add_bench.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\bench\bench.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\bench\bench.h"
 				>
 			</File>
 			<File
 				RelativePath=".\bench\dot_bench.cpp"
 				>
 			</File>
 			<File
 				RelativePath=".\bench\quad_bench.cpp"
 				>
 			</File>
 		</Filter>
 	</Files>
 	<Globals>
 	</Globals>
 </VisualStudioProject>
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,14 +34,16 @@ target_include_directories(
        AnimTestbed
        PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
        PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/glfw/deps>
        PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/Handmade-Math>
        PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/imgui>
        PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/sokol>
-		PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/Handmade-Math>
+        PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vectorial/include>
 )
 target_link_libraries(AnimTestbed glfw ozz_base ozz_geometry ozz_animation ${OPENGL_LIBRARIES})
 target_sources(AnimTestbed PRIVATE
        src/main.cc
        src/Camera.c
        3rdparty/glfw/deps/glad_gl.c
        3rdparty/imgui/imgui.cpp
        3rdparty/imgui/imgui_draw.cpp
--- a/src/Camera.c
+++ b/src/Camera.c
@ -0,0 +1,158 @@
 #include "Camera.h"
 #include <assert.h>
 #include "string.h"
 #include "vectorial/simd4x4f.h"
 inline void Camera_Init(Camera* camera) {
  // clang-format off
  static float mtx_identity[16] = {
      1.f, 0.f, 0.f, 0.f,
      0.f, 1.f, 0.f, 0.f,
      0.f, 0.f, 1.f, 0.f,
      0.f, 0.f, 0.f, 1.f
  };
  // clang-format on
  camera->near = 0.01;
  camera->far = 1000.0;
  camera->fov = 90 * M_PI / 180.f;
  camera->forward[0] = -1.f;
  camera->forward[1] = 0.f;
  camera->forward[2] = -1.f;
  camera->right[0] = 1.f;
  camera->right[1] = 0.f;
  camera->right[2] = 0.f;
  camera->up[0] = 0.f;
  camera->up[1] = 1.f;
  camera->up[2] = 0.f;
  camera->pos[0] = 2.f;
  camera->pos[1] = 1.2f;
  camera->pos[2] = 2.f;
  camera->vel[0] = 0.f;
  camera->vel[1] = 0.f;
  camera->vel[2] = 0.f;
  camera->heading = -45.0 * M_PI / 180.0f;
  camera->pitch = 10 * M_PI / 180.0f;
  memcpy(&camera->mtxView, &mtx_identity, sizeof(camera->mtxView));
  Camera_CalcToMatrix(camera, &camera->mtxView);
  Camera_CalcFromMatrix(camera, &camera->mtxView);
 }
 void Camera_CalcFromMatrix(Camera* camera, float* mat) {
  simd4x4f mtx;
  simd4x4f_uload(&mtx, mat);
  camera->forward[0] = mtx.x[2];
  camera->forward[1] = mtx.y[2];
  camera->forward[2] = mtx.z[2];
  camera->right[0] = mtx.x[0];
  camera->right[1] = mtx.y[0];
  camera->right[2] = mtx.z[0];
  camera->heading = atan2(-camera->forward[2], camera->forward[0]);
  camera->pitch = asin(camera->forward[1]);
  simd4x4f rot_mat = mtx;
  rot_mat.w = simd4f_create(0.f, 0.f, 0.f, 1.f);
  simd4x4f_transpose_inplace(&rot_mat);
  simd4f eye;
  simd4x4f_matrix_point3_mul(&rot_mat, &mtx.w, &eye);
  camera->pos[0] = -simd4f_get_x(eye);
  camera->pos[1] = -simd4f_get_y(eye);
  camera->pos[2] = -simd4f_get_z(eye);
 //  gLog ("ViewMat");
 //  gLog ("%f, %f, %f, %f", mtx->x[0], mtx->x[1], mtx->x[2], mtx->x[3]);
 //  gLog ("%f, %f, %f, %f", mtx->y[0], mtx->y[1], mtx->y[2], mtx->y[3]);
 //  gLog ("%f, %f, %f, %f", mtx->z[0], mtx->z[1], mtx->z[2], mtx->z[3]);
 //  gLog ("%f, %f, %f, %f", mtx->w[0], mtx->w[1], mtx->w[2], mtx->w[3]);
 }
 void Camera_CalcToMatrix(Camera* camera, float* mat) {
  float sp = sin(camera->pitch);
  float cp = cos(camera->pitch);
  float ch = cos(camera->heading);
  float sh = sin(camera->heading);
  const float d = 10.0f;
  simd4f eye = simd4f_create (camera->pos[0], camera->pos[1], camera->pos[2], 1.f);
  simd4f forward = simd4f_create (-cp * ch, -sp, cp * sh, 0.f);
  simd4f right = simd4f_cross3 (forward, simd4f_create (0.f, 1.f, 0.f, 1.f));
  simd4f up = simd4f_cross3(right, forward);
  simd4f center = simd4f_add(simd4f_mul(forward, simd4f_splat(d)), eye);
  camera->forward[0] = -simd4f_get_x(forward);
  camera->forward[1] = -simd4f_get_y(forward);
  camera->forward[2] = -simd4f_get_z(forward);
  camera->right[0] = simd4f_get_x(right);
  camera->right[1] = simd4f_get_y(right);
  camera->right[2] = simd4f_get_z(right);
  simd4x4f mtx;
  simd4x4f_lookat(&mtx, eye, center, up);
  simd4f_ustore4(mtx.x, mat);
  simd4f_ustore4(mtx.y, mat +4);
  simd4f_ustore4(mtx.z, mat +8);
  simd4f_ustore4(mtx.w, mat +12);
 }
 inline void Camera_Update(
    Camera* camera,
    int width,
    int height,
    float dt,
    float mouse_dx,
    float mouse_dy,
    float accel[3]) {
  assert(camera);
  assert((width > 0) && (height > 0));
  const float w = (float) width;
  const float h = (float) height;
  simd4x4f proj;
  simd4x4f_perspective(&proj, camera->fov, w/h, camera->near, camera->far);
  simd4f_ustore4(proj.x, camera->mtxProj);
  simd4f_ustore4(proj.y, camera->mtxProj +4);
  simd4f_ustore4(proj.z, camera->mtxProj +8);
  simd4f_ustore4(proj.w, camera->mtxProj +12);
  if (mouse_dx != 0.f || mouse_dy != 0.f || accel != NULL) {
    const float mouse_sensitivity = 20.0f;
    camera->heading -= dt * mouse_dx * mouse_sensitivity * M_PI / 180.f;
    if (camera->heading < -M_PI) {
      camera->heading += M_PI * 2.f;
    } else if (camera->heading > M_PI) {
      camera->heading -= M_PI * 2.f;
    }
    camera->pitch += dt * mouse_dy * mouse_sensitivity * M_PI / 180.f;
    if (camera->pitch < -M_PI * 0.49) {
      camera->pitch = -M_PI * 0.49;
    } else if (camera->pitch > M_PI * 0.49) {
      camera->pitch = M_PI * 0.49;
    }
    for (int i = 0; i < 3; i++) {
      camera->vel[i] += dt * accel[0] * camera->forward[i]
                        + dt * accel[2] * camera->right[i]
                        + dt * accel[1] * camera->up[i];
      camera->pos[i] += dt * camera->vel[i];
      camera->vel[i] = camera->vel[i] * 0.1;
    }
    Camera_CalcToMatrix(camera, &camera->mtxView);
  }
 }
--- a/src/Camera.h
+++ b/src/Camera.h
@ -0,0 +1,43 @@
 //
 // Created by martin on 19.10.21.
 //
 #ifndef RBDLSIM_RENDER_UTILS_H
 #define RBDLSIM_RENDER_UTILS_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef struct {
  float mtxProj[16];
  float mtxView[16];
  float near;
  float far;
  float fov;
  float heading;
  float pitch;
  float vel[3];
  float pos[3];
  float forward[3];
  float right[3];
  float up[3];
 } Camera;
 void Camera_Init(Camera* camera);
 void Camera_CalcFromMatrix(Camera* camera, float* mtx);
 void Camera_CalcToMatrix(Camera* camera, float* mtx);
 void Camera_Update(
    Camera* camera,
    int width,
    int height,
    float dt,
    float mouse_dx,
    float mouse_dy,
    float accel[3]);
 #ifdef __cplusplus
 }
 #endif
 #endif  //RBDLSIM_RENDER_UTILS_H
--- a/src/main.cc
+++ b/src/main.cc
@ -14,6 +14,7 @@
 #define GLFW_INCLUDE_NONE
 #include <iostream>
 #include "Camera.h"
 #include "GLFW/glfw3.h"
 const int Width = 1024;
@ -68,7 +69,7 @@ typedef struct {
 static struct {
  std::unique_ptr<ozz_t> ozz;
  sg_pass_action pass_action;
-  camera_t camera;
+  Camera camera;
  struct {
    bool skeleton;
    bool animation;
@ -85,6 +86,25 @@ static struct {
  } time;
 } state;
 typedef struct {
  int32_t mousedX;
  int32_t mousedY;
  int32_t mouseX;
  int32_t mouseY;
  uint8_t mouseButton;
  int32_t mouseScroll;
  char key;
 } GuiInputState;
 GuiInputState gGuiInputState = {0, 0, 0, 0, 0, 0, 0};
 enum class ControlMode {
  ControlModeNone,
  ControlModeFPS
 };
 ControlMode gControlMode = ControlMode::ControlModeNone;
 // io buffers for skeleton and animation data files, we know the max file size upfront
 static uint8_t skel_data_buffer[4 * 1024];
 static uint8_t anim_data_buffer[32 * 1024];
@ -93,11 +113,35 @@ static void load_skeleton(void);
 static void load_animation(void);
 static void eval_animation(void);
 static void draw_skeleton(void);
 static void draw_grid(void);
 static void draw_ui(void);
 // static void skeleton_data_loaded(const sfetch_response_t* response);
 // static void animation_data_loaded(const sfetch_response_t* response);
 static void frame(void);
 void handle_mouse(GLFWwindow* w, GuiInputState* io_input_state) {
  if (!glfwGetWindowAttrib(w, GLFW_FOCUSED)) {
    return;
  }
  double mouse_x, mouse_y;
  glfwGetCursorPos(w, &mouse_x, &mouse_y);
  if (io_input_state->mouseButton) {
    io_input_state->mousedX = int32_t(mouse_x) - io_input_state->mouseX;
    io_input_state->mousedY = int32_t(mouse_y) - io_input_state->mouseY;
  } else {
    io_input_state->mousedX = 0;
    io_input_state->mousedY = 0;
  }
  io_input_state->mouseX = int32_t(mouse_x);
  io_input_state->mouseY = int32_t(mouse_y);
  io_input_state->mouseButton = glfwGetMouseButton(w, 0)
                                + (glfwGetMouseButton(w, 1) << 1)
                                + (glfwGetMouseButton(w, 2) << 2);
 }
 int main() {
  // window and GL context via GLFW and flextGL
  glfwInit();
@ -106,7 +150,7 @@ int main() {
  glfwWindowHint(GLFW_OPENGL_FORWARD_COMPAT, GLFW_TRUE);
  glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE);
  glfwWindowHint(GLFW_COCOA_RETINA_FRAMEBUFFER, GLFW_FALSE);
-  GLFWwindow* w = glfwCreateWindow(Width, Height, "Sokol+ImGui+GLFW", 0, 0);
+  GLFWwindow* w = glfwCreateWindow(Width, Height, "AnimTestbed", 0, 0);
  glfwMakeContextCurrent(w);
  glfwSwapInterval(1);
@ -154,16 +198,7 @@ int main() {
  state.ozz = std::make_unique<ozz_t>();
  state.time.factor = 1.0f;
-  // initialize camera helper
+  Camera_Init(&state.camera);
  camera_desc_t camdesc = {};
  camdesc.min_dist = 1.0f;
  camdesc.max_dist = 100.0f;
 	camdesc.farz = 1000.0f;
  camdesc.center.Y = 1.0f;
  camdesc.distance = 30.0f;
  camdesc.latitude = 10.0f;
  camdesc.longitude = 20.0f;
  cam_init(&state.camera, &camdesc);
  // setup Dear Imgui
  ImGui::CreateContext();
@ -265,7 +300,7 @@ int main() {
  // initial clear color
  pass_action.colors[0].action = SG_ACTION_CLEAR;
-  pass_action.colors[0].value = {0.0f, 0.5f, 0.7f, 1.0f};
+  pass_action.colors[0].value = {0.1f, 0.1f, 0.1f, 1.0f};
  load_skeleton();
  load_animation();
@ -278,14 +313,81 @@ int main() {
    int cur_width, cur_height;
    glfwGetFramebufferSize(w, &cur_width, &cur_height);
    cam_update(&state.camera, cur_width, cur_height);
    // this is standard ImGui demo code
    ImGuiIO& io = ImGui::GetIO();
    io.DisplaySize = ImVec2(float(cur_width), float(cur_height));
    io.DeltaTime = (float)stm_sec(stm_laptime(&last_time));
    ImGui::NewFrame();
    ImGui::Begin("Camera");
    ImGui::SliderFloat3("pos", state.camera.pos, -100.f, 100.f);
    ImGui::SliderFloat("near", &state.camera.near, 0.001f, 10.f);
    ImGui::SliderFloat("far", &state.camera.far, 1.0f, 10000.f);
    ImGui::SliderFloat("heading", &state.camera.heading, -180.0f, 180.f);
    ImGui::SliderFloat("pitch", &state.camera.pitch, -179.0f, 179.f);
    ImGui::End();
    // handle input
    handle_mouse (w, &gGuiInputState);
    if (glfwGetMouseButton(w, GLFW_MOUSE_BUTTON_RIGHT)) {
      if (gControlMode == ControlMode::ControlModeNone) {
        gControlMode = ControlMode::ControlModeFPS;
        Camera_CalcFromMatrix(&state.camera, &state.camera.mtxView[0]);
        glfwSetInputMode(w, GLFW_CURSOR, GLFW_CURSOR_DISABLED);
      }
    } else {
      gControlMode = ControlMode::ControlModeNone;
      glfwSetInputMode(w, GLFW_CURSOR, GLFW_CURSOR_NORMAL);
      Camera_Update(
          &state.camera,
          cur_width,
          cur_height,
          state.time.frame,
          0,
          0,
          nullptr);
    }
    if (gControlMode == ControlMode::ControlModeFPS) {
      float camera_accel[3] = {0.f, 0.f, 0.f};
      float accel_scale = 100.0;
      if (glfwGetKey(w, GLFW_KEY_LEFT_SHIFT)) {
        accel_scale *= 3.;
      } else if (glfwGetKey(w, GLFW_KEY_LEFT_CONTROL)) {
        accel_scale /= 3.;
      }
      if (glfwGetKey(w, GLFW_KEY_W)) {
        camera_accel[0] -= accel_scale;
      }
      if (glfwGetKey(w, GLFW_KEY_S)) {
        camera_accel[0] += accel_scale;
      }
      if (glfwGetKey(w, GLFW_KEY_C)) {
        camera_accel[1] -= accel_scale;
      }
      if (glfwGetKey(w, GLFW_KEY_SPACE)) {
        camera_accel[1] += accel_scale;
      }
      if (glfwGetKey(w, GLFW_KEY_A)) {
        camera_accel[2] -= accel_scale;
      }
      if (glfwGetKey(w, GLFW_KEY_D)) {
        camera_accel[2] += accel_scale;
      }
      Camera_Update(
          &state.camera,
          cur_width,
          cur_height,
          state.time.frame,
          gGuiInputState.mousedX,
          gGuiInputState.mousedY,
          camera_accel);
    }
    if (ImGui::BeginMainMenuBar()) {
      ImGui::Text("AnimTestbed");
      ImGui::Checkbox("ImGui Demo", &show_imgui_demo_window);
@ -301,14 +403,6 @@ int main() {
      ImGui::ShowDemoWindow();
    }
    ImGui::Begin("Camera");
    ImGui::SliderFloat("min_dist", &state.camera.min_dist, 1.0f, 100.f);
    ImGui::SliderFloat("max_dist", &state.camera.max_dist, 1.0f, 100.f);
    ImGui::SliderFloat("center.Y", &state.camera.center.Y, 1.0f, 100.f);
    ImGui::SliderFloat("distance", &state.camera.distance, 1.0f, 1000.f);
    ImGui::SliderFloat("latitude", &state.camera.latitude, 1.0f, 100.f);
    ImGui::SliderFloat("longitude", &state.camera.longitude, -179.0f, 179.f);
    ImGui::End();
    // the sokol_gfx draw pass
    sg_begin_default_pass(&pass_action, cur_width, cur_height);
@ -433,6 +527,8 @@ static void draw_ui() {
 }
 static void frame() {
  draw_grid();
  if (state.loaded.animation && state.loaded.skeleton) {
    if (!state.time.paused) {
      state.time.absolute += state.time.frame * state.time.factor;
@ -516,13 +612,49 @@ static void draw_joint(int joint_index, int parent_joint_index) {
  draw_line(p5, p2);
 }
 static void draw_grid(void) {
  sgl_defaults();
  sgl_matrix_mode_projection();
  sgl_load_matrix((const float*)&state.camera.mtxProj);
  sgl_matrix_mode_modelview();
  sgl_load_matrix((const float*)&state.camera.mtxView);
  const int grid_size = 10;
  sgl_begin_lines();
  sgl_c3f(0.4f, 0.4f, 0.4f);
  for (int i = -grid_size; i <= grid_size; i++) {
    if (i == 0) {
      continue;
    }
    ozz::math::SimdFloat4 p0 = ozz::math::simd_float4::Load(i * 1.0f, 0.f, -grid_size * 1.0f, 1.f);
    ozz::math::SimdFloat4 p1 = ozz::math::simd_float4::Load(i * 1.0f, 0.f, grid_size * 1.0f, 1.f);
    draw_line(p0, p1);
    p0 = ozz::math::simd_float4::Load(-grid_size * 1.0f, 0.f, i * 1.0f, 1.f);
    p1 = ozz::math::simd_float4::Load(grid_size * 1.0f, 0.f, i * 1.0f, 1.f);
    draw_line(p0, p1);
  }
  sgl_c3f (0.7f, 0.4f, 0.2f);
  ozz::math::SimdFloat4 p0 = ozz::math::simd_float4::Load(0, 0.f, -grid_size * 1.0f, 1.f);
  ozz::math::SimdFloat4 p1 = ozz::math::simd_float4::Load(0, 0.f, grid_size * 1.0f, 1.f);
  draw_line(p0, p1);
  sgl_c3f (0.2f, 0.4f, 0.7f);
  p0 = ozz::math::simd_float4::Load(-grid_size * 1.0f, 0.f, 0.f, 1.f);
  p1 = ozz::math::simd_float4::Load(grid_size * 1.0f, 0.f, 0.f, 1.f);
  draw_line(p0, p1);
  sgl_end();
 }
 static void draw_skeleton(void) {
  sgl_defaults();
  sgl_matrix_mode_projection();
-  sgl_load_matrix((const float*)&state.camera.proj);
+  sgl_load_matrix((const float*)&state.camera.mtxProj);
  sgl_matrix_mode_modelview();
-	hmm_mat4 scale_mat = HMM_Scale (HMM_Vec3(0.1f, 0.1f, 0.1f));
+  hmm_mat4 scale_mat = HMM_Scale(HMM_Vec3(0.01f, 0.01f, 0.01f));
-  sgl_load_matrix((const float*)&state.camera.view);
+  sgl_load_matrix((const float*)&state.camera.mtxView);
  sgl_mult_matrix((const float*)&scale_mat);
  const int num_joints = state.ozz->skeleton.num_joints();