Initial works on render commands

2020-10-17 23:20:38 +02:00 · 2020-10-17 23:20:38 +02:00 · e8553fff6c
parent 99508054bf
commit e8553fff6c
51 changed files with 7164 additions and 12 deletions
--- a/3rdparty/vectorial/.gitignore
+++ b/3rdparty/vectorial/.gitignore
@ -0,0 +1,3 @@
+*.o
+*.orig
+specsuite-*
--- a/3rdparty/vectorial/.travis.yml
+++ b/3rdparty/vectorial/.travis.yml
@ -0,0 +1,6 @@
+language: cpp
+compiler:
+  - gcc
+  - clang
+
+script: make
--- a/3rdparty/vectorial/LICENSE
+++ b/3rdparty/vectorial/LICENSE
@ -0,0 +1,22 @@
+Copyright 2010 Mikko Lehtonen. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are
+permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this list of
+      conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice, this list
+      of conditions and the following disclaimer in the documentation and/or other materials
+      provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
--- a/3rdparty/vectorial/Makefile
+++ b/3rdparty/vectorial/Makefile
@ -0,0 +1,294 @@
+
+CXX?=g++
+CLANG_CC=clang
+CLANG_CXX=clang++
+
+IPHONE_PLATFORM_PATH = /Developer/Platforms/iPhoneOS.platform/Developer
+IPHONE_ISYSROOT_PATH = $(IPHONE_PLATFORM_PATH)/SDKs/iPhoneOS4.2.sdk/
+IPHONE_CC = $(IPHONE_PLATFORM_PATH)/usr/bin/g++ -isysroot $(IPHONE_ISYSROOT_PATH)   -arch armv7
+# -mfloat-abi=softfp -mfpu=neon  
+
+#CXXFLAGS += -Iinclude -O0
+#CXXFLAGS += -g -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math 
+CXXFLAGS += -Iinclude -Wall -Wextra -pedantic -Wno-unused -O3 -fstrict-aliasing -Wstrict-aliasing=2 -ffast-math  -D__extern_always_inline=inline
+
+SPEC_SRC = $(wildcard spec/*.cpp)
+SPEC_OBJ = $(SPEC_SRC:.cpp=.o)
+
+BENCH_SRC = $(wildcard bench/*.cpp)
+BENCH_OBJ = $(BENCH_SRC:.cpp=.o)
+BENCH_ASM = $(patsubst %.cpp,asm$(SUFFIX)/%.S,$(BENCH_SRC))
+
+SUFFIX=
+
+DEFAULT_CC=1
+
+ifeq ($(FORCE_SCALAR),1)
+	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SCALAR
+	SUFFIX=-scalar
+endif
+
+ifeq ($(FORCE_SSE),1)
+	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_SSE -msse -msse2 -mfpmath=sse
+	SUFFIX=-sse
+endif
+
+ifeq ($(FORCE_GNU),1)
+	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_GNU 
+	#-msse -msse2 -mfpmath=sse
+	SUFFIX=-gnu
+endif
+
+ifeq ($(FORCE_NEON),1)
+	CXXFLAGS+= -DVECTORIAL_FORCED -DVECTORIAL_NEON
+	SUFFIX=-neon
+	ARM=1
+endif
+
+
+ifeq ($(ARM),1)
+ifeq ($(shell uname -s),Darwin)
+	CC=$(IPHONE_CC)
+	CXX=$(IPHONE_CC)
+endif
+#	CXXFLAGS+= -mcpu=cortex-a8 
+	CXXFLAGS+= -mno-thumb -mfloat-abi=softfp -mfpu=neon
+	DEFAULT_CC=0
+endif
+
+ifeq ($(CLANG),1)
+	CC=$(CLANG_CC)
+	CXX=$(CLANG_CXX)
+	DEFAULT_CC=0
+endif
+
+ifeq ($(DEFAULT_CC),1)
+#	CXXFLAGS += -msse -msse2 -mfpmath=sse
+endif
+
+ifeq ($(ASM),1)
+	CC+= -S
+	CXX+= -S
+endif
+
+BUILDDIR=build$(SUFFIX)
+SPEC_OBJ := $(addprefix $(BUILDDIR)/,$(SPEC_OBJ))
+BENCH_OBJ := $(addprefix $(BUILDDIR)/,$(BENCH_OBJ))
+SILENT=@
+MKDIR=mkdir -p
+PATH_SEPARATOR=/
+
+$(BUILDDIR)/%.o: %.cpp
+	@echo CXX $<
+	$(SILENT) $(MKDIR) $(subst /,$(PATH_SEPARATOR),$(dir $@))
+	$(SILENT) $(COMPILE.cc) -o $@ $<
+
+
+
+.PHONY: all
+all: specsuite$(SUFFIX)
+	./specsuite$(SUFFIX)
+
+
+.PHONY: full
+full:
+	@clear
+	@echo FULL COMPILE at `date +%H:%M:%S`
+#	FORCE_SCALAR=1 $(MAKE) clean 
+	@FORCE_SCALAR=1 $(MAKE)  specsuite-scalar
+#	FORCE_GNU=1 $(MAKE) clean 
+	@FORCE_GNU=1 $(MAKE)  specsuite-gnu
+#	FORCE_SSE=1 $(MAKE) clean 
+	@FORCE_SSE=1 $(MAKE)  specsuite-sse
+#	FORCE_NEON=1 $(MAKE) clean 
+#	FORCE_NEON=1 $(MAKE) specsuite-neon
+	@./specsuite-scalar
+	@./specsuite-sse
+	@./specsuite-gnu
+
+specsuite$(SUFFIX): $(SPEC_OBJ)
+	@echo LINK $@
+	@$(CXX) $(LDFLAGS) $^ -o $@
+
+.PHONY: depend
+depend:
+	@echo DEP
+	@makedepend -Y -- $(CXXFLAGS) -- $(SPEC_SRC) $(BENCH_SRC) -p$(BUILDDIR)/ > /dev/null 2>&1 
+	@$(RM) Makefile.bak
+
+define asm-command
+@mkdir -p $(dir asm$(SUFFIX)/$(1))
+$(CXX) $(CXXFLAGS) -S $(1) -o asm$(SUFFIX)/$(1).S
+
+endef
+
+bench-asm: $(BENCH_SRC)
+	$(foreach p,$(BENCH_SRC),$(call asm-command,$(p)))
+
+benchmark$(SUFFIX): $(BENCH_OBJ) bench-asm
+	$(CXX) $(BENCH_OBJ) -o $@
+
+.PHONY: bench-full
+bench-full:
+	FORCE_SCALAR=1 $(MAKE) benchmark-scalar
+	FORCE_GNU=1 $(MAKE) benchmark-gnu
+	FORCE_SSE=1 $(MAKE) benchmark-sse
+#	FORCE_NEON=1 $(MAKE) clean 
+#	FORCE_NEON=1 $(MAKE) benchmark-neon
+	./benchmark-scalar
+	./benchmark-sse
+	./benchmark-gnu
+
+.PHONY: clean
+clean:
+	rm -f $(SPEC_OBJ) $(BENCH_OBJ) benchmark$(SUFFIX) specsuite$(SUFFIX) 
+	rm -rf asm$(SUFFIX)
+
+.PHONY: realclean
+realclean: clean
+	rm -f specsuite*
+	rm -rf build*
+
+
+.PHONY: update_spec
+update_spec:
+	./tools/update_spec.rb spec/spec_*.cpp
+
+ifeq ($(MAKECMDGOALS),export)
+ifeq ($(origin to),undefined)
+$(error to not set, like  make export to=/foo/bar)
+endif
+endif
+
+.PHONY: export
+export:
+	$(SILENT) git archive --format tar master | tar x -C $(to)
+
+
+include/vectorial/vec2f.h include/vectorial/vec3f.h include/vectorial/vec4f.h: include/vectorial/simd4f.h
+include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h
+include/vectorial/simd4f.h: include/vectorial/simd4f_neon.h
+include/vectorial/simd4f.h: include/vectorial/simd4f_gnu.h
+include/vectorial/simd4f.h: include/vectorial/simd4f_sse.h
+include/vectorial/simd4f.h: include/vectorial/simd4f_scalar.h
+include/vectorial/simd4f.h: include/vectorial/config.h
+include/vectorial/simd4x4f.h: include/vectorial/simd4f.h
+include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_scalar.h
+include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_neon.h
+include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_gnu.h
+include/vectorial/simd4x4f.h: include/vectorial/simd4x4f_sse.h
+include/vectorial/simd4x4f.h: include/vectorial/config.h
+spec/spec_helper.h: include/vectorial/simd4x4f.h include/vectorial/simd4f.h include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h
+spec/spec.cpp: spec/spec.h
+spec/spec_main.cpp: spec/spec.h
+spec/spec_simd4f.cpp: spec/spec_helper.h
+spec/spec_simd4x4f.cpp: spec/spec_helper.h
+spec/spec_vec2f.cpp: spec/spec_helper.h
+spec/spec_vec3f.cpp: spec/spec_helper.h
+spec/spec_vec4f.cpp: spec/spec_helper.h
+
+$(BUILDDIR)/spec/spec_simd4f.o: \
+  include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
+  include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
+  include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
+  include/vectorial/config.h
+
+$(BUILDDIR)/spec/spec_simd4x4f.o: \
+  include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
+  include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
+  include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
+  include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \
+  include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h
+  
+$(BUILDDIR)/spec/spec_vec2f.o $(BUILDDIR)/spec/spec_vec3f.o $(BUILDDIR)/spec/spec_vec4f.o: \
+  include/vectorial/simd4x4f.h include/vectorial/simd4f.h \
+  include/vectorial/vec4f.h include/vectorial/vec3f.h include/vectorial/vec2f.h \
+  include/vectorial/simd4f_scalar.h include/vectorial/simd4f_neon.h \
+  include/vectorial/simd4f_gnu.h include/vectorial/simd4f_sse.h \
+  include/vectorial/simd4x4f_scalar.h include/vectorial/simd4x4f_neon.h \
+  include/vectorial/simd4x4f_gnu.h include/vectorial/simd4x4f_sse.h include/vectorial/config.h
+
+
+
+
+
+# DO NOT DELETE
+
+$(BUILDDIR)/spec/spec.o: spec/spec.h
+$(BUILDDIR)/spec/spec_main.o: spec/spec.h
+$(BUILDDIR)/spec/spec_mat4f.o: spec/spec_helper.h spec/spec.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/config.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_gnu.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f_common.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/vec2f.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/spec/spec_mat4f.o: include/vectorial/mat4f.h
+$(BUILDDIR)/spec/spec_simd4f.o: spec/spec_helper.h spec/spec.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/config.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_gnu.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f_common.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/vec2f.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/spec/spec_simd4f.o: include/vectorial/mat4f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: spec/spec_helper.h spec/spec.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/config.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_gnu.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f_common.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec4f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec3f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/vec2f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/spec/spec_simd4x4f.o: include/vectorial/mat4f.h
+$(BUILDDIR)/spec/spec_vec2f.o: spec/spec_helper.h spec/spec.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/config.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_gnu.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f_common.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/vec2f.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/spec/spec_vec2f.o: include/vectorial/mat4f.h
+$(BUILDDIR)/spec/spec_vec3f.o: spec/spec_helper.h spec/spec.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/config.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_gnu.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f_common.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/vec2f.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/spec/spec_vec3f.o: include/vectorial/mat4f.h
+$(BUILDDIR)/spec/spec_vec4f.o: spec/spec_helper.h spec/spec.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/config.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_gnu.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f_common.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec4f.h include/vectorial/vec3f.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/vec2f.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4f.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/spec/spec_vec4f.o: include/vectorial/mat4f.h
+$(BUILDDIR)/bench/add_bench.o: bench/bench.h include/vectorial/vec4f.h
+$(BUILDDIR)/bench/bench.o: bench/bench.h include/vectorial/config.h
+$(BUILDDIR)/bench/dot_bench.o: bench/bench.h include/vectorial/vec4f.h
+$(BUILDDIR)/bench/matrix_bench.o: bench/bench.h include/vectorial/simd4x4f.h
+$(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4f.h
+$(BUILDDIR)/bench/matrix_bench.o: include/vectorial/simd4x4f_gnu.h
+$(BUILDDIR)/bench/quad_bench.o: bench/bench.h include/vectorial/simd4x4f.h
+$(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4f.h
+$(BUILDDIR)/bench/quad_bench.o: include/vectorial/simd4x4f_gnu.h
--- a/3rdparty/vectorial/README
+++ b/3rdparty/vectorial/README
@ -0,0 +1,60 @@
+
+    Vectorial - vector math library
+
+
+
+  Motivation
+
+    I couldn't find an open source math library that was usable and
+    supported simd - especially the ARM NEON variant.
+
+
+  Features
+
+    Supports NEON, SSE, scalar and generic gcc vector extension.
+    Most basic vector and matrix math is available, but not quite
+    yet full featured.
+
+
+  Design
+
+    Vectorial consists of two main parts, pure-C wrapper around
+    platform-specific vector instructions in the simd*.h files
+    and C++ classes for common uses, the vec*.h and mat*.h
+
+    The config.h autodetects approriate vector instructions to use.
+
+    The platform-specific support is done with intrisincs only,
+    allowing the compiler to have a full view of the code, hopefully
+    resulting in better optimizations especially with reordering etc.
+
+
+  Installation / Usage
+
+    Add vectorial/include to your include path
+
+    #include "vectorial/simd4f.h"  
+    for C-only simd wrapper, using it looks like this:
+      simd4f v = simd4f_normalize( simd4f_add( simd4f_create(1,2,3,4), y) );
+      float z = simd4f_get_z(v);
+
+    #include "vectorial/vectorial.h"
+    for C++ classes. They reside in vectorial namespace, you might
+    want to alias them to your own namespace
+      namespace myproject {
+        using namespace ::vectorial;
+        // if you like different name: typedef vec3f Vector3;
+      }
+      using myproject::vec4f;
+      
+      vec4f v = normalize( vec4f(1,2,3,4) + y );
+      float z = v.z();
+
+
+  License
+
+    2-clause BSD. See LICENSE
+
+
+
+
--- a/3rdparty/vectorial/bench/add_bench.cpp
+++ b/3rdparty/vectorial/bench/add_bench.cpp
@ -0,0 +1,60 @@
+
+#include "bench.h"
+#include <stdlib.h>
+
+#include <iostream>
+#include "vectorial/vec4f.h"
+
+#define NUM (81920)
+#define ITER 100
+using namespace vectorial;
+
+namespace {
+    vec4f* alloc_vec4f(size_t n) {
+        void *ptr = memalign(n*sizeof(vec4f), 16);
+        return static_cast<vec4f*>(ptr);
+    }
+}
+
+
+
+static vec4f * a;
+static vec4f * b;
+static vec4f * c;
+
+
+
+
+void add_func() {
+    
+    vec4f* vectorial_restrict aa = a;
+    vec4f* vectorial_restrict bb = b;
+    vec4f* vectorial_restrict cc = c;
+    
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        cc[i] = aa[i] + bb[i];
+    }    
+}
+
+void add_bench() {
+
+    a = alloc_vec4f(NUM);
+    b = alloc_vec4f(NUM);
+    c = alloc_vec4f(NUM);
+
+
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        a[i]=vec4f(i,i,i,i);
+        b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i);
+    }
+        
+    profile("add", add_func, ITER, NUM);
+
+    memfree(a);
+    memfree(b);
+    memfree(c);
+
+
+}
--- a/3rdparty/vectorial/bench/bench.cpp
+++ b/3rdparty/vectorial/bench/bench.cpp
@ -0,0 +1,117 @@
+#include "bench.h"
+#include <sstream>
+#include <iostream>
+#include "vectorial/config.h"
+
+
+namespace profiler {
+
+    #ifdef BENCH_MACH
+    mach_timebase_info_data_t info;
+    void init() {
+        mach_timebase_info(&info);
+    }
+    #endif
+    
+    #ifdef BENCH_GTOD
+    void init() {
+    }
+    #endif
+
+    #ifdef BENCH_QPC
+    double frequency;
+    void init() {
+        LARGE_INTEGER freq;
+        QueryPerformanceFrequency(&freq);
+        frequency = (double)freq.QuadPart;
+    }
+    #endif
+
+
+    time_t now() {
+
+        #ifdef BENCH_MACH
+        return mach_absolute_time();
+        #endif
+
+        #ifdef BENCH_GTOD
+        time_t v;
+        gettimeofday(&v, NULL);
+        return v;
+        #endif
+        
+        #ifdef BENCH_QPC
+        LARGE_INTEGER v;
+        QueryPerformanceCounter(&v);
+        return v;
+        #endif
+
+    }
+    
+    
+    double diffTime(time_t start, time_t end) {
+        
+        #ifdef BENCH_GTOD
+        return (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1000000.0;
+        #endif
+        
+        #ifdef BENCH_MACH        
+        return ((end-start) * info.numer / info.denom) / 1000000000.0;
+        #endif
+
+        #ifdef BENCH_QPC
+        return (end.QuadPart - start.QuadPart) / frequency;
+        #endif
+    }
+    
+}
+
+
+std::string formatTime(double d, double relative ) {
+    const double sec   = 1.0;
+    const double milli = 0.001;
+    const double micro = 0.000001;
+    const double nano  = 0.000000001;
+    std::stringstream ss;
+    if( relative < 0.0) relative=d;
+    if( relative >= sec ) ss << d << "s";
+    else if( relative >= milli ) ss << d/milli << "ms";
+    else if( relative >= micro ) ss << d/micro <<"us";
+    else ss << d/nano << "ns";
+    return ss.str();
+}
+
+void profile(const char* name, void (*func)(), int iterations, int elements) {
+    
+    profiler::init();
+    profiler::time_t start = profiler::now();
+    for(int i = 0; i < iterations; ++i)
+    {
+        func();
+    }
+    profiler::time_t end = profiler::now();
+    
+    std::cout << "Using simd: " << VECTORIAL_SIMD_TYPE << std::endl;
+    std::cout << "Testing: " << name << std::endl;
+    std::cout << "Duration " << formatTime(profiler::diffTime(start,end)) << std::endl;
+    std::cout << "Per iter " << formatTime(profiler::diffTime(start,end) / iterations) << std::endl;
+    std::cout << "Per item " << formatTime(profiler::diffTime(start,end) / iterations / elements) << std::endl;
+
+    
+}
+
+void add_bench();
+void dot_bench();
+void quad_bench();
+void matrix_bench();
+
+int main() {
+    
+//    add_bench();
+//    dot_bench();
+//    quad_bench();
+    matrix_bench();
+
+    return 0;
+}
+
--- a/3rdparty/vectorial/bench/bench.h
+++ b/3rdparty/vectorial/bench/bench.h
@ -0,0 +1,65 @@
+#ifndef BENCH_H
+#define BENCH_H
+
+#include <string>
+#include <stdlib.h>
+
+#ifdef __APPLE__
+    #define BENCH_MACH
+    #include <mach/mach_time.h>
+    #include <stdint.h>
+#elif defined(_WIN32)
+    #define BENCH_QPC
+    #define WIN32_LEAN_AND_MEAN
+    #include <windows.h>
+    #include <malloc.h>
+#else
+    #define BENCH_GTOD
+    #include <sys/time.h>
+#endif
+
+
+static void* memalign(size_t count, size_t align) {
+    #ifdef _WIN32
+    return _aligned_malloc(count,align);
+    #else
+    void *ptr;
+    int e = posix_memalign(&ptr, align, count);
+    //    if( e == EINVAL ) printf("EINVAL posix_memalign\n");
+    //    if( e == ENOMEM ) printf("ENOMEM posix_memalign\n");
+    return ptr;
+    #endif
+}
+
+static void memfree(void* ptr) {
+    #ifdef _WIN32
+    _aligned_free(ptr);
+    #else
+    free(ptr);
+    #endif
+}
+
+namespace profiler {
+
+    #ifdef BENCH_GTOD
+        typedef struct timeval time_t;
+    #endif
+    #ifdef BENCH_MACH
+        typedef const uint64_t time_t;
+    #endif
+    #ifdef BENCH_QPC
+        typedef LARGE_INTEGER time_t;
+    #endif
+
+    void init();
+    time_t now();
+
+    double diffTime(time_t start, time_t end);
+
+}
+
+std::string formatTime(double d, double relative=-1);
+void profile(const char* name, void (*func)(), int iterations, int elements);
+
+
+#endif
--- a/3rdparty/vectorial/bench/dot_bench.cpp
+++ b/3rdparty/vectorial/bench/dot_bench.cpp
@ -0,0 +1,60 @@
+
+#include "bench.h"
+#include <stdlib.h>
+
+#include <iostream>
+#include "vectorial/vec4f.h"
+
+#define NUM (81920)
+#define ITER 100
+using namespace vectorial;
+
+namespace {
+    vec4f* alloc_vec4f(size_t n) {
+        void *ptr = memalign(n*sizeof(vec4f), 16);
+        return static_cast<vec4f*>(ptr);
+    }    
+}
+
+
+
+static vec4f * a;
+static vec4f * b;
+static float * c;
+
+
+
+
+void dot_func() {
+    
+    vec4f* vectorial_restrict aa = a;
+    vec4f* vectorial_restrict bb = b;
+    float* vectorial_restrict cc = c;
+    
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        cc[i] = dot(aa[i], bb[i]);
+    }    
+}
+
+void dot_bench() {
+
+    a = alloc_vec4f(NUM);
+    b = alloc_vec4f(NUM);
+    c = static_cast<float*>(malloc(NUM * sizeof(float)));
+
+
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        a[i]=vec4f(i,i,i,i);
+        b[i]=vec4f(NUM-i, NUM-i, NUM-i, NUM-i);
+    }
+        
+    profile("dot", dot_func, ITER, NUM);
+
+    memfree(a);
+    memfree(b);
+    memfree(c);
+
+
+}
--- a/3rdparty/vectorial/bench/matrix_bench.cpp
+++ b/3rdparty/vectorial/bench/matrix_bench.cpp
@ -0,0 +1,62 @@
+
+#include "bench.h"
+#include <stdlib.h>
+
+#include <iostream>
+#include "vectorial/simd4x4f.h"
+
+#define NUM (819200)
+#define ITER 100
+//using namespace vectorial;
+
+namespace {
+    simd4x4f* alloc_vec4x4f(size_t n) {
+        void *ptr = memalign(n*sizeof(simd4x4f), 16);
+        return static_cast<simd4x4f*>(ptr);
+    }    
+}
+
+
+
+static simd4x4f * a;
+static simd4x4f * b;
+static simd4x4f * c;
+
+
+
+
+void matrix_func() {
+    
+    simd4x4f* vectorial_restrict aa = a;
+    simd4x4f* vectorial_restrict bb = b;
+    simd4x4f* vectorial_restrict cc = c;
+    
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        simd4x4f_matrix_mul(&aa[i], &bb[i], &bb[i]);
+    }    
+}
+
+void matrix_bench() {
+
+    a = alloc_vec4x4f(NUM);
+    b = alloc_vec4x4f(NUM);
+    c = alloc_vec4x4f(NUM);
+
+
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        simd4f v = simd4f_create(i,i,i,i);
+        simd4f vi = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i);
+        a[i]=simd4x4f_create(v,v,v,v);
+        b[i]=simd4x4f_create(vi,vi,vi,vi);
+    }
+        
+    profile("matrix mul", matrix_func, ITER, NUM);
+
+    memfree(a);
+    memfree(b);
+    memfree(c);
+
+
+}
--- a/3rdparty/vectorial/bench/quad_bench.cpp
+++ b/3rdparty/vectorial/bench/quad_bench.cpp
@ -0,0 +1,123 @@
+
+#include "bench.h"
+#include <stdlib.h>
+
+#include <iostream>
+#include "vectorial/simd4x4f.h"
+
+#define NUM (81920)
+#define ITER 100
+//using namespace vectorial;
+
+namespace {
+    simd4x4f* alloc_simd4x4f(size_t n) {
+        void *ptr = memalign(n*sizeof(simd4x4f), 16);
+        return static_cast<simd4x4f*>(ptr);
+    }    
+}
+
+
+
+static simd4x4f * a;
+static simd4x4f * b;
+static simd4x4f * c;
+
+
+
+static simd4x4f add_4x4(SIMD_PARAM(simd4x4f, a), SIMD_PARAM(simd4x4f, b)) {
+    return simd4x4f_create(
+        simd4f_add(a.x, b.x),
+        simd4f_add(a.y, b.y),
+        simd4f_add(a.z, b.z),
+        simd4f_add(a.w, b.w)
+        );
+}
+
+static simd4x4f add_4x4_rp(simd4x4f *a, simd4x4f *b) {
+    return simd4x4f_create(
+        simd4f_add(a->x, b->x),
+        simd4f_add(a->y, b->y),
+        simd4f_add(a->z, b->z),
+        simd4f_add(a->w, b->w)
+        );
+}
+
+
+static void add_4x4_p(simd4x4f *a, simd4x4f *b, simd4x4f *out) {
+    out->x = simd4f_add(a->x, b->x);
+    out->y = simd4f_add(a->y, b->y);
+    out->z = simd4f_add(a->z, b->z);
+    out->w = simd4f_add(a->w, b->w);
+}
+
+
+
+
+void quad_return_func() {
+    
+    
+    simd4x4f* aa = a;
+    simd4x4f* bb = b;
+    simd4x4f* cc = c;
+    
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        bb[i] = add_4x4(aa[i], bb[i]);
+    }    
+}
+
+
+void quad_pointer_func() {
+    
+    simd4x4f* aa = a;
+    simd4x4f* bb = b;
+    simd4x4f* cc = c;
+    
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        add_4x4_p(&aa[i], &bb[i], &bb[i]);
+    }
+    
+
+}
+
+void quad_pointer_return_func() {
+    
+    simd4x4f* aa = a;
+    simd4x4f* bb = b;
+    simd4x4f* cc = c;
+    
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        bb[i] = add_4x4_rp(&aa[i], &bb[i]);
+    }    
+    
+
+}
+
+
+void quad_bench() {
+
+    a = alloc_simd4x4f(NUM);
+    b = alloc_simd4x4f(NUM);
+    c = alloc_simd4x4f(NUM);
+
+
+    for(size_t i = 0; i < NUM; ++i)
+    {
+        simd4f t = simd4f_create(i,i,i,i); 
+        simd4f t2 = simd4f_create(NUM-i,NUM-i,NUM-i,NUM-i); 
+        a[i]=simd4x4f_create(t,t,t,t);
+        b[i]=simd4x4f_create(t2,t2,t2,t2);
+    }
+        
+    profile("quad return-value", quad_return_func, ITER, NUM);
+    profile("quad pass-by-pointer", quad_pointer_func, ITER, NUM);
+    profile("quad pass-by-pointer return-value", quad_pointer_return_func, ITER, NUM);
+
+    memfree(a);
+    memfree(b);
+    memfree(c);
+
+
+}
--- a/3rdparty/vectorial/include/vectorial/config.h
+++ b/3rdparty/vectorial/include/vectorial/config.h
@ -0,0 +1,101 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_CONFIG_H
+#define VECTORIAL_CONFIG_H
+
+
+#ifndef VECTORIAL_FORCED
+    #if defined(__SSE__) || (_M_IX86_FP > 0) || (_M_X64 > 0)
+
+        #define VECTORIAL_SSE
+
+    // __ARM_NEON is used instead of __ARM_NEON__ on armv8.
+    #elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+        #define VECTORIAL_NEON
+
+    // Don't use gnu extension for arm, buggy with some gccs with armv6 and -Os,
+    // Also doesn't seem perform as well
+    #elif defined(__GNUC__) && !defined(__arm__)
+
+        #define VECTORIAL_GNU
+
+    #else
+
+        #define VECTORIAL_SCALAR
+
+    #endif
+#endif
+
+
+
+#ifdef VECTORIAL_SCALAR
+    #define VECTORIAL_SIMD_TYPE "scalar"
+#endif
+
+#ifdef VECTORIAL_SSE
+    #define VECTORIAL_SIMD_TYPE "sse"
+#endif
+
+#ifdef VECTORIAL_NEON
+    #define VECTORIAL_SIMD_TYPE "neon"
+    #define VECTORIAL_HAVE_SIMD2F
+#endif
+
+#ifdef VECTORIAL_GNU
+    #define VECTORIAL_SIMD_TYPE "gnu"
+#endif
+
+
+
+#if defined(VECTORIAL_FORCED) && !defined(VECTORIAL_SIMD_TYPE)
+    #error VECTORIAL_FORCED set but no simd-type found, try f.ex. VECTORIAL_SCALAR
+#endif
+
+
+#define vectorial_inline    static inline
+
+#if defined(__GNUC__) 
+  #if defined(__cplusplus)
+    #define vectorial_restrict  __restrict
+  #endif
+  #define simd4f_aligned16  __attribute__ ((aligned (16)))
+#elif defined(_WIN32)
+  #define vectorial_restrict  
+  #define simd4f_aligned16   __declspec(align(16))
+#else
+  #define vectorial_restrict  restrict
+  #define simd4f_aligned16   
+#endif
+// #define vectorial_restrict
+
+#ifdef __GNUC__
+    #define vectorial_pure __attribute__((pure))
+#else
+    #define vectorial_pure
+#endif
+
+#ifdef _WIN32
+  #if defined(min) || defined(max)
+#pragma message ( "set NOMINMAX as preprocessor macro, undefining min/max " )
+#undef min
+#undef max
+  #endif
+#endif
+
+#ifdef __cplusplus
+    // Hack around msvc badness
+    #define SIMD_PARAM(t, p) const t& p
+#else
+    #define SIMD_PARAM(t, p) t p
+#endif
+                    
+#define VECTORIAL_PI      3.14159265f
+#define VECTORIAL_HALFPI  1.57079633f
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/mat4f.h
+++ b/3rdparty/vectorial/include/vectorial/mat4f.h
@ -0,0 +1,197 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_MAT4F_H
+#define VECTORIAL_MAT4F_H
+
+#ifndef VECTORIAL_SIMD4X4F_H
+  #include "vectorial/simd4x4f.h"
+#endif
+
+#ifndef VECTORIAL_VEC4F_H
+  #include "vectorial/vec4f.h"
+#endif
+
+
+namespace vectorial {
+    
+
+    class mat4f {
+    public:
+
+        simd4x4f value;
+    
+        inline mat4f() {}
+        inline mat4f(const mat4f& m) : value(m.value) {}
+        inline mat4f(const simd4x4f& v) : value(v) {}
+        inline mat4f(const vec4f& v0, const vec4f& v1, const vec4f& v2, const vec4f& v3) : value(simd4x4f_create(v0.value, v1.value, v2.value, v3.value)) {}
+        explicit inline mat4f(const float *ary) { simd4x4f_uload(&value, ary); }
+
+        inline void load(const float *ary) { 
+            value.x = simd4f_uload4(ary);
+            value.y = simd4f_uload4(ary+4); 
+            value.z = simd4f_uload4(ary+8); 
+            value.w = simd4f_uload4(ary+12); 
+        }
+
+        inline void store(float *ary) const { 
+            simd4f_ustore4(value.x, ary);
+            simd4f_ustore4(value.y, ary+4);
+            simd4f_ustore4(value.z, ary+8);
+            simd4f_ustore4(value.w, ary+12);
+        }
+
+        static mat4f identity() { mat4f m; simd4x4f_identity(&m.value); return m; }
+
+        static mat4f perspective(float fovy, float aspect, float znear, float zfar) {
+            simd4x4f m;
+            simd4x4f_perspective(&m, fovy, aspect, znear, zfar);
+            return m;
+        }
+        
+        static mat4f ortho(float left, float right, float bottom, float top, float znear, float zfar) {
+            simd4x4f m;
+            simd4x4f_ortho(&m, left, right, bottom, top, znear, zfar);
+            return m;
+        }
+        
+        static mat4f lookAt(const vec3f& eye, const vec3f& center, const vec3f& up) {
+            simd4x4f m;
+            simd4x4f_lookat(&m, eye.value, center.value, up.value);
+            return m;            
+        }
+
+        static mat4f translation(const vec3f& pos) {
+            simd4x4f m;
+            simd4x4f_translation(&m, pos.x(), pos.y(), pos.z());
+            return m;            
+        }
+
+        static mat4f axisRotation(float angle, const vec3f& axis) {
+            simd4x4f m;
+            simd4x4f_axis_rotation(&m, angle, axis.value);
+            return m;            
+        }
+
+        static mat4f scale(float scale) {
+            return simd4x4f_create( simd4f_create(scale,0,0,0),
+                                    simd4f_create(0,scale,0,0),
+                                    simd4f_create(0,0,scale,0),
+                                    simd4f_create(0,0,0,1) );
+        }
+
+        static mat4f scale(const vec3f& scale) {
+            return simd4x4f_create( simd4f_create(scale.x(),0,0,0),
+                                   simd4f_create(0,scale.y(),0,0),
+                                   simd4f_create(0,0,scale.z(),0),
+                                   simd4f_create(0,0,0,1) );
+        }
+
+    };
+    
+    
+    vectorial_inline mat4f operator*(const mat4f& lhs, const mat4f& rhs) {
+        mat4f ret;
+        simd4x4f_matrix_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+
+    vectorial_inline mat4f operator*=(mat4f& lhs, const mat4f& rhs) {
+        const simd4x4f tmp = lhs.value;
+        simd4x4f_matrix_mul(&tmp, &rhs.value, &lhs.value);
+        return lhs;
+    }
+
+
+    vectorial_inline vec4f operator*(const mat4f& lhs, const vec4f& rhs) {
+        vec4f ret;
+        simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+
+    vectorial_inline vec3f transformVector(const mat4f& lhs, const vec3f& rhs) {
+        vec3f ret;
+        simd4x4f_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+
+    vectorial_inline vec4f transformVector(const mat4f& lhs, const vec4f& rhs) {
+        vec4f ret;
+        simd4x4f_matrix_vector_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+    
+    vectorial_inline vec3f transformPoint(const mat4f& lhs, const vec3f& rhs) {
+        vec3f ret;
+        simd4x4f_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+
+    vectorial_inline vec3f orthoInverseTransformPoint(const mat4f& lhs, const vec3f& rhs) {
+        vec3f ret;
+        simd4x4f_inv_ortho_matrix_point3_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+
+    vectorial_inline vec3f orthoInverseTransformVector(const mat4f& lhs, const vec3f& rhs) {
+        vec3f ret;
+        simd4x4f_inv_ortho_matrix_vector3_mul(&lhs.value, &rhs.value, &ret.value);
+        return ret;
+    }
+
+    
+    vectorial_inline mat4f transpose(const mat4f& m) {
+        mat4f ret;
+        simd4x4f_transpose(&m.value, &ret.value);
+        return ret;
+    }
+
+
+    vectorial_inline mat4f inverse(const mat4f& m) {
+        mat4f ret;
+        simd4x4f_inverse(&m.value, &ret.value);
+        return ret;
+    }
+
+
+
+}
+
+
+
+#ifdef VECTORIAL_OSTREAM
+//#include <ostream>
+
+vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::mat4f& v) {
+
+    os << "[ ";
+    os << simd4f_get_x(v.value.x) << ", ";
+    os << simd4f_get_x(v.value.y) << ", ";
+    os << simd4f_get_x(v.value.z) << ", ";
+    os << simd4f_get_x(v.value.w) << " ; ";
+
+    os << simd4f_get_y(v.value.x) << ", ";
+    os << simd4f_get_y(v.value.y) << ", ";
+    os << simd4f_get_y(v.value.z) << ", ";
+    os << simd4f_get_y(v.value.w) << " ; ";
+
+    os << simd4f_get_z(v.value.x) << ", ";
+    os << simd4f_get_z(v.value.y) << ", ";
+    os << simd4f_get_z(v.value.z) << ", ";
+    os << simd4f_get_z(v.value.w) << " ; ";
+
+    os << simd4f_get_w(v.value.x) << ", ";
+    os << simd4f_get_w(v.value.y) << ", ";
+    os << simd4f_get_w(v.value.z) << ", ";
+    os << simd4f_get_w(v.value.w) << " ]";
+
+    return os;
+}
+#endif
+
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd2f.h
+++ b/3rdparty/vectorial/include/vectorial/simd2f.h
@ -0,0 +1,38 @@
+/*
+  Vectorial
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+
+#ifndef VECTORIAL_SIMD2F_H
+#define VECTORIAL_SIMD2F_H
+
+#include "vectorial/config.h"
+
+#if defined(VECTORIAL_NEON)
+    #include "simd2f_neon.h"
+#else
+    #error No implementation defined
+#endif
+
+#include "simd2f_common.h"
+
+#ifdef __cplusplus
+
+    #ifdef VECTORIAL_OSTREAM
+        #include <ostream>
+
+        vectorial_inline std::ostream& operator<<(std::ostream& os, const simd2f& v) {
+            os << "simd2f(" << simd2f_get_x(v) << ", "
+                       << simd2f_get_y(v) << ")";
+            return os;
+        }
+    #endif
+
+#endif
+
+
+
+
+#endif
+
--- a/3rdparty/vectorial/include/vectorial/simd2f_common.h
+++ b/3rdparty/vectorial/include/vectorial/simd2f_common.h
@ -0,0 +1,22 @@
+/*
+  Vectorial
+  Copyright (c) 2014 Google
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD2F_COMMON_H
+#define VECTORIAL_SIMD2F_COMMON_H
+
+vectorial_inline simd2f simd2f_length2(simd2f v) {
+    return simd2f_sqrt( simd2f_dot2(v,v) );
+}
+
+vectorial_inline simd2f simd2f_length2_squared(simd2f v) {
+    return simd2f_dot2(v,v);
+}
+
+vectorial_inline simd2f simd2f_normalize2(simd2f a) {
+    simd2f invlen = simd2f_rsqrt( simd2f_dot2(a,a) );
+    return simd2f_mul(a, invlen);
+}
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd2f_neon.h
+++ b/3rdparty/vectorial/include/vectorial/simd2f_neon.h
@ -0,0 +1,159 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD2F_NEON_H
+#define VECTORIAL_SIMD2F_NEON_H
+
+#include <arm_neon.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef float32x2_t simd2f;
+
+typedef union {
+    simd2f s ;
+    float f[2];
+} _simd2f_union;
+
+
+
+vectorial_inline simd2f simd2f_create(float x, float y) {
+    const float32_t d[2] = { x,y };
+    simd2f s = vld1_f32(d);
+    return s;
+}
+
+vectorial_inline simd2f simd2f_zero() { return vdup_n_f32(0.0f); }
+
+vectorial_inline simd2f simd2f_uload2(const float *ary) {
+    const float32_t* ary32 = (const float32_t*)ary;
+    simd2f s = vld1_f32(ary32);
+    return s;
+}
+
+vectorial_inline void simd2f_ustore2(const simd2f val, float *ary) {
+    vst1_f32( (float32_t*)ary, val);
+}
+
+vectorial_inline simd2f simd2f_splat(float v) {
+    simd2f s = vdup_n_f32(v);
+    return s;
+}
+
+vectorial_inline simd2f simd2f_splat_x(simd2f v) {
+    simd2f ret = vdup_lane_f32(v, 0);
+    return ret;
+}
+
+vectorial_inline simd2f simd2f_splat_y(simd2f v) {
+    simd2f ret = vdup_lane_f32(v, 1);
+    return ret;
+}
+
+vectorial_inline simd2f simd2f_reciprocal(simd2f v) {
+    simd2f estimate = vrecpe_f32(v);
+    estimate = vmul_f32(vrecps_f32(estimate, v), estimate);
+    estimate = vmul_f32(vrecps_f32(estimate, v), estimate);
+    return estimate;
+}
+
+vectorial_inline void simd2f_rsqrt_1iteration(const simd2f& v, simd2f& estimate) {
+    simd2f estimate2 = vmul_f32(estimate, v);
+    estimate = vmul_f32(estimate, vrsqrts_f32(estimate2, estimate));
+}
+
+vectorial_inline simd2f simd2f_rsqrt1(simd2f v) {
+    simd2f estimate = vrsqrte_f32(v);
+    simd2f_rsqrt_1iteration(v, estimate);
+    return estimate;
+}
+
+vectorial_inline simd2f simd2f_rsqrt2(simd2f v) {
+    simd2f estimate = vrsqrte_f32(v);
+    simd2f_rsqrt_1iteration(v, estimate);
+    simd2f_rsqrt_1iteration(v, estimate);
+    return estimate;
+}
+
+vectorial_inline simd2f simd2f_rsqrt3(simd2f v) {
+    simd2f estimate = vrsqrte_f32(v);
+    simd2f_rsqrt_1iteration(v, estimate);
+    simd2f_rsqrt_1iteration(v, estimate);
+    simd2f_rsqrt_1iteration(v, estimate);
+    return estimate;
+}
+
+// http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for
+// one iteration but two gives a signficant accuracy improvment.
+vectorial_inline simd2f simd2f_rsqrt(simd2f v) {
+    return simd2f_rsqrt2(v);
+}
+
+vectorial_inline simd2f simd2f_sqrt(simd2f v) {
+
+    return vreinterpret_f32_u32(vand_u32( vtst_u32(vreinterpret_u32_f32(v),
+                                                      vreinterpret_u32_f32(v)),
+                                            vreinterpret_u32_f32(
+                                              simd2f_reciprocal(simd2f_rsqrt(v)))
+                                          )
+                                );
+
+}
+
+// arithmetics
+
+vectorial_inline simd2f simd2f_add(simd2f lhs, simd2f rhs) {
+    simd2f ret = vadd_f32(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd2f simd2f_sub(simd2f lhs, simd2f rhs) {
+    simd2f ret = vsub_f32(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd2f simd2f_mul(simd2f lhs, simd2f rhs) {
+    simd2f ret = vmul_f32(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd2f simd2f_div(simd2f lhs, simd2f rhs) {
+    simd2f recip = simd2f_reciprocal( rhs );
+    simd2f ret = vmul_f32(lhs, recip);
+    return ret;
+}
+
+vectorial_inline simd2f simd2f_madd(simd2f m1, simd2f m2, simd2f a) {
+    return vmla_f32( a, m1, m2 );
+}
+
+vectorial_inline float simd2f_get_x(simd2f s) { return vget_lane_f32(s, 0); }
+vectorial_inline float simd2f_get_y(simd2f s) { return vget_lane_f32(s, 1); }
+
+vectorial_inline simd2f simd2f_dot2(simd2f lhs, simd2f rhs) {
+    const simd2f m = simd2f_mul(lhs, rhs);
+    return vpadd_f32(m, m);
+}
+
+vectorial_inline simd2f simd2f_min(simd2f a, simd2f b) {
+    return vmin_f32( a, b );
+}
+
+vectorial_inline simd2f simd2f_max(simd2f a, simd2f b) {
+    return vmax_f32( a, b );
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+
--- a/3rdparty/vectorial/include/vectorial/simd4f.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f.h
@ -0,0 +1,51 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+
+#ifndef VECTORIAL_SIMD4F_H
+#define VECTORIAL_SIMD4F_H
+
+#ifndef VECTORIAL_CONFIG_H
+  #include "vectorial/config.h"
+#endif
+
+
+#ifdef VECTORIAL_SCALAR
+    #include "simd4f_scalar.h"
+#elif defined(VECTORIAL_SSE)
+    #include "simd4f_sse.h"
+#elif defined(VECTORIAL_GNU)
+    #include "simd4f_gnu.h"
+#elif defined(VECTORIAL_NEON)
+    #include "simd4f_neon.h"
+#else
+    #error No implementation defined
+#endif
+
+#include "simd4f_common.h"
+
+
+
+#ifdef __cplusplus
+
+    #ifdef VECTORIAL_OSTREAM
+        #include <ostream>
+
+        vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4f& v) {
+            os << "simd4f(" << simd4f_get_x(v) << ", "
+                       << simd4f_get_y(v) << ", "
+                       << simd4f_get_z(v) << ", "
+                       << simd4f_get_w(v) << ")";
+            return os;
+        }
+    #endif
+
+#endif
+
+
+
+
+#endif
+
--- a/3rdparty/vectorial/include/vectorial/simd4f_common.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_common.h
@ -0,0 +1,74 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4F_COMMON_H
+#define VECTORIAL_SIMD4F_COMMON_H
+
+
+vectorial_inline simd4f simd4f_sum(simd4f v) { 
+    const simd4f s1 = simd4f_add(simd4f_splat_x(v), simd4f_splat_y(v));
+    const simd4f s2 = simd4f_add(s1, simd4f_splat_z(v));
+    const simd4f s3 = simd4f_add(s2, simd4f_splat_w(v));
+    return s3;
+}
+
+vectorial_inline simd4f simd4f_dot4(simd4f lhs, simd4f rhs) {
+    return simd4f_sum( simd4f_mul(lhs, rhs) );
+}
+
+vectorial_inline simd4f simd4f_dot2(simd4f lhs, simd4f rhs) {
+    const simd4f m = simd4f_mul(lhs, rhs);
+    const simd4f s1 = simd4f_add(simd4f_splat_x(m), simd4f_splat_y(m));
+    return s1;
+}
+
+
+vectorial_inline simd4f simd4f_length4(simd4f v) {
+    return simd4f_sqrt( simd4f_dot4(v,v) );
+}
+
+vectorial_inline simd4f simd4f_length3(simd4f v) {
+    return simd4f_sqrt( simd4f_dot3(v,v) );
+}
+
+vectorial_inline simd4f simd4f_length2(simd4f v) {
+    return simd4f_sqrt( simd4f_dot2(v,v) );
+}
+
+vectorial_inline simd4f simd4f_length4_squared(simd4f v) {
+    return simd4f_dot4(v,v);
+}
+
+vectorial_inline simd4f simd4f_length3_squared(simd4f v) {
+    return simd4f_dot3(v,v);
+}
+
+vectorial_inline float simd4f_length3_squared_scalar(simd4f v) {
+    return simd4f_dot3_scalar(v,v);
+}
+
+vectorial_inline simd4f simd4f_length2_squared(simd4f v) {
+    return simd4f_dot2(v,v);
+}
+
+
+vectorial_inline simd4f simd4f_normalize4(simd4f a) {
+    simd4f invlen = simd4f_rsqrt( simd4f_dot4(a,a) );
+    return simd4f_mul(a, invlen);    
+}
+
+vectorial_inline simd4f simd4f_normalize3(simd4f a) {
+    simd4f invlen = simd4f_rsqrt( simd4f_dot3(a,a) );
+    return simd4f_mul(a, invlen);
+}
+
+vectorial_inline simd4f simd4f_normalize2(simd4f a) {
+    simd4f invlen = simd4f_rsqrt( simd4f_dot2(a,a) );
+    return simd4f_mul(a, invlen);    
+}
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_gnu.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_gnu.h
@ -0,0 +1,225 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4F_GNU_H
+#define VECTORIAL_SIMD4F_GNU_H
+
+#include <math.h>
+#include <string.h>  // memcpy
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef float simd4f __attribute__ ((vector_size (16)));
+
+typedef union {
+    simd4f s ;
+    float f[4];
+} _simd4f_union;
+
+vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
+vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
+vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
+vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
+
+
+vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
+    simd4f s = { x, y, z, w };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); }
+
+vectorial_inline simd4f simd4f_uload4(const float *ary) {
+    simd4f s = { ary[0], ary[1], ary[2], ary[3] };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload3(const float *ary) {
+    simd4f s = { ary[0], ary[1], ary[2], 0 };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload2(const float *ary) {
+    simd4f s = { ary[0], ary[1], 0, 0 };
+    return s;
+}
+
+
+vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 4);
+}
+
+vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 3);
+}
+
+vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 2);
+}
+
+
+vectorial_inline simd4f simd4f_splat(float v) { 
+    simd4f s = { v, v, v, v }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
+    float s = simd4f_get_x(v);
+    simd4f ret = { s, s, s, s }; 
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
+    float s = simd4f_get_y(v);
+    simd4f ret = { s, s, s, s }; 
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
+    float s = simd4f_get_z(v);
+    simd4f ret = { s, s, s, s }; 
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
+    float s = simd4f_get_w(v);
+    simd4f ret = { s, s, s, s }; 
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
+    return simd4f_splat(1.0f) / v;
+}
+
+vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
+    simd4f ret = { sqrtf(simd4f_get_x(v)), sqrtf(simd4f_get_y(v)), sqrtf(simd4f_get_z(v)), sqrtf(simd4f_get_w(v)) };
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
+    return simd4f_splat(1.0f) / simd4f_sqrt(v);
+}
+
+
+
+vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
+    simd4f ret = lhs + rhs;
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
+    simd4f ret = lhs - rhs;
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
+    simd4f ret = lhs * rhs;
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
+    simd4f ret = lhs / rhs;
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
+    return simd4f_add( simd4f_mul(m1, m2), a );
+}
+
+vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
+    _simd4f_union l = {lhs};
+    _simd4f_union r = {rhs};
+    return l.f[0] * r.f[0] + l.f[1] * r.f[1] + l.f[2] * r.f[2];
+}
+
+vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
+    return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) );
+}
+
+vectorial_inline simd4f simd4f_cross3(simd4f l, simd4f r) {
+    _simd4f_union lhs = {l};
+    _simd4f_union rhs = {r};
+    
+    return simd4f_create( lhs.f[1] * rhs.f[2] - lhs.f[2] * rhs.f[1],
+                          lhs.f[2] * rhs.f[0] - lhs.f[0] * rhs.f[2],
+                          lhs.f[0] * rhs.f[1] - lhs.f[1] * rhs.f[0], 0);
+}
+
+
+vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[3], u.f[0], u.f[1], u.f[2]); 
+}
+
+vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 
+}
+
+vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 
+}
+
+
+vectorial_inline simd4f simd4f_zero_w(simd4f s) {
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f);
+}
+
+vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f);
+}
+
+
+vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 
+    _simd4f_union u1 = {abcd};
+    _simd4f_union u2 = {xyzw};
+    return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]);
+}
+
+vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[0], -u.f[1], u.f[2], -u.f[3]);
+}
+
+vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
+    _simd4f_union u = {s};
+    return simd4f_create(-u.f[0], u.f[1], -u.f[2], u.f[3]);
+}
+
+
+vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
+    _simd4f_union ua = {a};
+    _simd4f_union ub = {b};
+    return simd4f_create( ua.f[0] < ub.f[0] ? ua.f[0] : ub.f[0], 
+                          ua.f[1] < ub.f[1] ? ua.f[1] : ub.f[1], 
+                          ua.f[2] < ub.f[2] ? ua.f[2] : ub.f[2], 
+                          ua.f[3] < ub.f[3] ? ua.f[3] : ub.f[3] );
+}
+
+vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
+    _simd4f_union ua = {a};
+    _simd4f_union ub = {b};
+    return simd4f_create( ua.f[0] > ub.f[0] ? ua.f[0] : ub.f[0], 
+                          ua.f[1] > ub.f[1] ? ua.f[1] : ub.f[1], 
+                          ua.f[2] > ub.f[2] ? ua.f[2] : ub.f[2], 
+                          ua.f[3] > ub.f[3] ? ua.f[3] : ub.f[3] );
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+
--- a/3rdparty/vectorial/include/vectorial/simd4f_neon.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_neon.h
@ -0,0 +1,280 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4F_NEON_H
+#define VECTORIAL_SIMD4F_NEON_H
+
+#include <arm_neon.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef float32x4_t simd4f;
+typedef float32x2_t simd2f;
+
+typedef union {
+    simd4f s ;
+    float f[4];
+} _simd4f_union;
+
+
+
+vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
+    const float32_t d[4] = { x,y,z,w };
+    simd4f s = vld1q_f32(d);
+    return s;
+}
+
+vectorial_inline simd4f simd4f_zero() { return vdupq_n_f32(0.0f); }
+
+vectorial_inline simd4f simd4f_uload4(const float *ary) {
+    const float32_t* ary32 = (const float32_t*)ary;
+    simd4f s = vld1q_f32(ary32);    
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload3(const float *ary) {
+    simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload2(const float *ary) {
+    const float32_t* ary32 = (const float32_t*)ary;
+    float32x2_t low = vld1_f32(ary32);
+    const float32_t zero = 0;
+    float32x2_t high = vld1_dup_f32(&zero); // { 0,0 } but stupid warnings from llvm-gcc
+    return vcombine_f32(low, high);
+}
+
+
+vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
+    vst1q_f32( (float32_t*)ary, val);
+}
+
+vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
+    float* local_data = ary;
+    vst1q_lane_f32(local_data++, val, 0);
+    vst1q_lane_f32(local_data++, val, 1);
+    vst1q_lane_f32(local_data, val, 2);
+}
+
+vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
+    const float32x2_t low = vget_low_f32(val);
+    vst1_f32( (float32_t*)ary, low);
+}
+
+
+
+
+vectorial_inline simd4f simd4f_splat(float v) { 
+    simd4f s = vdupq_n_f32(v);
+    return s;
+}
+
+// todo: or is simd4f_splat(simd4f_get_x(v))  better?
+
+vectorial_inline simd4f simd4f_splat_x(simd4f v) {
+    float32x2_t o = vget_low_f32(v);
+    simd4f ret = vdupq_lane_f32(o, 0);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
+    float32x2_t o = vget_low_f32(v);
+    simd4f ret = vdupq_lane_f32(o, 1);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
+    float32x2_t o = vget_high_f32(v);
+    simd4f ret = vdupq_lane_f32(o, 0);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
+    float32x2_t o = vget_high_f32(v);
+    simd4f ret = vdupq_lane_f32(o, 1);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
+    simd4f estimate = vrecpeq_f32(v);
+    estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate);
+    estimate = vmulq_f32(vrecpsq_f32(estimate, v), estimate);
+    return estimate;
+}
+
+vectorial_inline void simd4f_rsqrt_1iteration(const simd4f& v, simd4f& estimate) {
+    simd4f estimate2 = vmulq_f32(estimate, v);
+    estimate = vmulq_f32(estimate, vrsqrtsq_f32(estimate2, estimate));
+}
+
+vectorial_inline simd4f simd4f_rsqrt1(simd4f v) {
+    simd4f estimate = vrsqrteq_f32(v);
+    simd4f_rsqrt_1iteration(v, estimate);
+    return estimate;
+}
+
+vectorial_inline simd4f simd4f_rsqrt2(simd4f v) {
+    simd4f estimate = vrsqrteq_f32(v);
+    simd4f_rsqrt_1iteration(v, estimate);
+    simd4f_rsqrt_1iteration(v, estimate);
+    return estimate;
+}
+
+vectorial_inline simd4f simd4f_rsqrt3(simd4f v) {
+    simd4f estimate = vrsqrteq_f32(v);
+    simd4f_rsqrt_1iteration(v, estimate);
+    simd4f_rsqrt_1iteration(v, estimate);
+    simd4f_rsqrt_1iteration(v, estimate);
+    return estimate;
+}
+
+// http://en.wikipedia.org/wiki/Fast_inverse_square_root makes the argument for
+// one iteration but two gives a signficant accuracy improvment.
+vectorial_inline simd4f simd4f_rsqrt(simd4f v) {
+    return simd4f_rsqrt2(v);
+}
+
+vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
+
+    return vreinterpretq_f32_u32(vandq_u32( vtstq_u32(vreinterpretq_u32_f32(v),  
+                                                      vreinterpretq_u32_f32(v)), 
+                                            vreinterpretq_u32_f32(
+                                              simd4f_reciprocal(simd4f_rsqrt(v)))
+                                          )
+                                );
+
+}
+
+
+
+// arithmetics
+
+vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
+    simd4f ret = vaddq_f32(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
+    simd4f ret = vsubq_f32(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
+    simd4f ret = vmulq_f32(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
+    simd4f recip = simd4f_reciprocal( rhs );
+    simd4f ret = vmulq_f32(lhs, recip);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
+    return vmlaq_f32( a, m1, m2 );
+}
+
+
+
+vectorial_inline float simd4f_get_x(simd4f s) { return vgetq_lane_f32(s, 0); }
+vectorial_inline float simd4f_get_y(simd4f s) { return vgetq_lane_f32(s, 1); }
+vectorial_inline float simd4f_get_z(simd4f s) { return vgetq_lane_f32(s, 2); }
+vectorial_inline float simd4f_get_w(simd4f s) { return vgetq_lane_f32(s, 3); }
+
+// This function returns x*x+y*y+z*z and ignores the w component.
+vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
+    const simd4f m = simd4f_mul(lhs, rhs);
+    simd2f s1 = vpadd_f32(vget_low_f32(m), vget_low_f32(m));
+    s1 = vadd_f32(s1, vget_high_f32(m));
+    return vget_lane_f32(s1, 0);
+}
+
+vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
+    return simd4f_splat(simd4f_dot3_scalar(lhs, rhs));
+}
+
+vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
+    // Compute lhs and rhs in order yzx
+    simd2f lhs_low = vget_low_f32(lhs);
+    simd2f rhs_low = vget_low_f32(rhs);
+    simd4f lhs_yzx = vcombine_f32(vext_f32(lhs_low, vget_high_f32(lhs),1), lhs_low);
+    simd4f rhs_yzx = vcombine_f32(vext_f32(rhs_low, vget_high_f32(rhs),1), rhs_low);
+    // Compute cross in order zxy
+    simd4f s3 = simd4f_sub(simd4f_mul(rhs_yzx, lhs), simd4f_mul(lhs_yzx, rhs));
+    // Permute cross to order xyz and zero out the fourth value
+    simd2f low = vget_low_f32(s3);
+    static const uint32_t mask_array[] = {
+      0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0};
+    static const int32x4_t mask = vld1q_s32((const int32_t*)mask_array);
+    s3 = vcombine_f32(vext_f32(low, vget_high_f32(s3), 1), low);
+    return (simd4f)vandq_s32((int32x4_t)s3,mask);
+}
+
+vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { 
+    _simd4f_union u = {s};
+    return simd4f_create( u.f[3], u.f[0], u.f[1], u.f[2]); 
+}
+
+vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { 
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[2], u.f[3], u.f[0], u.f[1]); 
+}
+
+vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { 
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[1], u.f[2], u.f[3], u.f[0]); 
+}
+
+
+vectorial_inline simd4f simd4f_zero_w(simd4f s) {
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[0], u.f[1], u.f[2], 0.0f);
+}
+
+vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
+    _simd4f_union u = {s};
+    return simd4f_create(u.f[0], u.f[1], 0.0f, 0.0f);
+}
+
+
+vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 
+    _simd4f_union u1 = {xyzw};
+    _simd4f_union u2 = {abcd};
+    return simd4f_create(u1.f[2], u1.f[3], u2.f[2], u2.f[3]); 
+}
+
+vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
+    const unsigned int upnpn[4] = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+    const uint32x4_t pnpn = vld1q_u32( upnpn );
+    return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), pnpn ) ); 
+}
+
+vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
+    const unsigned int unpnp[4] = { 0x80000000, 0x00000000, 0x80000000, 0x00000000 };
+    const uint32x4_t npnp = vld1q_u32( unpnp );
+    return vreinterpretq_f32_u32( veorq_u32( vreinterpretq_u32_f32(s), npnp ) ); 
+}
+
+
+vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
+    return vminq_f32( a, b ); 
+}
+
+vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
+    return vmaxq_f32( a, b ); 
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd4f_scalar.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_scalar.h
@ -0,0 +1,199 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4F_SCALAR_H
+#define VECTORIAL_SIMD4F_SCALAR_H
+
+#include <math.h>
+#include <string.h>  // memcpy
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef struct { 
+    float x;
+    float y; 
+    float z; 
+    float w;
+} simd4f;
+
+
+
+vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
+    simd4f s = { x, y, z, w };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_zero() { return simd4f_create(0.0f, 0.0f, 0.0f, 0.0f); }
+
+vectorial_inline simd4f simd4f_uload4(const float *ary) {
+    simd4f s = { ary[0], ary[1], ary[2], ary[3] };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload3(const float *ary) {
+    simd4f s = { ary[0], ary[1], ary[2], 0 };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload2(const float *ary) {
+    simd4f s = { ary[0], ary[1], 0, 0 };
+    return s;
+}
+
+
+vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 4);
+}
+
+vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 3);
+}
+
+vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 2);
+}
+
+
+
+// utilities
+vectorial_inline simd4f simd4f_splat(float v) { 
+    simd4f s = { v, v, v, v }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
+    simd4f s = { v.x, v.x, v.x, v.x }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
+    simd4f s = { v.y, v.y, v.y, v.y }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
+    simd4f s = { v.z, v.z, v.z, v.z }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
+    simd4f s = { v.w, v.w, v.w, v.w }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
+    simd4f s = { 1.0f/v.x, 1.0f/v.y, 1.0f/v.z, 1.0f/v.w }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
+    simd4f s = { sqrtf(v.x), sqrtf(v.y), sqrtf(v.z), sqrtf(v.w) }; 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
+    simd4f s = { 1.0f/sqrtf(v.x), 1.0f/sqrtf(v.y), 1.0f/sqrtf(v.z), 1.0f/sqrtf(v.w) }; 
+    return s;
+}
+
+
+// arithmetic
+
+vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
+    simd4f ret = { lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z, lhs.w + rhs.w };
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
+    simd4f ret = { lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z, lhs.w - rhs.w };
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
+    simd4f ret = { lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z, lhs.w * rhs.w };
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
+    simd4f ret = { lhs.x / rhs.x, lhs.y / rhs.y, lhs.z / rhs.z, lhs.w / rhs.w };
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
+    return simd4f_add( simd4f_mul(m1, m2), a );
+}
+
+vectorial_inline float simd4f_dot3_scalar(simd4f lhs, simd4f rhs) {
+    return lhs.x * rhs.x + lhs.y * rhs.y + lhs.z * rhs.z;
+}
+
+vectorial_inline simd4f simd4f_dot3(simd4f lhs, simd4f rhs) {
+    return simd4f_splat( simd4f_dot3_scalar(lhs, rhs) );
+}
+
+vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
+    return simd4f_create( lhs.y * rhs.z - lhs.z * rhs.y,
+                          lhs.z * rhs.x - lhs.x * rhs.z,
+                          lhs.x * rhs.y - lhs.y * rhs.x, 0);
+}
+
+
+vectorial_inline float simd4f_get_x(simd4f s) { return s.x; }
+vectorial_inline float simd4f_get_y(simd4f s) { return s.y; }
+vectorial_inline float simd4f_get_z(simd4f s) { return s.z; }
+vectorial_inline float simd4f_get_w(simd4f s) { return s.w; }
+
+
+vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return simd4f_create(s.w, s.x, s.y, s.z); }
+vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return simd4f_create(s.z, s.w, s.x, s.y); }
+vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return simd4f_create(s.y, s.z, s.w, s.x); }
+
+
+vectorial_inline simd4f simd4f_zero_w(simd4f s) {
+    return simd4f_create(s.x, s.y, s.z, 0.0f);
+}
+
+vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
+    return simd4f_create(s.x, s.y, 0.0f, 0.0f);
+}
+
+
+vectorial_inline simd4f simd4f_merge_high(simd4f abcd, simd4f xyzw) { 
+    return simd4f_create(abcd.z, abcd.w, xyzw.z, xyzw.w);
+}
+
+vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
+    return simd4f_create(s.x, -s.y, s.z, -s.w);
+}
+
+vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
+    return simd4f_create(-s.x, s.y, -s.z, s.w);
+}
+
+vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
+    return simd4f_create( a.x < b.x ? a.x : b.x, 
+                          a.y < b.y ? a.y : b.y, 
+                          a.z < b.z ? a.z : b.z, 
+                          a.w < b.w ? a.w : b.w );
+}
+
+vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
+    return simd4f_create( a.x > b.x ? a.x : b.x, 
+                          a.y > b.y ? a.y : b.y, 
+                          a.z > b.z ? a.z : b.z, 
+                          a.w > b.w ? a.w : b.w );
+}
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
+
--- a/3rdparty/vectorial/include/vectorial/simd4f_sse.h
+++ b/3rdparty/vectorial/include/vectorial/simd4f_sse.h
@ -0,0 +1,236 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4F_SSE_H
+#define VECTORIAL_SIMD4F_SSE_H
+
+// Conditionally enable SSE4.1 otherwise fallback to SSE.
+#if defined(_M_IX86_FP)
+    #if _M_IX86_FP >=2
+        #define VECTORIAL_USE_SSE4_1
+    #endif
+#elif defined(__SSE4_1__)
+        #define VECTORIAL_USE_SSE4_1
+#endif
+
+#include <xmmintrin.h>
+#if defined(VECTORIAL_USE_SSE4_1)
+    #include <smmintrin.h>
+#endif
+#include <string.h>  // memcpy
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+typedef __m128 simd4f; 
+
+typedef union {
+    simd4f s ;
+    float f[4];
+    unsigned int ui[4];
+} _simd4f_union;
+
+// creating
+
+vectorial_inline simd4f simd4f_create(float x, float y, float z, float w) {
+    simd4f s = { x, y, z, w };
+    return s;
+}
+
+vectorial_inline simd4f simd4f_zero() { return _mm_setzero_ps(); }
+
+vectorial_inline simd4f simd4f_uload4(const float *ary) {
+    simd4f s = _mm_loadu_ps(ary);
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload3(const float *ary) {
+    simd4f s = simd4f_create(ary[0], ary[1], ary[2], 0);
+    return s;
+}
+
+vectorial_inline simd4f simd4f_uload2(const float *ary) {
+    simd4f s = simd4f_create(ary[0], ary[1], 0, 0);
+    return s;
+}
+
+
+vectorial_inline void simd4f_ustore4(const simd4f val, float *ary) {
+    _mm_storeu_ps(ary, val);
+}
+
+vectorial_inline void simd4f_ustore3(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 3);
+}
+
+vectorial_inline void simd4f_ustore2(const simd4f val, float *ary) {
+    memcpy(ary, &val, sizeof(float) * 2);
+}
+
+
+// utilites
+
+vectorial_inline simd4f simd4f_splat(float v) { 
+    simd4f s = _mm_set1_ps(v); 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_x(simd4f v) { 
+    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0,0,0,0)); 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_y(simd4f v) { 
+    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1,1,1,1)); 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_z(simd4f v) { 
+    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,2,2,2)); 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_splat_w(simd4f v) { 
+    simd4f s = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3,3,3,3)); 
+    return s;
+}
+
+
+// arithmetic
+
+vectorial_inline simd4f simd4f_add(simd4f lhs, simd4f rhs) {
+    simd4f ret = _mm_add_ps(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_sub(simd4f lhs, simd4f rhs) {
+    simd4f ret = _mm_sub_ps(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_mul(simd4f lhs, simd4f rhs) {
+    simd4f ret = _mm_mul_ps(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_div(simd4f lhs, simd4f rhs) {
+    simd4f ret = _mm_div_ps(lhs, rhs);
+    return ret;
+}
+
+vectorial_inline simd4f simd4f_madd(simd4f m1, simd4f m2, simd4f a) {
+    return simd4f_add( simd4f_mul(m1, m2), a );
+}
+
+
+
+
+vectorial_inline simd4f simd4f_reciprocal(simd4f v) { 
+    simd4f s = _mm_rcp_ps(v); 
+    const simd4f two = simd4f_create(2.0f, 2.0f, 2.0f, 2.0f);
+    s = simd4f_mul(s, simd4f_sub(two, simd4f_mul(v, s)));
+    return s;
+}
+
+vectorial_inline simd4f simd4f_sqrt(simd4f v) { 
+    simd4f s = _mm_sqrt_ps(v); 
+    return s;
+}
+
+vectorial_inline simd4f simd4f_rsqrt(simd4f v) { 
+    simd4f s = _mm_rsqrt_ps(v); 
+    const simd4f half = simd4f_create(0.5f, 0.5f, 0.5f, 0.5f);
+    const simd4f three = simd4f_create(3.0f, 3.0f, 3.0f, 3.0f);
+    s = simd4f_mul(simd4f_mul(s, half), simd4f_sub(three, simd4f_mul(s, simd4f_mul(v,s))));
+    return s;
+}
+
+vectorial_inline float simd4f_get_x(simd4f s) { _simd4f_union u={s}; return u.f[0]; }
+vectorial_inline float simd4f_get_y(simd4f s) { _simd4f_union u={s}; return u.f[1]; }
+vectorial_inline float simd4f_get_z(simd4f s) { _simd4f_union u={s}; return u.f[2]; }
+vectorial_inline float simd4f_get_w(simd4f s) { _simd4f_union u={s}; return u.f[3]; }
+
+vectorial_inline simd4f simd4f_dot3(simd4f lhs,simd4f rhs) {
+#if defined(VECTORIAL_USE_SSE4_1)
+    return _mm_dp_ps(lhs, rhs, 0x7f);
+#else
+    simd4f_aligned16 const unsigned int mask_array[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 };
+    const simd4f mask = _mm_load_ps((const float*)mask_array);
+    const simd4f m = _mm_mul_ps(lhs, rhs);
+    const simd4f s0 = _mm_and_ps(m, mask);
+    const simd4f s1 = _mm_add_ps(s0, _mm_movehl_ps(s0, s0));
+    const simd4f s2 = _mm_add_ss(s1, _mm_shuffle_ps(s1, s1, 1));
+    return _mm_shuffle_ps(s2,s2, 0);
+#endif
+}
+
+vectorial_inline float simd4f_dot3_scalar(simd4f lhs,simd4f rhs) {
+    return simd4f_get_x(simd4f_dot3(lhs, rhs));
+}
+
+vectorial_inline simd4f simd4f_cross3(simd4f lhs, simd4f rhs) {
+    
+    const simd4f lyzx = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,0,2,1));
+    const simd4f lzxy = _mm_shuffle_ps(lhs, lhs, _MM_SHUFFLE(3,1,0,2));
+
+    const simd4f ryzx = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,0,2,1));
+    const simd4f rzxy = _mm_shuffle_ps(rhs, rhs, _MM_SHUFFLE(3,1,0,2));
+
+    return _mm_sub_ps(_mm_mul_ps(lyzx, rzxy), _mm_mul_ps(lzxy, ryzx));
+
+}
+
+vectorial_inline simd4f simd4f_shuffle_wxyz(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(2,1,0,3) ); }
+vectorial_inline simd4f simd4f_shuffle_zwxy(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(1,0,3,2) ); }
+vectorial_inline simd4f simd4f_shuffle_yzwx(simd4f s) { return _mm_shuffle_ps(s,s, _MM_SHUFFLE(0,3,2,1) ); }
+
+vectorial_inline simd4f simd4f_zero_w(simd4f s) {
+    simd4f r = _mm_unpackhi_ps(s, _mm_setzero_ps());
+    return _mm_movelh_ps(s, r);
+}
+
+vectorial_inline simd4f simd4f_zero_zw(simd4f s) {
+    return _mm_movelh_ps(s, _mm_setzero_ps());
+}
+
+vectorial_inline simd4f simd4f_merge_high(simd4f xyzw, simd4f abcd) { 
+    return _mm_movehl_ps(abcd, xyzw);
+}
+
+
+typedef simd4f_aligned16 union {
+    unsigned int ui[4];
+    float f[4];
+} _simd4f_uif;
+
+vectorial_inline simd4f simd4f_flip_sign_0101(simd4f s) {
+    const _simd4f_uif upnpn = { { 0x00000000, 0x80000000, 0x00000000, 0x80000000 } };
+    return _mm_xor_ps( s, _mm_load_ps(upnpn.f) ); 
+}
+
+vectorial_inline simd4f simd4f_flip_sign_1010(simd4f s) {
+    const _simd4f_uif unpnp = { { 0x80000000, 0x00000000, 0x80000000, 0x00000000 } };
+    return _mm_xor_ps( s, _mm_load_ps(unpnp.f) ); 
+}
+
+vectorial_inline simd4f simd4f_min(simd4f a, simd4f b) {
+    return _mm_min_ps( a, b ); 
+}
+
+vectorial_inline simd4f simd4f_max(simd4f a, simd4f b) {
+    return _mm_max_ps( a, b ); 
+}
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f.h
@ -0,0 +1,412 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4X4F_H
+#define VECTORIAL_SIMD4X4F_H
+
+
+#include "simd4f.h"
+
+#include <math.h>
+
+/*
+  Note, x,y,z,w are conceptually columns with matrix math.
+*/
+
+typedef struct {
+    simd4f x,y,z,w;
+} simd4x4f;
+
+
+
+vectorial_inline simd4x4f simd4x4f_create(simd4f x, simd4f y, simd4f z, SIMD_PARAM(simd4f, w)) {
+    simd4x4f s = { x, y, z, w };
+    return s;
+}
+
+
+vectorial_inline void simd4x4f_identity(simd4x4f* m) {
+    *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
+                          simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
+                          simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
+                          simd4f_create(0.0f, 0.0f, 0.0f, 1.0f));
+}
+
+
+
+vectorial_inline void simd4x4f_uload(simd4x4f* m, const float *f) {
+
+    m->x = simd4f_uload4(f + 0);
+    m->y = simd4f_uload4(f + 4);
+    m->z = simd4f_uload4(f + 8);
+    m->w = simd4f_uload4(f + 12);
+
+}
+
+
+
+
+
+#ifdef VECTORIAL_SCALAR
+    #include "simd4x4f_scalar.h"
+#elif defined(VECTORIAL_SSE)
+    #include "simd4x4f_sse.h"
+#elif defined(VECTORIAL_GNU)
+    #include "simd4x4f_gnu.h"
+#elif defined(VECTORIAL_NEON)
+    #include "simd4x4f_neon.h"
+#else
+    #error No implementation defined
+#endif
+
+vectorial_inline void simd4x4f_sum(const simd4x4f* a, simd4f* out) {
+    simd4f t;
+    t = simd4f_add(a->x, a->y);
+    t = simd4f_add(t, a->z);
+    t = simd4f_add(t, a->w);
+    *out = t;
+}
+
+vectorial_inline void simd4x4f_matrix_vector_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
+
+    const simd4f x = a->x;
+    const simd4f y = a->y;
+    const simd4f z = a->z;
+    const simd4f w = a->w;
+    const simd4f v = *b;
+    const simd4f vx = simd4f_splat_x(v);
+    const simd4f vy = simd4f_splat_y(v);
+    const simd4f vz = simd4f_splat_z(v);
+    const simd4f vw = simd4f_splat_w(v);
+
+    #if 0
+    // In a hasty benchmark, this actually performed worse on neon
+    // TODO: revisit and conditionalize accordingly
+
+    *out = simd4f_madd(x, vx, 
+             simd4f_madd(y, vy, 
+               simd4f_madd(z, vz, 
+                 simd4f_mul(w, vw) ) ) );
+
+    #else    
+
+     *out = simd4f_add(simd4f_mul(x, vx), 
+              simd4f_add(simd4f_mul(y, vy), 
+                simd4f_add(simd4f_mul(z, vz), 
+                  simd4f_mul(w, vw) ) ) );
+
+    #endif
+}
+
+vectorial_inline void simd4x4f_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
+
+    #if 0
+    *out = simd4f_madd( a->x, simd4f_splat_x(*b), 
+             simd4f_madd( a->y, simd4f_splat_y(*b), 
+               simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
+    #else
+    *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)), 
+             simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)), 
+               simd4f_mul(a->z, simd4f_splat_z(*b)) ) );
+    #endif
+
+}
+
+vectorial_inline void simd4x4f_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
+
+    #if 0
+    *out = simd4f_madd( a->x, simd4f_splat_x(*b),
+             simd4f_madd( a->y, simd4f_splat_y(*b),
+               simd4f_madd( a->z, simd4f_splat_z(*b),
+                 a->w ) ) );
+    #else
+    *out = simd4f_add( simd4f_mul(a->x, simd4f_splat_x(*b)),
+             simd4f_add( simd4f_mul(a->y, simd4f_splat_y(*b)),
+               simd4f_add( simd4f_mul(a->z, simd4f_splat_z(*b)),
+                 a->w ) ) );
+    #endif
+
+}
+
+vectorial_inline void simd4x4f_inv_ortho_matrix_point3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
+    simd4f translation = simd4f_sub(*b, a->w);
+
+    simd4x4f transpose = *a;
+
+    transpose.w = simd4f_create(0,0,0,0);
+    simd4x4f_transpose_inplace(&transpose);
+
+    simd4x4f_matrix_point3_mul(&transpose, &translation, out);
+}
+
+vectorial_inline void simd4x4f_inv_ortho_matrix_vector3_mul(const simd4x4f* a, const simd4f * b, simd4f* out) {
+    simd4f translation = *b;
+
+    simd4x4f transpose = *a;
+
+    transpose.w = simd4f_create(0,0,0,0);
+    simd4x4f_transpose_inplace(&transpose);
+
+    simd4x4f_matrix_vector3_mul(&transpose, &translation, out);
+}
+
+
+vectorial_inline void simd4x4f_matrix_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
+
+    simd4x4f_matrix_vector_mul(a, &b->x, &out->x);
+    simd4x4f_matrix_vector_mul(a, &b->y, &out->y);
+    simd4x4f_matrix_vector_mul(a, &b->z, &out->z);
+    simd4x4f_matrix_vector_mul(a, &b->w, &out->w);
+
+}
+
+
+
+
+vectorial_inline void simd4x4f_perspective(simd4x4f *m, float fovy_radians, float aspect, float znear, float zfar) {
+    
+    float deltaz = zfar - znear;
+    float cotangent = tanf( VECTORIAL_HALFPI - fovy_radians * 0.5f );
+    
+    float a = cotangent / aspect;
+    float b = cotangent;
+    float c = -(zfar + znear) / deltaz;
+    float d = -2 * znear * zfar / deltaz;
+    
+    m->x = simd4f_create( a, 0, 0,  0);
+    m->y = simd4f_create( 0, b, 0,  0);
+    m->z = simd4f_create( 0, 0, c, -1);
+    m->w = simd4f_create( 0, 0, d,  0);
+
+}
+
+vectorial_inline void simd4x4f_ortho(simd4x4f *m, float left, float right, float bottom, float top, float znear, float zfar) {
+    
+    float deltax = right - left;
+    float deltay = top - bottom;
+    float deltaz = zfar - znear;
+
+    float a = 2.0f / deltax;
+    float b = -(right + left) / deltax;
+    float c = 2.0f / deltay;
+    float d = -(top + bottom) / deltay;
+    float e =  -2.0f / deltaz;
+    float f = -(zfar + znear) / deltaz;
+    
+    m->x = simd4f_create( a, 0, 0, 0);
+    m->y = simd4f_create( 0, c, 0, 0);
+    m->z = simd4f_create( 0, 0, e, 0);
+    m->w = simd4f_create( b, d, f, 1);
+    
+}
+
+
+vectorial_inline void simd4x4f_lookat(simd4x4f *m, simd4f eye, simd4f center, simd4f up) {
+    
+    simd4f zaxis = simd4f_normalize3( simd4f_sub(center, eye) );
+    simd4f xaxis = simd4f_normalize3( simd4f_cross3( zaxis, up ) );
+    simd4f yaxis = simd4f_cross3(xaxis, zaxis);
+
+    zaxis = simd4f_sub( simd4f_zero(), zaxis);
+
+    float x = -simd4f_dot3_scalar(xaxis, eye);
+    float y = -simd4f_dot3_scalar(yaxis, eye);
+    float z = -simd4f_dot3_scalar(zaxis, eye);
+
+    m->x = xaxis;
+    m->y = yaxis;
+    m->z = zaxis;
+
+    m->w = simd4f_create( 0,0,0, 1);
+    simd4x4f_transpose_inplace(m);
+    m->w = simd4f_create( x,y,z,1);
+
+}
+
+
+vectorial_inline void simd4x4f_translation(simd4x4f* m, float x, float y, float z) {
+    *m = simd4x4f_create( simd4f_create(1.0f, 0.0f, 0.0f, 0.0f),
+                          simd4f_create(0.0f, 1.0f, 0.0f, 0.0f),
+                          simd4f_create(0.0f, 0.0f, 1.0f, 0.0f),
+                          simd4f_create(   x,    y,    z, 1.0f));
+}
+
+
+vectorial_inline void simd4x4f_axis_rotation(simd4x4f* m, float radians, simd4f axis) {
+
+    radians = -radians;
+
+    axis = simd4f_normalize3(axis);
+
+    const float sine = sinf(radians);
+    const float cosine = cosf(radians);
+
+    const float x = simd4f_get_x(axis);
+    const float y = simd4f_get_y(axis);
+    const float z = simd4f_get_z(axis);
+
+    const float ab = x * y * (1 - cosine);
+    const float bc = y * z * (1 - cosine);
+    const float ca = z * x * (1 - cosine);
+
+    const float tx = x * x;
+    const float ty = y * y;
+    const float tz = z * z;
+
+    const simd4f i = simd4f_create( tx + cosine * (1 - tx), ab - z * sine,          ca + y * sine,          0);
+    const simd4f j = simd4f_create( ab + z * sine,          ty + cosine * (1 - ty), bc - x * sine,          0);
+    const simd4f k = simd4f_create( ca - y * sine,          bc + x * sine,          tz + cosine * (1 - tz), 0);
+    
+    *m = simd4x4f_create( i,j,k, simd4f_create(0, 0, 0, 1) );
+        
+}
+
+
+
+vectorial_inline void simd4x4f_add(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
+    
+    out->x = simd4f_add(a->x, b->x);
+    out->y = simd4f_add(a->y, b->y);
+    out->z = simd4f_add(a->z, b->z);
+    out->w = simd4f_add(a->w, b->w);
+    
+}
+
+vectorial_inline void simd4x4f_sub(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
+    
+    out->x = simd4f_sub(a->x, b->x);
+    out->y = simd4f_sub(a->y, b->y);
+    out->z = simd4f_sub(a->z, b->z);
+    out->w = simd4f_sub(a->w, b->w);
+    
+}
+
+vectorial_inline void simd4x4f_mul(const simd4x4f* a, const simd4x4f* b, simd4x4f* out) {
+    
+    out->x = simd4f_mul(a->x, b->x);
+    out->y = simd4f_mul(a->y, b->y);
+    out->z = simd4f_mul(a->z, b->z);
+    out->w = simd4f_mul(a->w, b->w);
+    
+}
+
+vectorial_inline void simd4x4f_div(simd4x4f* a, simd4x4f* b, simd4x4f* out) {
+    
+    out->x = simd4f_div(a->x, b->x);
+    out->y = simd4f_div(a->y, b->y);
+    out->z = simd4f_div(a->z, b->z);
+    out->w = simd4f_div(a->w, b->w);
+    
+}
+
+vectorial_inline simd4f simd4x4f_inverse(const simd4x4f* a, simd4x4f* out) {
+
+    const simd4f c0 = a->x;
+    const simd4f c1 = a->y;
+    const simd4f c2 = a->z;
+    const simd4f c3 = a->w;
+
+    const simd4f c0_wxyz = simd4f_shuffle_wxyz(c0);
+    const simd4f c0_zwxy = simd4f_shuffle_zwxy(c0);
+    const simd4f c0_yzwx = simd4f_shuffle_yzwx(c0);
+
+    const simd4f c1_wxyz = simd4f_shuffle_wxyz(c1);
+    const simd4f c1_zwxy = simd4f_shuffle_zwxy(c1);
+    const simd4f c1_yzwx = simd4f_shuffle_yzwx(c1);
+
+    const simd4f c2_wxyz = simd4f_shuffle_wxyz(c2);
+    const simd4f c2_zwxy = simd4f_shuffle_zwxy(c2);
+    const simd4f c2_yzwx = simd4f_shuffle_yzwx(c2);
+
+    const simd4f c3_wxyz = simd4f_shuffle_wxyz(c3);
+    const simd4f c3_zwxy = simd4f_shuffle_zwxy(c3);
+    const simd4f c3_yzwx = simd4f_shuffle_yzwx(c3);
+
+    const simd4f c0_wxyz_x_c1 = simd4f_mul(c0_wxyz, c1);
+    const simd4f c0_wxyz_x_c1_yzwx = simd4f_mul(c0_wxyz, c1_yzwx);
+    const simd4f c0_wxyz_x_c1_zwxy = simd4f_mul(c0_wxyz, c1_zwxy);
+
+    const simd4f c2_wxyz_x_c3 = simd4f_mul(c2_wxyz, c3);
+    const simd4f c2_wxyz_x_c3_yzwx = simd4f_mul(c2_wxyz, c3_yzwx);
+    const simd4f c2_wxyz_x_c3_zwxy = simd4f_mul(c2_wxyz, c3_zwxy);
+
+    const simd4f ar1 = simd4f_sub( simd4f_shuffle_wxyz(c2_wxyz_x_c3_zwxy), simd4f_shuffle_zwxy(c2_wxyz_x_c3) );
+    const simd4f ar2 = simd4f_sub( simd4f_shuffle_zwxy(c2_wxyz_x_c3_yzwx), c2_wxyz_x_c3_yzwx );
+    const simd4f ar3 = simd4f_sub( c2_wxyz_x_c3_zwxy, simd4f_shuffle_wxyz(c2_wxyz_x_c3) );
+
+    const simd4f br1 = simd4f_sub( simd4f_shuffle_wxyz(c0_wxyz_x_c1_zwxy), simd4f_shuffle_zwxy(c0_wxyz_x_c1) );
+    const simd4f br2 = simd4f_sub( simd4f_shuffle_zwxy(c0_wxyz_x_c1_yzwx), c0_wxyz_x_c1_yzwx );
+    const simd4f br3 = simd4f_sub( c0_wxyz_x_c1_zwxy, simd4f_shuffle_wxyz(c0_wxyz_x_c1) );
+
+
+    const simd4f c0_sum = simd4f_madd(c0_yzwx, ar3,
+                            simd4f_madd(c0_zwxy, ar2,
+                              simd4f_mul(c0_wxyz, ar1)));
+
+    const simd4f c1_sum = simd4f_madd(c1_wxyz,  ar1, 
+                            simd4f_madd(c1_zwxy,  ar2, 
+                              simd4f_mul(c1_yzwx, ar3)));
+
+    const simd4f c2_sum = simd4f_madd(c2_yzwx, br3,
+                            simd4f_madd(c2_zwxy, br2,
+                              simd4f_mul(c2_wxyz, br1)));
+
+    const simd4f c3_sum = simd4f_madd(c3_yzwx, br3,
+                            simd4f_madd(c3_zwxy, br2,
+                              simd4f_mul(c3_wxyz, br1)));
+
+
+    const simd4f d0 = simd4f_mul(c1_sum, c0);
+    const simd4f d1 = simd4f_add(d0, simd4f_merge_high(d0, d0));
+    const simd4f det = simd4f_sub(d1, simd4f_splat_y(d1));
+
+    const simd4f invdet = simd4f_splat_x( simd4f_div(simd4f_splat(1.0f), det) );
+
+    const simd4f o0 = simd4f_mul( simd4f_flip_sign_0101(c1_sum), invdet );
+    const simd4f o1 = simd4f_mul( simd4f_flip_sign_1010(c0_sum), invdet );
+    const simd4f o2 = simd4f_mul( simd4f_flip_sign_0101(c3_sum), invdet );
+    const simd4f o3 = simd4f_mul( simd4f_flip_sign_1010(c2_sum), invdet );
+
+    const simd4x4f mt = simd4x4f_create(o0, o1, o2, o3);
+    
+    simd4x4f_transpose( &mt, out);
+
+    return det;
+}
+
+#ifdef __cplusplus
+
+    #ifdef VECTORIAL_OSTREAM
+        #include <ostream>
+
+        vectorial_inline std::ostream& operator<<(std::ostream& os, const simd4x4f& v) {
+            os << "simd4x4f(simd4f(" << simd4f_get_x(v.x) << ", "
+                       << simd4f_get_y(v.x) << ", "
+                       << simd4f_get_z(v.x) << ", "
+                       << simd4f_get_w(v.x) << "),\n"
+                       << "         simd4f(" << simd4f_get_x(v.y) << ", "
+                       << simd4f_get_y(v.y) << ", "
+                       << simd4f_get_z(v.y) << ", "
+                       << simd4f_get_w(v.y) << "),\n"
+                       << "         simd4f(" << simd4f_get_x(v.z) << ", "
+                       << simd4f_get_y(v.z) << ", "
+                       << simd4f_get_z(v.z) << ", "
+                       << simd4f_get_w(v.z) << "),\n"
+                       << "         simd4f(" << simd4f_get_x(v.w) << ", "
+                       << simd4f_get_y(v.w) << ", "
+                       << simd4f_get_z(v.w) << ", "
+                       << simd4f_get_w(v.w) << "))";
+            return os;
+        }
+    #endif
+
+#endif
+
+
+
+
+
+#endif 
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_gnu.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_gnu.h
@ -0,0 +1,36 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4X4F_GNU_H
+#define VECTORIAL_SIMD4X4F_GNU_H
+
+
+
+vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) {
+    const _simd4f_union sx = { s->x };
+    const _simd4f_union sy = { s->y };
+    const _simd4f_union sz = { s->z };
+    const _simd4f_union sw = { s->w };
+    
+    const simd4f dx = { sx.f[0], sy.f[0], sz.f[0], sw.f[0] };
+    const simd4f dy = { sx.f[1], sy.f[1], sz.f[1], sw.f[1] };
+    const simd4f dz = { sx.f[2], sy.f[2], sz.f[2], sw.f[2] };
+    const simd4f dw = { sx.f[3], sy.f[3], sz.f[3], sw.f[3] };
+
+    s->x = dx;
+    s->y = dy;
+    s->z = dz;
+    s->w = dw;
+
+}
+
+vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
+    *out=*s;
+    simd4x4f_transpose_inplace(out);
+}
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_neon.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_neon.h
@ -0,0 +1,35 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4X4F_NEON_H
+#define VECTORIAL_SIMD4X4F_NEON_H
+
+
+vectorial_inline void simd4x4f_transpose_inplace(simd4x4f* s) {
+    const _simd4f_union sx = { s->x };
+    const _simd4f_union sy = { s->y };
+    const _simd4f_union sz = { s->z };
+    const _simd4f_union sw = { s->w };
+    
+    const simd4f dx = simd4f_create( sx.f[0], sy.f[0], sz.f[0], sw.f[0] );
+    const simd4f dy = simd4f_create( sx.f[1], sy.f[1], sz.f[1], sw.f[1] );
+    const simd4f dz = simd4f_create( sx.f[2], sy.f[2], sz.f[2], sw.f[2] );
+    const simd4f dw = simd4f_create( sx.f[3], sy.f[3], sz.f[3], sw.f[3] );
+
+    s->x = dx;
+    s->y = dy;
+    s->z = dz;
+    s->w = dw;
+
+}
+
+vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
+    *out=*s;
+    simd4x4f_transpose_inplace(out);
+}
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_scalar.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_scalar.h
@ -0,0 +1,41 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4X4F_SCALAR_H
+#define VECTORIAL_SIMD4X4F_SCALAR_H
+
+
+vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) {
+    simd4x4f d=*s;
+    s->x.x = d.x.x;
+    s->x.y = d.y.x;
+    s->x.z = d.z.x;
+    s->x.w = d.w.x;
+
+    s->y.x = d.x.y;
+    s->y.y = d.y.y;
+    s->y.z = d.z.y;
+    s->y.w = d.w.y;
+
+    s->z.x = d.x.z;
+    s->z.y = d.y.z;
+    s->z.z = d.z.z;
+    s->z.w = d.w.z;
+
+    s->w.x = d.x.w;
+    s->w.y = d.y.w;
+    s->w.z = d.z.w;
+    s->w.w = d.w.w;
+
+}
+
+vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
+    *out=*s;
+    simd4x4f_transpose_inplace(out);
+}
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/simd4x4f_sse.h
+++ b/3rdparty/vectorial/include/vectorial/simd4x4f_sse.h
@ -0,0 +1,23 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_SIMD4X4F_SSE_H
+#define VECTORIAL_SIMD4X4F_SSE_H
+
+
+
+vectorial_inline void simd4x4f_transpose_inplace(simd4x4f *s) {
+    _MM_TRANSPOSE4_PS(s->x, s->y, s->z, s->w);
+}
+
+vectorial_inline void simd4x4f_transpose(const simd4x4f *s, simd4x4f *out) {
+    *out=*s;
+    simd4x4f_transpose_inplace(out);
+}
+
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/vec2f.h
+++ b/3rdparty/vectorial/include/vectorial/vec2f.h
@ -0,0 +1,191 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_VEC2F_H
+
+#ifndef VECTORIAL_SIMD4F_H
+  #include "vectorial/simd4f.h"
+#endif
+
+
+
+namespace vectorial {
+    
+    class vec4f;
+    class vec3f;
+
+    class vec2f {
+    public:
+
+        simd4f value;
+    
+        inline vec2f() {}
+        inline vec2f(const vec2f& v) : value(v.value) {}
+        inline vec2f(const simd4f& v) : value(v) {}
+        explicit inline vec2f(float xy) : value( simd4f_splat(xy) ) {}
+        inline vec2f(float x, float y) : value( simd4f_create(x,y,0,0) ) {}
+        explicit inline vec2f(const float *ary) : value( simd4f_uload2(ary) ) { }
+            
+        inline float x() const { return simd4f_get_x(value); }
+        inline float y() const { return simd4f_get_y(value); }
+
+        inline void load(const float *ary) { value = simd4f_uload2(ary); }
+        inline void store(float *ary) const { simd4f_ustore2(value, ary); }
+    
+        enum { elements = 2 };
+
+        static vec2f zero() { return vec2f(simd4f_zero()); }
+        static vec2f one() { return vec2f(1.0f); }
+        static vec2f xAxis() { return vec2f(1.0f, 0.0f); }
+        static vec2f yAxis() { return vec2f(0.0f, 1.0f); }
+
+        inline vec4f xyzw(float z, float w) const;
+        inline vec4f xy00() const;
+        inline vec4f xy01() const;
+        inline vec3f xyz(float z) const;
+        inline vec3f xy0() const;
+        inline vec2f xy() const;
+
+    };
+
+    vectorial_inline vec2f operator-(const vec2f& lhs) {
+        return vec2f( simd4f_sub(simd4f_zero(), lhs.value) );
+    }
+
+
+    vectorial_inline vec2f operator+(const vec2f& lhs, const vec2f& rhs) {
+        return vec2f( simd4f_add(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec2f operator-(const vec2f& lhs, const vec2f& rhs) {
+        return vec2f( simd4f_sub(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec2f operator*(const vec2f& lhs, const vec2f& rhs) {
+        return vec2f( simd4f_mul(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec2f operator/(const vec2f& lhs, const vec2f& rhs) {
+        return vec2f( simd4f_div(lhs.value, rhs.value) );
+    }
+
+
+    vectorial_inline vec2f operator+=(vec2f& lhs, const vec2f& rhs) {
+        return lhs = vec2f( simd4f_add(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec2f operator-=(vec2f& lhs, const vec2f& rhs) {
+        return lhs = vec2f( simd4f_sub(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec2f operator*=(vec2f& lhs, const vec2f& rhs) {
+        return lhs = vec2f( simd4f_mul(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec2f operator/=(vec2f& lhs, const vec2f& rhs) {
+        return lhs = vec2f( simd4f_div(lhs.value, rhs.value) );
+    }
+
+
+
+    vectorial_inline vec2f operator+(const vec2f& lhs, float rhs) {
+        return vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator-(const vec2f& lhs, float rhs) {
+        return vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator*(const vec2f& lhs, float rhs) {
+        return vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator/(const vec2f& lhs, float rhs) {
+        return vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator+(float lhs, const vec2f& rhs) {
+        return vec2f( simd4f_add(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec2f operator-(float lhs, const vec2f& rhs) {
+        return vec2f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec2f operator*(float lhs, const vec2f& rhs) {
+        return vec2f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec2f operator/(float lhs, const vec2f& rhs) {
+        return vec2f( simd4f_div(simd4f_splat(lhs), rhs.value) );
+    }
+
+
+    vectorial_inline vec2f operator+=(vec2f& lhs, float rhs) {
+        return lhs = vec2f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator-=(vec2f& lhs, float rhs) {
+        return lhs = vec2f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator*=(vec2f& lhs, float rhs) {
+        return lhs = vec2f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec2f operator/=(vec2f& lhs, float rhs) {
+        return lhs = vec2f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
+    }
+
+
+    vectorial_inline float dot(const vec2f& lhs, const vec2f& rhs) {
+        return simd4f_get_x( simd4f_dot2(lhs.value, rhs.value) );
+    }
+
+    
+    vectorial_inline float length(const vec2f& v) {
+        return simd4f_get_x( simd4f_length2(v.value) );
+    }
+
+    vectorial_inline float length_squared(const vec2f& v) {
+        return simd4f_get_x( simd4f_length2_squared(v.value) );
+    }
+
+    vectorial_inline vec2f normalize(const vec2f& v) {
+        return vec2f( simd4f_normalize2(v.value) );
+    }
+
+    vectorial_inline vec2f min(const vec2f& a, const vec2f& b) {
+        return vec2f( simd4f_min(a.value, b.value) );
+    }
+
+    vectorial_inline vec2f max(const vec2f& a, const vec2f& b) {
+        return vec2f( simd4f_max(a.value, b.value) );
+    }
+
+
+}
+
+
+namespace std {
+    inline ::vectorial::vec2f min(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::min(a,b); }
+    inline ::vectorial::vec2f max(const ::vectorial::vec2f& a, const ::vectorial::vec2f& b) { return ::vectorial::max(a,b); }
+}
+
+
+#ifdef VECTORIAL_OSTREAM
+#include <ostream>
+
+vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec2f& v) {
+    os << "[ " << v.x() << ", "
+               << v.y() << " ]";
+    return os;
+}
+#endif
+
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/vec3f.h
+++ b/3rdparty/vectorial/include/vectorial/vec3f.h
@ -0,0 +1,197 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Copyright (c) 2014 Google, Inc.
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_VEC3F_H
+
+#ifndef VECTORIAL_SIMD4F_H
+  #include "vectorial/simd4f.h"
+#endif
+
+
+
+namespace vectorial {
+    
+    class vec4f;
+    class vec2f;
+
+    class vec3f {
+    public:
+
+        simd4f value;
+    
+        inline vec3f() {}
+        inline vec3f(const vec3f& v) : value(v.value) {}
+        inline vec3f(const simd4f& v) : value(v) {}
+        explicit inline vec3f(float xyz) : value( simd4f_splat(xyz) ) {}
+        inline vec3f(float x, float y, float z) : value( simd4f_create(x,y,z,0) ) {}
+        explicit inline vec3f(const float *ary) : value( simd4f_uload3(ary) ) { }
+            
+        inline float x() const { return simd4f_get_x(value); }
+        inline float y() const { return simd4f_get_y(value); }
+        inline float z() const { return simd4f_get_z(value); }
+
+        inline void load(const float *ary) { value = simd4f_uload3(ary); }
+        inline void store(float *ary) const { simd4f_ustore3(value, ary); }
+    
+        enum { elements = 3 };
+
+        static vec3f zero() { return vec3f(simd4f_zero()); }
+        static vec3f one() { return vec3f(1.0f); }
+        static vec3f xAxis() { return vec3f(1.0f, 0.0f, 0.0f); }
+        static vec3f yAxis() { return vec3f(0.0f, 1.0f, 0.0f); }
+        static vec3f zAxis() { return vec3f(0.0f, 0.0f, 1.0f); }
+
+        inline vec4f xyz0() const;
+        inline vec4f xyz1() const;
+        inline vec4f xyzw(float w) const;
+        inline vec3f xyz() const;
+        inline vec3f xy0() const;
+        inline vec2f xy() const;
+    };
+
+    vectorial_inline vec3f operator-(const vec3f& lhs) {
+        return vec3f( simd4f_sub(simd4f_zero(), lhs.value) );
+    }
+    
+
+    vectorial_inline vec3f operator+(const vec3f& lhs, const vec3f& rhs) {
+        return vec3f( simd4f_add(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec3f operator-(const vec3f& lhs, const vec3f& rhs) {
+        return vec3f( simd4f_sub(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec3f operator*(const vec3f& lhs, const vec3f& rhs) {
+        return vec3f( simd4f_mul(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec3f operator/(const vec3f& lhs, const vec3f& rhs) {
+        return vec3f( simd4f_div(lhs.value, rhs.value) );
+    }
+
+
+    vectorial_inline vec3f operator+=(vec3f& lhs, const vec3f& rhs) {
+        return lhs = vec3f( simd4f_add(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec3f operator-=(vec3f& lhs, const vec3f& rhs) {
+        return lhs = vec3f( simd4f_sub(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec3f operator*=(vec3f& lhs, const vec3f& rhs) {
+        return lhs = vec3f( simd4f_mul(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec3f operator/=(vec3f& lhs, const vec3f& rhs) {
+        return lhs = vec3f( simd4f_div(lhs.value, rhs.value) );
+    }
+
+
+
+    vectorial_inline vec3f operator+(const vec3f& lhs, float rhs) {
+        return vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator-(const vec3f& lhs, float rhs) {
+        return vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator*(const vec3f& lhs, float rhs) {
+        return vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator/(const vec3f& lhs, float rhs) {
+        return vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator+(float lhs, const vec3f& rhs) {
+        return vec3f( simd4f_add(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec3f operator-(float lhs, const vec3f& rhs) {
+        return vec3f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec3f operator*(float lhs, const vec3f& rhs) {
+        return vec3f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec3f operator/(float lhs, const vec3f& rhs) {
+        return vec3f( simd4f_div(simd4f_splat(lhs), rhs.value) );
+    }
+
+
+    vectorial_inline vec3f operator+=(vec3f& lhs, float rhs) {
+        return lhs = vec3f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator-=(vec3f& lhs, float rhs) {
+        return lhs = vec3f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator*=(vec3f& lhs, float rhs) {
+        return lhs = vec3f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec3f operator/=(vec3f& lhs, float rhs) {
+        return lhs = vec3f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
+    }
+
+
+    vectorial_inline float dot(const vec3f& lhs, const vec3f& rhs) {
+        return simd4f_dot3_scalar(lhs.value, rhs.value);
+    }
+
+    vectorial_inline vec3f cross(const vec3f& lhs, const vec3f& rhs) {
+        return simd4f_cross3(lhs.value, rhs.value);
+    }
+    
+    
+    vectorial_inline float length(const vec3f& v) {
+        return simd4f_get_x( simd4f_length3(v.value) );
+    }
+
+    vectorial_inline float length_squared(const vec3f& v) {
+        return simd4f_get_x( simd4f_length3_squared(v.value) );
+    }
+
+    vectorial_inline vec3f normalize(const vec3f& v) {
+        return vec3f( simd4f_normalize3(v.value) );
+    }
+
+    vectorial_inline vec3f min(const vec3f& a, const vec3f& b) {
+        return vec3f( simd4f_min(a.value, b.value) );
+    }
+
+    vectorial_inline vec3f max(const vec3f& a, const vec3f& b) {
+        return vec3f( simd4f_max(a.value, b.value) );
+    }
+
+}
+
+
+namespace std {
+    inline ::vectorial::vec3f min(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::min(a,b); }
+    inline ::vectorial::vec3f max(const ::vectorial::vec3f& a, const ::vectorial::vec3f& b) { return ::vectorial::max(a,b); }
+}
+
+
+#ifdef VECTORIAL_OSTREAM
+#include <ostream>
+
+vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec3f& v) {
+    os << "[ " << v.x() << ", "
+               << v.y() << ", "
+               << v.z() << " ]";
+    return os;
+}
+#endif
+
+
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/vec4f.h
+++ b/3rdparty/vectorial/include/vectorial/vec4f.h
@ -0,0 +1,195 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_VEC4F_H
+#define VECTORIAL_VEC4F_H
+
+#ifndef VECTORIAL_SIMD4F_H
+  #include "vectorial/simd4f.h"
+#endif
+
+
+
+namespace vectorial {
+    
+    class vec3f;
+    class vec2f;
+
+    class vec4f {
+    public:
+
+        simd4f value;
+    
+        inline vec4f() {}
+        inline vec4f(const vec4f& v) : value(v.value) {}
+        inline vec4f(const simd4f& v) : value(v) {}
+        explicit inline vec4f(float xyzw) : value( simd4f_splat(xyzw) ) {}
+        inline vec4f(float x, float y, float z, float w) : value( simd4f_create(x,y,z,w) ) {}
+        explicit inline vec4f(const float *ary) : value( simd4f_uload4(ary) ) { }
+            
+        inline float x() const { return simd4f_get_x(value); }
+        inline float y() const { return simd4f_get_y(value); }
+        inline float z() const { return simd4f_get_z(value); }
+        inline float w() const { return simd4f_get_w(value); }
+
+        inline void load(const float *ary) { value = simd4f_uload4(ary); }
+        inline void store(float *ary) const { simd4f_ustore4(value, ary); }
+        
+        enum { elements = 4 };
+
+
+        static vec4f zero() { return vec4f(simd4f_zero()); }
+        static vec4f one() { return vec4f(1.0f); }
+        static vec4f xAxis() { return vec4f(1.0f, 0.0f, 0.0f, 0.0f); }
+        static vec4f yAxis() { return vec4f(0.0f, 1.0f, 0.0f, 0.0f); }
+        static vec4f zAxis() { return vec4f(0.0f, 0.0f, 1.0f, 0.0f); }
+        static vec4f wAxis() { return vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
+
+
+        inline vec3f xyz() const;
+        inline vec2f xy() const;
+
+    };
+
+
+    vectorial_inline vec4f operator-(const vec4f& lhs) {
+        return vec4f( simd4f_sub(simd4f_zero(), lhs.value) );
+    }
+
+
+    vectorial_inline vec4f operator+(const vec4f& lhs, const vec4f& rhs) {
+        return vec4f( simd4f_add(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec4f operator-(const vec4f& lhs, const vec4f& rhs) {
+        return vec4f( simd4f_sub(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec4f operator*(const vec4f& lhs, const vec4f& rhs) {
+        return vec4f( simd4f_mul(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec4f operator/(const vec4f& lhs, const vec4f& rhs) {
+        return vec4f( simd4f_div(lhs.value, rhs.value) );
+    }
+
+
+    vectorial_inline vec4f operator+=(vec4f& lhs, const vec4f& rhs) {
+        return lhs = vec4f( simd4f_add(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec4f operator-=(vec4f& lhs, const vec4f& rhs) {
+        return lhs = vec4f( simd4f_sub(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec4f operator*=(vec4f& lhs, const vec4f& rhs) {
+        return lhs = vec4f( simd4f_mul(lhs.value, rhs.value) );
+    }
+
+    vectorial_inline vec4f operator/=(vec4f& lhs, const vec4f& rhs) {
+        return lhs = vec4f( simd4f_div(lhs.value, rhs.value) );
+    }
+
+
+
+    vectorial_inline vec4f operator+(const vec4f& lhs, float rhs) {
+        return vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator-(const vec4f& lhs, float rhs) {
+        return vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator*(const vec4f& lhs, float rhs) {
+        return vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator/(const vec4f& lhs, float rhs) {
+        return vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator+(float lhs, const vec4f& rhs) {
+        return vec4f( simd4f_add(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec4f operator-(float lhs, const vec4f& rhs) {
+        return vec4f( simd4f_sub(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec4f operator*(float lhs, const vec4f& rhs) {
+        return vec4f( simd4f_mul(simd4f_splat(lhs), rhs.value) );
+    }
+
+    vectorial_inline vec4f operator/(float lhs, const vec4f& rhs) {
+        return vec4f( simd4f_div(simd4f_splat(lhs), rhs.value) );
+    }
+
+
+    vectorial_inline vec4f operator+=(vec4f& lhs, float rhs) {
+        return lhs = vec4f( simd4f_add(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator-=(vec4f& lhs, float rhs) {
+        return lhs = vec4f( simd4f_sub(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator*=(vec4f& lhs, float rhs) {
+        return lhs = vec4f( simd4f_mul(lhs.value, simd4f_splat(rhs)) );
+    }
+
+    vectorial_inline vec4f operator/=(vec4f& lhs, float rhs) {
+        return lhs = vec4f( simd4f_div(lhs.value, simd4f_splat(rhs)) );
+    }
+
+
+    vectorial_inline float dot(const vec4f& lhs, const vec4f& rhs) {
+        return simd4f_get_x( simd4f_dot4(lhs.value, rhs.value) );
+    }
+    
+    
+    vectorial_inline float length(const vec4f& v) {
+        return simd4f_get_x( simd4f_length4(v.value) );
+    }
+
+    vectorial_inline float length_squared(const vec4f& v) {
+        return simd4f_get_x( simd4f_length4_squared(v.value) );
+    }
+
+    vectorial_inline vec4f normalize(const vec4f& v) {
+        return vec4f( simd4f_normalize4(v.value) );
+    }
+
+    vectorial_inline vec4f min(const vec4f& a, const vec4f& b) {
+        return vec4f( simd4f_min(a.value, b.value) );
+    }
+
+    vectorial_inline vec4f max(const vec4f& a, const vec4f& b) {
+        return vec4f( simd4f_max(a.value, b.value) );
+    }
+
+
+}
+
+
+namespace std {
+    inline ::vectorial::vec4f min(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::min(a,b); }
+    inline ::vectorial::vec4f max(const ::vectorial::vec4f& a, const ::vectorial::vec4f& b) { return ::vectorial::max(a,b); }
+}
+
+
+#ifdef VECTORIAL_OSTREAM
+#include <ostream>
+
+vectorial_inline std::ostream& operator<<(std::ostream& os, const vectorial::vec4f& v) {
+    os << "[ " << v.x() << ", "
+               << v.y() << ", "
+               << v.z() << ", "
+               << v.w() << " ]";
+    return os;
+}
+#endif
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/vec_convert.h
+++ b/3rdparty/vectorial/include/vectorial/vec_convert.h
@ -0,0 +1,31 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_VEC_CONVERT_H
+#define VECTORIAL_VEC_CONVERT_H
+
+
+namespace vectorial {
+    
+    inline vec3f vec4f::xyz() const { return vec3f(value); }
+    inline vec2f vec4f::xy() const { return vec2f(value); }
+
+    inline vec4f vec3f::xyz0() const { return vec4f(simd4f_zero_w(value)); }
+    inline vec4f vec3f::xyz1() const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
+    inline vec4f vec3f::xyzw(float w) const { return xyz0() + vec4f(0.0f, 0.0f, 0.0f, w); }
+    inline vec3f vec3f::xyz() const { return vec3f(value); }
+    inline vec3f vec3f::xy0() const { return vec3f(value) * vec3f(1.0f, 1.0f, 0.0f); }
+    inline vec2f vec3f::xy() const { return vec2f(value); }
+
+    inline vec4f vec2f::xy00() const { return vec4f(simd4f_zero_zw(value)); }
+    inline vec4f vec2f::xy01() const { return xy00() + vec4f(0.0f, 0.0f, 0.0f, 1.0f); }
+    inline vec4f vec2f::xyzw(float z, float w) const { return xy00() + vec4f(0.0f, 0.0f, z, w); }
+    inline vec3f vec2f::xy0() const { return vec3f(simd4f_zero_zw(value)); }
+    inline vec2f vec2f::xy() const { return vec2f(value); }
+
+}
+
+
+#endif
--- a/3rdparty/vectorial/include/vectorial/vectorial.h
+++ b/3rdparty/vectorial/include/vectorial/vectorial.h
@ -0,0 +1,19 @@
+/*
+  Vectorial
+  Copyright (c) 2010 Mikko Lehtonen
+  Licensed under the terms of the two-clause BSD License (see LICENSE)
+*/
+#ifndef VECTORIAL_VECTORIAL_H
+#define VECTORIAL_VECTORIAL_H
+
+
+#include "vectorial/vec2f.h"
+#include "vectorial/vec3f.h"
+#include "vectorial/vec4f.h"
+
+#include "vectorial/vec_convert.h"
+
+#include "vectorial/mat4f.h"
+
+
+#endif
--- a/3rdparty/vectorial/spec/spec.cpp
+++ b/3rdparty/vectorial/spec/spec.cpp
@ -0,0 +1,229 @@
+/* Specific - Minimal C++ spec framework.
+ 
+
+The zlib/libpng License
+
+
+Copyright (c) 2008 Mikko Lehtonen
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+
+#include "spec.h"
+
+#include <iostream>
+
+namespace specific {
+
+
+
+    void SpecWriter::startGroup(std::string /*group*/, std::string /*description*/) {}
+
+    void SpecWriter::addFailedAssertation(std::string msg, const char *file, int line) {
+        mFailures.push_back( SpecFailure(msg,file,line) );
+    }
+    void SpecWriter::addSpecResult(SpecResult r) {
+        mResults.push_back( r );
+    }
+    void SpecWriter::start() {}
+    void SpecWriter::stop() {
+        std::cout << std::endl;
+        size_t nth = 0;
+        for(std::vector<SpecFailure>::iterator i=mFailures.begin(); i != mFailures.end(); ++i, ++nth)
+        {
+            std::cout << std::endl;
+            std::cout << (nth+1) << ") Failed assertation at " << i->file << ":"
+            << i->line << ":" << std::endl << "  " << i->msg << std::endl;
+        }
+        std::cout << std::endl << mResults.size() << " examples, " << mFailures.size() << " failures" << std::endl;
+ 
+    }
+ 
+
+
+    void ProgressWriter::addSpecResult(SpecResult r) {
+        SpecWriter::addSpecResult(r);
+        switch(r.type) {
+            case SpecResult::PASSED:
+                std::cout << ".";
+                break;
+            case SpecResult::FAILED:
+                std::cout << "F";
+                break;
+            case SpecResult::ERRORED:
+                std::cout << "E";
+                break;
+        }
+        std::cout << std::flush;
+    }
+
+
+
+    void SpecdocWriter::startGroup(std::string group, std::string description) {
+        std::cout << group << ": " << description << std::endl;            
+    }
+
+
+    void SpecdocWriter::addSpecResult(SpecResult r) {
+        SpecWriter::addSpecResult(r);
+        size_t nth = mFailures.size();
+        std::cout << "- " << r.test;
+        switch(r.type) {
+            case SpecResult::PASSED:
+                std::cout << " [OK]";
+                break;
+            case SpecResult::FAILED:
+                std::cout << " [FAILED - " << nth << "]";
+                break;
+            case SpecResult::ERRORED:
+                std::cout << " [ERROR - "<< nth <<"]";
+                break;
+        }
+        std::cout << std::endl;
+    }
+
+
+
+
+    class spec_failure {};
+
+
+
+    SpecBase::SpecBase() : mWriter(NULL), mName(NULL),
+        mFailed(false), mLastFailed(false), mError(false), mExecutionPoint(0), mContinuePoint(0) 
+    {
+        SpecRunner::getInstance().add(this);
+    }
+
+
+    SpecBase::~SpecBase() {
+        
+    }
+
+
+    bool SpecBase::startSpec(const char* name) 
+    {
+        endSpec();
+
+        mExecutionPoint++;
+        if(mExecutionPoint <= mContinuePoint) return false;
+        mContinuePoint++;
+
+        mName = name;
+        return true;
+    }
+
+
+    void SpecBase::endSpec() 
+    {
+        if(!mName) return;
+
+        SpecResult r;
+        r.group = getGroup();
+        r.description = getDescription();
+        r.type = SpecResult::PASSED;
+        if(mLastFailed) r.type = SpecResult::FAILED;
+        if(mError) r.type = SpecResult::ERRORED;
+        r.test = mName;
+        mWriter->addSpecResult( r );
+        
+        mName = NULL; 
+    }
+
+
+    void SpecBase::should_test(bool value, const char* message, const char* file, int line) {
+        mLastFailed=false;
+        if(!value) {
+            mWriter->addFailedAssertation(message, file, line);
+            mLastFailed = mFailed = true;
+            throw spec_failure();
+        }
+    }
+
+    
+    void SpecBase::error(std::string msg) {
+        mWriter->addFailedAssertation(msg, "exception", 0);
+        mLastFailed = true;
+        mFailed = true;
+        mError = true;
+    }
+
+    bool SpecBase::done() {
+        if( mError ) {
+            mError = false;
+            return false;
+        }
+        return true;
+    }
+
+
+    SpecRunner::SpecRunner() {}
+    SpecRunner::~SpecRunner() { }
+
+    SpecRunner& SpecRunner::getInstance() {
+        static SpecRunner* instance = NULL;
+        if( instance == NULL ) {
+            instance = new SpecRunner;
+        }
+        return *instance;
+    }
+
+
+    bool SpecRunner::run(SpecWriter& writer, const std::string subset) {
+        bool success = true;
+        
+        writer.start();
+        std::vector<SpecBase*>::iterator i = mSpecs.begin();
+        for(; i != mSpecs.end(); ++i) {
+            SpecBase *b = *i;
+            if( b->getGroup().find(subset, 0) == std::string::npos ) continue;
+            b->mContinuePoint = 0;
+            b->setWriter(&writer);
+            writer.startGroup( b->getGroup(), b->getDescription() );
+            do {
+                b->mExecutionPoint = 0;
+                try {
+                    b->specify();
+                } catch(spec_failure& e) {
+                    b->mError=true;
+                } catch( std::exception& e) {
+                    b->error(e.what());
+                } catch( ... ) {
+                    b->error("unknown exception");
+                }
+                b->endSpec();
+                
+            } while( !b->done() );
+            
+            success = success && b->isSuccessful();
+
+        }
+        writer.stop();
+
+        return success;
+    }
+
+
+}
+
+
+
+
--- a/3rdparty/vectorial/spec/spec.h
+++ b/3rdparty/vectorial/spec/spec.h
@ -0,0 +1,217 @@
+/* Specific - Minimal C++ spec framework.
+ 
+
+The zlib/libpng License
+
+
+Copyright (c) 2008 Mikko Lehtonen
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+
+#ifndef SPECIFIC_SPEC_H
+#define SPECIFIC_SPEC_H
+
+#include <string>
+#include <vector>
+#include <stdexcept>
+#include <sstream>
+
+namespace specific {
+
+
+    class SpecResult {
+    public:
+        typedef enum {
+            PASSED,
+            FAILED,
+            ERRORED
+        } Type;
+      
+        Type type;
+        
+        std::string group;
+        std::string description;
+        std::string test;
+    };
+
+
+    class SpecFailure {
+    public:
+        SpecFailure(std::string amsg, const char* afile, int aline)
+            : msg(amsg), file(afile), line(aline) { }
+        std::string msg;
+        const char* file;
+        int line;
+    };
+
+
+    class SpecWriter {
+    public:
+        std::vector<SpecResult> mResults;
+        std::vector<SpecFailure> mFailures;
+        SpecWriter() {}
+        virtual ~SpecWriter() {}
+        virtual void startGroup(std::string group, std::string description);
+        virtual void addFailedAssertation(std::string msg, const char *file, int line);
+        virtual void addSpecResult(SpecResult r);
+        virtual void start();
+        virtual void stop();
+    };
+
+
+    class ProgressWriter : public SpecWriter {
+    public:
+        void addSpecResult(SpecResult r);
+    };
+
+
+
+    class SpecdocWriter : public SpecWriter {
+    public:
+        void startGroup(std::string group, std::string description);
+        void addSpecResult(SpecResult r);
+    };
+
+
+
+    template<class T> std::string inspect(const T& value) {
+        std::stringstream ss;
+        ss << value;
+        return ss.str();
+    }
+
+
+    class SpecBase {
+    public:
+        SpecBase();
+        virtual ~SpecBase();
+
+        virtual void specify() = 0;
+
+        void setWriter(SpecWriter* w) { mWriter = w; }
+
+        bool startSpec(const char* name);
+        void endSpec();
+
+        void should_test(bool value, const char* message, const char* file, int line);
+
+        template<typename T1, typename T2> void should_equal_template(const T1& a, const T2& b, const char* file, int line) {
+            std::stringstream ss;
+            ss << "`" << ::specific::inspect(a) << "'" << " == " << "`" << ::specific::inspect(b) << "'";
+            should_test( a == b, ss.str().c_str(), file, line);
+        }
+
+        template<typename T1, typename T2> void should_not_equal_template(const T1& a, const T2& b, const char* file, int line) {
+            std::stringstream ss;
+            ss << "`" << ::specific::inspect(a) << "'" << " != " << "`" << ::specific::inspect(b) << "'";
+            should_test( a != b, ss.str().c_str(), file, line);
+        }
+
+
+
+        virtual std::string getGroup() = 0;
+        virtual std::string getDescription() = 0;
+
+        bool isSuccessful() { return !mFailed; }
+        
+        bool done();
+        
+        void error(std::string msg);
+
+        SpecWriter* mWriter;
+        const char* mName;
+        bool mFailed;
+        bool mLastFailed;
+        bool mError;
+        int mExecutionPoint;
+        int mContinuePoint;
+        char *mFile;
+        std::string mErrorMessage;
+        int mLine;
+    };
+
+
+    class SpecRunner {
+    public:
+        static SpecRunner& getInstance();
+        void add(SpecBase* spec) { mSpecs.push_back( spec ); }
+        bool run(SpecWriter& writer, const std::string subset = "");
+    private:
+
+        std::vector<SpecBase*> mSpecs;
+
+        SpecRunner();
+        ~SpecRunner();
+    };
+
+    #define SPEC_UNIQUE_NAME3(x,y) x##y
+    #define SPEC_UNIQUE_NAME2(x,y) SPEC_UNIQUE_NAME3(x,y)
+
+    #define SPEC_NAME(x) SPEC_UNIQUE_NAME2(SPEC_##x, SPEC_UNIQUE_NAME2(_startingOnLine, __LINE__) )
+
+
+    #define describe(group, description)                                    \
+    class SPEC_NAME(group) : public specific::SpecBase                         \
+    {                                                                       \
+    public:                                                                 \
+        void specify();                                                     \
+        std::string getGroup() { return #group; }                           \
+        std::string getDescription() { return description; }                \
+    };                                                                      \
+    static SPEC_NAME(group) SPEC_UNIQUE_NAME2(SPEC_NAME(group), _instance); \
+    void SPEC_NAME(group)::specify()
+    
+
+    #define it(description) if(startSpec(description))
+
+
+    // Matchers
+    #define should_be_true(a) should_test(a, #a, __FILE__, __LINE__)
+    #define should_be_false(a) should_be_true( !a )
+
+    #ifndef SPECIFIC_NO_OSTREAM
+        #define should_equal(a, b) should_equal_template( a,b, __FILE__, __LINE__ )
+        #define should_not_equal(a, b) should_not_equal_template( a,b, __FILE__, __LINE__ )
+    #else
+        #define should_equal(a, b) should_be_true( (a) == (b) )
+        #define should_not_equal(a, b) should_be_true( (a) != (b) )
+    #endif
+
+    #define should_throw(code, what) \
+    do {                             \
+        bool _thrown = false;        \
+        try {                        \
+          code ;                     \
+        } catch(what& e) {           \
+            _thrown = true;          \
+        }                            \
+        should_test(_thrown, "should throw exception " #what, __FILE__, __LINE__); \
+    } while(0)
+
+
+
+}
+
+
+
+#endif /* Include guard */
+
--- a/3rdparty/vectorial/spec/spec_helper.h
+++ b/3rdparty/vectorial/spec/spec_helper.h
@ -0,0 +1,215 @@
+#ifndef VECTORIAL_SPEC_HELPER_H
+#define VECTORIAL_SPEC_HELPER_H
+
+#define VECTORIAL_OSTREAM
+
+#include "spec.h"
+
+#include "vectorial/vectorial.h"
+
+#ifdef VECTORIAL_HAVE_SIMD2F
+#include "vectorial/simd2f.h"
+#endif
+
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+
+#define should_be_close_to(a,b,tolerance) should_be_close_to_(this, a,b,tolerance,__FILE__,__LINE__)
+#define should_be_equal_simd4f( a, b, tolerance) should_be_equal_simd4f_(this, a,b,tolerance,__FILE__,__LINE__)
+#define should_be_equal_simd2f( a, b, tolerance) should_be_equal_simd2f_(this, a,b,tolerance,__FILE__,__LINE__)
+#define should_be_equal_vec4f( a, b, tolerance) should_be_equal_vec4f_(this, a,b,tolerance,__FILE__,__LINE__)
+#define should_be_equal_vec3f( a, b, tolerance) should_be_equal_vec3f_(this, a,b,tolerance,__FILE__,__LINE__)
+#define should_be_equal_vec2f( a, b, tolerance) should_be_equal_vec2f_(this, a,b,tolerance,__FILE__,__LINE__)
+
+#define should_be_equal_simd4x4f( a, b, tolerance) should_be_equal_simd4x4f_(this, a,b,tolerance,__FILE__,__LINE__)
+
+#define should_be_equal_mat4f( a, b, tolerance) should_be_equal_mat4f_(this, a,b,tolerance,__FILE__,__LINE__)
+
+// Based on:
+// http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm
+// 
+static inline bool compare_floats(float A, float B, int maxUlps)
+{
+    // Make sure maxUlps is non-negative and small enough that the
+    // default NAN won't compare as equal to anything.
+    // assert(maxUlps > 0 && maxUlps < 4 * 1024 * 1024);
+    union {
+        float f;
+        int i;
+    } f2iA, f2iB;
+    f2iA.f = A;
+    f2iB.f = B;
+    
+    int aInt = f2iA.i;
+//    int aInt = *(int*)&A;
+    // Make aInt lexicographically ordered as a twos-complement int
+    if (aInt < 0)
+        aInt = 0x80000000 - aInt;
+    // Make bInt lexicographically ordered as a twos-complement int
+    int bInt = f2iB.i;
+//    int bInt = *(int*)&B;
+    if (bInt < 0)
+        bInt = 0x80000000 - bInt;
+    int intDiff = abs(aInt - bInt);
+    if (intDiff <= maxUlps)
+        return true;
+    return false;
+}
+
+
+
+
+
+
+
+
+
+static inline void should_be_close_to_(specific::SpecBase *spec, float a, float b, int tolerance, const char *file, int line) {
+    
+    bool equal=true;
+    if( !compare_floats(a,b,tolerance) ) equal = false;
+
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+#ifdef VECTORIAL_HAVE_SIMD2F
+static inline void should_be_equal_simd2f_(specific::SpecBase *spec, const simd2f& a, const simd2f& b, int tolerance, const char *file, int line) {
+
+    bool equal=true;
+    if( !compare_floats( simd2f_get_x(a), simd2f_get_x(b), tolerance) ) equal = false;
+    if( !compare_floats( simd2f_get_y(a), simd2f_get_y(b), tolerance) ) equal = false;
+
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+
+}
+#endif
+
+static inline void should_be_equal_simd4f_(specific::SpecBase *spec, const simd4f& a, const simd4f& b, int tolerance, const char *file, int line) {
+    
+    bool equal=true;
+    if( !compare_floats( simd4f_get_x(a), simd4f_get_x(b), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a), simd4f_get_y(b), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a), simd4f_get_z(b), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a), simd4f_get_w(b), tolerance) ) equal = false;
+    
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+static inline void should_be_equal_vec4f_(specific::SpecBase *spec, const vectorial::vec4f& a, const vectorial::vec4f& b, int tolerance, const char *file, int line) {
+    
+    bool equal=true;
+    if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
+    if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
+    if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false;
+    if( !compare_floats( a.w(), b.w(), tolerance) ) equal = false;
+    
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+static inline void should_be_equal_vec3f_(specific::SpecBase *spec, const vectorial::vec3f& a, const vectorial::vec3f& b, int tolerance, const char *file, int line) {
+    
+    bool equal=true;
+    if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
+    if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
+    if( !compare_floats( a.z(), b.z(), tolerance) ) equal = false;
+    
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+static inline void should_be_equal_vec2f_(specific::SpecBase *spec, const vectorial::vec2f& a, const vectorial::vec2f& b, int tolerance, const char *file, int line) {
+    
+    bool equal=true;
+    if( !compare_floats( a.x(), b.x(), tolerance) ) equal = false;
+    if( !compare_floats( a.y(), b.y(), tolerance) ) equal = false;
+    
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+
+
+static inline void should_be_equal_simd4x4f_(specific::SpecBase *spec, const simd4x4f& a, const simd4x4f& b, int tolerance, const char *file, int line) {
+    
+    bool equal=true;
+    if( !compare_floats( simd4f_get_x(a.x), simd4f_get_x(b.x), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.x), simd4f_get_y(b.x), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.x), simd4f_get_z(b.x), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.x), simd4f_get_w(b.x), tolerance) ) equal = false;
+
+    if( !compare_floats( simd4f_get_x(a.y), simd4f_get_x(b.y), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.y), simd4f_get_y(b.y), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.y), simd4f_get_z(b.y), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.y), simd4f_get_w(b.y), tolerance) ) equal = false;
+
+    if( !compare_floats( simd4f_get_x(a.z), simd4f_get_x(b.z), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.z), simd4f_get_y(b.z), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.z), simd4f_get_z(b.z), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.z), simd4f_get_w(b.z), tolerance) ) equal = false;
+
+    if( !compare_floats( simd4f_get_x(a.w), simd4f_get_x(b.w), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.w), simd4f_get_y(b.w), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.w), simd4f_get_z(b.w), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.w), simd4f_get_w(b.w), tolerance) ) equal = false;
+    
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << ")";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+static inline void should_be_equal_mat4f_(specific::SpecBase *spec, const vectorial::mat4f& a, const vectorial::mat4f& b, int tolerance, const char *file, int line) {
+                                                                        
+    bool equal=true;                                                    
+    if( !compare_floats( simd4f_get_x(a.value.x), simd4f_get_x(b.value.x), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.value.x), simd4f_get_y(b.value.x), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.value.x), simd4f_get_z(b.value.x), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.value.x), simd4f_get_w(b.value.x), tolerance) ) equal = false;
+
+    if( !compare_floats( simd4f_get_x(a.value.y), simd4f_get_x(b.value.y), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.value.y), simd4f_get_y(b.value.y), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.value.y), simd4f_get_z(b.value.y), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.value.y), simd4f_get_w(b.value.y), tolerance) ) equal = false;
+
+    if( !compare_floats( simd4f_get_x(a.value.z), simd4f_get_x(b.value.z), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.value.z), simd4f_get_y(b.value.z), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.value.z), simd4f_get_z(b.value.z), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.value.z), simd4f_get_w(b.value.z), tolerance) ) equal = false;
+
+    if( !compare_floats( simd4f_get_x(a.value.w), simd4f_get_x(b.value.w), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_y(a.value.w), simd4f_get_y(b.value.w), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_z(a.value.w), simd4f_get_z(b.value.w), tolerance) ) equal = false;
+    if( !compare_floats( simd4f_get_w(a.value.w), simd4f_get_w(b.value.w), tolerance) ) equal = false;
+    
+    std::stringstream ss;
+    ss << a << " == " << b << " (with tolerance of " << tolerance << " ulps)";
+    spec->should_test(equal, ss.str().c_str(), file, line);
+    
+    
+}
+
+
+
+#endif
--- a/3rdparty/vectorial/spec/spec_main.cpp
+++ b/3rdparty/vectorial/spec/spec_main.cpp
@ -0,0 +1,55 @@
+/* Specific - Minimal C++ spec framework.
+ 
+
+The zlib/libpng License
+
+
+Copyright (c) 2008 Mikko Lehtonen
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+
+#include "spec.h"
+#include <cstdlib>
+
+int main(int argc, char *argv[]) 
+{
+
+    std::string subset("");
+
+    specific::ProgressWriter progressWriter;
+    specific::SpecdocWriter specdocWriter;
+    specific::SpecWriter* writer = &progressWriter;
+
+    for(size_t i = 1; i < size_t(argc); ++i) {
+        if( std::string("-s") == argv[i] ) {
+            writer = &specdocWriter;
+        } else {
+            subset = argv[i];
+        }
+    }
+
+
+    bool success = specific::SpecRunner::getInstance().run(*writer, subset);
+
+    return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
--- a/3rdparty/vectorial/spec/spec_mat4f.cpp
+++ b/3rdparty/vectorial/spec/spec_mat4f.cpp
@ -0,0 +1,29 @@
+#include "spec_helper.h"
+#include <iostream>
+using vectorial::vec4f;
+using vectorial::mat4f;
+
+const int epsilon = 1;
+
+describe(mat4f, "constructing") {
+    it("should have default constructor that does nothing..") {
+        mat4f x;
+    }
+
+    it("should have constructor that constructs from four vec4") {
+        mat4f x( vec4f(1,2,3,4), vec4f(5,6,7,8), vec4f(9,10,11,12), vec4f(13,14,15,16) );
+
+        // octave mat4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ]
+        should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), simd4f_create(5.000000000000000f, 6.000000000000000f, 7.000000000000000f, 8.000000000000000f), simd4f_create(9.000000000000000f, 10.000000000000000f, 11.000000000000000f, 12.000000000000000f), simd4f_create(13.000000000000000f, 14.000000000000000f, 15.000000000000000f, 16.000000000000000f)), epsilon );
+    }
+    
+    it("should have static function to create identity matrix") {
+        
+        mat4f x = mat4f::identity();
+        
+        // octave mat4f: [1,0,0,0;0,1,0,0;0,0,1,0;0,0,0,1]
+        should_be_equal_mat4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 1.000000000000000f)), epsilon );
+    }
+    
+}
+
--- a/3rdparty/vectorial/spec/spec_simd2f.cpp
+++ b/3rdparty/vectorial/spec/spec_simd2f.cpp
@ -0,0 +1,242 @@
+
+#include "spec_helper.h"
+
+const int epsilon = 1;
+
+#ifdef VECTORIAL_HAVE_SIMD2F
+
+describe(simd2f, "sanity") {
+    it("VECTORIAL_SIMD_TYPE should be defined to a string") {
+        std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl;
+    }
+}
+
+describe(simd2f, "creating") {
+    
+    it("should be possible to create with simd2f_create") {
+        
+        simd2f x = simd2f_create(1, 2);
+
+        should_be_close_to( simd2f_get_x(x), 1, epsilon);
+        should_be_close_to( simd2f_get_y(x), 2, epsilon);
+
+        // octave simd2f: [1,2]
+        should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon );
+        
+    }
+
+    it("should have simd2f_zero for zero vector") {
+
+        simd2f x = simd2f_zero();
+
+        // octave simd2f: [0,0]
+        should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon );
+    }
+    
+    
+}
+#ifdef _MSC_VER
+#include <malloc.h>
+#else
+#include <alloca.h>
+#endif
+
+#define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4))
+
+describe(simd2f, "utilities") {
+
+    it("should have simd2f_uload2 for loading two float values from float an unaligned array into simd2f") {
+        float *f = unaligned_mem(2);
+        f[0] = 1;
+        f[1] = 2;
+        simd2f x = simd2f_uload2(f);
+        // octave simd2f: [1,2]
+        should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 2.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_ustore2 for storing two float values from simd2f to an unaligned array") {
+        float *f = unaligned_mem(2);
+        f[0] = -1;
+        f[1] = -1;
+        simd2f a = simd2f_create(1,2);
+        simd2f_ustore2(a, f);
+        should_be_close_to(f[0], 1, epsilon);
+        should_be_close_to(f[1], 2, epsilon);
+    }
+
+
+    it("should have simd2f_splat that expands a single scalar to all elements") {
+        simd2f x = simd2f_splat(42);
+        // octave simd2f: [42,42]
+        should_be_equal_simd2f(x, simd2f_create(42.000000000000000f, 42.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_splat_x,y splatting of an element") {
+        simd2f a = simd2f_create(1,2);
+
+        simd2f x;
+        
+        x = simd2f_splat_x(a);
+        // octave simd2f: [1,1]
+        should_be_equal_simd2f(x, simd2f_create(1.000000000000000f, 1.000000000000000f), epsilon );
+
+        x = simd2f_splat_y(a);
+        // octave simd2f: [2,2]
+        should_be_equal_simd2f(x, simd2f_create(2.000000000000000f, 2.000000000000000f), epsilon );
+
+    }
+
+#if 0
+    it("should have simd2f_sum that adds elements") {
+        simd2f a = simd2f_create(1,2);
+        simd2f x = simd2f_sum(a);
+        // octave simd2f: [sum([1,2]), sum([1,2,3,4])]
+        should_be_equal_simd2f(x, simd2f_create(3.000000000000000f, 10.000000000000000f), epsilon );
+        
+    }
+#endif
+
+    it("should have simd2f_reciprocal") {
+        simd2f a = simd2f_create(0.00001f, 2.00001f);
+        simd2f x = simd2f_reciprocal(a);
+        // octave simd2f: 1 ./ [0.00001, 2.00001]
+        should_be_equal_simd2f(x, simd2f_create(99999.999999999985448f, 0.499997500012500f), epsilon );
+    }
+
+    it("should have simd2f_sqrt") {
+        simd2f a = simd2f_create(0.00001f, 2.00001f);
+        simd2f x = simd2f_sqrt(a);
+        // octave simd2f:  sqrt([0.00001, 2.00001])
+        should_be_equal_simd2f(x, simd2f_create(0.003162277660168f, 1.414217097902582f), epsilon );
+
+        x = simd2f_sqrt( simd2f_create(0.0f, 0.0f) );
+        // octave simd2f:  sqrt([0, 0])
+        should_be_equal_simd2f(x, simd2f_create(0.000000000000000f, 0.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_rsqrt for reciprocal of square-root") {
+        simd2f a = simd2f_create(0.00001f, 2.00001f);
+        simd2f x = simd2f_rsqrt(a);
+        const int epsilon = 4; // Grant larger error
+        // octave simd2f:  1 ./ sqrt([0.00001, 2.00001])
+        should_be_equal_simd2f(x, simd2f_create(316.227766016837904f, 0.707105013426224f), epsilon );
+    }
+
+}
+
+describe(simd2f, "arithmetic with another simd2f") {
+
+    it("should have simd2f_add for component-wise addition") {
+        simd2f a = simd2f_create(1,2);
+        simd2f b = simd2f_create(10,20);
+        
+        simd2f x = simd2f_add(a,b);
+        // octave simd2f: [1,2] + [10,20]
+        should_be_equal_simd2f(x, simd2f_create(11.000000000000000f, 22.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_sub for component-wise subtraction") {
+        simd2f a = simd2f_create(1,2);
+        simd2f b = simd2f_create(10,20);
+        
+        simd2f x = simd2f_sub(b,a);
+        // octave simd2f: [10,20] - [1,2] 
+        should_be_equal_simd2f(x, simd2f_create(9.000000000000000f, 18.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_mul for component-wise multiply") {
+        simd2f a = simd2f_create(1,2);
+        simd2f b = simd2f_create(10,20);
+        
+        simd2f x = simd2f_mul(a,b);
+        // octave simd2f: [1,2] .* [10,20]
+        should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 40.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_div for component-wise division") {
+        simd2f a = simd2f_create(1,2);
+        simd2f b = simd2f_create(10,20);
+        
+        simd2f x = simd2f_div(b,a);
+        // octave simd2f: [10,20] ./ [1,2] 
+        should_be_equal_simd2f(x, simd2f_create(10.000000000000000f, 10.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_madd for multiply-add") {
+        simd2f a = simd2f_create(1,2);
+        simd2f b = simd2f_create(100,100);
+        simd2f c = simd2f_create(6,7);
+
+        simd2f x = simd2f_madd(a,b,c);
+        // octave simd2f: [1,2] .* [100,100] .+ [6,7]
+        should_be_equal_simd2f(x, simd2f_create(106.000000000000000f, 207.000000000000000f), epsilon );
+
+    }
+
+}
+
+
+describe(simd2f, "vector math") {
+
+    it("should have simd2f_dot2 for two component dot product") {
+        simd2f a = simd2f_create(1,2);
+        simd2f b = simd2f_create(10,20);
+        
+        simd2f x = simd2f_dot2(a,b);
+        // octave simd2f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20])]
+        should_be_equal_simd2f(x, simd2f_create(50.000000000000000f, 50.000000000000000f), epsilon );
+    }
+
+    it("should have simd2f_length2 for two component vector length") {
+        simd2f a = simd2f_create(1,2);
+        simd2f x = simd2f_length2(a);
+        // octave simd2f: [norm([1,2]),norm([1,2])]
+        should_be_equal_simd2f(x, simd2f_create(2.236067977499790f, 2.236067977499790f), epsilon );
+
+    }
+
+
+    it("should have simd2f_length2_squared for two component squared vector length") {
+        simd2f a = simd2f_create(1,2);
+        simd2f x = simd2f_length2_squared(a);
+        // octave simd2f: ([dot([1,2], [1,2]), dot([1,2], [1,2])])
+        should_be_equal_simd2f(x, simd2f_create(5.000000000000000f, 5.000000000000000f), epsilon );
+
+    }
+
+    it("should have simd2f_normalize2 for normalizing two component vector to unit length") {
+        simd2f a = simd2f_create(1,2);
+        simd2f x = simd2f_normalize2(a);
+        // octave simd2f: [1,2] / norm([1,2])
+        should_be_equal_simd2f(x, simd2f_create(0.447213595499958f, 0.894427190999916f), epsilon );
+    }
+
+}
+
+
+describe(simd2f, "min-max") {
+
+    it("should have simd2f_min for choosing minimum elements") {
+        simd2f a = simd2f_create(1.0f,  2.0f);
+        simd2f b = simd2f_create(2.0f, -2.0f);
+
+        simd2f x = simd2f_min(a,b);
+        should_be_equal_simd2f(x, simd2f_create(1.0f, -2.0f), epsilon);
+
+    }
+
+    it("should have simd2f_max for choosing maximum elements") {
+        simd2f a = simd2f_create(1.0f,  2.0f);
+        simd2f b = simd2f_create(2.0f, -2.0f);
+
+        simd2f x = simd2f_max(a,b);
+        should_be_equal_simd2f(x, simd2f_create(2.0f, 2.0f), epsilon);
+
+    }
+
+}
+
+
+
+#endif
+
--- a/3rdparty/vectorial/spec/spec_simd4f.cpp
+++ b/3rdparty/vectorial/spec/spec_simd4f.cpp
@ -0,0 +1,457 @@
+
+#include "spec_helper.h"
+
+const int epsilon = 1;
+
+describe(simd4f, "sanity") {
+    it("VECTORIAL_SIMD_TYPE should be defined to a string") {
+        std::cout << "Simd type: " << VECTORIAL_SIMD_TYPE << std::endl;
+    }
+}
+
+describe(simd4f, "creating") {
+    
+    it("should be possible to create with simd4f_create") {
+        
+        simd4f x = simd4f_create(1, 2, 3, 4);
+
+        should_be_close_to( simd4f_get_x(x), 1, epsilon);
+        should_be_close_to( simd4f_get_y(x), 2, epsilon);
+        should_be_close_to( simd4f_get_z(x), 3, epsilon);
+        should_be_close_to( simd4f_get_w(x), 4, epsilon);
+
+        // octave simd4f: [1,2,3,4]
+        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
+        
+    }
+
+    it("should have simd4f_zero for zero vector") {
+
+        simd4f x = simd4f_zero();
+
+        // octave simd4f: [0,0,0,0]
+        should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon );
+    }
+    
+    
+}
+#ifdef _MSC_VER
+#include <malloc.h>
+#else
+#include <alloca.h>
+#endif
+
+#define unaligned_mem(n) ((float*)((unsigned char*)alloca(sizeof(float)*n+4)+4))
+
+describe(simd4f, "utilities") {
+
+    it("should have simd4f_uload4 for loading four float values from an unaligned float array into simd4f") {
+        float *f = unaligned_mem(4);
+        f[0] = 1;
+        f[1] = 2;
+        f[2] = 3;
+        f[3] = 4;
+        simd4f x = simd4f_uload4(f);
+        // octave simd4f: [1,2,3,4]
+        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_uload3 for loading three float values from an unaligned float array into simd4f") {
+        float *f = unaligned_mem(3);
+        f[0] = 1;
+        f[1] = 2;
+        f[2] = 3;
+        simd4f x = simd4f_uload3(f);
+        // octave simd4f: [1,2,3]
+        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
+    }
+
+    it("should have simd4f_uload2 for loading two float values from float an unaligned array into simd4f") {
+        float *f = unaligned_mem(2);
+        f[0] = 1;
+        f[1] = 2;
+        simd4f x = simd4f_uload2(f);
+        // octave simd4f: [1,2]
+        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
+    }
+
+
+    it("should have simd4f_ustore4 for storing four float values from simd4f to an unaligned array") {
+        float *f = unaligned_mem(4);
+        f[0] = -1;
+        f[1] = -1;
+        f[2] = -1;
+        f[3] = -1;
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f_ustore4(a, f);
+        should_be_close_to(f[0], 1, epsilon);
+        should_be_close_to(f[1], 2, epsilon);
+        should_be_close_to(f[2], 3, epsilon);
+        should_be_close_to(f[3], 4, epsilon);
+    }
+
+    it("should have simd4f_ustore3 for storing three float values from simd4f to an unaligned array") {
+        float *f = unaligned_mem(3);
+        f[0] = -1;
+        f[1] = -1;
+        f[2] = -1;
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f_ustore3(a, f);
+        should_be_close_to(f[0], 1, epsilon);
+        should_be_close_to(f[1], 2, epsilon);
+        should_be_close_to(f[2], 3, epsilon);
+    }
+
+    it("should have simd4f_ustore2 for storing two float values from simd4f to an unaligned array") {
+        float *f = unaligned_mem(2);
+        f[0] = -1;
+        f[1] = -1;
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f_ustore2(a, f);
+        should_be_close_to(f[0], 1, epsilon);
+        should_be_close_to(f[1], 2, epsilon);
+    }
+
+
+
+
+    it("should have simd4f_splat that expands a single scalar to all elements") {
+        simd4f x = simd4f_splat(42);
+        // octave simd4f: [42,42,42,42]
+        should_be_equal_simd4f(x, simd4f_create(42.000000000000000f, 42.000000000000000f, 42.000000000000000f, 42.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_splat_x,y,z,w splatting of an element") {
+        simd4f a = simd4f_create(1,2,3,4);
+
+        simd4f x;
+        
+        x = simd4f_splat_x(a);
+        // octave simd4f: [1,1,1,1]
+        should_be_equal_simd4f(x, simd4f_create(1.000000000000000f, 1.000000000000000f, 1.000000000000000f, 1.000000000000000f), epsilon );
+
+        x = simd4f_splat_y(a);
+        // octave simd4f: [2,2,2,2]
+        should_be_equal_simd4f(x, simd4f_create(2.000000000000000f, 2.000000000000000f, 2.000000000000000f, 2.000000000000000f), epsilon );
+
+        x = simd4f_splat_z(a);
+        // octave simd4f: [3,3,3,3]
+        should_be_equal_simd4f(x, simd4f_create(3.000000000000000f, 3.000000000000000f, 3.000000000000000f, 3.000000000000000f), epsilon );
+
+        x = simd4f_splat_w(a);
+        // octave simd4f: [4,4,4,4]
+        should_be_equal_simd4f(x, simd4f_create(4.000000000000000f, 4.000000000000000f, 4.000000000000000f, 4.000000000000000f), epsilon );
+    }
+    
+    it("should have simd4f_sum that adds elements") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_sum(a);
+        // octave simd4f: [sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4]), sum([1,2,3,4])]
+        should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
+        
+    }
+        
+    it("should have simd4f_reciprocal") {
+        simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
+        simd4f x = simd4f_reciprocal(a);
+        // octave simd4f: 1 ./ [0.00001, 2.00001, 3.0, 99999999.0]
+        should_be_equal_simd4f(x, simd4f_create(99999.999999999985448f, 0.499997500012500f, 0.333333333333333f, 0.000000010000000f), epsilon );
+    }
+
+    it("should have simd4f_sqrt") {
+        simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
+        simd4f x = simd4f_sqrt(a);
+        // octave simd4f:  sqrt([0.00001, 2.00001, 3.0, 99999999.0])
+        should_be_equal_simd4f(x, simd4f_create(0.003162277660168f, 1.414217097902582f, 1.732050807568877f, 9999.999949999999444f), epsilon );
+
+        x = simd4f_sqrt( simd4f_create(0.0f, 0.0f, 0.0f, 0.0f) );
+        // octave simd4f:  sqrt([0, 0, 0, 0])
+        should_be_equal_simd4f(x, simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_rsqrt for reciprocal of square-root") {
+        simd4f a = simd4f_create(0.00001f, 2.00001f, 3.0f, 99999999.0f);
+        simd4f x = simd4f_rsqrt(a);
+        const int epsilon = 4; // Grant larger error
+        // octave simd4f:  1 ./ sqrt([0.00001, 2.00001, 3.0, 99999999.0])
+        should_be_equal_simd4f(x, simd4f_create(316.227766016837904f, 0.707105013426224f, 0.577350269189626f, 0.000100000000500f), epsilon );
+    }
+
+}
+
+describe(simd4f, "arithmetic with another simd4f") {
+
+    it("should have simd4f_add for component-wise addition") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(10,20,30,40);
+        
+        simd4f x = simd4f_add(a,b);
+        // octave simd4f: [1,2,3,4] + [10,20,30,40]
+        should_be_equal_simd4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_sub for component-wise subtraction") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(10,20,30,40);
+        
+        simd4f x = simd4f_sub(b,a);
+        // octave simd4f: [10,20,30,40] - [1,2,3,4] 
+        should_be_equal_simd4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_mul for component-wise multiply") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(10,20,30,40);
+        
+        simd4f x = simd4f_mul(a,b);
+        // octave simd4f: [1,2,3,4] .* [10,20,30,40]
+        should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_div for component-wise division") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(10,20,30,40);
+        
+        simd4f x = simd4f_div(b,a);
+        // octave simd4f: [10,20,30,40] ./ [1,2,3,4] 
+        should_be_equal_simd4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_madd for multiply-add") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(100,100,100,100);
+        simd4f c = simd4f_create(6,7,8,9);
+
+        simd4f x = simd4f_madd(a,b,c);
+        // octave simd4f: [1,2,3,4] .* [100,100,100,100] .+ [6,7,8,9]
+        should_be_equal_simd4f(x, simd4f_create(106.000000000000000f, 207.000000000000000f, 308.000000000000000f, 409.000000000000000f), epsilon );
+
+    }
+
+}
+
+
+describe(simd4f, "vector math") {
+    
+    it("should have simd4f_dot4 for four component dot product") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(10,20,30,40);
+        
+        simd4f x = simd4f_dot4(a,b);
+        // octave simd4f: [dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40]),dot([1, 2, 3, 4], [10, 20, 30, 40])]
+        should_be_equal_simd4f(x, simd4f_create(300.000000000000000f, 300.000000000000000f, 300.000000000000000f, 300.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_dot3_scalar for three component dot product returning float") {
+        simd4f a = simd4f_create(1,2,3,9999);
+        simd4f b = simd4f_create(10,20,30,-9990);
+        
+        float x = simd4f_dot3_scalar(a,b);
+        // octave float: dot([1, 2, 3], [10, 20, 30])
+        should_be_close_to(x, 140.000000000000000f, epsilon );
+    }
+
+    it("should have simd4f_dot3 for three component dot product returning simd4f") {
+        simd4f a = simd4f_create(1,2,3,9999);
+        simd4f b = simd4f_create(10,20,30,-9990);
+        
+        simd4f x = simd4f_dot3(a,b);
+        // octave simd4f: [dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30]),dot([1, 2, 3], [10, 20, 30])]
+        should_be_equal_simd4f(x, simd4f_create(140.000000000000000f, 140.000000000000000f, 140.000000000000000f, 140.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_dot2 for two component dot product") {
+        simd4f a = simd4f_create(1,2,3,9999);
+        simd4f b = simd4f_create(10,20,30,-9990);
+        
+        simd4f x = simd4f_dot2(a,b);
+        // octave simd4f: [dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20]),dot([1, 2], [10, 20])]
+        should_be_equal_simd4f(x, simd4f_create(50.000000000000000f, 50.000000000000000f, 50.000000000000000f, 50.000000000000000f), epsilon );
+    }
+    
+    it("should have simd4f_length4 for four component vector length") {
+        simd4f a = simd4f_create(1,2,-3,9999);
+        simd4f x = simd4f_length4(a);
+        // octave simd4f: [norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999]), norm([1,2,-3,9999])]
+        should_be_equal_simd4f(x, simd4f_create(9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f, 9999.000700069982486f), epsilon );
+
+    }
+
+    it("should have simd4f_length3 for three component vector length") {
+        simd4f a = simd4f_create(1,2,-3,9999);
+        simd4f x = simd4f_length3(a);
+        // octave simd4f: [norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3]), norm([1,2,-3])]
+        should_be_equal_simd4f(x, simd4f_create(3.741657386773941f, 3.741657386773941f, 3.741657386773941f, 3.741657386773941f), epsilon );
+
+    }
+
+    it("should have simd4f_length2 for two component vector length") {
+        simd4f a = simd4f_create(1,2,-3,9999);
+        simd4f x = simd4f_length2(a);
+        // octave simd4f: [norm([1,2]),norm([1,2]),norm([1,2]),norm([1,2])]
+        should_be_equal_simd4f(x, simd4f_create(2.236067977499790f, 2.236067977499790f, 2.236067977499790f, 2.236067977499790f), epsilon );
+
+    }
+
+
+    it("should have simd4f_length4_squared for four component squared vector length") {
+        simd4f a = simd4f_create(1,2,-3,9999);
+        simd4f x = simd4f_length4_squared(a);
+        // octave simd4f: ([(dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999])), (dot([1,2,-3,9999], [1,2,-3,9999]))])
+        should_be_equal_simd4f(x, simd4f_create(99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f, 99980015.000000000000000f), epsilon );
+
+    }
+
+    it("should have simd4f_length3_squared for three component squared vector length") {
+        simd4f a = simd4f_create(1,2,-3,9999);
+        simd4f x = simd4f_length3_squared(a);
+        // octave simd4f: ([dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3]), dot([1,2,-3], [1,2,-3])])
+        should_be_equal_simd4f(x, simd4f_create(14.000000000000000f, 14.000000000000000f, 14.000000000000000f, 14.000000000000000f), epsilon );
+
+    }
+
+    it("should have simd4f_length2_squared for two component squared vector length") {
+        simd4f a = simd4f_create(1,2,-3,9999);
+        simd4f x = simd4f_length2_squared(a);
+        // octave simd4f: ([dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2]), dot([1,2], [1,2])])
+        should_be_equal_simd4f(x, simd4f_create(5.000000000000000f, 5.000000000000000f, 5.000000000000000f, 5.000000000000000f), epsilon );
+
+    }
+    
+    
+    
+    it("should have simd4f_cross3 for cross product") {
+        simd4f a = simd4f_create(1,12,3,-9999);
+        simd4f b = simd4f_create(5,6,-17, 9999);
+
+        simd4f x = simd4f_cross3(a,b);
+        // octave simd4f: horzcat(  cross( [1,12,3], [5,6,-17] )  , [0] )
+        should_be_equal_simd4f(x, simd4f_create(-222.000000000000000f, 32.000000000000000f, -54.000000000000000f, 0.000000000000000f), epsilon );
+
+    }
+    
+    it("should have simd4f_normalize4 for normalizing four const vector to unit length") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_normalize4(a);
+        // octave simd4f: [1,2,3,4] / norm([1,2,3,4])
+        should_be_equal_simd4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon );
+    }
+
+    it("should have simd4f_normalize3 for normalizing three component vector to unit length") {
+        simd4f a = simd4f_create(1,2,3,0);
+        simd4f x = simd4f_normalize3(a);
+        // octave simd4f: [1,2,3,0] / norm([1,2,3])
+        should_be_equal_simd4f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.000000000000000f), epsilon );
+    }
+
+    it("should have simd4f_normalize2 for normalizing two component vector to unit length") {
+        simd4f a = simd4f_create(1,2,0,0);
+        simd4f x = simd4f_normalize2(a);
+        // octave simd4f: [1,2,0,0] / norm([1,2])
+        should_be_equal_simd4f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.000000000000000f, 0.000000000000000f), epsilon );
+    }
+
+    
+}
+
+describe(simd4f, "shuffles and merges") {
+    
+    it("should have simd4f_shuffle_wxyz") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_shuffle_wxyz(a);
+        should_be_equal_simd4f(x, simd4f_create(4,1,2,3), epsilon );
+    }
+
+    it("should have simd4f_shuffle_zwxy") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_shuffle_zwxy(a);
+        should_be_equal_simd4f(x, simd4f_create(3,4,1,2), epsilon );
+    }
+
+    it("should have simd4f_shuffle_yzwx") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_shuffle_yzwx(a);
+        should_be_equal_simd4f(x, simd4f_create(2,3,4,1), epsilon );
+    }
+
+    it("should have simd4f_merge_high") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f b = simd4f_create(5,6,7,8);
+        simd4f x = simd4f_merge_high(a,b);
+        should_be_equal_simd4f(x, simd4f_create(3,4,7,8), epsilon );
+    }
+    
+}
+
+describe(simd4f, "signs") {
+
+    it("should have simd4f_flip_sign_0101 for flipping even elements sign") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_flip_sign_0101(a);
+        should_be_equal_simd4f(x, simd4f_create(1,-2,3,-4), epsilon );
+    }
+
+    it("should have simd4f_flip_sign_1010 for flipping even elements sign") {
+        simd4f a = simd4f_create(1,2,3,4);
+        simd4f x = simd4f_flip_sign_1010(a);
+        should_be_equal_simd4f(x, simd4f_create(-1,2,-3,4), epsilon );
+    }
+
+}
+
+describe(simd4f, "min-max") {
+    
+    it("should have simd4f_min for choosing minimum elements") {
+        simd4f a = simd4f_create(1.0f,  2.0f, -300000000.0f, -0.000002f);
+        simd4f b = simd4f_create(2.0f, -2.0f,  300000000.0f,  0.000001f);
+
+        simd4f x = simd4f_min(a,b);
+        should_be_equal_simd4f(x, simd4f_create(1.0f, -2.0f, -300000000.0f, -0.000002f), epsilon);
+        
+    }
+
+    it("should have simd4f_max for choosing maximum elements") {
+        simd4f a = simd4f_create(1.0f,  2.0f, -300000000.0f, -0.000002f);
+        simd4f b = simd4f_create(2.0f, -2.0f,  300000000.0f,  0.000001f);
+
+        simd4f x = simd4f_max(a,b);
+        should_be_equal_simd4f(x, simd4f_create(2.0f, 2.0f, 300000000.0f, 0.000001f), epsilon);
+        
+    }
+    
+    
+    
+}
+
+
+describe(simd4f, "zeroing")
+{
+
+    it("should have simd4f_zero_w that zeros the last element")
+    {
+        const float nan = sqrtf(-1.0f);
+        simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f);
+        simd4f b = simd4f_create(1.0f, 2.0f, 3.0f, nan);
+        simd4f x = simd4f_zero_w(a);
+        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon);
+        x = simd4f_zero_w(b);
+        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 3.0f, 0.0f), epsilon);
+    }
+
+    it("should have simd4f_zero_zw that zeros the last element")
+    {
+        const float nan = sqrtf(-1.0f);
+        simd4f a = simd4f_create(1.0f, 2.0f, 3.0f, 4.0f);
+        simd4f b = simd4f_create(1.0f, 2.0f, nan, nan);
+        simd4f x = simd4f_zero_zw(a);
+        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon);
+        x = simd4f_zero_zw(b);
+        should_be_equal_simd4f(x, simd4f_create(1.0f, 2.0f, 0.0f, 0.0f), epsilon);
+    }
+
+}
+
+
+
+
+
--- a/3rdparty/vectorial/spec/spec_simd4x4f.cpp
+++ b/3rdparty/vectorial/spec/spec_simd4x4f.cpp
@ -0,0 +1,381 @@
+#include "spec_helper.h"
+
+const int epsilon = 1;
+
+#ifndef M_PI
+#define M_PI 3.141592f
+#endif
+
+describe(simd4x4f, "creating") {
+    
+    it("should be possible to create with params") {
+        
+        simd4x4f x = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                   simd4f_create(5,  6,  7,  8 ),
+                                   simd4f_create(9,  10, 11, 12 ),
+                                   simd4f_create(13, 14, 15, 16 ));
+
+        should_be_equal_simd4f( x.x, simd4f_create(1,  2,  3,  4 ) , epsilon);
+        should_be_equal_simd4f( x.y, simd4f_create(5,  6,  7,  8 ) , epsilon);
+        should_be_equal_simd4f( x.z, simd4f_create(9,  10, 11, 12 ), epsilon);
+        should_be_equal_simd4f( x.w, simd4f_create(13, 14, 15, 16 ), epsilon);
+
+        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), simd4f_create(5.000000000000000f, 6.000000000000000f, 7.000000000000000f, 8.000000000000000f), simd4f_create(9.000000000000000f, 10.000000000000000f, 11.000000000000000f, 12.000000000000000f), simd4f_create(13.000000000000000f, 14.000000000000000f, 15.000000000000000f, 16.000000000000000f)), epsilon );
+        
+    }
+    
+
+    it("should be possible to set to identity") {
+        simd4x4f x;
+        simd4x4f_identity(&x);
+        
+        // octave simd4x4f: [1,0,0,0; 0,1,0,0; 0,0,1,0; 0,0,0,1]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 0.000000000000000f, 1.000000000000000f)), epsilon );
+    }
+}
+
+
+describe(simd4x4f, "loading and storing") {
+
+    it("should be possible to load from array of 16 floats with simd4x4f_uload") {
+        
+        simd4x4f x;
+        float f[16] = {1,2,3,4, 5,6,7,8, 9,10,11,12, 13,14,15,16 };
+        simd4x4f_uload(&x, f);
+
+        should_be_equal_simd4x4f(x, simd4x4f_create( simd4f_create(1,2,3,4),
+                                                     simd4f_create(5,6,7,8),
+                                                     simd4f_create(9,10,11,12),
+                                                     simd4f_create(13,14,15,16) ), epsilon);
+        
+    }
+
+}
+
+
+describe(simd4x4f, "matrix utility") {
+    
+    it("should have simd4x4f_transpose_inplace for transpose") {
+        
+        simd4x4f x = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                   simd4f_create(5,  6,  7,  8 ),
+                                   simd4f_create(9,  10, 11, 12 ),
+                                   simd4f_create(13, 14, 15, 16 ));
+        
+        simd4x4f_transpose_inplace(&x);
+        
+        // octave simd4x4f: transpose([1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ])
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 5.000000000000000f, 9.000000000000000f, 13.000000000000000f), simd4f_create(2.000000000000000f, 6.000000000000000f, 10.000000000000000f, 14.000000000000000f), simd4f_create(3.000000000000000f, 7.000000000000000f, 11.000000000000000f, 15.000000000000000f), simd4f_create(4.000000000000000f, 8.000000000000000f, 12.000000000000000f, 16.000000000000000f)), epsilon );
+    }
+
+    it("should have simd4x4f_transpose for transpose") {
+        
+        simd4x4f in = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                   simd4f_create(5,  6,  7,  8 ),
+                                   simd4f_create(9,  10, 11, 12 ),
+                                   simd4f_create(13, 14, 15, 16 ));
+        
+        simd4x4f x;
+        simd4x4f_transpose(&in, &x);
+        
+        // octave simd4x4f: transpose([1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ])
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 5.000000000000000f, 9.000000000000000f, 13.000000000000000f), simd4f_create(2.000000000000000f, 6.000000000000000f, 10.000000000000000f, 14.000000000000000f), simd4f_create(3.000000000000000f, 7.000000000000000f, 11.000000000000000f, 15.000000000000000f), simd4f_create(4.000000000000000f, 8.000000000000000f, 12.000000000000000f, 16.000000000000000f)), epsilon );
+    }
+
+    it("should have simd4x4f_matrix_vector_mul for matrix-vector multiply") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
+                                     simd4f_create( 3,   11,   19,   27 ),
+                                     simd4f_create( 5,   13,   21,   29 ),
+                                     simd4f_create( 7,   15,   23,   31 ));
+
+        simd4f b = simd4f_create( 26,  -28,   30,  -32 );
+        
+        simd4f x;
+        simd4x4f_matrix_vector_mul(&a, &b, &x);
+        
+        // octave simd4f: [1,3,5,7;9,11,13,15;17,19,21,23;25,27,29,31] * [26;-28;30;-32]
+        should_be_equal_simd4f(x, simd4f_create(-132.000000000000000f, -164.000000000000000f, -196.000000000000000f, -228.000000000000000f), epsilon );
+    }
+
+    it("should have simd4x4f_matrix_vector3_mul for matrix-vector3 multiply") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
+                                     simd4f_create( 3,   11,   19,   27 ),
+                                     simd4f_create( 5,   13,   21,   29 ),
+                                     simd4f_create( 7,   15,   23,   31 ));
+
+        simd4f b = simd4f_create( 26,  -28,   30,  -32 );
+        
+        simd4f x;
+        simd4x4f_matrix_vector3_mul(&a, &b, &x);
+        
+        // TODO octave simd4f: 
+        
+    }
+
+    it("should have simd4x4f_matrix_vector3_mul for matrix-vector3 multiply") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
+                                     simd4f_create( 3,   11,   19,   27 ),
+                                     simd4f_create( 5,   13,   21,   29 ),
+                                     simd4f_create( 7,   15,   23,   31 ));
+
+        simd4f b = simd4f_create( 26,  -28,   30,  -32 );
+        
+        simd4f x;
+        simd4x4f_matrix_vector3_mul(&a, &b, &x);
+        
+        // TODO octave simd4f: 
+        
+    }
+
+    it("should have simd4x4f_matrix_point3_mul") { /* TODO */ }
+
+    it("should have simd4x4f_inv_ortho_matrix_point3_mul for transforming point with inverse of a orhtonormal matrix") {
+
+        simd4x4f a = simd4x4f_create(simd4f_create( 0,  -1,   0,   0 ),
+                                     simd4f_create( 1,   0,   0,   0 ),
+                                     simd4f_create( 0,   0,   1,   0 ),
+                                     simd4f_create( 1,   2,   3,   1 ));
+
+        simd4f b = simd4f_create(5,6,7,0);
+
+        simd4f x;
+        simd4x4f_inv_ortho_matrix_point3_mul(&a, &b, &x);
+
+        // octave simd4f: inverse([0,1,0,1; -1,0,0,2; 0,0,1,3; 0,0,0,1]) * [5;6;7;1] .* [1;1;1;0]
+        should_be_equal_simd4f(x, simd4f_create(-4.000000000000000f, 4.000000000000000f, 4.000000000000000f, 0.000000000000000f), epsilon );
+    }
+
+
+    it("should have simd4x4f_matrix_mul for matrix multiply") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create( 1,    9,   17,   25 ),
+                                     simd4f_create( 3,   11,   19,   27 ),
+                                     simd4f_create( 5,   13,   21,   29 ),
+                                     simd4f_create( 7,   15,   23,   31 ));
+
+        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
+                                     simd4f_create( -4,   12,  -20,   28 ),
+                                     simd4f_create(  6,  -14,   22,  -30 ),
+                                     simd4f_create( -8,   16,  -24,   32 ));
+        
+        simd4x4f x;
+        simd4x4f_matrix_mul(&a, &b, &x);
+        
+        // octave simd4x4f: [1,3,5,7;9,11,13,15;17,19,21,23;25,27,29,31] * [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(-120.000000000000000f, -248.000000000000000f, -376.000000000000000f, -504.000000000000000f), simd4f_create(128.000000000000000f, 256.000000000000000f, 384.000000000000000f, 512.000000000000000f), simd4f_create(-136.000000000000000f, -264.000000000000000f, -392.000000000000000f, -520.000000000000000f), simd4f_create(144.000000000000000f, 272.000000000000000f, 400.000000000000000f, 528.000000000000000f)), epsilon );
+    }
+    
+    
+    
+    
+    it("should have simd4x4f_inverse for calculating inverse matrix") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create(7,  2,  87,  5 ),
+                                   simd4f_create(5,  24,  6,  3 ),
+                                   simd4f_create(4,  6, 5, 6 ),
+                                   simd4f_create(5, 7, 4, 6 ));
+        
+        simd4x4f x;
+        simd4x4f_inverse(&a, &x);
+        
+        // octave simd4x4f: inverse( [7,5,4,5 ; 2,24,6,7 ; 87,6,5,4 ; 5,3,6,6] )
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.015309310560300f, -0.049885440533222f, -1.081337221412206f, 1.093522182878568f), simd4f_create(-0.004061653822120f, 0.054051239325141f, 0.123620079150177f, -0.147260987294314f), simd4f_create(0.011247656738180f, 0.004165798791918f, 0.042282857737971f, -0.053738804415747f), simd4f_create(-0.015517600499896f, -0.024265777962924f, 0.728702353676318f, -0.536971464278276f)), epsilon );
+
+        simd4x4f x2;
+        simd4x4f_matrix_mul(&x, &a, &x2);
+        simd4x4f identity;
+        simd4x4f_identity(&identity);
+        // Allow larger error for M * M' = I
+        const int epsilon = 0x35100000; 
+        should_be_equal_simd4x4f(x2, identity, epsilon);
+        
+    }
+    
+    
+    
+}
+
+
+describe(simd4x4f, "math on elements") {
+    
+    it("should have simd4x4f_add for element-wise addition") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                     simd4f_create(5,  6,  7,  8 ),
+                                     simd4f_create(9,  10, 11, 12 ),
+                                     simd4f_create(13, 14, 15, 16 ));
+        
+        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
+                                     simd4f_create( -4,   12,  -20,   28 ),
+                                     simd4f_create(  6,  -14,   22,  -30 ),
+                                     simd4f_create( -8,   16,  -24,   32 ));
+
+        simd4x4f x;
+        
+        simd4x4f_add(&a, &b, &x);
+                                 
+
+        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] + [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(3.000000000000000f, -8.000000000000000f, 21.000000000000000f, -22.000000000000000f), simd4f_create(1.000000000000000f, 18.000000000000000f, -13.000000000000000f, 36.000000000000000f), simd4f_create(15.000000000000000f, -4.000000000000000f, 33.000000000000000f, -18.000000000000000f), simd4f_create(5.000000000000000f, 30.000000000000000f, -9.000000000000000f, 48.000000000000000f)), epsilon );
+        
+    }
+
+    it("should have simd4x4f_sub for element-wise substraction") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                     simd4f_create(5,  6,  7,  8 ),
+                                     simd4f_create(9,  10, 11, 12 ),
+                                     simd4f_create(13, 14, 15, 16 ));
+        
+        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
+                                     simd4f_create( -4,   12,  -20,   28 ),
+                                     simd4f_create(  6,  -14,   22,  -30 ),
+                                     simd4f_create( -8,   16,  -24,   32 ));
+
+        simd4x4f x;
+        
+        simd4x4f_sub(&a, &b, &x);
+                                 
+
+        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] - [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(-1.000000000000000f, 12.000000000000000f, -15.000000000000000f, 30.000000000000000f), simd4f_create(9.000000000000000f, -6.000000000000000f, 27.000000000000000f, -20.000000000000000f), simd4f_create(3.000000000000000f, 24.000000000000000f, -11.000000000000000f, 42.000000000000000f), simd4f_create(21.000000000000000f, -2.000000000000000f, 39.000000000000000f, -16.000000000000000f)), epsilon );
+        
+    }
+
+    it("should have simd4x4f_mul for element-wise multiplication") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                     simd4f_create(5,  6,  7,  8 ),
+                                     simd4f_create(9,  10, 11, 12 ),
+                                     simd4f_create(13, 14, 15, 16 ));
+        
+        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
+                                     simd4f_create( -4,   12,  -20,   28 ),
+                                     simd4f_create(  6,  -14,   22,  -30 ),
+                                     simd4f_create( -8,   16,  -24,   32 ));
+
+        simd4x4f x;
+        
+        simd4x4f_mul(&a, &b, &x);
+                                 
+
+        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] .* [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(2.000000000000000f, -20.000000000000000f, 54.000000000000000f, -104.000000000000000f), simd4f_create(-20.000000000000000f, 72.000000000000000f, -140.000000000000000f, 224.000000000000000f), simd4f_create(54.000000000000000f, -140.000000000000000f, 242.000000000000000f, -360.000000000000000f), simd4f_create(-104.000000000000000f, 224.000000000000000f, -360.000000000000000f, 512.000000000000000f)), epsilon );
+        
+    }
+
+    it("should have simd4x4f_div for element-wise division") {
+        
+        simd4x4f a = simd4x4f_create(simd4f_create(1,  2,  3,  4 ),
+                                     simd4f_create(5,  6,  7,  8 ),
+                                     simd4f_create(9,  10, 11, 12 ),
+                                     simd4f_create(13, 14, 15, 16 ));
+        
+        simd4x4f b = simd4x4f_create(simd4f_create(  2 , -10,   18 , -26 ),
+                                     simd4f_create( -4,   12,  -20,   28 ),
+                                     simd4f_create(  6,  -14,   22,  -30 ),
+                                     simd4f_create( -8,   16,  -24,   32 ));
+
+        simd4x4f x;
+        
+        simd4x4f_div(&a, &b, &x);
+                                 
+
+        // octave simd4x4f: [1,5,9,13 ; 2,6,10,14 ; 3,7,11,15 ; 4,8,12,16 ] ./ [2,-4,6,-8;-10,12,-14,16;18,-20,22,-24;-26,28,-30,32]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.500000000000000f, -0.200000000000000f, 0.166666666666667f, -0.153846153846154f), simd4f_create(-1.250000000000000f, 0.500000000000000f, -0.350000000000000f, 0.285714285714286f), simd4f_create(1.500000000000000f, -0.714285714285714f, 0.500000000000000f, -0.400000000000000f), simd4f_create(-1.625000000000000f, 0.875000000000000f, -0.625000000000000f, 0.500000000000000f)), epsilon );
+        
+    }
+    
+    
+}
+
+
+describe(simd4x4f, "creating projection and view matrices") {
+
+    it("should have simd4x4f_perspective for creating perspective projection matrix") {
+        
+        const float fov = 10.0f * M_PI / 180.0f;
+        const float aspect = 1.6f;
+        const float znear = 2.0f;
+        const float zfar = 50.0f;
+
+        const int epsilon = 50;
+        
+        simd4x4f x;
+        simd4x4f_perspective(&x, fov, aspect, znear, zfar);
+        
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(7.14378, 0, 0, 0),
+                                                    simd4f_create(0, 11.4301, 0, 0),
+                                                    simd4f_create(0, 0, -1.08333, -1),
+                                                    simd4f_create(-0, -0, -4.16667, -0)), epsilon);
+        
+        
+    }
+
+    it("should have simd4x4f_ortho for creating orthogonal projection matrix") {
+        
+
+        simd4x4f x;
+        simd4x4f_ortho(&x, -10, 20, -30, 40, -50, 60);
+        const int epsilon = 20;        
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.0666667, 0, 0, 0),
+                                                    simd4f_create(0, 0.0285714, 0, 0),
+                                                    simd4f_create(-0, -0, -0.0181818, -0),
+                                                    simd4f_create(-0.333333, -0.142857, -0.0909091, 1)), epsilon);
+        
+        
+    }
+    
+    it("should have simd4x4f_lookat for creating look-at matrix") {
+        
+        simd4f eye = simd4f_create(1,2,3,0);
+        simd4f center = simd4f_create(3,4,5,0);
+        simd4f up = simd4f_create(0,1,0,0);
+
+        simd4x4f x;
+        simd4x4f_lookat(&x, eye, center, up);
+
+        const int epsilon = 40;
+        
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(-0.707107, -0.408248, -0.57735, 0),
+                                                    simd4f_create(0, 0.816497, -0.57735, 0),
+                                                    simd4f_create(0.707107, -0.408248, -0.57735, 0),
+                                                    simd4f_create(-1.41421, 0, 3.4641, 1)), epsilon);
+
+        
+    }
+    
+    
+    it("should have simd4x4f_translation for creating translation matrix") {
+        
+        simd4x4f x;
+        simd4x4f_translation(&x, 1,2,3);
+
+        // octave simd4x4f: [1,0,0,1; 0,1,0,2; 0,0,1,3; 0,0,0,1]
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(1.000000000000000f, 0.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 1.000000000000000f, 0.000000000000000f, 0.000000000000000f), simd4f_create(0.000000000000000f, 0.000000000000000f, 1.000000000000000f, 0.000000000000000f), simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 1.000000000000000f)), epsilon );
+    }
+
+    it("should have simd4x4f_axis_rotation for creating a rotation matrix along a axis") {
+        
+        simd4x4f x;
+
+        simd4x4f_axis_rotation(&x, 45 * M_PI / 180.0f, simd4f_create(1,2,3,0));
+
+        const int epsilon = 20;
+
+        should_be_equal_simd4x4f(x, simd4x4f_create(simd4f_create(0.728028, 0.608789, -0.315202, 0),
+                                                   simd4f_create(-0.525105, 0.790791, 0.314508, 0),
+                                                   simd4f_create(0.440727, -0.0634566, 0.895395, 0),
+                                                   simd4f_create(0, 0, 0, 1)), epsilon);
+
+
+    }
+    
+    
+}
+
+
--- a/3rdparty/vectorial/spec/spec_vec2f.cpp
+++ b/3rdparty/vectorial/spec/spec_vec2f.cpp
@ -0,0 +1,255 @@
+#include "spec_helper.h"
+#include <iostream>
+using vectorial::vec2f;
+
+const int epsilon = 1;
+
+describe(vec2f, "constructing") {
+    it("should have default constructor that does nothing..") {
+        vec2f x;
+    }
+
+    it("should have constructor with element values") {
+        vec2f x(10,20);
+        // octave vec2f: [10,20]
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
+        
+    }
+
+    it("should have constructor that loads from a float array") {
+        float ary[2] = { 1,2 };
+        vec2f x(ary);
+        // octave vec2f: [1,2]
+        should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
+    }
+
+}
+
+describe(vec2f, "loads and stores") {
+
+    it("should have method for loading from a float array") {
+        float ary[2] = { 1, 2 };
+        vec2f x(-1, -1 );
+        x.load(ary);
+        // octave vec2f: [1,2]
+        should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
+    }
+
+    it("should have method for storing to a float array") {
+        float ary[2] = { -1, -1 };
+        vec2f x(1, 2);
+        x.store(ary);
+        should_be_close_to(ary[0], 1, epsilon);
+        should_be_close_to(ary[1], 2, epsilon);
+    }
+
+}
+
+
+describe(vec2f, "arithmetic with another vec2f") {
+    
+    it("should have operator+ for component-wise addition") {
+        vec2f a(1,2);
+        vec2f b(10,20);
+        vec2f x = a + b;
+        // octave vec2f: [1,2] + [10,20]
+        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction") {
+        vec2f a(1,2);
+        vec2f b(10,20);
+        vec2f x = b - a;
+        // octave vec2f:  [10,20] - [1,2]
+        should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication") {
+        vec2f a(1,2);
+        vec2f b(10,20);
+        vec2f x = a * b;
+        // octave vec2f: [1,2] .* [10,20]
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator/ for component-wise division") {
+        vec2f a(1,2);
+        vec2f b(10,20);
+        vec2f x = b / a;
+        // octave vec2f:  [10,20] ./ [1,2]
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+   
+
+
+    it("should have operator+= for component-wise addition") {
+        vec2f x(1,2);
+        vec2f b(10,20);
+        x += b;
+        // octave vec2f: [1,2] + [10,20]
+        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator-= for component-wise subtraction") {
+        vec2f a(1,2);
+        vec2f x(10,20);
+        x -= a;
+        // octave vec2f:  [10,20] - [1,2]
+        should_be_equal_vec2f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator*= for component-wise multiplication") {
+        vec2f x(1,2);
+        vec2f b(10,20);
+        x *= b;
+        // octave vec2f: [1,2] .* [10,20]
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator/= for component-wise division") {
+        vec2f a(1,2);
+        vec2f x(10,20);
+        x /= a;
+        // octave vec2f:  [10,20] ./ [1,2]
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+
+}
+
+
+describe(vec2f, "arithmetic with scalar") {
+    
+    it("should have operator+ for component-wise addition") {
+        vec2f a(1,2);
+        float b=10;
+        vec2f x = a + b;
+        // octave vec2f: [1,2] + 10
+        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction") {
+        float a=10;
+        vec2f b(10,20);
+        vec2f x = b - a;
+        // octave vec2f:  [10,20] - 10
+        should_be_equal_vec2f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication") {
+        vec2f a(1,2);
+        float b=10;
+        vec2f x = a * b;
+        // octave vec2f: [1,2] .* 10
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator/ for component-wise division") {
+        vec2f a(10,20);
+        float b=10;
+        vec2f x = a / b;
+        // octave vec2f: [10,20] ./ 10
+        should_be_equal_vec2f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+
+
+    it("should have operator+ for component-wise addition (float as lhs)") {
+        vec2f b(1,2);
+        float a=10;
+        vec2f x = a + b;
+        // octave vec2f: 10 + [1,2]
+        should_be_equal_vec2f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction (float as lhs)") {
+        float b=50;
+        vec2f a(10,20);
+        vec2f x = b - a;
+        // octave vec2f:  50 - [10,20]
+        should_be_equal_vec2f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication (float as lhs)") {
+        vec2f b(1,2);
+        float a=10;
+        vec2f x = a * b;
+        // octave vec2f: 10 .* [1,2] 
+        should_be_equal_vec2f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication (float as lhs)") {
+        vec2f b(10,20);
+        float a=40;
+        vec2f x = a / b;
+        // octave vec2f: 40 ./ [10,20] 
+        should_be_equal_vec2f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 0.0f, 0.0f), epsilon );
+
+    }
+
+    
+}
+
+
+
+describe(vec2f, "vector math") {
+
+    it("should have unary minus operator") {
+        vec2f a(1,2);
+        vec2f x = -a;
+        // octave vec2f: -[1,2]
+        should_be_equal_vec2f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, 0.0f, 0.0f), epsilon );
+    }
+
+
+    it("should have dot function") {
+        vec2f a(1,2);
+        vec2f b(6,7);
+        float x = vectorial::dot(a,b);
+        
+        // octave vec2f: dot([1,2],[6,7])
+        should_be_close_to(x, 20.000000000000000f, epsilon );
+    }
+
+    it("should have length_squared function") {
+        vec2f a(1,2);
+        float x = vectorial::length_squared(a);
+        
+        // octave vec2f: dot([1,2],[1,2])
+        should_be_close_to(x, 5.000000000000000f, epsilon );
+    }
+
+    it("should have length function") {
+        vec2f a(1,2);
+        float x = vectorial::length(a);
+        
+        // octave vec2f: norm([1,2])
+        should_be_close_to(x, 2.236067977499790f, epsilon );
+    }
+    
+    
+    it("should have normalize function") {
+        vec2f a(1,2);
+        vec2f x = vectorial::normalize(a);
+        // octave vec2f: [1,2] / norm([1,2])
+        should_be_equal_vec2f(x, simd4f_create(0.447213595499958f, 0.894427190999916f, 0.0f, 0.0f), epsilon );
+    }
+
+}
+
+
--- a/3rdparty/vectorial/spec/spec_vec3f.cpp
+++ b/3rdparty/vectorial/spec/spec_vec3f.cpp
@ -0,0 +1,263 @@
+#include "spec_helper.h"
+#include <iostream>
+using vectorial::vec3f;
+
+const int epsilon = 1;
+
+describe(vec3f, "constructing") {
+    it("should have default constructor that does nothing..") {
+        vec3f x;
+    }
+
+    it("should have constructor with element values") {
+        vec3f x(10,20,30);
+        // octave vec3f: [10,20,30]
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
+        
+    }
+
+    it("should have constructor that loads from a float array") {
+        float ary[3] = { 1,2,3 };
+        vec3f x(ary);
+        // octave vec3f: [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
+    }
+
+}
+
+describe(vec3f, "loads and stores") {
+
+    it("should have method for loading from a float array") {
+        float ary[3] = { 1,2,3 };
+        vec3f x(-1, -1, -1 );
+        x.load(ary);
+        // octave vec3f: [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
+    }
+
+    it("should have method for storing to a float array") {
+        float ary[3] = { -1, -1, -1 };
+        vec3f x(1, 2, 3);
+        x.store(ary);
+        should_be_close_to(ary[0], 1, epsilon);
+        should_be_close_to(ary[1], 2, epsilon);
+        should_be_close_to(ary[2], 3, epsilon);
+    }
+
+}
+
+describe(vec3f, "arithmetic with another vec3f") {
+    
+    it("should have operator+ for component-wise addition") {
+        vec3f a(1,2,3);
+        vec3f b(10,20,30);
+        vec3f x = a + b;
+        // octave vec3f: [1,2,3] + [10,20,30]
+        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction") {
+        vec3f a(1,2,3);
+        vec3f b(10,20,30);
+        vec3f x = b - a;
+        // octave vec3f:  [10,20,30] - [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication") {
+        vec3f a(1,2,3);
+        vec3f b(10,20,30);
+        vec3f x = a * b;
+        // octave vec3f: [1,2,3] .* [10,20,30]
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator/ for component-wise division") {
+        vec3f a(1,2,3);
+        vec3f b(10,20,30);
+        vec3f x = b / a;
+        // octave vec3f:  [10,20,30] ./ [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon );
+
+    }
+
+
+
+    it("should have operator+= for component-wise addition") {
+        vec3f x(1,2,3);
+        vec3f b(10,20,30);
+        x += b;
+        // octave vec3f: [1,2,3] + [10,20,30]
+        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator-= for component-wise subtraction") {
+        vec3f a(1,2,3);
+        vec3f x(10,20,30);
+        x -= a;
+        // octave vec3f:  [10,20,30] - [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator*= for component-wise multiplication") {
+        vec3f x(1,2,3);
+        vec3f b(10,20,30);
+        x *= b;
+        // octave vec3f: [1,2,3] .* [10,20,30]
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator/= for component-wise division") {
+        vec3f a(1,2,3);
+        vec3f x(10,20,30);
+        x /= a;
+        // octave vec3f:  [10,20,30] ./ [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 0.0f), epsilon );
+
+    }
+    
+}
+
+
+describe(vec3f, "arithmetic with scalar") {
+    
+    it("should have operator+ for component-wise addition") {
+        vec3f a(1,2,3);
+        float b=10;
+        vec3f x = a + b;
+        // octave vec3f: [1,2,3] + 10
+        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction") {
+        float a=10;
+        vec3f b(10,20,30);
+        vec3f x = b - a;
+        // octave vec3f:  [10,20,30] - 10
+        should_be_equal_vec3f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication") {
+        vec3f a(1,2,3);
+        float b=10;
+        vec3f x = a * b;
+        // octave vec3f: [1,2,3] .* 10
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator/ for component-wise division") {
+        vec3f a(10,20,30);
+        float b=10;
+        vec3f x = a / b;
+        // octave vec3f: [10,20,30] ./ 10
+        should_be_equal_vec3f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 0.0f), epsilon );
+
+    }
+
+
+
+    it("should have operator+ for component-wise addition (float as lhs)") {
+        vec3f b(1,2,3);
+        float a=10;
+        vec3f x = a + b;
+        // octave vec3f: 10 + [1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction (float as lhs)") {
+        float b=50;
+        vec3f a(10,20,30);
+        vec3f x = b - a;
+        // octave vec3f:  50 - [10,20,30]
+        should_be_equal_vec3f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication (float as lhs)") {
+        vec3f b(1,2,3);
+        float a=10;
+        vec3f x = a * b;
+        // octave vec3f: 10 .* [1,2,3] 
+        should_be_equal_vec3f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 0.0f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication (float as lhs)") {
+        vec3f b(10,20,30);
+        float a=40;
+        vec3f x = a / b;
+        // octave vec3f: 40 ./ [10,20,30] 
+        should_be_equal_vec3f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 0.0f), epsilon );
+
+    }
+
+    
+}
+
+
+
+describe(vec3f, "vector math") {
+
+    it("should have unary minus operator") {
+        vec3f a(1,2,3);
+        vec3f x = -a;
+        // octave vec3f: -[1,2,3]
+        should_be_equal_vec3f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, 0.0f), epsilon );
+    }
+
+
+    it("should have dot function") {
+        vec3f a(1,2,3);
+        vec3f b(6,7,8);
+        float x = vectorial::dot(a,b);
+        
+        // octave vec3f: dot([1,2,3],[6,7,8])
+        should_be_close_to(x, 44.000000000000000f, epsilon );
+    }
+
+    it("should have cross function") {
+        vec3f a(1,2,3);
+        vec3f b(6,7,8);
+        vec3f x = vectorial::cross(a,b);
+        
+        // octave vec3f: cross([1,2,3],[6,7,8])
+        should_be_equal_vec3f(x, simd4f_create(-5.000000000000000f, 10.000000000000000f, -5.000000000000000f, 0.0f), epsilon );
+    }
+
+    it("should have length_squared function") {
+        vec3f a(1,2,3);
+        float x = vectorial::length_squared(a);
+        
+        // octave vec3f: dot([1,2,3],[1,2,3])
+        should_be_close_to(x, 14.000000000000000f, epsilon );
+    }
+
+    it("should have length function") {
+        vec3f a(1,2,3);
+        float x = vectorial::length(a);
+        
+        // octave vec3f: norm([1,2,3])
+        should_be_close_to(x, 3.741657386773941f, epsilon );
+    }
+    
+    
+    it("should have normalize function") {
+        vec3f a(1,2,3);
+        vec3f x = vectorial::normalize(a);
+        // octave vec3f: [1,2,3] / norm([1,2,3])
+        should_be_equal_vec3f(x, simd4f_create(0.267261241912424f, 0.534522483824849f, 0.801783725737273f, 0.0f), epsilon );
+    }
+
+}
+
+
--- a/3rdparty/vectorial/spec/spec_vec4f.cpp
+++ b/3rdparty/vectorial/spec/spec_vec4f.cpp
@ -0,0 +1,258 @@
+#include "spec_helper.h"
+#include <iostream>
+using vectorial::vec4f;
+
+const int epsilon = 1;
+
+describe(vec4f, "constructing") {
+    it("should have default constructor that does nothing..") {
+        vec4f x;
+    }
+
+    it("should have constructor with element values") {
+        vec4f x(10,20,30,40);
+        // octave vec4f: [10,20,30,40]
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
+        
+    }
+
+    it("should have constructor that loads from a float array") {
+        float ary[4] = { 1,2,3,4 };
+        vec4f x(ary);
+        // octave vec4f: [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
+    }
+
+}
+
+describe(vec4f, "loads and stores") {
+
+
+    it("should have method for loading from a float array") {
+        float ary[4] = { 1,2,3,4 };
+        vec4f x(-1, -1, -1, -1);
+        x.load(ary);
+        // octave vec4f: [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
+    }
+
+    it("should have method for storing to a float array") {
+        float ary[4] = { -1, -1, -1, -1 };
+        vec4f x(1, 2, 3, 4);
+        x.store(ary);
+        should_be_close_to(ary[0], 1, epsilon);
+        should_be_close_to(ary[1], 2, epsilon);
+        should_be_close_to(ary[2], 3, epsilon);
+        should_be_close_to(ary[3], 4, epsilon);
+    }
+
+}
+
+describe(vec4f, "arithmetic with another vec4f") {
+    
+    it("should have operator+ for component-wise addition") {
+        vec4f a(1,2,3,4);
+        vec4f b(10,20,30,40);
+        vec4f x = a + b;
+        // octave vec4f: [1,2,3,4] + [10,20,30,40]
+        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction") {
+        vec4f a(1,2,3,4);
+        vec4f b(10,20,30,40);
+        vec4f x = b - a;
+        // octave vec4f:  [10,20,30,40] - [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication") {
+        vec4f a(1,2,3,4);
+        vec4f b(10,20,30,40);
+        vec4f x = a * b;
+        // octave vec4f: [1,2,3,4] .* [10,20,30,40]
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator/ for component-wise division") {
+        vec4f a(1,2,3,4);
+        vec4f b(10,20,30,40);
+        vec4f x = b / a;
+        // octave vec4f:  [10,20,30,40] ./ [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
+
+    }
+
+
+
+	
+    it("should have operator+= for component-wise addition") {
+        vec4f x(1,2,3,4);
+        vec4f b(10,20,30,40);
+        x += b;
+        // octave vec4f: [1,2,3,4] + [10,20,30,40]
+        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 22.000000000000000f, 33.000000000000000f, 44.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator-= for component-wise subtraction") {
+        vec4f a(1,2,3,4);
+        vec4f x(10,20,30,40);
+        x -= a;
+        // octave vec4f:  [10,20,30,40] - [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(9.000000000000000f, 18.000000000000000f, 27.000000000000000f, 36.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator*= for component-wise multiplication") {
+        vec4f x(1,2,3,4);
+        vec4f b(10,20,30,40);
+        x *= b;
+        // octave vec4f: [1,2,3,4] .* [10,20,30,40]
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 40.000000000000000f, 90.000000000000000f, 160.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator/= for component-wise division") {
+        vec4f a(1,2,3,4);
+        vec4f x(10,20,30,40);
+        x /= a;
+        // octave vec4f:  [10,20,30,40] ./ [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 10.000000000000000f, 10.000000000000000f, 10.000000000000000f), epsilon );
+
+    }
+
+
+    
+}
+
+
+describe(vec4f, "arithmetic with scalar") {
+    
+    it("should have operator+ for component-wise addition") {
+        vec4f a(1,2,3,4);
+        float b=10;
+        vec4f x = a + b;
+        // octave vec4f: [1,2,3,4] + 10
+        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction") {
+        float a=10;
+        vec4f b(10,20,30,40);
+        vec4f x = b - a;
+        // octave vec4f:  [10,20,30,40] - 10
+        should_be_equal_vec4f(x, simd4f_create(0.000000000000000f, 10.000000000000000f, 20.000000000000000f, 30.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication") {
+        vec4f a(1,2,3,4);
+        float b=10;
+        vec4f x = a * b;
+        // octave vec4f: [1,2,3,4] .* 10
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator/ for component-wise division") {
+        vec4f a(10,20,30,40);
+        float b=10;
+        vec4f x = a / b;
+        // octave vec4f: [10,20,30,40] ./ 10
+        should_be_equal_vec4f(x, simd4f_create(1.000000000000000f, 2.000000000000000f, 3.000000000000000f, 4.000000000000000f), epsilon );
+
+    }
+
+
+
+    it("should have operator+ for component-wise addition (float as lhs)") {
+        vec4f b(1,2,3,4);
+        float a=10;
+        vec4f x = a + b;
+        // octave vec4f: 10 + [1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(11.000000000000000f, 12.000000000000000f, 13.000000000000000f, 14.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator- for component-wise subtraction (float as lhs)") {
+        float b=50;
+        vec4f a(10,20,30,40);
+        vec4f x = b - a;
+        // octave vec4f:  50 - [10,20,30,40]
+        should_be_equal_vec4f(x, simd4f_create(40.000000000000000f, 30.000000000000000f, 20.000000000000000f, 10.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication (float as lhs)") {
+        vec4f b(1,2,3,4);
+        float a=10;
+        vec4f x = a * b;
+        // octave vec4f: 10 .* [1,2,3,4] 
+        should_be_equal_vec4f(x, simd4f_create(10.000000000000000f, 20.000000000000000f, 30.000000000000000f, 40.000000000000000f), epsilon );
+
+    }
+
+    it("should have operator* for component-wise multiplication (float as lhs)") {
+        vec4f b(10,20,30,40);
+        float a=40;
+        vec4f x = a / b;
+        // octave vec4f: 40 ./ [10,20,30,40] 
+        should_be_equal_vec4f(x, simd4f_create(4.000000000000000f, 2.000000000000000f, 1.333333333333333f, 1.000000000000000f), epsilon );
+
+    }
+
+    
+}
+
+
+
+describe(vec4f, "vector math") {
+
+    it("should have unary minus operator") {
+        vec4f a(1,2,3,4);
+        vec4f x = -a;
+        // octave vec4f: -[1,2,3,4]
+        should_be_equal_vec4f(x, simd4f_create(-1.000000000000000f, -2.000000000000000f, -3.000000000000000f, -4.000000000000000f), epsilon );
+    }
+
+    it("should have dot function") {
+        vec4f a(1,2,3,4);
+        vec4f b(6,7,8,9);
+        float x = vectorial::dot(a,b);
+        
+        // octave vec4f: dot([1,2,3,4],[6,7,8,9])
+        should_be_close_to(x, 80.000000000000000f, epsilon );
+    }
+
+    it("should have length_squared function") {
+        vec4f a(1,2,3,4);
+        float x = vectorial::length_squared(a);
+        
+        // octave vec4f: dot([1,2,3,4],[1,2,3,4])
+        should_be_close_to(x, 30.000000000000000f, epsilon );
+    }
+
+    it("should have length function") {
+        vec4f a(1,2,3,4);
+        float x = vectorial::length(a);
+        
+        // octave vec4f: norm([1,2,3,4])
+        should_be_close_to(x, 5.477225575051661f, epsilon );
+    }
+    
+    
+    it("should have normalize function") {
+        vec4f a(1,2,3,4);
+        vec4f x = vectorial::normalize(a);
+        // octave vec4f: [1,2,3,4] / norm([1,2,3,4])
+        should_be_equal_vec4f(x, simd4f_create(0.182574185835055f, 0.365148371670111f, 0.547722557505166f, 0.730296743340221f), epsilon );
+    }
+
+}
+
+
--- a/3rdparty/vectorial/tools/spechelper.m
+++ b/3rdparty/vectorial/tools/spechelper.m
@ -0,0 +1,45 @@
+#!/usr/bin/env octave
+
+1;
+
+function spec_formatter (val,type)
+
+    if( isscalar(val) == 1 ) 
+        printf("        should_be_close_to(x, %15.15ff, epsilon );", val);
+        return;
+    endif
+
+    if( size(val) == [1,2] ) 
+        if( strcmp(type,"simd2f") == 1 )
+        printf("        should_be_equal_%s(x, simd2f_create(%15.15ff, %15.15ff), epsilon );",type, val(1), val(2));
+        else
+        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, 0.0f, 0.0f), epsilon );",type, val(1), val(2));
+        endif
+        return;
+    endif
+
+    if( size(val) == [1,3] ) 
+        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, 0.0f), epsilon );",type, val(1), val(2), val(3));
+        return;
+    endif
+
+    if( size(val) == [1,4] ) 
+        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4));
+        return;
+    endif
+
+    if( size(val) == [4,1] ) 
+        printf("        should_be_equal_%s(x, simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), epsilon );",type, val(1), val(2), val(3), val(4));
+        return;
+    endif
+
+    if( size(val) == [4,4] ) 
+        printf("        should_be_equal_%s(x, simd4x4f_create(simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff), simd4f_create(%15.15ff, %15.15ff, %15.15ff, %15.15ff)), epsilon );",type, 
+        val(1), val(2), val(3), val(4), val(5), val(6), val(7), val(8), val(9), val(10), val(11), val(12), val(13), val(14), val(15), val(16)
+        );
+        return;
+    endif
+
+
+endfunction
+
--- a/3rdparty/vectorial/tools/update_spec.rb
+++ b/3rdparty/vectorial/tools/update_spec.rb
@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+
+SPECHELPER = File.join(File.dirname(__FILE__), "spechelper.m")
+def octave_eval(str, type)
+  puts "evalling (#{type}): #{str}"
+  ret = `octave --quiet --eval 'source("#{SPECHELPER}"); spec_formatter(#{str}, "#{type}")'`
+  puts "    = #{ret.strip}"
+  ret
+end
+
+
+ARGV.each do |fn|
+  str = File.read(fn)
+  str.gsub!(%r{(// octave (\w+):)(.*?)\n(.*?\n)}) do |match|
+    e = octave_eval($3, $2)
+
+    [$1, $3, "\n", e, "\n"].join
+  end
+  File.open(fn, "w") do |f|
+    f.write str
+  end
+
+end
+
--- a/3rdparty/vectorial/vectorial.sln
+++ b/3rdparty/vectorial/vectorial.sln
@ -0,0 +1,31 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C++ Express 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial specsuite", "vectorial.vcproj", "{9450BCE8-02CB-4169-8471-2DFF764817F4}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "vectorial benchmark", "vectorialbenchmark.vcproj", "{1E78F64D-C404-4048-8AE6-217089480E8A}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Win32 = Debug|Win32
+		Release Scalar|Win32 = Release Scalar|Win32
+		Release SSE|Win32 = Release SSE|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Debug|Win32.Build.0 = Debug|Win32
+		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32
+		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release Scalar|Win32.Build.0 = Release Scalar|Win32
+		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.ActiveCfg = Release|Win32
+		{9450BCE8-02CB-4169-8471-2DFF764817F4}.Release SSE|Win32.Build.0 = Release|Win32
+		{1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.ActiveCfg = Debug|Win32
+		{1E78F64D-C404-4048-8AE6-217089480E8A}.Debug|Win32.Build.0 = Debug|Win32
+		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.ActiveCfg = Release Scalar|Win32
+		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release Scalar|Win32.Build.0 = Release Scalar|Win32
+		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.ActiveCfg = Release|Win32
+		{1E78F64D-C404-4048-8AE6-217089480E8A}.Release SSE|Win32.Build.0 = Release|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/3rdparty/vectorial/vectorial.vcproj
+++ b/3rdparty/vectorial/vectorial.vcproj
@ -0,0 +1,350 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9,00"
+	Name="vectorial specsuite"
+	ProjectGUID="{9450BCE8-02CB-4169-8471-2DFF764817F4}"
+	RootNamespace="vectorial specsuite"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="0"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="Debug"
+			IntermediateDirectory="Debug"
+			ConfigurationType="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="include"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE;"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				FloatingPointModel="0"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="Release"
+			IntermediateDirectory="Release"
+			ConfigurationType="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				AdditionalIncludeDirectories="include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;NOMINMAX"
+				RuntimeLibrary="2"
+				EnableEnhancedInstructionSet="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="0"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release Scalar|Win32"
+			OutputDirectory="$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				AdditionalIncludeDirectories="include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;"
+				RuntimeLibrary="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				Detect64BitPortabilityProblems="false"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="0"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="vectorial"
+			>
+			<File
+				RelativePath=".\include\vectorial\config.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_common.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_gnu.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_neon.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_scalar.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_sse.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_gnu.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_neon.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_scalar.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_sse.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\vec2f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\vec3f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\vec4f.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="spec"
+			>
+			<File
+				RelativePath=".\spec\spec.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec.h"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_helper.h"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_main.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_mat4f.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_simd4f.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_simd4x4f.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_vec2f.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_vec3f.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\spec\spec_vec4f.cpp"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/3rdparty/vectorial/vectorialbenchmark.vcproj
+++ b/3rdparty/vectorial/vectorialbenchmark.vcproj
@ -0,0 +1,340 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9,00"
+	Name="vectorial benchmark"
+	ProjectGUID="{1E78F64D-C404-4048-8AE6-217089480E8A}"
+	RootNamespace="vectorialbenchmark"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform
+			Name="Win32"
+		/>
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="include"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE;NOMINMAX"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="false"
+				EnableEnhancedInstructionSet="2"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release Scalar|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="include"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="false"
+				EnableEnhancedInstructionSet="0"
+				FloatingPointModel="2"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<Filter
+			Name="vectorial"
+			>
+			<File
+				RelativePath=".\include\vectorial\config.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_common.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_gnu.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_neon.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_scalar.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4f_sse.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_gnu.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_neon.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_scalar.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\simd4x4f_sse.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\vec2f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\vec3f.h"
+				>
+			</File>
+			<File
+				RelativePath=".\include\vectorial\vec4f.h"
+				>
+			</File>
+		</Filter>
+		<Filter
+			Name="bench"
+			>
+			<File
+				RelativePath=".\bench\add_bench.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\bench\bench.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\bench\bench.h"
+				>
+			</File>
+			<File
+				RelativePath=".\bench\dot_bench.cpp"
+				>
+			</File>
+			<File
+				RelativePath=".\bench\quad_bench.cpp"
+				>
+			</File>
+		</Filter>
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
--- a/src/srender.c
+++ b/src/srender.c
@ -5,6 +5,8 @@
 #include "srender.h"

 #include "utils.h"
+
+#include <string.h>
 #include <assert.h>

 typedef struct srndr {
@ -17,23 +19,104 @@ typedef struct srview {
  GLuint mFrameBufferId;
  GLuint mColorTexture;
  GLuint mDepthTexture;
-} srveiw;
+
+	simd4x4f proj;
+	simd4x4f view;
+} srview;

 typedef struct srcmdbuf {
+	int ncmds;
+	int idx;
+	srcmd* cmds;
 } srcmdbuf;

+//
+// Vertex Data
+//
+typedef union srvrtxdata {
+	struct {
+		float x, y, z, w;
+		float nx, ny, nz;
+		float s, t;
+		GLubyte r,g,b,a;
+	};
+	struct {
+		float pos[4];
+		float n[4];
+		float uv[2];
+		GLubyte color[4];
+	};
+} srvrtxdata;
+
+//
+// Simple Mesh Data
+//
+typedef struct srmeshdata {
+	int nvertices;
+	int nindices;
+
+	srvrtxdata *vertices;
+	int *indices;
+} srmeshdata;
+
+static srmeshdata gCoordFrameMesh = {0};
+
+void init_debug_meshes () {
+	assert (gCoordFrameMesh.nvertices == 0);
+
+	gCoordFrameMesh.nvertices = 6;
+
+	srvrtxdata coord_frame_vertices[] = {
+		{0.0f, 0.0f, 0.0f, 1.0f,	 0.0f, 0.0f, 0.0f,		0.0f, 0.0f,	255, 0, 0, 255  },
+		{1.0f, 0.0f, 0.0f, 1.0f,	 0.0f, 0.0f, 0.0f,		0.0f, 0.0f,	255, 0, 0, 255  },
+
+		{0.0f, 0.0f, 0.0f, 1.0f,	 0.0f, 0.0f, 0.0f,		0.0f, 0.0f,	  0, 255, 0, 255},
+		{0.0f, 1.0f, 0.0f, 1.0f,	 0.0f, 0.0f, 0.0f,		0.0f, 0.0f,	  0, 255, 0, 255},
+
+		{0.0f, 0.0f, 0.0f, 1.0f,	 0.0f, 0.0f, 0.0f,		0.0f, 0.0f,	  0, 0, 255, 255},
+		{0.0f, 0.0f, 1.0f, 1.0f,	 0.0f, 0.0f, 0.0f,		0.0f, 0.0f,	  0, 0, 255, 255}
+	};
+	GLuint coord_frame_indices[] = { 
+		0, 1,
+		1, 2,
+		3, 4
+	};
+
+	gCoordFrameMesh.vertices = malloc (sizeof(coord_frame_vertices));
+	memcpy (gCoordFrameMesh.vertices, coord_frame_vertices, sizeof(coord_frame_vertices));
+
+	gCoordFrameMesh.indices = malloc (sizeof(coord_frame_indices));
+	memcpy (gCoordFrameMesh.indices, coord_frame_indices, sizeof (coord_frame_indices));
+};
+
+//
+// Renderer
+//
 srndr* srndr_create() {
  srndr* result = calloc(1, sizeof(srndr));

+	init_debug_meshes();
+
  return result;
 }

 void srndr_destroy(srndr* sr) { free(sr); }

+//
+// View
+//
 void srview_get_output_texture(srview* sv, GLuint* texture) {
  *texture = sv->mColorTexture;
 }

+void srview_set_proj(srview* sv, simd4x4f proj) {
+	sv->proj = proj;
+}
+
+void srview_set_view(srview* sv, simd4x4f view) {
+	sv->view = view;
+}
+
 void srview_update_framebuffer(srview* sv) {
  glGenFramebuffers(1, &sv->mFrameBufferId);
  glBindFramebuffer(GL_FRAMEBUFFER, sv->mFrameBufferId);
@ -151,3 +234,27 @@ void srndr_render(srndr* srndr, srview* sview, srcmdbuf* scmdbuf) {

  glBindFramebuffer(GL_DRAW_FRAMEBUFFER, 0);
 }
+
+//
+// Render Commands
+//
+srcmdbuf* srcmdbuf_create (unsigned int size_max) {
+	srcmdbuf* result = malloc (sizeof (srcmdbuf));
+	result->ncmds = size_max;
+	result->idx = 0;
+	result->cmds = calloc (sizeof (srcmd), size_max);
+
+	return result;
+}
+
+void srcmdbuf_clear (srcmdbuf* cmdbuf) {
+	cmdbuf->idx = 0;
+}
+
+srcmd* srcmd_create (srcmdbuf* cmdbuf) {
+	if (cmdbuf->idx == cmdbuf->ncmds) {
+		gLog ("Warning: number of render commands maxed out!");
+		return NULL;
+	}
+	return &(cmdbuf->cmds[cmdbuf->idx++]);
+}
--- a/src/srender.h
+++ b/src/srender.h
@ -5,21 +5,24 @@
 extern "C" {
 #endif

+#include "vectorial/simd4x4f.h"
+
 typedef struct srndr srndr;
 typedef struct srview srview;
 typedef struct srcmdbuf srcmdbuf;

 typedef enum {
-  SRndrCmdTypeGrid = 0,
+  SRndrCmdTypeFrame = 0,
+  SRndrCmdTypeGrid,
  SRndrCmdTypeBox,
  SRndrCmdTypeLight,
  SRndrCmdTypeSphere
 } SRndrCmdType;

 typedef struct srcmd {
-  float mat[16];
-  float color[4];
-  SRndrCmdType cmd_type;
+	simd4x4f mat;
+	simd4f color;
+  SRndrCmdType type;
 } srcmd;

 //
@ -34,8 +37,8 @@ void srndr_destroy(srndr* sr);
 srview* srview_create();
 void srview_destroy(srview* sv);

-void srview_set_proj(srview* sv, float* proj);
-void srview_set_view(srview* sv, float* view);
+void srview_set_proj(srview* sv, simd4x4f proj);
+void srview_set_view(srview* sv, simd4x4f view);
 void srview_set_size(srview* sv, int width, int height);
 void srview_get_output_texture(srview* sv, GLuint* texture);

@ -43,6 +46,7 @@ void srview_get_output_texture(srview* sv, GLuint* texture);
 // Command Buffer and Commands
 //
 srcmdbuf* srcmdbuf_create(unsigned int size_max);
+void srcmdbuf_clear (srcmdbuf* cmdbuf);
 srcmd* srcmd_create(srcmdbuf* cmdbuf);

 void srndr_render(srndr* srndr, srview* sview, srcmdbuf* scmdbuf);
--- a/src/vissim.cc
+++ b/src/vissim.cc
@ -56,9 +56,9 @@ static void opengl_error_callback(
    GLsizei length,
    const GLchar* message,
    const void* userParam) {
-  //	gLog ("OpenGL Error: %s type %0x%x, severity = 0x%x, message = %s",
-  //           ( type == GL_DEBUG_TYPE_ERROR ? "** GL ERROR **" : "" ),
-  //            type, severity, message );
+  	gLog ("OpenGL Error: %s type %0x%x, severity = 0x%x, message = %s",
+             ( type == GL_DEBUG_TYPE_ERROR ? "** GL ERROR **" : "" ),
+              type, severity, message );
 }

 static void
@ -155,11 +155,27 @@ void ShowDockspace(bool open) {
 void DoRender() {
  // Render Output
  ImGui::Begin("Render Output");
-  const ImVec2 content_avail = ImGui::GetContentRegionAvail();
-  GLuint view_texture;

+	// Update the view
+  const ImVec2 content_avail = ImGui::GetContentRegionAvail();
  srview_set_size(gView, content_avail.x, content_avail.y);
+	simd4x4f view;
+	simd4x4f proj;
+	simd4x4f_identity (&view);
+	simd4x4f_identity (&proj);
+	srview_set_view(gView, view);
+	srview_set_view(gView, proj);
+
+	// Populate render commands
+	srcmdbuf_clear(gRndrCmds);
+	srcmd* cmd = srcmd_create (gRndrCmds);
+	cmd->type = SRndrCmdTypeFrame;
+	simd4x4f_identity(&cmd->mat);
+	cmd->color = simd4f_create (1.f, 1.f, 1.f, 1.f);
+
+	// Perform the actual render
  srndr_render(gRndr, gView, gRndrCmds);
+ 	GLuint view_texture;
  srview_get_output_texture(gView, &view_texture);

  ImGui::Image(