From eb7f958bb302a3135aa77ff6a2188dd3f27b7a3f Mon Sep 17 00:00:00 2001
From: Martin Felis <martin@fysx.org>
Date: Tue, 11 Apr 2017 09:15:27 +0200
Subject: [PATCH] another update of bgfx and added its bimg dependency

---
 .../3rdparty/glslang/SPIRV/GlslangToSpv.cpp   |    97 +-
 .../bgfx/3rdparty/glslang/SPIRV/SpvBuilder.h  |    10 +
 3rdparty/bgfx/3rdparty/glslang/SPIRV/doc.cpp  |     4 +-
 .../bgfx/3rdparty/glslang/SPIRV/spirv.hpp     |     6 +
 .../glslang/StandAlone/StandAlone.cpp         |   109 +-
 .../3rdparty/glslang/StandAlone/Worklist.h    |    13 +-
 3rdparty/bgfx/3rdparty/glslang/Test/450.geom  |     3 +
 .../glslang/Test/baseResults/450.geom.out     |   109 +-
 .../Test/baseResults/hlsl.amend.frag.out      |     5 +-
 .../baseResults/hlsl.array.flatten.frag.out   |     5 +-
 .../Test/baseResults/hlsl.array.frag.out      |     5 +-
 .../hlsl.array.implicit-size.frag.out         |     5 +-
 .../baseResults/hlsl.array.multidim.frag.out  |     5 +-
 .../Test/baseResults/hlsl.assoc.frag.out      |     5 +-
 .../hlsl.attribute.expression.comp.out        |     5 +-
 .../Test/baseResults/hlsl.attribute.frag.out  |    13 +-
 .../Test/baseResults/hlsl.basic.comp.out      |     5 +-
 .../Test/baseResults/hlsl.basic.geom.out      |    87 +-
 .../Test/baseResults/hlsl.buffer.frag.out     |     5 +-
 .../hlsl.calculatelod.dx10.frag.out           |     5 +-
 .../hlsl.calculatelodunclamped.dx10.frag.out  |     4 +-
 .../Test/baseResults/hlsl.cast.frag.out       |     5 +-
 .../baseResults/hlsl.comparison.vec.frag.out  |     5 +-
 .../baseResults/hlsl.conditional.frag.out     |   195 +-
 .../baseResults/hlsl.constructexpr.frag.out   |     5 +-
 .../hlsl.deadFunctionMissingBody.vert.out     |     1 +
 .../baseResults/hlsl.depthGreater.frag.out    |     5 +-
 .../Test/baseResults/hlsl.depthLess.frag.out  |     5 +-
 .../Test/baseResults/hlsl.discard.frag.out    |    71 +-
 .../Test/baseResults/hlsl.doLoop.frag.out     |     5 +-
 .../hlsl.emptystructreturn.frag.out           |     5 +-
 .../hlsl.emptystructreturn.vert.out           |     5 +-
 .../Test/baseResults/hlsl.entry-in.frag.out   |     5 +-
 .../Test/baseResults/hlsl.entry-out.frag.out  |     5 +-
 .../baseResults/hlsl.entry.rename.frag.out    |     5 +-
 .../baseResults/hlsl.flatten.return.frag.out  |     5 +-
 .../Test/baseResults/hlsl.float1.frag.out     |     5 +-
 .../Test/baseResults/hlsl.float4.frag.out     |     5 +-
 .../Test/baseResults/hlsl.forLoop.frag.out    |     5 +-
 .../hlsl.gather.array.dx10.frag.out           |     5 +-
 .../hlsl.gather.basic.dx10.frag.out           |     5 +-
 .../hlsl.gather.basic.dx10.vert.out           |     5 +-
 .../hlsl.gather.offset.dx10.frag.out          |     5 +-
 .../hlsl.gather.offsetarray.dx10.frag.out     |     5 +-
 .../hlsl.gatherRGBA.array.dx10.frag.out       |     5 +-
 .../hlsl.gatherRGBA.basic.dx10.frag.out       |     5 +-
 .../hlsl.gatherRGBA.offset.dx10.frag.out      |     5 +-
 .../hlsl.gatherRGBA.offsetarray.dx10.frag.out |     5 +-
 .../hlsl.gathercmpRGBA.offset.dx10.frag.out   |     5 +-
 .../hlsl.getdimensions.dx10.frag.out          |     5 +-
 .../hlsl.getdimensions.dx10.vert.out          |     5 +-
 .../hlsl.getdimensions.rw.dx10.frag.out       |     5 +-
 .../hlsl.getsampleposition.dx10.frag.out      |     4 +-
 .../Test/baseResults/hlsl.hull.1.tesc.out     |   178 +-
 .../Test/baseResults/hlsl.hull.2.tesc.out     |   202 +-
 .../Test/baseResults/hlsl.hull.void.tesc.out  |    89 +-
 .../hlsl.identifier.sample.frag.out           |     5 +-
 .../glslang/Test/baseResults/hlsl.if.frag.out |   131 +-
 .../Test/baseResults/hlsl.init.frag.out       |     5 +-
 .../Test/baseResults/hlsl.init2.frag.out      |     5 +-
 .../Test/baseResults/hlsl.inoutquals.frag.out |     5 +-
 .../hlsl.intrinsics.barriers.comp.out         |     5 +-
 .../Test/baseResults/hlsl.intrinsics.comp.out |     5 +-
 .../hlsl.intrinsics.d3dcolortoubyte4.frag.out |     5 +-
 .../hlsl.intrinsics.double.frag.out           |     5 +-
 .../hlsl.intrinsics.evalfns.frag.out          |     5 +-
 .../hlsl.intrinsics.f1632.frag.out            |     5 +-
 .../hlsl.intrinsics.f3216.frag.out            |     5 +-
 .../Test/baseResults/hlsl.intrinsics.frag.out | 12579 ++++++++--------
 .../baseResults/hlsl.intrinsics.lit.frag.out  |     5 +-
 .../hlsl.intrinsics.negative.comp.out         |     5 +-
 .../hlsl.intrinsics.negative.frag.out         |    13 +-
 .../hlsl.intrinsics.negative.vert.out         |     5 +-
 .../hlsl.intrinsics.promote.down.frag.out     |     5 +-
 .../hlsl.intrinsics.promote.frag.out          |    32 +-
 .../hlsl.intrinsics.promote.outputs.frag.out  |    12 +-
 .../Test/baseResults/hlsl.intrinsics.vert.out |  6422 ++++----
 .../Test/baseResults/hlsl.layout.frag.out     |     5 +-
 .../baseResults/hlsl.load.2dms.dx10.frag.out  |     5 +-
 .../baseResults/hlsl.load.array.dx10.frag.out |     5 +-
 .../baseResults/hlsl.load.basic.dx10.frag.out |     5 +-
 .../baseResults/hlsl.load.basic.dx10.vert.out |     5 +-
 .../hlsl.load.buffer.dx10.frag.out            |    61 +-
 .../hlsl.load.buffer.float.dx10.frag.out      |    61 +-
 .../hlsl.load.offset.dx10.frag.out            |     5 +-
 .../hlsl.load.offsetarray.dx10.frag.out       |     5 +-
 .../hlsl.load.rwbuffer.dx10.frag.out          |     5 +-
 .../hlsl.load.rwtexture.array.dx10.frag.out   |     5 +-
 .../hlsl.load.rwtexture.dx10.frag.out         |     5 +-
 .../baseResults/hlsl.logical.binary.frag.out  |     5 +-
 .../hlsl.logical.binary.vec.frag.out          |     5 +-
 .../baseResults/hlsl.logical.unary.frag.out   |   141 +-
 .../Test/baseResults/hlsl.matNx1.frag.out     |     5 +-
 .../baseResults/hlsl.matType.bool.frag.out    |     5 +-
 .../Test/baseResults/hlsl.matType.frag.out    |     5 +-
 .../baseResults/hlsl.matType.int.frag.out     |     5 +-
 .../baseResults/hlsl.matrixSwizzle.vert.out   |     5 +-
 .../baseResults/hlsl.matrixindex.frag.out     |     5 +-
 .../Test/baseResults/hlsl.max.frag.out        |     5 +-
 .../Test/baseResults/hlsl.mintypes.frag.out   |     5 +-
 .../Test/baseResults/hlsl.multiEntry.vert.out |    22 +-
 .../baseResults/hlsl.multiReturn.frag.out     |     5 +-
 .../hlsl.nonstaticMemberFunction.frag.out     |     5 +-
 .../baseResults/hlsl.numericsuffixes.frag.out |     5 +-
 .../Test/baseResults/hlsl.numthreads.comp.out |     5 +-
 .../Test/baseResults/hlsl.overload.frag.out   |     5 +-
 .../baseResults/hlsl.params.default.frag.out  |     5 +-
 .../hlsl.params.default.negative.frag.out     |     4 +-
 .../baseResults/hlsl.partialInit.frag.out     |     5 +-
 .../Test/baseResults/hlsl.pp.line.frag.out    |     5 +-
 .../Test/baseResults/hlsl.precedence.frag.out |     5 +-
 .../baseResults/hlsl.precedence2.frag.out     |     5 +-
 .../Test/baseResults/hlsl.precise.frag.out    |     5 +-
 .../baseResults/hlsl.promote.atomic.frag.out  |     5 +-
 .../baseResults/hlsl.promote.binary.frag.out  |     5 +-
 .../baseResults/hlsl.promote.vec1.frag.out    |     5 +-
 .../Test/baseResults/hlsl.promotions.frag.out |     5 +-
 .../Test/baseResults/hlsl.rw.atomics.frag.out |     5 +-
 .../Test/baseResults/hlsl.rw.bracket.frag.out |     5 +-
 .../baseResults/hlsl.rw.register.frag.out     |     5 +-
 .../hlsl.rw.scalar.bracket.frag.out           |     5 +-
 .../Test/baseResults/hlsl.rw.swizzle.frag.out |     5 +-
 .../baseResults/hlsl.rw.vec2.bracket.frag.out |     5 +-
 .../hlsl.sample.array.dx10.frag.out           |     5 +-
 .../hlsl.sample.basic.dx10.frag.out           |     5 +-
 .../hlsl.sample.offset.dx10.frag.out          |     5 +-
 .../hlsl.sample.offsetarray.dx10.frag.out     |     5 +-
 .../hlsl.sample.sub-vec4.dx10.frag.out        |     5 +-
 .../hlsl.samplebias.array.dx10.frag.out       |     5 +-
 .../hlsl.samplebias.basic.dx10.frag.out       |     5 +-
 .../hlsl.samplebias.offset.dx10.frag.out      |     5 +-
 .../hlsl.samplebias.offsetarray.dx10.frag.out |     5 +-
 .../hlsl.samplecmp.array.dx10.frag.out        |     5 +-
 .../hlsl.samplecmp.basic.dx10.frag.out        |     5 +-
 .../hlsl.samplecmp.offset.dx10.frag.out       |     5 +-
 .../hlsl.samplecmp.offsetarray.dx10.frag.out  |     5 +-
 ...lsl.samplecmplevelzero.array.dx10.frag.out |     5 +-
 ...lsl.samplecmplevelzero.basic.dx10.frag.out |     5 +-
 ...sl.samplecmplevelzero.offset.dx10.frag.out |     5 +-
 ...mplecmplevelzero.offsetarray.dx10.frag.out |     5 +-
 .../hlsl.samplegrad.array.dx10.frag.out       |     5 +-
 .../hlsl.samplegrad.basic.dx10.frag.out       |     5 +-
 .../hlsl.samplegrad.basic.dx10.vert.out       |     5 +-
 .../hlsl.samplegrad.offset.dx10.frag.out      |     5 +-
 .../hlsl.samplegrad.offsetarray.dx10.frag.out |     5 +-
 .../hlsl.samplelevel.array.dx10.frag.out      |     5 +-
 .../hlsl.samplelevel.basic.dx10.frag.out      |     5 +-
 .../hlsl.samplelevel.basic.dx10.vert.out      |     5 +-
 .../hlsl.samplelevel.offset.dx10.frag.out     |     5 +-
 ...hlsl.samplelevel.offsetarray.dx10.frag.out |     5 +-
 .../Test/baseResults/hlsl.scope.frag.out      |     5 +-
 .../Test/baseResults/hlsl.semantic.geom.out   |   132 +-
 .../Test/baseResults/hlsl.semantic.vert.out   |     5 +-
 .../Test/baseResults/hlsl.semicolons.frag.out |     5 +-
 .../Test/baseResults/hlsl.shapeConv.frag.out  |     5 +-
 .../baseResults/hlsl.shapeConvRet.frag.out    |     5 +-
 .../Test/baseResults/hlsl.sin.frag.out        |     5 +-
 .../hlsl.staticMemberFunction.frag.out        |     5 +-
 .../Test/baseResults/hlsl.string.frag.out     |     5 +-
 .../baseResults/hlsl.stringtoken.frag.out     |     5 +-
 .../Test/baseResults/hlsl.struct.frag.out     |     5 +-
 .../baseResults/hlsl.struct.split-1.vert.out  |     5 +-
 .../hlsl.struct.split.array.geom.out          |    22 +-
 .../hlsl.struct.split.assign.frag.out         |     5 +-
 .../hlsl.struct.split.call.vert.out           |     5 +-
 .../hlsl.struct.split.nested.geom.out         |   231 +-
 .../hlsl.struct.split.trivial.geom.out        |   126 +-
 .../hlsl.struct.split.trivial.vert.out        |     5 +-
 .../baseResults/hlsl.structIoFourWay.frag.out |     5 +-
 .../hlsl.structStructName.frag.out            |     5 +-
 .../hlsl.structarray.flatten.frag.out         |     5 +-
 .../hlsl.structarray.flatten.geom.out         |   129 +-
 .../hlsl.structbuffer.atomics.frag.out        |     5 +-
 .../hlsl.structbuffer.byte.frag.out           |     5 +-
 .../hlsl.structbuffer.coherent.frag.out       |     5 +-
 .../baseResults/hlsl.structbuffer.fn.frag.out |     5 +-
 .../baseResults/hlsl.structbuffer.frag.out    |     5 +-
 .../baseResults/hlsl.structbuffer.rw.frag.out |     5 +-
 .../hlsl.structbuffer.rwbyte.frag.out         |     5 +-
 .../Test/baseResults/hlsl.structin.vert.out   |     5 +-
 .../Test/baseResults/hlsl.switch.frag.out     |     5 +-
 .../Test/baseResults/hlsl.swizzle.frag.out    |     5 +-
 .../baseResults/hlsl.templatetypes.frag.out   |     5 +-
 .../Test/baseResults/hlsl.this.frag.out       |     5 +-
 .../Test/baseResults/hlsl.tx.bracket.frag.out |     5 +-
 .../Test/baseResults/hlsl.type.half.frag.out  |     5 +-
 .../baseResults/hlsl.type.identifier.frag.out |     5 +-
 .../baseResults/hlsl.typeGraphCopy.vert.out   |     5 +-
 .../Test/baseResults/hlsl.typedef.frag.out    |     5 +-
 .../Test/baseResults/hlsl.void.frag.out       |     5 +-
 .../Test/baseResults/hlsl.whileLoop.frag.out  |     5 +-
 .../remap.hlsl.sample.basic.none.frag.out     |     1 +
 .../remap.hlsl.templatetypes.none.frag.out    |     1 +
 .../Test/baseResults/spv.bool.vert.out        |    12 +-
 .../spv.buffer.autoassign.frag.out            |     1 +
 .../Test/baseResults/spv.deepRvalue.frag.out  |    78 +-
 .../Test/baseResults/spv.image.frag.out       |    20 +-
 .../spv.register.autoassign-2.frag.out        |     1 +
 .../spv.register.autoassign.frag.out          |     1 +
 .../spv.register.noautoassign.frag.out        |     1 +
 .../baseResults/spv.rw.autoassign.frag.out    |     1 +
 .../spv.specConstantOperations.vert.out       |     6 +-
 .../baseResults/spv.ssbo.autoassign.frag.out  |     1 +
 .../Test/baseResults/tokenPaste.vert.out      |     3 +-
 .../glslang/Test/hlsl.intrinsics.frag         |     5 -
 .../glslang/Test/hlsl.intrinsics.vert         |     5 -
 3rdparty/bgfx/3rdparty/glslang/Test/runtests  |    14 +
 .../3rdparty/glslang/Test/tokenPaste.vert     |     6 +-
 .../glslang/glslang/Include/BaseTypes.h       |     3 +
 .../3rdparty/glslang/glslang/Include/Types.h  |   119 +-
 .../glslang/glslang/Include/revision.h        |     4 +-
 .../MachineIndependent/Intermediate.cpp       |     8 +-
 .../MachineIndependent/ParseContextBase.cpp   |     2 +-
 .../MachineIndependent/ParseHelper.cpp        |    20 +-
 .../glslang/MachineIndependent/ParseHelper.h  |     8 +-
 .../glslang/MachineIndependent/Scan.cpp       |     2 +-
 .../glslang/MachineIndependent/ShaderLang.cpp |    14 +-
 .../glslang/MachineIndependent/intermOut.cpp  |     7 +
 .../glslang/MachineIndependent/iomapper.cpp   |     8 +-
 .../MachineIndependent/linkValidate.cpp       |    24 +-
 .../MachineIndependent/localintermediate.h    |     9 +-
 .../preprocessor/PpScanner.cpp                |     7 +
 .../glslang/OSDependent/Unix/ossource.cpp     |    14 -
 .../glslang/OSDependent/Windows/ossource.cpp  |    15 -
 .../glslang/glslang/OSDependent/osinclude.h   |     3 -
 .../glslang/glslang/Public/ShaderLang.h       |    13 +-
 .../3rdparty/glslang/gtests/Hlsl.FromFile.cpp |    17 +-
 .../3rdparty/glslang/hlsl/hlslGrammar.cpp     |   240 +-
 .../bgfx/3rdparty/glslang/hlsl/hlslGrammar.h  |     9 +-
 .../3rdparty/glslang/hlsl/hlslParseHelper.cpp |   801 +-
 .../3rdparty/glslang/hlsl/hlslParseHelper.h   |    40 +-
 .../3rdparty/glslang/hlsl/hlslParseables.cpp  |     4 +-
 .../3rdparty/glslang/hlsl/hlslScanContext.cpp |    26 +-
 .../bgfx/3rdparty/glslang/hlsl/hlslTokens.h   |     1 +
 .../bgfx/3rdparty/ocornut-imgui/imgui.cpp     |     8 +-
 3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.h  |     9 +-
 .../3rdparty/ocornut-imgui/imgui_demo.cpp     |     3 +-
 .../3rdparty/ocornut-imgui/imgui_draw.cpp     |    17 +-
 .../3rdparty/ocornut-imgui/widgets/gizmo.h    |     5 +-
 .../3rdparty/ocornut-imgui/widgets/gizmo.inl  |    50 +-
 3rdparty/bgfx/CMakeLists.txt                  |     1 +
 3rdparty/bgfx/LICENSE                         |     4 -
 3rdparty/bgfx/README.md                       |     5 +
 3rdparty/bgfx/examples/common/bgfx_utils.cpp  |    18 +-
 3rdparty/bgfx/examples/common/bgfx_utils.h    |    20 +-
 3rdparty/bgfx/examples/common/imgui/imgui.cpp |     2 +
 .../bgfx/examples/common/nanovg/nanovg.cpp    |    41 +-
 3rdparty/bgfx/examples/common/shaderlib.sh    |     9 +
 3rdparty/bgfx/examples/makefile               |     1 +
 3rdparty/bgfx/include/bgfx/bgfx.h             |    52 +-
 3rdparty/bgfx/include/bgfx/bgfxdefines.h      |   444 -
 3rdparty/bgfx/include/bgfx/bgfxplatform.h     |   148 -
 3rdparty/bgfx/include/bgfx/c99/bgfx.h         |     8 +-
 3rdparty/bgfx/include/bgfx/c99/bgfxplatform.h |   208 -
 3rdparty/bgfx/include/bgfx/c99/platform.h     |     4 +-
 3rdparty/bgfx/include/bgfx/defines.h          |     2 +-
 3rdparty/bgfx/include/bgfx/platform.h         |     2 +-
 3rdparty/bgfx/scripts/bgfx.lua                |     2 +
 3rdparty/bgfx/scripts/build.ninja             |     9 +
 3rdparty/bgfx/scripts/example-common.lua      |     3 +-
 3rdparty/bgfx/scripts/genie.lua               |    10 +-
 3rdparty/bgfx/scripts/shader.mk               |     4 +
 3rdparty/bgfx/scripts/shaderc.lua             |     3 +-
 3rdparty/bgfx/scripts/texturec.lua            |    30 +-
 3rdparty/bgfx/scripts/texturev.lua            |     3 +
 3rdparty/bgfx/src/amalgamated.cpp             |     1 -
 3rdparty/bgfx/src/bgfx.cpp                    |   166 +-
 3rdparty/bgfx/src/bgfx_compute.sh             |    46 +-
 3rdparty/bgfx/src/bgfx_p.h                    |   173 +-
 3rdparty/bgfx/src/config.h                    |    19 +-
 3rdparty/bgfx/src/glcontext_eagl.mm           |    85 +-
 3rdparty/bgfx/src/image.h                     |   251 -
 3rdparty/bgfx/src/renderer_d3d11.cpp          |   138 +-
 3rdparty/bgfx/src/renderer_d3d12.cpp          |   186 +-
 3rdparty/bgfx/src/renderer_d3d9.cpp           |    74 +-
 3rdparty/bgfx/src/renderer_gl.cpp             |   113 +-
 3rdparty/bgfx/src/renderer_mtl.mm             |    85 +-
 3rdparty/bgfx/src/renderer_noop.cpp           |    59 +-
 3rdparty/bgfx/src/renderer_null.cpp           |   215 -
 3rdparty/bgfx/src/renderer_vk.cpp             |    19 +-
 3rdparty/bgfx/src/shader_dxbc.cpp             |     2 +-
 3rdparty/bgfx/tools/geometryc/geometryc.cpp   |    14 +-
 3rdparty/bgfx/tools/shaderc/shaderc.cpp       |    89 +-
 3rdparty/bgfx/tools/shaderc/shaderc_glsl.cpp  |    54 +-
 3rdparty/bgfx/tools/shaderc/shaderc_hlsl.cpp  |     2 +-
 3rdparty/bgfx/tools/shaderc/shaderc_spirv.cpp |     4 +-
 3rdparty/bgfx/tools/texturec/texturec.cpp     |   510 +-
 3rdparty/bgfx/tools/texturev/texturev.cpp     |    25 +-
 3rdparty/bimg/3rdparty/edtaa3/LICENSE.md      |    34 +
 3rdparty/bimg/3rdparty/edtaa3/edtaa3func.cpp  |   580 +
 3rdparty/bimg/3rdparty/edtaa3/edtaa3func.h    |     7 +
 3rdparty/bimg/3rdparty/etc1/LICENSE           |   161 +
 3rdparty/bimg/3rdparty/etc1/etc1.cpp          |   686 +
 3rdparty/bimg/3rdparty/etc1/etc1.h            |   114 +
 3rdparty/bimg/3rdparty/etc2/LICENSE.txt       |    24 +
 3rdparty/bimg/3rdparty/etc2/Math.hpp          |    90 +
 3rdparty/bimg/3rdparty/etc2/ProcessCommon.hpp |    51 +
 3rdparty/bimg/3rdparty/etc2/ProcessRGB.cpp    |   719 +
 3rdparty/bimg/3rdparty/etc2/ProcessRGB.hpp    |     9 +
 3rdparty/bimg/3rdparty/etc2/Tables.cpp        |   109 +
 3rdparty/bimg/3rdparty/etc2/Tables.hpp        |    25 +
 3rdparty/bimg/3rdparty/etc2/Types.hpp         |    17 +
 3rdparty/bimg/3rdparty/etc2/Vector.hpp        |   222 +
 3rdparty/bimg/3rdparty/iqa/LICENSE            |    32 +
 3rdparty/bimg/3rdparty/iqa/README.txt         |    36 +
 3rdparty/bimg/3rdparty/iqa/include/convolve.h |   111 +
 3rdparty/bimg/3rdparty/iqa/include/decimate.h |    55 +
 3rdparty/bimg/3rdparty/iqa/include/iqa.h      |   134 +
 3rdparty/bimg/3rdparty/iqa/include/iqa_os.h   |    68 +
 .../bimg/3rdparty/iqa/include/math_utils.h    |    64 +
 3rdparty/bimg/3rdparty/iqa/include/ssim.h     |   117 +
 3rdparty/bimg/3rdparty/iqa/source/convolve.c  |   195 +
 3rdparty/bimg/3rdparty/iqa/source/decimate.c  |    59 +
 .../bimg/3rdparty/iqa/source/math_utils.c     |    82 +
 3rdparty/bimg/3rdparty/iqa/source/ms_ssim.c   |   277 +
 3rdparty/bimg/3rdparty/iqa/source/mse.c       |    50 +
 3rdparty/bimg/3rdparty/iqa/source/psnr.c      |    42 +
 3rdparty/bimg/3rdparty/iqa/source/ssim.c      |   322 +
 3rdparty/bimg/3rdparty/libsquish/LICENSE      |    20 +
 3rdparty/bimg/3rdparty/libsquish/README       |    35 +
 3rdparty/bimg/3rdparty/libsquish/alpha.cpp    |   350 +
 3rdparty/bimg/3rdparty/libsquish/alpha.h      |    41 +
 .../bimg/3rdparty/libsquish/clusterfit.cpp    |   392 +
 3rdparty/bimg/3rdparty/libsquish/clusterfit.h |    61 +
 .../bimg/3rdparty/libsquish/colourblock.cpp   |   214 +
 .../bimg/3rdparty/libsquish/colourblock.h     |    41 +
 .../bimg/3rdparty/libsquish/colourfit.cpp     |    54 +
 3rdparty/bimg/3rdparty/libsquish/colourfit.h  |    56 +
 .../bimg/3rdparty/libsquish/colourset.cpp     |   121 +
 3rdparty/bimg/3rdparty/libsquish/colourset.h  |    58 +
 3rdparty/bimg/3rdparty/libsquish/config.h     |    49 +
 3rdparty/bimg/3rdparty/libsquish/maths.cpp    |   259 +
 3rdparty/bimg/3rdparty/libsquish/maths.h      |   233 +
 3rdparty/bimg/3rdparty/libsquish/rangefit.cpp |   201 +
 3rdparty/bimg/3rdparty/libsquish/rangefit.h   |    54 +
 3rdparty/bimg/3rdparty/libsquish/simd.h       |    32 +
 3rdparty/bimg/3rdparty/libsquish/simd_float.h |   183 +
 .../3rdparty/libsquish/singlecolourfit.cpp    |   172 +
 .../bimg/3rdparty/libsquish/singlecolourfit.h |    58 +
 .../3rdparty/libsquish/singlecolourlookup.inl |  1064 ++
 3rdparty/bimg/3rdparty/libsquish/squish.cpp   |   260 +
 3rdparty/bimg/3rdparty/libsquish/squish.h     |   269 +
 3rdparty/bimg/3rdparty/lodepng/README.md      |    10 +
 3rdparty/bimg/3rdparty/lodepng/lodepng.cpp    |  6224 ++++++++
 3rdparty/bimg/3rdparty/lodepng/lodepng.h      |  1759 +++
 3rdparty/bimg/3rdparty/maratis-tcl/LICENSE    |    22 +
 3rdparty/bimg/3rdparty/maratis-tcl/m_image.h  |  2340 +++
 .../nvtt/NVIDIA_Texture_Tools_LICENSE.txt     |    24 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/bits.h       |    75 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/shapes_two.h |   133 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/tile.h       |    82 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.cpp      |   197 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.h        |    65 +
 .../bimg/3rdparty/nvtt/bc6h/zoh_utils.cpp     |   324 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.h  |    72 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/zohone.cpp   |   799 +
 3rdparty/bimg/3rdparty/nvtt/bc6h/zohtwo.cpp   |   883 ++
 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.cpp     |   264 +
 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.h       |    99 +
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode0.cpp    |  1066 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode1.cpp    |  1047 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode2.cpp    |  1004 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode3.cpp    |  1059 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode4.cpp    |  1214 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode5.cpp    |  1216 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode6.cpp    |  1055 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_mode7.cpp    |  1094 ++
 .../bimg/3rdparty/nvtt/bc7/avpcl_utils.cpp    |   389 +
 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.h |    61 +
 3rdparty/bimg/3rdparty/nvtt/bc7/bits.h        |    76 +
 3rdparty/bimg/3rdparty/nvtt/bc7/endpts.h      |    81 +
 .../bimg/3rdparty/nvtt/bc7/shapes_three.h     |   132 +
 3rdparty/bimg/3rdparty/nvtt/bc7/shapes_two.h  |   133 +
 3rdparty/bimg/3rdparty/nvtt/bc7/tile.h        |    41 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/array.h    |   181 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/array.inl  |   437 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/debug.h    |   216 +
 .../3rdparty/nvtt/nvcore/defsgnucdarwin.h     |    57 +
 .../bimg/3rdparty/nvtt/nvcore/defsgnuclinux.h |    63 +
 .../bimg/3rdparty/nvtt/nvcore/defsgnucwin32.h |    65 +
 .../bimg/3rdparty/nvtt/nvcore/defsvcwin32.h   |    94 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/foreach.h  |    68 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/hash.h     |    83 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/memory.h   |    30 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/nvcore.h   |   363 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/posh.h     |  1030 ++
 .../bimg/3rdparty/nvtt/nvcore/stdstream.h     |   459 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/stream.h   |   163 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/strlib.h   |   429 +
 3rdparty/bimg/3rdparty/nvtt/nvcore/utils.h    |   281 +
 .../bimg/3rdparty/nvtt/nvmath/fitting.cpp     |  1200 ++
 3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.h  |    49 +
 3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.h   |   112 +
 3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.inl |  1274 ++
 3rdparty/bimg/3rdparty/nvtt/nvmath/nvmath.h   |    61 +
 3rdparty/bimg/3rdparty/nvtt/nvmath/plane.h    |    40 +
 3rdparty/bimg/3rdparty/nvtt/nvmath/plane.inl  |    49 +
 3rdparty/bimg/3rdparty/nvtt/nvmath/vector.h   |   148 +
 3rdparty/bimg/3rdparty/nvtt/nvmath/vector.inl |   921 ++
 3rdparty/bimg/3rdparty/nvtt/nvtt.cpp          |    95 +
 3rdparty/bimg/3rdparty/nvtt/nvtt.h            |    13 +
 3rdparty/bimg/3rdparty/pvrtc/AlphaBitmap.h    |    20 +
 3rdparty/bimg/3rdparty/pvrtc/BitScale.cpp     |   183 +
 3rdparty/bimg/3rdparty/pvrtc/BitScale.h       |    28 +
 3rdparty/bimg/3rdparty/pvrtc/BitUtility.h     |    19 +
 3rdparty/bimg/3rdparty/pvrtc/Bitmap.h         |    36 +
 3rdparty/bimg/3rdparty/pvrtc/ColorRgba.h      |   152 +
 3rdparty/bimg/3rdparty/pvrtc/Interval.h       |    21 +
 3rdparty/bimg/3rdparty/pvrtc/LICENSE.TXT      |    25 +
 3rdparty/bimg/3rdparty/pvrtc/MortonTable.cpp  |    43 +
 3rdparty/bimg/3rdparty/pvrtc/MortonTable.h    |    18 +
 3rdparty/bimg/3rdparty/pvrtc/Point2.h         |    17 +
 3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.cpp |   144 +
 3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.h   |    25 +
 3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.cpp |   464 +
 3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.h   |    43 +
 3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.cpp  |   209 +
 3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.h    |    65 +
 3rdparty/bimg/3rdparty/pvrtc/README.md        |    17 +
 3rdparty/bimg/3rdparty/pvrtc/RgbBitmap.h      |    25 +
 3rdparty/bimg/3rdparty/pvrtc/RgbaBitmap.h     |    24 +
 3rdparty/bimg/3rdparty/stb/stb_image.h        |  7183 +++++++++
 3rdparty/bimg/3rdparty/stb/stb_image_resize.h |  2624 ++++
 3rdparty/bimg/3rdparty/stb/stb_image_write.h  |  1092 ++
 3rdparty/bimg/3rdparty/tinyexr/README.md      |   274 +
 3rdparty/bimg/3rdparty/tinyexr/tinyexr.h      | 12356 +++++++++++++++
 3rdparty/bimg/CMakeLists.txt                  |    26 +
 3rdparty/bimg/LICENSE                         |    22 +
 3rdparty/bimg/README.md                       |    45 +
 3rdparty/bimg/include/bimg/bimg.h             |   519 +
 3rdparty/bimg/include/bimg/decode.h           |    23 +
 3rdparty/bimg/include/bimg/encode.h           |    27 +
 3rdparty/bimg/makefile                        |   376 +
 3rdparty/bimg/scripts/bimg.lua                |    23 +
 3rdparty/bimg/scripts/bimg_decode.lua         |    26 +
 3rdparty/bimg/scripts/bimg_encode.lua         |    41 +
 3rdparty/bimg/scripts/genie.lua               |    76 +
 3rdparty/bimg/scripts/texturec.lua            |    57 +
 3rdparty/bimg/src/bimg_p.h                    |    72 +
 3rdparty/{bgfx => bimg}/src/image.cpp         |    53 +-
 3rdparty/bimg/src/image_decode.cpp            |   393 +
 3rdparty/bimg/src/image_encode.cpp            |   271 +
 3rdparty/bimg/tools/texturec/texturec.cpp     |   427 +
 3rdparty/bx/LICENSE                           |     4 -
 3rdparty/bx/include/bx/platform.h             |     2 +-
 3rdparty/bx/include/bx/readerwriter.h         |     1 +
 3rdparty/bx/scripts/toolchain.lua             |     6 +
 3rdparty/bx/tools/bin/darwin/genie            |   Bin 496056 -> 500152 bytes
 3rdparty/bx/tools/bin/linux/genie             |   Bin 478872 -> 478872 bytes
 3rdparty/bx/tools/bin/windows/genie.exe       |   Bin 485888 -> 486400 bytes
 CMakeLists.txt                                |     2 +
 src/main.cc                                   |     2 +-
 src/modules/RenderModule.cc                   |     2 +-
 453 files changed, 80688 insertions(+), 13833 deletions(-)
 delete mode 100644 3rdparty/bgfx/include/bgfx/bgfxdefines.h
 delete mode 100644 3rdparty/bgfx/include/bgfx/bgfxplatform.h
 delete mode 100644 3rdparty/bgfx/include/bgfx/c99/bgfxplatform.h
 delete mode 100644 3rdparty/bgfx/src/image.h
 delete mode 100644 3rdparty/bgfx/src/renderer_null.cpp
 create mode 100644 3rdparty/bimg/3rdparty/edtaa3/LICENSE.md
 create mode 100644 3rdparty/bimg/3rdparty/edtaa3/edtaa3func.cpp
 create mode 100644 3rdparty/bimg/3rdparty/edtaa3/edtaa3func.h
 create mode 100644 3rdparty/bimg/3rdparty/etc1/LICENSE
 create mode 100644 3rdparty/bimg/3rdparty/etc1/etc1.cpp
 create mode 100644 3rdparty/bimg/3rdparty/etc1/etc1.h
 create mode 100644 3rdparty/bimg/3rdparty/etc2/LICENSE.txt
 create mode 100644 3rdparty/bimg/3rdparty/etc2/Math.hpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/ProcessCommon.hpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/ProcessRGB.cpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/ProcessRGB.hpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/Tables.cpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/Tables.hpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/Types.hpp
 create mode 100644 3rdparty/bimg/3rdparty/etc2/Vector.hpp
 create mode 100644 3rdparty/bimg/3rdparty/iqa/LICENSE
 create mode 100644 3rdparty/bimg/3rdparty/iqa/README.txt
 create mode 100644 3rdparty/bimg/3rdparty/iqa/include/convolve.h
 create mode 100644 3rdparty/bimg/3rdparty/iqa/include/decimate.h
 create mode 100644 3rdparty/bimg/3rdparty/iqa/include/iqa.h
 create mode 100644 3rdparty/bimg/3rdparty/iqa/include/iqa_os.h
 create mode 100644 3rdparty/bimg/3rdparty/iqa/include/math_utils.h
 create mode 100644 3rdparty/bimg/3rdparty/iqa/include/ssim.h
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/convolve.c
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/decimate.c
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/math_utils.c
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/ms_ssim.c
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/mse.c
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/psnr.c
 create mode 100644 3rdparty/bimg/3rdparty/iqa/source/ssim.c
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/LICENSE
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/README
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/alpha.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/alpha.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/clusterfit.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/clusterfit.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/colourblock.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/colourblock.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/colourfit.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/colourfit.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/colourset.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/colourset.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/config.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/maths.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/maths.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/rangefit.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/rangefit.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/simd.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/simd_float.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/singlecolourfit.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/singlecolourfit.h
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/singlecolourlookup.inl
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/squish.cpp
 create mode 100644 3rdparty/bimg/3rdparty/libsquish/squish.h
 create mode 100644 3rdparty/bimg/3rdparty/lodepng/README.md
 create mode 100644 3rdparty/bimg/3rdparty/lodepng/lodepng.cpp
 create mode 100644 3rdparty/bimg/3rdparty/lodepng/lodepng.h
 create mode 100644 3rdparty/bimg/3rdparty/maratis-tcl/LICENSE
 create mode 100644 3rdparty/bimg/3rdparty/maratis-tcl/m_image.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/bits.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/shapes_two.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/tile.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/zohone.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc6h/zohtwo.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode0.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode1.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode2.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode3.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode4.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode5.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode6.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode7.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/bits.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/endpts.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/shapes_three.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/shapes_two.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/bc7/tile.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/array.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/array.inl
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/debug.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucdarwin.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnuclinux.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucwin32.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/defsvcwin32.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/foreach.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/hash.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/memory.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/nvcore.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/posh.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/stdstream.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/stream.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/strlib.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvcore/utils.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.inl
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/nvmath.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/plane.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/plane.inl
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/vector.h
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvmath/vector.inl
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvtt.cpp
 create mode 100644 3rdparty/bimg/3rdparty/nvtt/nvtt.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/AlphaBitmap.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/BitScale.cpp
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/BitScale.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/BitUtility.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/Bitmap.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/ColorRgba.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/Interval.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/LICENSE.TXT
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/MortonTable.cpp
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/MortonTable.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/Point2.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.cpp
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.cpp
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.cpp
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/README.md
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/RgbBitmap.h
 create mode 100644 3rdparty/bimg/3rdparty/pvrtc/RgbaBitmap.h
 create mode 100644 3rdparty/bimg/3rdparty/stb/stb_image.h
 create mode 100644 3rdparty/bimg/3rdparty/stb/stb_image_resize.h
 create mode 100644 3rdparty/bimg/3rdparty/stb/stb_image_write.h
 create mode 100644 3rdparty/bimg/3rdparty/tinyexr/README.md
 create mode 100644 3rdparty/bimg/3rdparty/tinyexr/tinyexr.h
 create mode 100644 3rdparty/bimg/CMakeLists.txt
 create mode 100644 3rdparty/bimg/LICENSE
 create mode 100644 3rdparty/bimg/README.md
 create mode 100644 3rdparty/bimg/include/bimg/bimg.h
 create mode 100644 3rdparty/bimg/include/bimg/decode.h
 create mode 100644 3rdparty/bimg/include/bimg/encode.h
 create mode 100644 3rdparty/bimg/makefile
 create mode 100644 3rdparty/bimg/scripts/bimg.lua
 create mode 100644 3rdparty/bimg/scripts/bimg_decode.lua
 create mode 100644 3rdparty/bimg/scripts/bimg_encode.lua
 create mode 100644 3rdparty/bimg/scripts/genie.lua
 create mode 100644 3rdparty/bimg/scripts/texturec.lua
 create mode 100644 3rdparty/bimg/src/bimg_p.h
 rename 3rdparty/{bgfx => bimg}/src/image.cpp (98%)
 create mode 100644 3rdparty/bimg/src/image_decode.cpp
 create mode 100644 3rdparty/bimg/src/image_encode.cpp
 create mode 100644 3rdparty/bimg/tools/texturec/texturec.cpp

diff --git a/3rdparty/bgfx/3rdparty/glslang/SPIRV/GlslangToSpv.cpp b/3rdparty/bgfx/3rdparty/glslang/SPIRV/GlslangToSpv.cpp
index cdfb309..ebb7230 100755
--- a/3rdparty/bgfx/3rdparty/glslang/SPIRV/GlslangToSpv.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/SPIRV/GlslangToSpv.cpp
@@ -222,8 +222,7 @@ spv::SourceLanguage TranslateSourceLanguage(glslang::EShSource source, EProfile
             return spv::SourceLanguageUnknown;
         }
     case glslang::EShSourceHlsl:
-        // Use SourceLanguageUnknown instead of SourceLanguageHLSL for now, until Vulkan knows what HLSL is
-        return spv::SourceLanguageUnknown;
+        return spv::SourceLanguageHLSL;
     default:
         return spv::SourceLanguageUnknown;
     }
@@ -869,14 +868,20 @@ TGlslangToSpvTraverser::TGlslangToSpvTraverser(const glslang::TIntermediate* gls
         builder.addCapability(spv::CapabilityShader);
         break;
 
+    case EShLangTessEvaluation:
     case EShLangTessControl:
         builder.addCapability(spv::CapabilityTessellation);
-        builder.addExecutionMode(shaderEntry, spv::ExecutionModeOutputVertices, glslangIntermediate->getVertices());
-        break;
 
-    case EShLangTessEvaluation:
-        builder.addCapability(spv::CapabilityTessellation);
-        switch (glslangIntermediate->getInputPrimitive()) {
+        glslang::TLayoutGeometry primitive;
+
+        if (glslangIntermediate->getStage() == EShLangTessControl) {
+            builder.addExecutionMode(shaderEntry, spv::ExecutionModeOutputVertices, glslangIntermediate->getVertices());
+            primitive = glslangIntermediate->getOutputPrimitive();
+        } else {
+            primitive = glslangIntermediate->getInputPrimitive();
+        }
+
+        switch (primitive) {
         case glslang::ElgTriangles:           mode = spv::ExecutionModeTriangles;     break;
         case glslang::ElgQuads:               mode = spv::ExecutionModeQuads;         break;
         case glslang::ElgIsolines:            mode = spv::ExecutionModeIsolines;      break;
@@ -1823,7 +1828,15 @@ bool TGlslangToSpvTraverser::visitSelection(glslang::TVisit /* visit */, glslang
         node->getFalseBlock()->traverse(this);
         spv::Id falseValue = accessChainLoad(node->getTrueBlock()->getAsTyped()->getType());
 
-        spv::Id select = builder.createTriOp(spv::OpSelect, convertGlslangToSpvType(node->getType()), condition, trueValue, falseValue);
+        // smear condition to vector, if necessary (AST is always scalar)
+        if (builder.isVector(trueValue))
+            condition = builder.smearScalar(spv::NoPrecision, condition, 
+                                            builder.makeVectorType(builder.makeBoolType(),
+                                                                   builder.getNumComponents(trueValue)));
+
+        spv::Id select = builder.createTriOp(spv::OpSelect,
+                                             convertGlslangToSpvType(node->getType()), condition,
+                                                                     trueValue, falseValue);
         builder.clearAccessChain();
         builder.setAccessChainRValue(select);
     };
@@ -2272,12 +2285,21 @@ bool TGlslangToSpvTraverser::filterMember(const glslang::TType& member)
 {
     auto& extensions = glslangIntermediate->getRequestedExtensions();
 
+    if (member.getFieldName() == "gl_ViewportMask" &&
+        extensions.find("GL_NV_viewport_array2") == extensions.end())
+        return true;
+    if (member.getFieldName() == "gl_SecondaryViewportMaskNV" &&
+        extensions.find("GL_NV_stereo_view_rendering") == extensions.end())
+        return true;
     if (member.getFieldName() == "gl_SecondaryPositionNV" &&
         extensions.find("GL_NV_stereo_view_rendering") == extensions.end())
         return true;
     if (member.getFieldName() == "gl_PositionPerViewNV" &&
         extensions.find("GL_NVX_multiview_per_view_attributes") == extensions.end())
         return true;
+    if (member.getFieldName() == "gl_ViewportMaskPerViewNV" &&
+        extensions.find("GL_NVX_multiview_per_view_attributes") == extensions.end())
+        return true;
 
     return false;
 };
@@ -2705,7 +2727,23 @@ void TGlslangToSpvTraverser::updateMemberOffset(const glslang::TType& /*structTy
     int memberSize;
     int dummyStride;
     int memberAlignment = glslangIntermediate->getBaseAlignment(memberType, memberSize, dummyStride, explicitLayout == glslang::ElpStd140, matrixLayout == glslang::ElmRowMajor);
+
+    // Adjust alignment for HLSL rules
+    if (glslangIntermediate->usingHlslOFfsets() &&
+        ! memberType.isArray() && memberType.isVector()) {
+        int dummySize;
+        int componentAlignment = glslangIntermediate->getBaseAlignmentScalar(memberType, dummySize);
+        if (componentAlignment <= 4)
+            memberAlignment = componentAlignment;
+    }
+
+    // Bump up to member alignment
     glslang::RoundToPow2(currentOffset, memberAlignment);
+
+    // Bump up to vec4 if there is a bad straddle
+    if (glslangIntermediate->improperStraddle(memberType, memberSize, currentOffset))
+        glslang::RoundToPow2(currentOffset, 16);
+
     nextOffset = currentOffset + memberSize;
 }
 
@@ -2777,7 +2815,7 @@ void TGlslangToSpvTraverser::makeFunctions(const glslang::TIntermSequence& glslF
             if (paramType.containsOpaque() ||                                // sampler, etc.
                 (paramType.getBasicType() == glslang::EbtBlock &&
                  paramType.getQualifier().storage == glslang::EvqBuffer) ||  // SSBO
-                 p == 0 && implicitThis)                                     // implicit 'this'
+                (p == 0 && implicitThis))                                    // implicit 'this'
                 typeId = builder.makePointer(TranslateStorageClass(paramType), typeId);
             else if (paramType.getQualifier().storage != glslang::EvqConstReadOnly)
                 typeId = builder.makePointer(spv::StorageClassFunction, typeId);
@@ -4609,6 +4647,9 @@ spv::Id TGlslangToSpvTraverser::createMiscOperation(glslang::TOperator op, spv::
     spv::Id typeId0 = 0;
     if (consumedOperands > 0)
         typeId0 = builder.getTypeId(operands[0]);
+    spv::Id typeId1 = 0;
+    if (consumedOperands > 1)
+        typeId1 = builder.getTypeId(operands[1]);
     spv::Id frexpIntType = 0;
 
     switch (op) {
@@ -4730,13 +4771,22 @@ spv::Id TGlslangToSpvTraverser::createMiscOperation(glslang::TOperator op, spv::
         libCall = spv::GLSLstd450Fma;
         break;
     case glslang::EOpFrexp:
-        libCall = spv::GLSLstd450FrexpStruct;
-        if (builder.getNumComponents(operands[0]) == 1)
-            frexpIntType = builder.makeIntegerType(32, true);
-        else
-            frexpIntType = builder.makeVectorType(builder.makeIntegerType(32, true), builder.getNumComponents(operands[0]));
-        typeId = builder.makeStructResultType(typeId0, frexpIntType);
-        consumedOperands = 1;
+        {
+            libCall = spv::GLSLstd450FrexpStruct;
+            assert(builder.isPointerType(typeId1));
+            typeId1 = builder.getContainedTypeId(typeId1);
+#ifdef AMD_EXTENSIONS
+            int width = builder.getScalarTypeWidth(typeId1);
+#else
+            int width = 32;
+#endif
+            if (builder.getNumComponents(operands[0]) == 1)
+                frexpIntType = builder.makeIntegerType(width, true);
+            else
+                frexpIntType = builder.makeVectorType(builder.makeIntegerType(width, true), builder.getNumComponents(operands[0]));
+            typeId = builder.makeStructResultType(typeId0, frexpIntType);
+            consumedOperands = 1;
+        }
         break;
     case glslang::EOpLdexp:
         libCall = spv::GLSLstd450Ldexp;
@@ -4844,9 +4894,18 @@ spv::Id TGlslangToSpvTraverser::createMiscOperation(glslang::TOperator op, spv::
         builder.createStore(builder.createCompositeExtract(id, typeId0, 1), operands[2]);
         break;
     case glslang::EOpFrexp:
-        assert(operands.size() == 2);
-        builder.createStore(builder.createCompositeExtract(id, frexpIntType, 1), operands[1]);
-        id = builder.createCompositeExtract(id, typeId0, 0);
+        {
+            assert(operands.size() == 2);
+            if (builder.isFloatType(builder.getScalarTypeId(typeId1))) {
+                // "exp" is floating-point type (from HLSL intrinsic)
+                spv::Id member1 = builder.createCompositeExtract(id, frexpIntType, 1);
+                member1 = builder.createUnaryOp(spv::OpConvertSToF, typeId1, member1);
+                builder.createStore(member1, operands[1]);
+            } else
+                // "exp" is integer type (from GLSL built-in function)
+                builder.createStore(builder.createCompositeExtract(id, frexpIntType, 1), operands[1]);
+            id = builder.createCompositeExtract(id, typeId0, 0);
+        }
         break;
     default:
         break;
diff --git a/3rdparty/bgfx/3rdparty/glslang/SPIRV/SpvBuilder.h b/3rdparty/bgfx/3rdparty/glslang/SPIRV/SpvBuilder.h
index 204d3e7..d93174e 100755
--- a/3rdparty/bgfx/3rdparty/glslang/SPIRV/SpvBuilder.h
+++ b/3rdparty/bgfx/3rdparty/glslang/SPIRV/SpvBuilder.h
@@ -134,6 +134,9 @@ public:
     bool isSampledImage(Id resultId) const { return isSampledImageType(getTypeId(resultId)); }
 
     bool isBoolType(Id typeId)         const { return groupedTypes[OpTypeBool].size() > 0 && typeId == groupedTypes[OpTypeBool].back()->getResultId(); }
+    bool isIntType(Id typeId)          const { return getTypeClass(typeId) == OpTypeInt && module.getInstruction(typeId)->getImmediateOperand(1) != 0; }
+    bool isUintType(Id typeId)         const { return getTypeClass(typeId) == OpTypeInt && module.getInstruction(typeId)->getImmediateOperand(1) == 0; }
+    bool isFloatType(Id typeId)        const { return getTypeClass(typeId) == OpTypeFloat; }
     bool isPointerType(Id typeId)      const { return getTypeClass(typeId) == OpTypePointer; }
     bool isScalarType(Id typeId)       const { return getTypeClass(typeId) == OpTypeFloat  || getTypeClass(typeId) == OpTypeInt || getTypeClass(typeId) == OpTypeBool; }
     bool isVectorType(Id typeId)       const { return getTypeClass(typeId) == OpTypeVector; }
@@ -153,6 +156,13 @@ public:
     unsigned int getConstantScalar(Id resultId) const { return module.getInstruction(resultId)->getImmediateOperand(0); }
     StorageClass getStorageClass(Id resultId) const { return getTypeStorageClass(getTypeId(resultId)); }
 
+    int getScalarTypeWidth(Id typeId) const
+    {
+        Id scalarTypeId = getScalarTypeId(typeId);
+        assert(getTypeClass(scalarTypeId) == OpTypeInt || getTypeClass(scalarTypeId) == OpTypeFloat);
+        return module.getInstruction(scalarTypeId)->getImmediateOperand(0);
+    }
+
     int getTypeNumColumns(Id typeId) const
     {
         assert(isMatrixType(typeId));
diff --git a/3rdparty/bgfx/3rdparty/glslang/SPIRV/doc.cpp b/3rdparty/bgfx/3rdparty/glslang/SPIRV/doc.cpp
index a99522b..903421c 100755
--- a/3rdparty/bgfx/3rdparty/glslang/SPIRV/doc.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/SPIRV/doc.cpp
@@ -68,9 +68,9 @@ namespace spv {
 // Also, the ceilings are declared next to these, to help keep them in sync.
 // Ceilings should be
 //  - one more than the maximum value an enumerant takes on, for non-mask enumerants
-//    (for non-sparse enums, this is the number of enumurants)
+//    (for non-sparse enums, this is the number of enumerants)
 //  - the number of bits consumed by the set of masks
-//    (for non-sparse mask enums, this is the number of enumurants)
+//    (for non-sparse mask enums, this is the number of enumerants)
 //
 
 const int SourceLanguageCeiling = 6; // HLSL todo: need official enumerant
diff --git a/3rdparty/bgfx/3rdparty/glslang/SPIRV/spirv.hpp b/3rdparty/bgfx/3rdparty/glslang/SPIRV/spirv.hpp
index 5580c40..91cb59e 100755
--- a/3rdparty/bgfx/3rdparty/glslang/SPIRV/spirv.hpp
+++ b/3rdparty/bgfx/3rdparty/glslang/SPIRV/spirv.hpp
@@ -61,6 +61,7 @@ enum SourceLanguage {
     SourceLanguageGLSL = 2,
     SourceLanguageOpenCL_C = 3,
     SourceLanguageOpenCL_CPP = 4,
+    SourceLanguageHLSL = 5,
     SourceLanguageMax = 0x7fffffff,
 };
 
@@ -137,6 +138,7 @@ enum StorageClass {
     StorageClassPushConstant = 9,
     StorageClassAtomicCounter = 10,
     StorageClassImage = 11,
+    StorageClassStorageBuffer = 12,
     StorageClassMax = 0x7fffffff,
 };
 
@@ -616,12 +618,16 @@ enum Capability {
     CapabilitySubgroupBallotKHR = 4423,
     CapabilityDrawParameters = 4427,
     CapabilitySubgroupVoteKHR = 4431,
+    CapabilityStorageBuffer16BitAccess = 4433,
     CapabilityStorageUniformBufferBlock16 = 4433,
     CapabilityStorageUniform16 = 4434,
+    CapabilityUniformAndStorageBuffer16BitAccess = 4434,
     CapabilityStoragePushConstant16 = 4435,
     CapabilityStorageInputOutput16 = 4436,
     CapabilityDeviceGroup = 4437,
     CapabilityMultiView = 4439,
+    CapabilityVariablePointersStorageBuffer = 4441,
+    CapabilityVariablePointers = 4442,
     CapabilitySampleMaskOverrideCoverageNV = 5249,
     CapabilityGeometryShaderPassthroughNV = 5251,
     CapabilityShaderViewportIndexLayerNV = 5254,
diff --git a/3rdparty/bgfx/3rdparty/glslang/StandAlone/StandAlone.cpp b/3rdparty/bgfx/3rdparty/glslang/StandAlone/StandAlone.cpp
index 0da690e..60dbc4d 100644
--- a/3rdparty/bgfx/3rdparty/glslang/StandAlone/StandAlone.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/StandAlone/StandAlone.cpp
@@ -51,6 +51,8 @@
 #include <cctype>
 #include <cmath>
 #include <array>
+#include <memory>
+#include <thread>
 
 #include "../glslang/OSDependent/osinclude.h"
 
@@ -84,6 +86,7 @@ enum TOptions {
     EOptionFlattenUniformArrays = (1 << 20),
     EOptionNoStorageFormat      = (1 << 21),
     EOptionKeepUncalled         = (1 << 22),
+    EOptionHlslOffsets          = (1 << 23),
 };
 
 //
@@ -150,13 +153,6 @@ void ProcessConfigFile()
         delete[] config;
 }
 
-// thread-safe list of shaders to asynchronously grab and compile
-glslang::TWorklist Worklist;
-
-// array of unique places to leave the shader names and infologs for the asynchronous compiles
-glslang::TWorkItem** Work = 0;
-int NumWorkItems = 0;
-
 int Options = 0;
 const char* ExecutableName = nullptr;
 const char* binaryFileName = nullptr;
@@ -253,7 +249,7 @@ void ProcessBindingBase(int& argc, char**& argv, std::array<unsigned int, EShLan
 //
 // Does not return (it exits) if command-line is fatally flawed.
 //
-void ProcessArguments(int argc, char* argv[])
+void ProcessArguments(std::vector<std::unique_ptr<glslang::TWorkItem>>& workItems, int argc, char* argv[])
 {
     baseSamplerBinding.fill(0);
     baseTextureBinding.fill(0);
@@ -262,10 +258,7 @@ void ProcessArguments(int argc, char* argv[])
     baseSsboBinding.fill(0);
 
     ExecutableName = argv[0];
-    NumWorkItems = argc;  // will include some empties where the '-' options were, but it doesn't matter, they'll be 0
-    Work = new glslang::TWorkItem*[NumWorkItems];
-    for (int w = 0; w < NumWorkItems; ++w)
-        Work[w] = 0;
+    workItems.reserve(argc);
 
     argc--;
     argv++;
@@ -319,8 +312,7 @@ void ProcessArguments(int argc, char* argv[])
                         } else
                             Error("no <C-variable-name> provided for --variable-name");
                         break;
-                    }
-                    else if (lowerword == "source-entrypoint" || // synonyms
+                    } else if (lowerword == "source-entrypoint" || // synonyms
                                lowerword == "sep") {
                         sourceEntryPointName = argv[1];
                         if (argc > 0) {
@@ -332,6 +324,8 @@ void ProcessArguments(int argc, char* argv[])
                     } else if (lowerword == "keep-uncalled" || // synonyms
                                lowerword == "ku") {
                         Options |= EOptionKeepUncalled;
+                    } else if (lowerword == "hlsl-offsets") {
+                        Options |= EOptionHlslOffsets;
                     } else {
                         usage();
                     }
@@ -420,9 +414,7 @@ void ProcessArguments(int argc, char* argv[])
                 Options |= EOptionSuppressInfolog;
                 break;
             case 't':
-                #ifdef _WIN32
-                    Options |= EOptionMultiThreaded;
-                #endif
+                Options |= EOptionMultiThreaded;
                 break;
             case 'v':
                 Options |= EOptionDumpVersions;
@@ -440,8 +432,7 @@ void ProcessArguments(int argc, char* argv[])
         } else {
             std::string name(argv[0]);
             if (! SetConfigFile(name)) {
-                Work[argc] = new glslang::TWorkItem(name);
-                Worklist.add(Work[argc]);
+                workItems.push_back(std::unique_ptr<glslang::TWorkItem>(new glslang::TWorkItem(name)));
             }
         }
     }
@@ -482,20 +473,20 @@ void SetMessageOptions(EShMessages& messages)
         messages = (EShMessages)(messages | EShMsgCascadingErrors);
     if (Options & EOptionKeepUncalled)
         messages = (EShMessages)(messages | EShMsgKeepUncalled);
+    if (Options & EOptionHlslOffsets)
+        messages = (EShMessages)(messages | EShMsgHlslOffsets);
 }
 
 //
 // Thread entry point, for non-linking asynchronous mode.
 //
-// Return 0 for failure, 1 for success.
-//
-unsigned int CompileShaders(void*)
+void CompileShaders(glslang::TWorklist& worklist)
 {
     glslang::TWorkItem* workItem;
-    while (Worklist.remove(workItem)) {
+    while (worklist.remove(workItem)) {
         ShHandle compiler = ShConstructCompiler(FindLanguage(workItem->name), Options);
         if (compiler == 0)
-            return 0;
+            return;
 
         CompileFile(workItem->name.c_str(), compiler);
 
@@ -504,8 +495,6 @@ unsigned int CompileShaders(void*)
 
         ShDestruct(compiler);
     }
-
-    return 0;
 }
 
 // Outputs the given string, but only if it is non-null and non-empty.
@@ -705,7 +694,7 @@ void CompileAndLinkShaderUnits(std::vector<ShaderCompUnit> compUnits)
 // performance and memory testing, the actual compile/link can be put in
 // a loop, independent of processing the work items and file IO.
 //
-void CompileAndLinkShaderFiles()
+void CompileAndLinkShaderFiles(glslang::TWorklist& Worklist)
 {
     std::vector<ShaderCompUnit> compUnits;
 
@@ -747,11 +736,19 @@ void CompileAndLinkShaderFiles()
 
 int C_DECL main(int argc, char* argv[])
 {
-    ProcessArguments(argc, argv);
+    // array of unique places to leave the shader names and infologs for the asynchronous compiles
+    std::vector<std::unique_ptr<glslang::TWorkItem>> workItems;
+    ProcessArguments(workItems, argc, argv);
+
+    glslang::TWorklist workList;
+    std::for_each(workItems.begin(), workItems.end(), [&workList](std::unique_ptr<glslang::TWorkItem>& item) {
+        assert(item);
+        workList.add(item.get());
+    });
 
     if (Options & EOptionDumpConfig) {
         printf("%s", glslang::GetDefaultTBuiltInResourceString().c_str());
-        if (Worklist.empty())
+        if (workList.empty())
             return ESuccess;
     }
 
@@ -766,11 +763,11 @@ int C_DECL main(int argc, char* argv[])
         printf("Khronos Tool ID %d\n", glslang::GetKhronosToolId());
         printf("GL_KHR_vulkan_glsl version %d\n", 100);
         printf("ARB_GL_gl_spirv version %d\n", 100);
-        if (Worklist.empty())
+        if (workList.empty())
             return ESuccess;
     }
 
-    if (Worklist.empty()) {
+    if (workList.empty()) {
         usage();
     }
 
@@ -784,47 +781,42 @@ int C_DECL main(int argc, char* argv[])
     if (Options & EOptionLinkProgram ||
         Options & EOptionOutputPreprocessed) {
         glslang::InitializeProcess();
-        CompileAndLinkShaderFiles();
+        CompileAndLinkShaderFiles(workList);
         glslang::FinalizeProcess();
-        for (int w = 0; w < NumWorkItems; ++w) {
-          if (Work[w]) {
-            delete Work[w];
-          }
-        }
     } else {
         ShInitialize();
 
-        bool printShaderNames = Worklist.size() > 1;
+        bool printShaderNames = workList.size() > 1;
 
-        if (Options & EOptionMultiThreaded) {
-            const int NumThreads = 16;
-            void* threads[NumThreads];
-            for (int t = 0; t < NumThreads; ++t) {
-                threads[t] = glslang::OS_CreateThread(&CompileShaders);
-                if (! threads[t]) {
+        if (Options & EOptionMultiThreaded)
+        {
+            std::array<std::thread, 16> threads;
+            for (unsigned int t = 0; t < threads.size(); ++t)
+            {
+                threads[t] = std::thread(CompileShaders, std::ref(workList));
+                if (threads[t].get_id() == std::thread::id())
+                {
                     printf("Failed to create thread\n");
                     return EFailThreadCreate;
                 }
             }
-            glslang::OS_WaitForAllThreads(threads, NumThreads);
+
+            std::for_each(threads.begin(), threads.end(), [](std::thread& t) { t.join(); });
         } else
-            CompileShaders(0);
+            CompileShaders(workList);
 
         // Print out all the resulting infologs
-        for (int w = 0; w < NumWorkItems; ++w) {
-            if (Work[w]) {
-                if (printShaderNames || Work[w]->results.size() > 0)
-                    PutsIfNonEmpty(Work[w]->name.c_str());
-                PutsIfNonEmpty(Work[w]->results.c_str());
-                delete Work[w];
+        for (size_t w = 0; w < workItems.size(); ++w) {
+            if (workItems[w]) {
+                if (printShaderNames || workItems[w]->results.size() > 0)
+                    PutsIfNonEmpty(workItems[w]->name.c_str());
+                PutsIfNonEmpty(workItems[w]->results.c_str());
             }
         }
 
         ShFinalize();
     }
 
-    delete[] Work;
-
     if (CompileFailed)
         return EFailCompile;
     if (LinkFailed)
@@ -1010,8 +1002,13 @@ void usage()
            "\n"
            "  --keep-uncalled                         don't eliminate uncalled functions when linking\n"
            "  --ku                                    synonym for --keep-uncalled\n"
-           "  --variable-name <name>                  Creates a C header file that contains a uint32_t array named <name> initialized with the shader binary code.\n"
-           "  --vn <name>                             synonym for --variable-name <name>.\n"
+           "\n"
+           "  --variable-name <name>                  Creates a C header file that contains a uint32_t array named <name>\n"
+           "                                          initialized with the shader binary code.\n"
+           "  --vn <name>                             synonym for --variable-name <name>\n"
+           "\n"
+           "  --hlsl-offsets                          Allow block offsets to follow HLSL rules instead of GLSL rules.\n"
+           "                                          Works independently of source language.\n"
            );
 
     exit(EFailUsage);
diff --git a/3rdparty/bgfx/3rdparty/glslang/StandAlone/Worklist.h b/3rdparty/bgfx/3rdparty/glslang/StandAlone/Worklist.h
index 2a14294..91b6f51 100644
--- a/3rdparty/bgfx/3rdparty/glslang/StandAlone/Worklist.h
+++ b/3rdparty/bgfx/3rdparty/glslang/StandAlone/Worklist.h
@@ -36,8 +36,9 @@
 #define WORKLIST_H_INCLUDED
 
 #include "../glslang/OSDependent/osinclude.h"
-#include <string>
 #include <list>
+#include <mutex>
+#include <string>
 
 namespace glslang {
 
@@ -58,24 +59,19 @@ namespace glslang {
 
         void add(TWorkItem* item)
         {
-            GetGlobalLock();
-
+            std::lock_guard<std::mutex> guard(mutex);
             worklist.push_back(item);
-
-            ReleaseGlobalLock();
         }
 
         bool remove(TWorkItem*& item)
         {
-            GetGlobalLock();
+            std::lock_guard<std::mutex> guard(mutex);
 
             if (worklist.empty())
                 return false;
             item = worklist.front();
             worklist.pop_front();
 
-            ReleaseGlobalLock();
-
             return true;
         }
 
@@ -90,6 +86,7 @@ namespace glslang {
         }
 
     protected:
+        std::mutex mutex;
         std::list<TWorkItem*> worklist;
     };
 
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/450.geom b/3rdparty/bgfx/3rdparty/glslang/Test/450.geom
index af67681..83d99aa 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/450.geom
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/450.geom
@@ -8,7 +8,10 @@ out gl_PerVertex {
     float gl_CullDistance[3];
 };
 
+layout(triangles) in;
+
 void main()
 {
+    gl_in[3].gl_Position; // ERROR, out of range
     gl_CullDistance[2] = gl_in[1].gl_CullDistance[2];
 }
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/450.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/450.geom.out
index 5e6c88f..8d313c8 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/450.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/450.geom.out
@@ -1,72 +1,83 @@
 450.geom
 Warning, version 450 is not yet complete; most version-specific features are present, but some are missing.
+ERROR: 0:15: '[' :  array index out of range '3'
+ERROR: 0:15: 'gl_Position' : no such field in structure 
+ERROR: 2 compilation errors.  No code generated.
+
 
 Shader version: 450
 invocations = -1
 max_vertices = -1
-input primitive = none
+input primitive = triangles
 output primitive = none
-0:? Sequence
-0:11  Function Definition: main( ( global void)
-0:11    Function Parameters: 
-0:13    Sequence
-0:13      move second child to first child ( temp float)
-0:13        direct index (layout( stream=0) temp float CullDistance)
-0:13          gl_CullDistance: direct index for structure (layout( stream=0) out 3-element array of float CullDistance)
-0:13            'anon@0' (layout( stream=0) out block{layout( stream=0) out 3-element array of float CullDistance gl_CullDistance})
-0:13            Constant:
-0:13              3 (const uint)
-0:13          Constant:
-0:13            2 (const int)
-0:13        direct index ( temp float CullDistance)
-0:13          gl_CullDistance: direct index for structure ( in 3-element array of float CullDistance)
-0:13            direct index ( temp block{ in 3-element array of float CullDistance gl_CullDistance})
-0:13              'gl_in' ( in implicitly-sized array of block{ in 3-element array of float CullDistance gl_CullDistance})
-0:13              Constant:
-0:13                1 (const int)
-0:13            Constant:
-0:13              0 (const int)
-0:13          Constant:
-0:13            2 (const int)
+ERROR: node is still EOpNull!
+0:13  Function Definition: main( ( global void)
+0:13    Function Parameters: 
+0:15    Sequence
+0:15      direct index ( temp block{ in 3-element array of float CullDistance gl_CullDistance})
+0:15        'gl_in' ( in 3-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
+0:15        Constant:
+0:15          3 (const int)
+0:16      move second child to first child ( temp float)
+0:16        direct index (layout( stream=0) temp float CullDistance)
+0:16          gl_CullDistance: direct index for structure (layout( stream=0) out 3-element array of float CullDistance)
+0:16            'anon@0' (layout( stream=0) out block{layout( stream=0) out 3-element array of float CullDistance gl_CullDistance})
+0:16            Constant:
+0:16              3 (const uint)
+0:16          Constant:
+0:16            2 (const int)
+0:16        direct index ( temp float CullDistance)
+0:16          gl_CullDistance: direct index for structure ( in 3-element array of float CullDistance)
+0:16            direct index ( temp block{ in 3-element array of float CullDistance gl_CullDistance})
+0:16              'gl_in' ( in 3-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
+0:16              Constant:
+0:16                1 (const int)
+0:16            Constant:
+0:16              0 (const int)
+0:16          Constant:
+0:16            2 (const int)
 0:?   Linker Objects
-0:?     'gl_in' ( in implicitly-sized array of block{ in 3-element array of float CullDistance gl_CullDistance})
+0:?     'gl_in' ( in 3-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
 0:?     'anon@0' (layout( stream=0) out block{layout( stream=0) out 3-element array of float CullDistance gl_CullDistance})
 
 
 Linked geometry stage:
 
-ERROR: Linking geometry stage: At least one shader must specify an input layout primitive
 ERROR: Linking geometry stage: At least one shader must specify an output layout primitive
 ERROR: Linking geometry stage: At least one shader must specify a layout(max_vertices = value)
 
 Shader version: 450
 invocations = 1
 max_vertices = -1
-input primitive = none
+input primitive = triangles
 output primitive = none
-0:? Sequence
-0:11  Function Definition: main( ( global void)
-0:11    Function Parameters: 
-0:13    Sequence
-0:13      move second child to first child ( temp float)
-0:13        direct index (layout( stream=0) temp float CullDistance)
-0:13          gl_CullDistance: direct index for structure (layout( stream=0) out 3-element array of float CullDistance)
-0:13            'anon@0' (layout( stream=0) out block{layout( stream=0) out 3-element array of float CullDistance gl_CullDistance})
-0:13            Constant:
-0:13              3 (const uint)
-0:13          Constant:
-0:13            2 (const int)
-0:13        direct index ( temp float CullDistance)
-0:13          gl_CullDistance: direct index for structure ( in 3-element array of float CullDistance)
-0:13            direct index ( temp block{ in 3-element array of float CullDistance gl_CullDistance})
-0:13              'gl_in' ( in 2-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
-0:13              Constant:
-0:13                1 (const int)
-0:13            Constant:
-0:13              0 (const int)
-0:13          Constant:
-0:13            2 (const int)
+ERROR: node is still EOpNull!
+0:13  Function Definition: main( ( global void)
+0:13    Function Parameters: 
+0:15    Sequence
+0:15      direct index ( temp block{ in 3-element array of float CullDistance gl_CullDistance})
+0:15        'gl_in' ( in 3-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
+0:15        Constant:
+0:15          3 (const int)
+0:16      move second child to first child ( temp float)
+0:16        direct index (layout( stream=0) temp float CullDistance)
+0:16          gl_CullDistance: direct index for structure (layout( stream=0) out 3-element array of float CullDistance)
+0:16            'anon@0' (layout( stream=0) out block{layout( stream=0) out 3-element array of float CullDistance gl_CullDistance})
+0:16            Constant:
+0:16              3 (const uint)
+0:16          Constant:
+0:16            2 (const int)
+0:16        direct index ( temp float CullDistance)
+0:16          gl_CullDistance: direct index for structure ( in 3-element array of float CullDistance)
+0:16            direct index ( temp block{ in 3-element array of float CullDistance gl_CullDistance})
+0:16              'gl_in' ( in 3-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
+0:16              Constant:
+0:16                1 (const int)
+0:16            Constant:
+0:16              0 (const int)
+0:16          Constant:
+0:16            2 (const int)
 0:?   Linker Objects
-0:?     'gl_in' ( in 2-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
+0:?     'gl_in' ( in 3-element array of block{ in 3-element array of float CullDistance gl_CullDistance})
 0:?     'anon@0' (layout( stream=0) out block{layout( stream=0) out 3-element array of float CullDistance gl_CullDistance})
 
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.amend.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.amend.frag.out
index a93643e..388d346 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.amend.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.amend.frag.out
@@ -1,5 +1,5 @@
 hlsl.amend.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Sequence
@@ -81,7 +81,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Sequence
@@ -168,6 +168,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "f1"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "f1"
                               Name 6  "@f1("
                               Name 8  "f2("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.flatten.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.flatten.frag.out
index 1beb71f..7385cea 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.flatten.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.flatten.frag.out
@@ -1,5 +1,5 @@
 hlsl.array.flatten.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:17  Function Definition: TestFn1( ( temp 4-component vector of float)
@@ -173,7 +173,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:17  Function Definition: TestFn1( ( temp 4-component vector of float)
@@ -353,6 +353,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 128
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "TestFn1("
                               Name 22  "TestFn2(t11[3];p1[3];"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.frag.out
index 58fa077..c0c5a13 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.frag.out
@@ -1,5 +1,5 @@
 hlsl.array.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: @PixelShaderFunction(i1;vf4[3]; ( temp 4-component vector of float)
@@ -76,7 +76,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: @PixelShaderFunction(i1;vf4[3]; ( temp 4-component vector of float)
@@ -158,6 +158,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 68 72 75
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 17  "@PixelShaderFunction(i1;vf4[3];"
                               Name 15  "i"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.implicit-size.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.implicit-size.frag.out
index efe1240..e4854db 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.implicit-size.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.implicit-size.frag.out
@@ -1,5 +1,5 @@
 hlsl.array.implicit-size.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Sequence
@@ -83,7 +83,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Sequence
@@ -171,6 +171,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.multidim.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.multidim.frag.out
index e13399e..72fa2ec 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.multidim.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.array.multidim.frag.out
@@ -1,5 +1,5 @@
 hlsl.array.multidim.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -68,7 +68,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -142,6 +142,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 54
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.assoc.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.assoc.frag.out
index 0a48528..dcefa9c 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.assoc.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.assoc.frag.out
@@ -1,5 +1,5 @@
 hlsl.assoc.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: @PixelShaderFunction(vf4;vf4;vf4;vf4;vf4; ( temp 4-component vector of float)
@@ -67,7 +67,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: @PixelShaderFunction(vf4;vf4;vf4;vf4;vf4; ( temp 4-component vector of float)
@@ -140,6 +140,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 31 34 37 40 43 46
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 15  "@PixelShaderFunction(vf4;vf4;vf4;vf4;vf4;"
                               Name 10  "a1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.expression.comp.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.expression.comp.out
index bd4e96b..ed50b9c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.expression.comp.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.expression.comp.out
@@ -1,5 +1,5 @@
 hlsl.attribute.expression.comp
-Shader version: 450
+Shader version: 500
 local_size = (4, 6, 8)
 0:? Sequence
 0:9  Function Definition: @main( ( temp 4-component vector of float)
@@ -42,7 +42,7 @@ local_size = (4, 6, 8)
 Linked compute stage:
 
 
-Shader version: 450
+Shader version: 500
 local_size = (4, 6, 8)
 0:? Sequence
 0:9  Function Definition: @main( ( temp 4-component vector of float)
@@ -90,6 +90,7 @@ local_size = (4, 6, 8)
                               MemoryModel Logical GLSL450
                               EntryPoint GLCompute 4  "main" 37
                               ExecutionMode 4 LocalSize 4 6 8
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "@main("
                               Name 13  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.frag.out
index 7a26416..ccd7693 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.attribute.frag.out
@@ -1,5 +1,5 @@
 hlsl.attribute.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp void)
@@ -9,7 +9,7 @@ gl_FragCoord origin is upper left
 0:11      Test condition and select ( temp void)
 0:11        Condition
 0:11        Constant:
-0:11          0 (const int)
+0:11          false (const bool)
 0:11        true case is null
 0:2  Function Definition: PixelShaderFunction( ( temp void)
 0:2    Function Parameters: 
@@ -26,7 +26,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp void)
@@ -36,7 +36,7 @@ gl_FragCoord origin is upper left
 0:11      Test condition and select ( temp void)
 0:11        Condition
 0:11        Constant:
-0:11          0 (const int)
+0:11          false (const bool)
 0:11        true case is null
 0:2  Function Definition: PixelShaderFunction( ( temp void)
 0:2    Function Parameters: 
@@ -58,6 +58,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 19
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
@@ -71,8 +72,8 @@ gl_FragCoord origin is upper left
                7:             TypeVector 6(float) 4
                8:             TypePointer Function 7(fvec4)
                9:             TypeFunction 2 8(ptr)
-              13:             TypeInt 32 1
-              14:     13(int) Constant 0
+              13:             TypeBool
+              14:    13(bool) ConstantFalse
               18:             TypePointer Input 7(fvec4)
        19(input):     18(ptr) Variable Input
 4(PixelShaderFunction):           2 Function None 3
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.comp.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.comp.out
index 172dfef..9842506 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.comp.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.comp.out
@@ -1,5 +1,5 @@
 hlsl.basic.comp
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:4  Function Definition: @main(i1;i1; ( temp void)
@@ -31,7 +31,7 @@ local_size = (1, 1, 1)
 Linked compute stage:
 
 
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:4  Function Definition: @main(i1;i1; ( temp void)
@@ -68,6 +68,7 @@ local_size = (1, 1, 1)
                               MemoryModel Logical GLSL450
                               EntryPoint GLCompute 4  "main" 18 21
                               ExecutionMode 4 LocalSize 1 1 1
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "@main(i1;i1;"
                               Name 9  "dti"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.geom.out
index 24250b3..64239c5 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.basic.geom.out
@@ -1,5 +1,5 @@
 hlsl.basic.geom
-Shader version: 450
+Shader version: 500
 invocations = -1
 max_vertices = 4
 input primitive = triangles
@@ -43,12 +43,12 @@ output primitive = line_strip
 0:20              0 (const int)
 0:22      Sequence
 0:22        move second child to first child ( temp structure{ temp float myfloat,  temp int something})
-0:22          'OutputStream' ( out structure{ temp float myfloat,  temp int something})
+0:22          'OutputStream' (layout( location=0) out structure{ temp float myfloat,  temp int something})
 0:22          'Vert' ( temp structure{ temp float myfloat,  temp int something})
 0:22        EmitVertex ( temp void)
 0:23      Sequence
 0:23        move second child to first child ( temp structure{ temp float myfloat,  temp int something})
-0:23          'OutputStream' ( out structure{ temp float myfloat,  temp int something})
+0:23          'OutputStream' (layout( location=0) out structure{ temp float myfloat,  temp int something})
 0:23          'Vert' ( temp structure{ temp float myfloat,  temp int something})
 0:23        EmitVertex ( temp void)
 0:24      EndPrimitive ( temp void)
@@ -60,20 +60,21 @@ output primitive = line_strip
 0:?         'VertexID' (layout( location=0) in 3-element array of uint)
 0:16      move second child to first child ( temp 3-element array of uint)
 0:?         'test' ( temp 3-element array of uint)
-0:?         'test' (layout( location=3) in 3-element array of uint)
+0:?         'test' (layout( location=1) in 3-element array of uint)
 0:16      Function Call: @main(u1[3];u1[3];struct-PSInput-f1-i11; ( temp void)
 0:?         'VertexID' ( temp 3-element array of uint)
 0:?         'test' ( temp 3-element array of uint)
 0:?         'OutputStream' ( temp structure{ temp float myfloat,  temp int something})
 0:?   Linker Objects
 0:?     'VertexID' (layout( location=0) in 3-element array of uint)
-0:?     'test' (layout( location=3) in 3-element array of uint)
+0:?     'test' (layout( location=1) in 3-element array of uint)
+0:?     'OutputStream' (layout( location=0) out structure{ temp float myfloat,  temp int something})
 
 
 Linked geometry stage:
 
 
-Shader version: 450
+Shader version: 500
 invocations = 1
 max_vertices = 4
 input primitive = triangles
@@ -117,12 +118,12 @@ output primitive = line_strip
 0:20              0 (const int)
 0:22      Sequence
 0:22        move second child to first child ( temp structure{ temp float myfloat,  temp int something})
-0:22          'OutputStream' ( out structure{ temp float myfloat,  temp int something})
+0:22          'OutputStream' (layout( location=0) out structure{ temp float myfloat,  temp int something})
 0:22          'Vert' ( temp structure{ temp float myfloat,  temp int something})
 0:22        EmitVertex ( temp void)
 0:23      Sequence
 0:23        move second child to first child ( temp structure{ temp float myfloat,  temp int something})
-0:23          'OutputStream' ( out structure{ temp float myfloat,  temp int something})
+0:23          'OutputStream' (layout( location=0) out structure{ temp float myfloat,  temp int something})
 0:23          'Vert' ( temp structure{ temp float myfloat,  temp int something})
 0:23        EmitVertex ( temp void)
 0:24      EndPrimitive ( temp void)
@@ -134,27 +135,29 @@ output primitive = line_strip
 0:?         'VertexID' (layout( location=0) in 3-element array of uint)
 0:16      move second child to first child ( temp 3-element array of uint)
 0:?         'test' ( temp 3-element array of uint)
-0:?         'test' (layout( location=3) in 3-element array of uint)
+0:?         'test' (layout( location=1) in 3-element array of uint)
 0:16      Function Call: @main(u1[3];u1[3];struct-PSInput-f1-i11; ( temp void)
 0:?         'VertexID' ( temp 3-element array of uint)
 0:?         'test' ( temp 3-element array of uint)
 0:?         'OutputStream' ( temp structure{ temp float myfloat,  temp int something})
 0:?   Linker Objects
 0:?     'VertexID' (layout( location=0) in 3-element array of uint)
-0:?     'test' (layout( location=3) in 3-element array of uint)
+0:?     'test' (layout( location=1) in 3-element array of uint)
+0:?     'OutputStream' (layout( location=0) out structure{ temp float myfloat,  temp int something})
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 57
+// Id's are bound by 60
 
                               Capability Geometry
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Geometry 4  "main" 45 48
+                              EntryPoint Geometry 4  "main" 42 47 50
                               ExecutionMode 4 Triangles
                               ExecutionMode 4 Invocations 1
                               ExecutionMode 4 OutputLineStrip
                               ExecutionMode 4 OutputVertices 4
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "PSInput"
                               MemberName 12(PSInput) 0  "myfloat"
@@ -164,16 +167,18 @@ output primitive = line_strip
                               Name 16  "test"
                               Name 17  "OutputStream"
                               Name 20  "Vert"
-                              Name 43  "VertexID"
+                              Name 42  "OutputStream"
                               Name 45  "VertexID"
-                              Name 47  "test"
-                              Name 48  "test"
-                              Name 50  "OutputStream"
-                              Name 51  "param"
+                              Name 47  "VertexID"
+                              Name 49  "test"
+                              Name 50  "test"
+                              Name 52  "OutputStream"
                               Name 53  "param"
                               Name 55  "param"
-                              Decorate 45(VertexID) Location 0
-                              Decorate 48(test) Location 3
+                              Name 57  "param"
+                              Decorate 42(OutputStream) Location 0
+                              Decorate 47(VertexID) Location 0
+                              Decorate 50(test) Location 1
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeInt 32 0
@@ -191,26 +196,30 @@ output primitive = line_strip
               29:     11(int) Constant 2
               34:             TypePointer Function 10(float)
               39:             TypePointer Function 11(int)
-              44:             TypePointer Input 8
-    45(VertexID):     44(ptr) Variable Input
-        48(test):     44(ptr) Variable Input
+              41:             TypePointer Output 12(PSInput)
+42(OutputStream):     41(ptr) Variable Output
+              46:             TypePointer Input 8
+    47(VertexID):     46(ptr) Variable Input
+        50(test):     46(ptr) Variable Input
          4(main):           2 Function None 3
                5:             Label
-    43(VertexID):      9(ptr) Variable Function
-        47(test):      9(ptr) Variable Function
-50(OutputStream):     13(ptr) Variable Function
-       51(param):      9(ptr) Variable Function
+    45(VertexID):      9(ptr) Variable Function
+        49(test):      9(ptr) Variable Function
+52(OutputStream):     13(ptr) Variable Function
        53(param):      9(ptr) Variable Function
-       55(param):     13(ptr) Variable Function
-              46:           8 Load 45(VertexID)
-                              Store 43(VertexID) 46
-              49:           8 Load 48(test)
-                              Store 47(test) 49
-              52:           8 Load 43(VertexID)
-                              Store 51(param) 52
-              54:           8 Load 47(test)
+       55(param):      9(ptr) Variable Function
+       57(param):     13(ptr) Variable Function
+              48:           8 Load 47(VertexID)
+                              Store 45(VertexID) 48
+              51:           8 Load 50(test)
+                              Store 49(test) 51
+              54:           8 Load 45(VertexID)
                               Store 53(param) 54
-              56:           2 FunctionCall 18(@main(u1[3];u1[3];struct-PSInput-f1-i11;) 51(param) 53(param) 55(param)
+              56:           8 Load 49(test)
+                              Store 55(param) 56
+              58:           2 FunctionCall 18(@main(u1[3];u1[3];struct-PSInput-f1-i11;) 53(param) 55(param) 57(param)
+              59: 12(PSInput) Load 57(param)
+                              Store 52(OutputStream) 59
                               Return
                               FunctionEnd
 18(@main(u1[3];u1[3];struct-PSInput-f1-i11;):           2 Function None 14
@@ -235,11 +244,11 @@ output primitive = line_strip
               38:     11(int) Bitcast 37
               40:     39(ptr) AccessChain 20(Vert) 25
                               Store 40 38
-              41: 12(PSInput) Load 20(Vert)
-                              Store 17(OutputStream) 41
+              43: 12(PSInput) Load 20(Vert)
+                              Store 42(OutputStream) 43
                               EmitVertex
-              42: 12(PSInput) Load 20(Vert)
-                              Store 17(OutputStream) 42
+              44: 12(PSInput) Load 20(Vert)
+                              Store 42(OutputStream) 44
                               EmitVertex
                               EndPrimitive
                               Return
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.buffer.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.buffer.frag.out
index c406f24..623480c 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.buffer.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.buffer.frag.out
@@ -1,5 +1,5 @@
 hlsl.buffer.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:30  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -50,7 +50,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:30  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -106,6 +106,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 46 49
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelod.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelod.dx10.frag.out
index f018984..535905c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelod.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelod.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.calculatelod.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -179,7 +179,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -367,6 +367,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 140 144
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelodunclamped.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelodunclamped.dx10.frag.out
index 4e2e886..f19f5d0 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelodunclamped.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.calculatelodunclamped.dx10.frag.out
@@ -11,7 +11,7 @@ ERROR: 0:38: '' : unimplemented: CalculateLevelOfDetailUnclamped
 ERROR: 9 compilation errors.  No code generated.
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -191,7 +191,7 @@ ERROR: node is still EOpNull!
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.cast.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.cast.frag.out
index d17251d..854c94e 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.cast.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.cast.frag.out
@@ -1,5 +1,5 @@
 hlsl.cast.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -37,7 +37,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -80,6 +80,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 32 35
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.comparison.vec.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.comparison.vec.frag.out
index dacdb77..dd363c6 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.comparison.vec.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.comparison.vec.frag.out
@@ -1,5 +1,5 @@
 hlsl.comparison.vec.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: Bug1(vf4; ( temp void)
@@ -132,7 +132,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: Bug1(vf4; ( temp void)
@@ -270,6 +270,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 90
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "Bug1(vf4;"
                               Name 10  "a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.conditional.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.conditional.frag.out
index cd0abf8..4028cad 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.conditional.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.conditional.frag.out
@@ -1,5 +1,5 @@
 hlsl.conditional.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -55,7 +55,8 @@ gl_FragCoord origin is upper left
 0:12            'a' ( temp int)
 0:12            Test condition and select ( temp int)
 0:12              Condition
-0:12              'b' ( temp int)
+0:12              Convert int to bool ( temp bool)
+0:12                'b' ( temp int)
 0:12              true case
 0:12              move second child to first child ( temp int)
 0:12                'c' ( temp int)
@@ -67,7 +68,8 @@ gl_FragCoord origin is upper left
 0:12          'b' ( temp int)
 0:12          Test condition and select ( temp int)
 0:12            Condition
-0:12            'a' ( temp int)
+0:12            Convert int to bool ( temp bool)
+0:12              'a' ( temp int)
 0:12            true case
 0:12            move second child to first child ( temp int)
 0:12              'd' ( temp int)
@@ -123,7 +125,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -179,7 +181,8 @@ gl_FragCoord origin is upper left
 0:12            'a' ( temp int)
 0:12            Test condition and select ( temp int)
 0:12              Condition
-0:12              'b' ( temp int)
+0:12              Convert int to bool ( temp bool)
+0:12                'b' ( temp int)
 0:12              true case
 0:12              move second child to first child ( temp int)
 0:12                'c' ( temp int)
@@ -191,7 +194,8 @@ gl_FragCoord origin is upper left
 0:12          'b' ( temp int)
 0:12          Test condition and select ( temp int)
 0:12            Condition
-0:12            'a' ( temp int)
+0:12            Convert int to bool ( temp bool)
+0:12              'a' ( temp int)
 0:12            true case
 0:12            move second child to first child ( temp int)
 0:12              'd' ( temp int)
@@ -245,13 +249,14 @@ gl_FragCoord origin is upper left
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 100
+// Id's are bound by 102
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "PixelShaderFunction" 93 96
+                              EntryPoint Fragment 4  "PixelShaderFunction" 95 98
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
@@ -261,13 +266,13 @@ gl_FragCoord origin is upper left
                               Name 21  "d"
                               Name 22  "ret"
                               Name 42  "e"
-                              Name 59  "f"
-                              Name 91  "input"
+                              Name 64  "f"
                               Name 93  "input"
-                              Name 96  "@entryPointOutput"
-                              Name 97  "param"
-                              Decorate 93(input) Location 0
-                              Decorate 96(@entryPointOutput) Location 0
+                              Name 95  "input"
+                              Name 98  "@entryPointOutput"
+                              Name 99  "param"
+                              Decorate 95(input) Location 0
+                              Decorate 98(@entryPointOutput) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -279,27 +284,27 @@ gl_FragCoord origin is upper left
               16:     13(int) Constant 5
               18:     13(int) Constant 6
               20:     13(int) Constant 7
-              49:     13(int) Constant 10
-              57:     13(int) Constant 11
-              61:             TypeInt 32 0
-              62:     61(int) Constant 0
-              63:             TypePointer Function 6(float)
-              66:     61(int) Constant 1
-              69:             TypeBool
-              92:             TypePointer Input 7(fvec4)
-       93(input):     92(ptr) Variable Input
-              95:             TypePointer Output 7(fvec4)
-96(@entryPointOutput):     95(ptr) Variable Output
+              45:             TypeBool
+              46:             TypeInt 32 0
+              47:     46(int) Constant 0
+              53:     13(int) Constant 10
+              62:     13(int) Constant 11
+              66:             TypePointer Function 6(float)
+              69:     46(int) Constant 1
+              94:             TypePointer Input 7(fvec4)
+       95(input):     94(ptr) Variable Input
+              97:             TypePointer Output 7(fvec4)
+98(@entryPointOutput):     97(ptr) Variable Output
 4(PixelShaderFunction):           2 Function None 3
                5:             Label
-       91(input):      8(ptr) Variable Function
-       97(param):      8(ptr) Variable Function
-              94:    7(fvec4) Load 93(input)
-                              Store 91(input) 94
-              98:    7(fvec4) Load 91(input)
-                              Store 97(param) 98
-              99:    7(fvec4) FunctionCall 11(@PixelShaderFunction(vf4;) 97(param)
-                              Store 96(@entryPointOutput) 99
+       93(input):      8(ptr) Variable Function
+       99(param):      8(ptr) Variable Function
+              96:    7(fvec4) Load 95(input)
+                              Store 93(input) 96
+             100:    7(fvec4) Load 93(input)
+                              Store 99(param) 100
+             101:    7(fvec4) FunctionCall 11(@PixelShaderFunction(vf4;) 99(param)
+                              Store 98(@entryPointOutput) 101
                               Return
                               FunctionEnd
 11(@PixelShaderFunction(vf4;):    7(fvec4) Function None 9
@@ -312,9 +317,9 @@ gl_FragCoord origin is upper left
          22(ret):      8(ptr) Variable Function
            42(e):     14(ptr) Variable Function
               43:     14(ptr) Variable Function
-              51:     14(ptr) Variable Function
-           59(f):      8(ptr) Variable Function
-              60:      8(ptr) Variable Function
+              55:     14(ptr) Variable Function
+           64(f):      8(ptr) Variable Function
+              65:      8(ptr) Variable Function
                               Store 15(a) 16
                               Store 17(b) 18
                               Store 19(c) 20
@@ -340,63 +345,65 @@ gl_FragCoord origin is upper left
               41:    7(fvec4) FAdd 36 40
                               Store 22(ret) 41
               44:     13(int) Load 17(b)
-                              SelectionMerge 46 None
-                              BranchConditional 44 45 48
-              45:               Label
-              47:     13(int)   Load 21(d)
-                                Store 19(c) 47
-                                Store 43 47
-                                Branch 46
-              48:               Label
-                                Store 43 49
-                                Branch 46
-              46:             Label
-              50:     13(int) Load 43
-                              Store 15(a) 50
-                              Store 42(e) 50
-              52:     13(int) Load 15(a)
-                              SelectionMerge 54 None
-                              BranchConditional 52 53 56
-              53:               Label
-              55:     13(int)   Load 19(c)
-                                Store 21(d) 55
-                                Store 51 55
-                                Branch 54
-              56:               Label
-                                Store 51 57
-                                Branch 54
-              54:             Label
-              58:     13(int) Load 51
-                              Store 17(b) 58
-              64:     63(ptr) AccessChain 22(ret) 62
-              65:    6(float) Load 64
-              67:     63(ptr) AccessChain 10(input) 66
+              48:    45(bool) INotEqual 44 47
+                              SelectionMerge 50 None
+                              BranchConditional 48 49 52
+              49:               Label
+              51:     13(int)   Load 21(d)
+                                Store 19(c) 51
+                                Store 43 51
+                                Branch 50
+              52:               Label
+                                Store 43 53
+                                Branch 50
+              50:             Label
+              54:     13(int) Load 43
+                              Store 15(a) 54
+                              Store 42(e) 54
+              56:     13(int) Load 15(a)
+              57:    45(bool) INotEqual 56 47
+                              SelectionMerge 59 None
+                              BranchConditional 57 58 61
+              58:               Label
+              60:     13(int)   Load 19(c)
+                                Store 21(d) 60
+                                Store 55 60
+                                Branch 59
+              61:               Label
+                                Store 55 62
+                                Branch 59
+              59:             Label
+              63:     13(int) Load 55
+                              Store 17(b) 63
+              67:     66(ptr) AccessChain 22(ret) 47
               68:    6(float) Load 67
-              70:    69(bool) FOrdLessThan 65 68
-                              SelectionMerge 72 None
-                              BranchConditional 70 71 77
-              71:               Label
-              73:     13(int)   Load 19(c)
-              74:    6(float)   ConvertSToF 73
-              75:    7(fvec4)   Load 10(input)
-              76:    7(fvec4)   VectorTimesScalar 75 74
-                                Store 60 76
-                                Branch 72
-              77:               Label
-              78:     13(int)   Load 21(d)
-              79:    6(float)   ConvertSToF 78
-              80:    7(fvec4)   Load 10(input)
-              81:    7(fvec4)   VectorTimesScalar 80 79
-                                Store 60 81
-                                Branch 72
-              72:             Label
-              82:    7(fvec4) Load 60
-                              Store 59(f) 82
-              83:     13(int) Load 42(e)
-              84:    6(float) ConvertSToF 83
-              85:    7(fvec4) Load 22(ret)
-              86:    7(fvec4) VectorTimesScalar 85 84
-              87:    7(fvec4) Load 59(f)
-              88:    7(fvec4) FAdd 86 87
-                              ReturnValue 88
+              70:     66(ptr) AccessChain 10(input) 69
+              71:    6(float) Load 70
+              72:    45(bool) FOrdLessThan 68 71
+                              SelectionMerge 74 None
+                              BranchConditional 72 73 79
+              73:               Label
+              75:     13(int)   Load 19(c)
+              76:    6(float)   ConvertSToF 75
+              77:    7(fvec4)   Load 10(input)
+              78:    7(fvec4)   VectorTimesScalar 77 76
+                                Store 65 78
+                                Branch 74
+              79:               Label
+              80:     13(int)   Load 21(d)
+              81:    6(float)   ConvertSToF 80
+              82:    7(fvec4)   Load 10(input)
+              83:    7(fvec4)   VectorTimesScalar 82 81
+                                Store 65 83
+                                Branch 74
+              74:             Label
+              84:    7(fvec4) Load 65
+                              Store 64(f) 84
+              85:     13(int) Load 42(e)
+              86:    6(float) ConvertSToF 85
+              87:    7(fvec4) Load 22(ret)
+              88:    7(fvec4) VectorTimesScalar 87 86
+              89:    7(fvec4) Load 64(f)
+              90:    7(fvec4) FAdd 88 89
+                              ReturnValue 90
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.constructexpr.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.constructexpr.frag.out
index eed1694..90c667a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.constructexpr.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.constructexpr.frag.out
@@ -1,5 +1,5 @@
 hlsl.constructexpr.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -53,7 +53,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -112,6 +112,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 37
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.deadFunctionMissingBody.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.deadFunctionMissingBody.vert.out
index 24e6982..3d493f7 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.deadFunctionMissingBody.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.deadFunctionMissingBody.vert.out
@@ -7,6 +7,7 @@ hlsl.deadFunctionMissingBody.vert
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 16
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "@main("
                               Name 16  "@entryPointOutput"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthGreater.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthGreater.frag.out
index 8ff7e73..f1bbb5b 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthGreater.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthGreater.frag.out
@@ -1,5 +1,5 @@
 hlsl.depthGreater.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 using depth_greater
 0:? Sequence
@@ -26,7 +26,7 @@ using depth_greater
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 using depth_greater
 0:? Sequence
@@ -59,6 +59,7 @@ using depth_greater
                               EntryPoint Fragment 4  "PixelShaderFunction" 18
                               ExecutionMode 4 OriginUpperLeft
                               ExecutionMode 4 DepthGreater
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 10  "@PixelShaderFunction(f1;"
                               Name 9  "depth"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthLess.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthLess.frag.out
index ef81b4a..7a3e926 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthLess.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.depthLess.frag.out
@@ -1,5 +1,5 @@
 hlsl.depthLess.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 using depth_less
 0:? Sequence
@@ -22,7 +22,7 @@ using depth_less
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 using depth_less
 0:? Sequence
@@ -51,6 +51,7 @@ using depth_less
                               EntryPoint Fragment 4  "PixelShaderFunction" 14
                               ExecutionMode 4 OriginUpperLeft
                               ExecutionMode 4 DepthLess
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 8  "@PixelShaderFunction("
                               Name 14  "@entryPointOutput"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.discard.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.discard.frag.out
index 6baea90..508ac5a 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.discard.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.discard.frag.out
@@ -1,5 +1,5 @@
 hlsl.discard.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: foo(f1; ( temp void)
@@ -25,10 +25,11 @@ gl_FragCoord origin is upper left
 0:9            2 (const int)
 0:10      Test condition and select ( temp void)
 0:10        Condition
-0:10        direct index ( temp float)
-0:10          'input' ( in 4-component vector of float)
-0:10          Constant:
-0:10            0 (const int)
+0:10        Convert float to bool ( temp bool)
+0:10          direct index ( temp float)
+0:10            'input' ( in 4-component vector of float)
+0:10            Constant:
+0:10              0 (const int)
 0:10        true case
 0:11        Branch: Kill
 0:12      Sequence
@@ -54,7 +55,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: foo(f1; ( temp void)
@@ -80,10 +81,11 @@ gl_FragCoord origin is upper left
 0:9            2 (const int)
 0:10      Test condition and select ( temp void)
 0:10        Condition
-0:10        direct index ( temp float)
-0:10          'input' ( in 4-component vector of float)
-0:10          Constant:
-0:10            0 (const int)
+0:10        Convert float to bool ( temp bool)
+0:10          direct index ( temp float)
+0:10            'input' ( in 4-component vector of float)
+0:10            Constant:
+0:10              0 (const int)
 0:10        true case
 0:11        Branch: Kill
 0:12      Sequence
@@ -107,24 +109,25 @@ gl_FragCoord origin is upper left
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 48
+// Id's are bound by 50
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "PixelShaderFunction" 43
+                              EntryPoint Fragment 4  "PixelShaderFunction" 45
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 10  "foo(f1;"
                               Name 9  "f"
                               Name 16  "@PixelShaderFunction(vf4;"
                               Name 15  "input"
                               Name 25  "param"
-                              Name 37  "f"
-                              Name 41  "input"
+                              Name 39  "f"
                               Name 43  "input"
-                              Name 45  "param"
-                              Decorate 43(input) Location 0
+                              Name 45  "input"
+                              Name 47  "param"
+                              Decorate 45(input) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -138,17 +141,18 @@ gl_FragCoord origin is upper left
               26:             TypeInt 32 0
               27:     26(int) Constant 2
               31:     26(int) Constant 0
-              42:             TypePointer Input 12(fvec4)
-       43(input):     42(ptr) Variable Input
+              34:    6(float) Constant 0
+              44:             TypePointer Input 12(fvec4)
+       45(input):     44(ptr) Variable Input
 4(PixelShaderFunction):           2 Function None 3
                5:             Label
-       41(input):     13(ptr) Variable Function
-       45(param):     13(ptr) Variable Function
-              44:   12(fvec4) Load 43(input)
-                              Store 41(input) 44
-              46:   12(fvec4) Load 41(input)
-                              Store 45(param) 46
-              47:           2 FunctionCall 16(@PixelShaderFunction(vf4;) 45(param)
+       43(input):     13(ptr) Variable Function
+       47(param):     13(ptr) Variable Function
+              46:   12(fvec4) Load 45(input)
+                              Store 43(input) 46
+              48:   12(fvec4) Load 43(input)
+                              Store 47(param) 48
+              49:           2 FunctionCall 16(@PixelShaderFunction(vf4;) 47(param)
                               Return
                               FunctionEnd
      10(foo(f1;):           2 Function None 8
@@ -167,20 +171,21 @@ gl_FragCoord origin is upper left
        15(input):     13(ptr) FunctionParameter
               17:             Label
        25(param):      7(ptr) Variable Function
-           37(f):      7(ptr) Variable Function
+           39(f):      7(ptr) Variable Function
               28:      7(ptr) AccessChain 15(input) 27
               29:    6(float) Load 28
                               Store 25(param) 29
               30:           2 FunctionCall 10(foo(f1;) 25(param)
               32:      7(ptr) AccessChain 15(input) 31
               33:    6(float) Load 32
-                              SelectionMerge 35 None
-                              BranchConditional 33 34 35
-              34:               Label
+              35:    20(bool) FOrdNotEqual 33 34
+                              SelectionMerge 37 None
+                              BranchConditional 35 36 37
+              36:               Label
                                 Kill
-              35:             Label
-              38:      7(ptr) AccessChain 15(input) 31
-              39:    6(float) Load 38
-                              Store 37(f) 39
+              37:             Label
+              40:      7(ptr) AccessChain 15(input) 31
+              41:    6(float) Load 40
+                              Store 39(f) 41
                               Kill
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.doLoop.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.doLoop.frag.out
index 35cf748..b427965 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.doLoop.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.doLoop.frag.out
@@ -1,5 +1,5 @@
 hlsl.doLoop.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -43,7 +43,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -92,6 +92,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 37 40
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.frag.out
index e0da985..7d11393 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.frag.out
@@ -1,5 +1,5 @@
 hlsl.emptystructreturn.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main(struct-ps_in1; ( temp structure{})
@@ -26,7 +26,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main(struct-ps_in1; ( temp structure{})
@@ -58,6 +58,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 20 23
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "ps_in"
                               Name 8  "ps_out"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.vert.out
index ff89352..64446bd 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.emptystructreturn.vert.out
@@ -1,5 +1,5 @@
 hlsl.emptystructreturn.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:10  Function Definition: @main(struct-vs_in1; ( temp structure{})
 0:10    Function Parameters: 
@@ -25,7 +25,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:10  Function Definition: @main(struct-vs_in1; ( temp structure{})
 0:10    Function Parameters: 
@@ -55,6 +55,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 20 23
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "vs_in"
                               Name 8  "vs_out"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-in.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-in.frag.out
index 7c15616..81441cb 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-in.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-in.frag.out
@@ -1,5 +1,5 @@
 hlsl.entry-in.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: fun(struct-InParam-vf2-vf4-vi21; ( temp float)
@@ -89,7 +89,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: fun(struct-InParam-vf2-vf4-vi21; ( temp float)
@@ -184,6 +184,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 56 63 73
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "InParam"
                               MemberName 11(InParam) 0  "v"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-out.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-out.frag.out
index f92605f..1324112 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-out.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry-out.frag.out
@@ -1,5 +1,5 @@
 hlsl.entry-out.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: fun(struct-OutParam-vf2-vi21; ( temp void)
@@ -123,7 +123,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: fun(struct-OutParam-vf2-vi21; ( temp void)
@@ -252,6 +252,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 57 60 73 76 80 83 86
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 10  "OutParam"
                               MemberName 10(OutParam) 0  "v"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry.rename.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry.rename.frag.out
index 293592f..c6a4335 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry.rename.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.entry.rename.frag.out
@@ -1,5 +1,5 @@
 hlsl.entry.rename.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: not_the_entry_point( ( temp void)
@@ -37,7 +37,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: not_the_entry_point( ( temp void)
@@ -80,6 +80,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main_in_spv" 26
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main_in_spv"
                               Name 6  "not_the_entry_point("
                               Name 10  "PS_OUTPUT"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.flatten.return.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.flatten.return.frag.out
index 9b28695..ee2c3f0 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.flatten.return.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.flatten.return.frag.out
@@ -1,5 +1,5 @@
 hlsl.flatten.return.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:11  Function Definition: Func1( ( temp structure{ temp 4-component vector of float color,  temp float other_struct_member1,  temp float other_struct_member2,  temp float other_struct_member3})
@@ -60,7 +60,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:11  Function Definition: Func1( ( temp structure{ temp 4-component vector of float color,  temp float other_struct_member1,  temp float other_struct_member2,  temp float other_struct_member3})
@@ -126,6 +126,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 29 36 41 45
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float1.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float1.frag.out
index ad29daa..21b1c19 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float1.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float1.frag.out
@@ -1,5 +1,5 @@
 hlsl.float1.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -34,7 +34,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -73,6 +73,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "ShaderFunction(vf1;f1;"
                               Name 9  "inFloat1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float4.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float4.frag.out
index d14de7b..4da2a16 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float4.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.float4.frag.out
@@ -2,7 +2,7 @@ hlsl.float4.frag
 WARNING: 0:5: 'register' : ignoring shader_profile 
 WARNING: 0:6: 'register' : ignoring shader_profile 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: ShaderFunction(vf4; ( temp 4-component vector of float)
@@ -24,7 +24,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: ShaderFunction(vf4; ( temp 4-component vector of float)
@@ -50,6 +50,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "ShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.forLoop.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.forLoop.frag.out
index a14b9ac..de1f1c0 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.forLoop.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.forLoop.frag.out
@@ -1,5 +1,5 @@
 hlsl.forLoop.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -128,7 +128,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -262,6 +262,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 117 120
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.array.dx10.frag.out
index 69f8e3d..5f9a1ca 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gather.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -131,7 +131,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -270,6 +270,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 107 111
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.frag.out
index 6d2526d..c011f56 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gather.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:29  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -129,7 +129,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:29  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -265,6 +265,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 108 112
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.vert.out
index 9eec1ca..46fb13c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.basic.dx10.vert.out
@@ -1,5 +1,5 @@
 hlsl.gather.basic.dx10.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:28    Function Parameters: 
@@ -111,7 +111,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:28    Function Parameters: 
@@ -228,6 +228,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 103 128
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUTPUT"
                               MemberName 8(VS_OUTPUT) 0  "Pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offset.dx10.frag.out
index c921313..8ed1c07 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gather.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -104,7 +104,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -215,6 +215,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 79 83
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offsetarray.dx10.frag.out
index 6b1cac3..73ecae0 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gather.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gather.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -101,7 +101,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -209,6 +209,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 80 84
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.array.dx10.frag.out
index c918d99..675e178 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gatherRGBA.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -375,7 +375,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -758,6 +758,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 238 242
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.basic.dx10.frag.out
index 7bd005a..4d5da52 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gatherRGBA.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:34  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -379,7 +379,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:34  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -765,6 +765,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 238 242
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offset.dx10.frag.out
index 1d3fe46..a4fd588 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gatherRGBA.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:39  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -631,7 +631,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:39  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -1270,6 +1270,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 363 367
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offsetarray.dx10.frag.out
index 86e0a99..23d26cc 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gatherRGBA.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gatherRGBA.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:33  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -627,7 +627,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:33  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -1263,6 +1263,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 363 367
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gathercmpRGBA.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gathercmpRGBA.offset.dx10.frag.out
index 1c7487d..52fcdb2 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gathercmpRGBA.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.gathercmpRGBA.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.gathercmpRGBA.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -180,7 +180,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -367,6 +367,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 111 115
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.frag.out
index 30483db..c3986d7 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.getdimensions.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:46  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -1159,7 +1159,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:46  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -2328,6 +2328,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 540 544
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.vert.out
index 2faa744..cb1eae6 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.dx10.vert.out
@@ -1,5 +1,5 @@
 hlsl.getdimensions.dx10.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:11  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:11    Function Parameters: 
@@ -59,7 +59,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:11  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:11    Function Parameters: 
@@ -125,6 +125,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 42 50
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUTPUT"
                               MemberName 8(VS_OUTPUT) 0  "Pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.rw.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.rw.dx10.frag.out
index 781ed90..67b2f8f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.rw.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getdimensions.rw.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.getdimensions.rw.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:44  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -359,7 +359,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:44  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -727,6 +727,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 216 220
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getsampleposition.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getsampleposition.dx10.frag.out
index 57097a4..4c86b75 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getsampleposition.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.getsampleposition.dx10.frag.out
@@ -4,7 +4,7 @@ ERROR: 0:17: '' : unimplemented: GetSamplePosition
 ERROR: 2 compilation errors.  No code generated.
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:13  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -75,7 +75,7 @@ ERROR: node is still EOpNull!
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:13  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.1.tesc.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.1.tesc.out
index 89ab4e6..3e1ad58 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.1.tesc.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.1.tesc.out
@@ -1,6 +1,7 @@
 hlsl.hull.1.tesc
-Shader version: 450
+Shader version: 500
 vertices = 4
+vertex spacing = equal_spacing
 0:? Sequence
 0:26  Function Definition: @main(struct-VS_OUT-vf31[4];u1; ( temp structure{ temp 3-component vector of float cpoint})
 0:26    Function Parameters: 
@@ -31,7 +32,9 @@ vertices = 4
 0:?         'm_cpid' ( temp uint)
 0:?         'm_cpid' ( in uint InvocationID)
 0:26      move second child to first child ( temp structure{ temp 3-component vector of float cpoint})
-0:?         '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:26        indirect index ( temp structure{ temp 3-component vector of float cpoint})
+0:?           '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
+0:?           'm_cpid' ( in uint InvocationID)
 0:26        Function Call: @main(struct-VS_OUT-vf31[4];u1; ( temp structure{ temp 3-component vector of float cpoint})
 0:?           'ip' ( temp 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?           'm_cpid' ( temp uint)
@@ -50,8 +53,8 @@ vertices = 4
 0:?               'pid' ( in uint PrimitiveID)
 0:?           Sequence
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   0 (const int)
 0:?               direct index ( temp float)
@@ -62,8 +65,8 @@ vertices = 4
 0:?                 Constant:
 0:?                   0 (const int)
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   1 (const int)
 0:?               direct index ( temp float)
@@ -100,18 +103,20 @@ vertices = 4
 0:38      Branch: Return with expression
 0:38        'output' ( temp structure{ temp 2-element array of float edges})
 0:?   Linker Objects
-0:?     '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:?     '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'ip' (layout( location=0) in 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'm_cpid' ( in uint InvocationID)
 0:?     'pid' ( in uint PrimitiveID)
-0:?     '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?     '@patchConstantOutput' (layout( location=1) patch out structure{})
+0:?     '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 
 
 Linked tessellation control stage:
 
 
-Shader version: 450
+Shader version: 500
 vertices = 4
+vertex spacing = equal_spacing
 0:? Sequence
 0:26  Function Definition: @main(struct-VS_OUT-vf31[4];u1; ( temp structure{ temp 3-component vector of float cpoint})
 0:26    Function Parameters: 
@@ -142,7 +147,9 @@ vertices = 4
 0:?         'm_cpid' ( temp uint)
 0:?         'm_cpid' ( in uint InvocationID)
 0:26      move second child to first child ( temp structure{ temp 3-component vector of float cpoint})
-0:?         '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:26        indirect index ( temp structure{ temp 3-component vector of float cpoint})
+0:?           '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
+0:?           'm_cpid' ( in uint InvocationID)
 0:26        Function Call: @main(struct-VS_OUT-vf31[4];u1; ( temp structure{ temp 3-component vector of float cpoint})
 0:?           'ip' ( temp 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?           'm_cpid' ( temp uint)
@@ -161,8 +168,8 @@ vertices = 4
 0:?               'pid' ( in uint PrimitiveID)
 0:?           Sequence
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   0 (const int)
 0:?               direct index ( temp float)
@@ -173,8 +180,8 @@ vertices = 4
 0:?                 Constant:
 0:?                   0 (const int)
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   1 (const int)
 0:?               direct index ( temp float)
@@ -211,21 +218,25 @@ vertices = 4
 0:38      Branch: Return with expression
 0:38        'output' ( temp structure{ temp 2-element array of float edges})
 0:?   Linker Objects
-0:?     '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:?     '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'ip' (layout( location=0) in 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'm_cpid' ( in uint InvocationID)
 0:?     'pid' ( in uint PrimitiveID)
-0:?     '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?     '@patchConstantOutput' (layout( location=1) patch out structure{})
+0:?     '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 85
+// Id's are bound by 93
 
                               Capability Tessellation
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint TessellationControl 4  "main" 40 44 47 62 67
+                              EntryPoint TessellationControl 4  "main" 40 44 48 66 72 92
                               ExecutionMode 4 OutputVertices 4
+                              ExecutionMode 4 Isolines
+                              ExecutionMode 4 SpacingEqual
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUT"
                               MemberName 8(VS_OUT) 0  "cpoint"
@@ -243,19 +254,24 @@ vertices = 4
                               Name 40  "ip"
                               Name 42  "m_cpid"
                               Name 44  "m_cpid"
-                              Name 47  "@entryPointOutput"
-                              Name 48  "param"
+                              Name 48  "@entryPointOutput"
                               Name 50  "param"
-                              Name 61  "@patchConstantResult"
-                              Name 62  "pid"
-                              Name 63  "param"
-                              Name 67  "@patchConstantOutput_edges"
-                              Name 77  "output"
+                              Name 52  "param"
+                              Name 65  "@patchConstantResult"
+                              Name 66  "pid"
+                              Name 67  "param"
+                              Name 72  "@patchConstantOutput_edges"
+                              Name 82  "output"
+                              Name 90  "HS_CONSTANT_OUT"
+                              Name 92  "@patchConstantOutput"
                               Decorate 40(ip) Location 0
                               Decorate 44(m_cpid) BuiltIn InvocationId
-                              Decorate 47(@entryPointOutput) Location 0
-                              Decorate 62(pid) BuiltIn PrimitiveId
-                              Decorate 67(@patchConstantOutput_edges) BuiltIn TessLevelOuter
+                              Decorate 48(@entryPointOutput) Location 0
+                              Decorate 66(pid) BuiltIn PrimitiveId
+                              Decorate 72(@patchConstantOutput_edges) Patch
+                              Decorate 72(@patchConstantOutput_edges) BuiltIn TessLevelOuter
+                              Decorate 92(@patchConstantOutput) Patch
+                              Decorate 92(@patchConstantOutput) Location 1
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -280,58 +296,66 @@ vertices = 4
           40(ip):     39(ptr) Variable Input
               43:             TypePointer Input 9(int)
       44(m_cpid):     43(ptr) Variable Input
-              46:             TypePointer Output 14(HS_OUT)
-47(@entryPointOutput):     46(ptr) Variable Output
-              53:      9(int) Constant 1
-              54:      9(int) Constant 0
-              56:             TypeBool
-              60:             TypePointer Function 22(HS_CONSTANT_OUT)
-         62(pid):     43(ptr) Variable Input
-              66:             TypePointer Output 21
-67(@patchConstantOutput_edges):     66(ptr) Variable Output
-              68:             TypePointer Function 6(float)
-              71:             TypePointer Output 6(float)
-              73:     29(int) Constant 1
-              78:    6(float) Constant 1073741824
-              80:    6(float) Constant 1090519040
+              46:             TypeArray 14(HS_OUT) 10
+              47:             TypePointer Output 46
+48(@entryPointOutput):     47(ptr) Variable Output
+              55:             TypePointer Output 14(HS_OUT)
+              57:      9(int) Constant 1
+              58:      9(int) Constant 0
+              60:             TypeBool
+              64:             TypePointer Function 22(HS_CONSTANT_OUT)
+         66(pid):     43(ptr) Variable Input
+              70:             TypeArray 6(float) 10
+              71:             TypePointer Output 70
+72(@patchConstantOutput_edges):     71(ptr) Variable Output
+              73:             TypePointer Function 6(float)
+              76:             TypePointer Output 6(float)
+              78:     29(int) Constant 1
+              83:    6(float) Constant 1073741824
+              85:    6(float) Constant 1090519040
+90(HS_CONSTANT_OUT):             TypeStruct
+              91:             TypePointer Output 90(HS_CONSTANT_OUT)
+92(@patchConstantOutput):     91(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
           38(ip):     12(ptr) Variable Function
       42(m_cpid):     13(ptr) Variable Function
-       48(param):     12(ptr) Variable Function
-       50(param):     13(ptr) Variable Function
-61(@patchConstantResult):     60(ptr) Variable Function
-       63(param):     13(ptr) Variable Function
+       50(param):     12(ptr) Variable Function
+       52(param):     13(ptr) Variable Function
+65(@patchConstantResult):     64(ptr) Variable Function
+       67(param):     13(ptr) Variable Function
               41:          11 Load 40(ip)
                               Store 38(ip) 41
               45:      9(int) Load 44(m_cpid)
                               Store 42(m_cpid) 45
-              49:          11 Load 38(ip)
-                              Store 48(param) 49
-              51:      9(int) Load 42(m_cpid)
+              49:      9(int) Load 44(m_cpid)
+              51:          11 Load 38(ip)
                               Store 50(param) 51
-              52:  14(HS_OUT) FunctionCall 18(@main(struct-VS_OUT-vf31[4];u1;) 48(param) 50(param)
-                              Store 47(@entryPointOutput) 52
-                              ControlBarrier 20 53 54
-              55:      9(int) Load 44(m_cpid)
-              57:    56(bool) IEqual 55 30
-                              SelectionMerge 59 None
-                              BranchConditional 57 58 59
-              58:               Label
-              64:      9(int)   Load 62(pid)
-                                Store 63(param) 64
-              65:22(HS_CONSTANT_OUT)   FunctionCall 25(PCF(u1;) 63(param)
-                                Store 61(@patchConstantResult) 65
-              69:     68(ptr)   AccessChain 61(@patchConstantResult) 30 30
-              70:    6(float)   Load 69
-              72:     71(ptr)   AccessChain 67(@patchConstantOutput_edges) 30
-                                Store 72 70
-              74:     68(ptr)   AccessChain 61(@patchConstantResult) 30 73
+              53:      9(int) Load 42(m_cpid)
+                              Store 52(param) 53
+              54:  14(HS_OUT) FunctionCall 18(@main(struct-VS_OUT-vf31[4];u1;) 50(param) 52(param)
+              56:     55(ptr) AccessChain 48(@entryPointOutput) 49
+                              Store 56 54
+                              ControlBarrier 20 57 58
+              59:      9(int) Load 44(m_cpid)
+              61:    60(bool) IEqual 59 30
+                              SelectionMerge 63 None
+                              BranchConditional 61 62 63
+              62:               Label
+              68:      9(int)   Load 66(pid)
+                                Store 67(param) 68
+              69:22(HS_CONSTANT_OUT)   FunctionCall 25(PCF(u1;) 67(param)
+                                Store 65(@patchConstantResult) 69
+              74:     73(ptr)   AccessChain 65(@patchConstantResult) 30 30
               75:    6(float)   Load 74
-              76:     71(ptr)   AccessChain 67(@patchConstantOutput_edges) 73
-                                Store 76 75
-                                Branch 59
-              59:             Label
+              77:     76(ptr)   AccessChain 72(@patchConstantOutput_edges) 30
+                                Store 77 75
+              79:     73(ptr)   AccessChain 65(@patchConstantResult) 30 78
+              80:    6(float)   Load 79
+              81:     76(ptr)   AccessChain 72(@patchConstantOutput_edges) 78
+                                Store 81 80
+                                Branch 63
+              63:             Label
                               Return
                               FunctionEnd
 18(@main(struct-VS_OUT-vf31[4];u1;):  14(HS_OUT) Function None 15
@@ -349,11 +373,11 @@ vertices = 4
      25(PCF(u1;):22(HS_CONSTANT_OUT) Function None 23
          24(pid):     13(ptr) FunctionParameter
               26:             Label
-      77(output):     60(ptr) Variable Function
-              79:     68(ptr) AccessChain 77(output) 30 30
-                              Store 79 78
-              81:     68(ptr) AccessChain 77(output) 30 73
-                              Store 81 80
-              82:22(HS_CONSTANT_OUT) Load 77(output)
-                              ReturnValue 82
+      82(output):     64(ptr) Variable Function
+              84:     73(ptr) AccessChain 82(output) 30 30
+                              Store 84 83
+              86:     73(ptr) AccessChain 82(output) 30 78
+                              Store 86 85
+              87:22(HS_CONSTANT_OUT) Load 82(output)
+                              ReturnValue 87
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.2.tesc.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.2.tesc.out
index 9d848c6..d32da52 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.2.tesc.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.2.tesc.out
@@ -1,6 +1,7 @@
 hlsl.hull.2.tesc
-Shader version: 450
+Shader version: 500
 vertices = 4
+vertex spacing = equal_spacing
 0:? Sequence
 0:26  Function Definition: @main(struct-VS_OUT-vf31[4]; ( temp structure{ temp 3-component vector of float cpoint})
 0:26    Function Parameters: 
@@ -27,7 +28,9 @@ vertices = 4
 0:?         'ip' ( temp 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?         'ip' (layout( location=0) in 4-element array of structure{ temp 3-component vector of float cpoint})
 0:26      move second child to first child ( temp structure{ temp 3-component vector of float cpoint})
-0:?         '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:26        indirect index ( temp structure{ temp 3-component vector of float cpoint})
+0:?           '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
+0:?           'InvocationId' ( in uint InvocationID)
 0:26        Function Call: @main(struct-VS_OUT-vf31[4]; ( temp structure{ temp 3-component vector of float cpoint})
 0:?           'ip' ( temp 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?       Barrier ( temp void)
@@ -46,8 +49,8 @@ vertices = 4
 0:?               'pos' ( in 4-component vector of float Position)
 0:?           Sequence
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   0 (const int)
 0:?               direct index ( temp float)
@@ -58,8 +61,8 @@ vertices = 4
 0:?                 Constant:
 0:?                   0 (const int)
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   1 (const int)
 0:?               direct index ( temp float)
@@ -97,19 +100,21 @@ vertices = 4
 0:38      Branch: Return with expression
 0:38        'output' ( temp structure{ temp 2-element array of float edges})
 0:?   Linker Objects
-0:?     '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:?     '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'ip' (layout( location=0) in 4-element array of structure{ temp 3-component vector of float cpoint})
+0:?     'InvocationId' ( in uint InvocationID)
 0:?     'pid' ( in uint PrimitiveID)
 0:?     'pos' ( in 4-component vector of float Position)
-0:?     'InvocationId' ( in uint InvocationID)
-0:?     '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?     '@patchConstantOutput' (layout( location=1) patch out structure{})
+0:?     '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 
 
 Linked tessellation control stage:
 
 
-Shader version: 450
+Shader version: 500
 vertices = 4
+vertex spacing = equal_spacing
 0:? Sequence
 0:26  Function Definition: @main(struct-VS_OUT-vf31[4]; ( temp structure{ temp 3-component vector of float cpoint})
 0:26    Function Parameters: 
@@ -136,7 +141,9 @@ vertices = 4
 0:?         'ip' ( temp 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?         'ip' (layout( location=0) in 4-element array of structure{ temp 3-component vector of float cpoint})
 0:26      move second child to first child ( temp structure{ temp 3-component vector of float cpoint})
-0:?         '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:26        indirect index ( temp structure{ temp 3-component vector of float cpoint})
+0:?           '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
+0:?           'InvocationId' ( in uint InvocationID)
 0:26        Function Call: @main(struct-VS_OUT-vf31[4]; ( temp structure{ temp 3-component vector of float cpoint})
 0:?           'ip' ( temp 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?       Barrier ( temp void)
@@ -155,8 +162,8 @@ vertices = 4
 0:?               'pos' ( in 4-component vector of float Position)
 0:?           Sequence
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   0 (const int)
 0:?               direct index ( temp float)
@@ -167,8 +174,8 @@ vertices = 4
 0:?                 Constant:
 0:?                   0 (const int)
 0:?             move second child to first child ( temp float)
-0:?               direct index ( out float TessLevelOuter)
-0:?                 '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?               direct index ( patch out float TessLevelOuter)
+0:?                 '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 0:?                 Constant:
 0:?                   1 (const int)
 0:?               direct index ( temp float)
@@ -206,22 +213,26 @@ vertices = 4
 0:38      Branch: Return with expression
 0:38        'output' ( temp structure{ temp 2-element array of float edges})
 0:?   Linker Objects
-0:?     '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:?     '@entryPointOutput' (layout( location=0) out 4-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'ip' (layout( location=0) in 4-element array of structure{ temp 3-component vector of float cpoint})
+0:?     'InvocationId' ( in uint InvocationID)
 0:?     'pid' ( in uint PrimitiveID)
 0:?     'pos' ( in 4-component vector of float Position)
-0:?     'InvocationId' ( in uint InvocationID)
-0:?     '@patchConstantOutput_edges' ( out 2-element array of float TessLevelOuter)
+0:?     '@patchConstantOutput' (layout( location=1) patch out structure{})
+0:?     '@patchConstantOutput_edges' ( patch out 4-element array of float TessLevelOuter)
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 87
+// Id's are bound by 95
 
                               Capability Tessellation
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint TessellationControl 4  "main" 42 45 52 60 62 69
+                              EntryPoint TessellationControl 4  "main" 42 46 48 64 66 74 94
                               ExecutionMode 4 OutputVertices 4
+                              ExecutionMode 4 Isolines
+                              ExecutionMode 4 SpacingEqual
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUT"
                               MemberName 8(VS_OUT) 0  "cpoint"
@@ -237,22 +248,27 @@ vertices = 4
                               Name 30  "output"
                               Name 40  "ip"
                               Name 42  "ip"
-                              Name 45  "@entryPointOutput"
-                              Name 46  "param"
-                              Name 52  "InvocationId"
-                              Name 59  "@patchConstantResult"
-                              Name 60  "pid"
-                              Name 62  "pos"
-                              Name 63  "param"
-                              Name 65  "param"
-                              Name 69  "@patchConstantOutput_edges"
-                              Name 79  "output"
+                              Name 46  "@entryPointOutput"
+                              Name 48  "InvocationId"
+                              Name 50  "param"
+                              Name 63  "@patchConstantResult"
+                              Name 64  "pid"
+                              Name 66  "pos"
+                              Name 67  "param"
+                              Name 69  "param"
+                              Name 74  "@patchConstantOutput_edges"
+                              Name 84  "output"
+                              Name 92  "HS_CONSTANT_OUT"
+                              Name 94  "@patchConstantOutput"
                               Decorate 42(ip) Location 0
-                              Decorate 45(@entryPointOutput) Location 0
-                              Decorate 52(InvocationId) BuiltIn InvocationId
-                              Decorate 60(pid) BuiltIn PrimitiveId
-                              Decorate 62(pos) BuiltIn Position
-                              Decorate 69(@patchConstantOutput_edges) BuiltIn TessLevelOuter
+                              Decorate 46(@entryPointOutput) Location 0
+                              Decorate 48(InvocationId) BuiltIn InvocationId
+                              Decorate 64(pid) BuiltIn PrimitiveId
+                              Decorate 66(pos) BuiltIn Position
+                              Decorate 74(@patchConstantOutput_edges) Patch
+                              Decorate 74(@patchConstantOutput_edges) BuiltIn TessLevelOuter
+                              Decorate 94(@patchConstantOutput) Patch
+                              Decorate 94(@patchConstantOutput) Location 1
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -277,59 +293,67 @@ vertices = 4
               33:             TypePointer Function 7(fvec3)
               41:             TypePointer Input 11
           42(ip):     41(ptr) Variable Input
-              44:             TypePointer Output 13(HS_OUT)
-45(@entryPointOutput):     44(ptr) Variable Output
-              49:      9(int) Constant 1
-              50:      9(int) Constant 0
-              51:             TypePointer Input 9(int)
-52(InvocationId):     51(ptr) Variable Input
-              54:             TypeBool
-              58:             TypePointer Function 23(HS_CONSTANT_OUT)
-         60(pid):     51(ptr) Variable Input
-              61:             TypePointer Input 19(fvec4)
-         62(pos):     61(ptr) Variable Input
-              68:             TypePointer Output 22
-69(@patchConstantOutput_edges):     68(ptr) Variable Output
-              70:             TypePointer Function 6(float)
-              73:             TypePointer Output 6(float)
-              75:     31(int) Constant 1
-              80:    6(float) Constant 1073741824
-              82:    6(float) Constant 1090519040
+              44:             TypeArray 13(HS_OUT) 10
+              45:             TypePointer Output 44
+46(@entryPointOutput):     45(ptr) Variable Output
+              47:             TypePointer Input 9(int)
+48(InvocationId):     47(ptr) Variable Input
+              53:             TypePointer Output 13(HS_OUT)
+              55:      9(int) Constant 1
+              56:      9(int) Constant 0
+              58:             TypeBool
+              62:             TypePointer Function 23(HS_CONSTANT_OUT)
+         64(pid):     47(ptr) Variable Input
+              65:             TypePointer Input 19(fvec4)
+         66(pos):     65(ptr) Variable Input
+              72:             TypeArray 6(float) 10
+              73:             TypePointer Output 72
+74(@patchConstantOutput_edges):     73(ptr) Variable Output
+              75:             TypePointer Function 6(float)
+              78:             TypePointer Output 6(float)
+              80:     31(int) Constant 1
+              85:    6(float) Constant 1073741824
+              87:    6(float) Constant 1090519040
+92(HS_CONSTANT_OUT):             TypeStruct
+              93:             TypePointer Output 92(HS_CONSTANT_OUT)
+94(@patchConstantOutput):     93(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
           40(ip):     12(ptr) Variable Function
-       46(param):     12(ptr) Variable Function
-59(@patchConstantResult):     58(ptr) Variable Function
-       63(param):     18(ptr) Variable Function
-       65(param):     20(ptr) Variable Function
+       50(param):     12(ptr) Variable Function
+63(@patchConstantResult):     62(ptr) Variable Function
+       67(param):     18(ptr) Variable Function
+       69(param):     20(ptr) Variable Function
               43:          11 Load 42(ip)
                               Store 40(ip) 43
-              47:          11 Load 40(ip)
-                              Store 46(param) 47
-              48:  13(HS_OUT) FunctionCall 16(@main(struct-VS_OUT-vf31[4];) 46(param)
-                              Store 45(@entryPointOutput) 48
-                              ControlBarrier 21 49 50
-              53:      9(int) Load 52(InvocationId)
-              55:    54(bool) IEqual 53 32
-                              SelectionMerge 57 None
-                              BranchConditional 55 56 57
-              56:               Label
-              64:      9(int)   Load 60(pid)
-                                Store 63(param) 64
-              66:   19(fvec4)   Load 62(pos)
-                                Store 65(param) 66
-              67:23(HS_CONSTANT_OUT)   FunctionCall 27(PCF(u1;vf4;) 63(param) 65(param)
-                                Store 59(@patchConstantResult) 67
-              71:     70(ptr)   AccessChain 59(@patchConstantResult) 32 32
-              72:    6(float)   Load 71
-              74:     73(ptr)   AccessChain 69(@patchConstantOutput_edges) 32
-                                Store 74 72
-              76:     70(ptr)   AccessChain 59(@patchConstantResult) 32 75
+              49:      9(int) Load 48(InvocationId)
+              51:          11 Load 40(ip)
+                              Store 50(param) 51
+              52:  13(HS_OUT) FunctionCall 16(@main(struct-VS_OUT-vf31[4];) 50(param)
+              54:     53(ptr) AccessChain 46(@entryPointOutput) 49
+                              Store 54 52
+                              ControlBarrier 21 55 56
+              57:      9(int) Load 48(InvocationId)
+              59:    58(bool) IEqual 57 32
+                              SelectionMerge 61 None
+                              BranchConditional 59 60 61
+              60:               Label
+              68:      9(int)   Load 64(pid)
+                                Store 67(param) 68
+              70:   19(fvec4)   Load 66(pos)
+                                Store 69(param) 70
+              71:23(HS_CONSTANT_OUT)   FunctionCall 27(PCF(u1;vf4;) 67(param) 69(param)
+                                Store 63(@patchConstantResult) 71
+              76:     75(ptr)   AccessChain 63(@patchConstantResult) 32 32
               77:    6(float)   Load 76
-              78:     73(ptr)   AccessChain 69(@patchConstantOutput_edges) 75
-                                Store 78 77
-                                Branch 57
-              57:             Label
+              79:     78(ptr)   AccessChain 74(@patchConstantOutput_edges) 32
+                                Store 79 77
+              81:     75(ptr)   AccessChain 63(@patchConstantResult) 32 80
+              82:    6(float)   Load 81
+              83:     78(ptr)   AccessChain 74(@patchConstantOutput_edges) 80
+                                Store 83 82
+                                Branch 61
+              61:             Label
                               Return
                               FunctionEnd
 16(@main(struct-VS_OUT-vf31[4];):  13(HS_OUT) Function None 14
@@ -347,11 +371,11 @@ vertices = 4
          25(pid):     18(ptr) FunctionParameter
          26(pos):     20(ptr) FunctionParameter
               28:             Label
-      79(output):     58(ptr) Variable Function
-              81:     70(ptr) AccessChain 79(output) 32 32
-                              Store 81 80
-              83:     70(ptr) AccessChain 79(output) 32 75
-                              Store 83 82
-              84:23(HS_CONSTANT_OUT) Load 79(output)
-                              ReturnValue 84
+      84(output):     62(ptr) Variable Function
+              86:     75(ptr) AccessChain 84(output) 32 32
+                              Store 86 85
+              88:     75(ptr) AccessChain 84(output) 32 80
+                              Store 88 87
+              89:23(HS_CONSTANT_OUT) Load 84(output)
+                              ReturnValue 89
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.void.tesc.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.void.tesc.out
index a2d0a1c..b6e417a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.void.tesc.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.hull.void.tesc.out
@@ -1,6 +1,7 @@
 hlsl.hull.void.tesc
-Shader version: 450
+Shader version: 500
 vertices = 3
+vertex spacing = fractional_even_spacing
 0:? Sequence
 0:26  Function Definition: @main(struct-VS_OUT-vf31[3]; ( temp structure{ temp 3-component vector of float cpoint})
 0:26    Function Parameters: 
@@ -27,7 +28,9 @@ vertices = 3
 0:?         'ip' ( temp 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?         'ip' (layout( location=0) in 3-element array of structure{ temp 3-component vector of float cpoint})
 0:26      move second child to first child ( temp structure{ temp 3-component vector of float cpoint})
-0:?         '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:26        indirect index ( temp structure{ temp 3-component vector of float cpoint})
+0:?           '@entryPointOutput' (layout( location=0) out 3-element array of structure{ temp 3-component vector of float cpoint})
+0:?           'InvocationId' ( in uint InvocationID)
 0:26        Function Call: @main(struct-VS_OUT-vf31[3]; ( temp structure{ temp 3-component vector of float cpoint})
 0:?           'ip' ( temp 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?       Barrier ( temp void)
@@ -38,11 +41,12 @@ vertices = 3
 0:?           Constant:
 0:?             0 (const int)
 0:?         true case
-0:?         Function Call: PCF( ( temp void)
+0:?         Sequence
+0:?           Function Call: PCF( ( temp void)
 0:33  Function Definition: PCF( ( temp void)
 0:33    Function Parameters: 
 0:?   Linker Objects
-0:?     '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:?     '@entryPointOutput' (layout( location=0) out 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'ip' (layout( location=0) in 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'InvocationId' ( in uint InvocationID)
 
@@ -50,8 +54,9 @@ vertices = 3
 Linked tessellation control stage:
 
 
-Shader version: 450
+Shader version: 500
 vertices = 3
+vertex spacing = fractional_even_spacing
 0:? Sequence
 0:26  Function Definition: @main(struct-VS_OUT-vf31[3]; ( temp structure{ temp 3-component vector of float cpoint})
 0:26    Function Parameters: 
@@ -78,7 +83,9 @@ vertices = 3
 0:?         'ip' ( temp 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?         'ip' (layout( location=0) in 3-element array of structure{ temp 3-component vector of float cpoint})
 0:26      move second child to first child ( temp structure{ temp 3-component vector of float cpoint})
-0:?         '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:26        indirect index ( temp structure{ temp 3-component vector of float cpoint})
+0:?           '@entryPointOutput' (layout( location=0) out 3-element array of structure{ temp 3-component vector of float cpoint})
+0:?           'InvocationId' ( in uint InvocationID)
 0:26        Function Call: @main(struct-VS_OUT-vf31[3]; ( temp structure{ temp 3-component vector of float cpoint})
 0:?           'ip' ( temp 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?       Barrier ( temp void)
@@ -89,23 +96,27 @@ vertices = 3
 0:?           Constant:
 0:?             0 (const int)
 0:?         true case
-0:?         Function Call: PCF( ( temp void)
+0:?         Sequence
+0:?           Function Call: PCF( ( temp void)
 0:33  Function Definition: PCF( ( temp void)
 0:33    Function Parameters: 
 0:?   Linker Objects
-0:?     '@entryPointOutput' (layout( location=0) out structure{ temp 3-component vector of float cpoint})
+0:?     '@entryPointOutput' (layout( location=0) out 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'ip' (layout( location=0) in 3-element array of structure{ temp 3-component vector of float cpoint})
 0:?     'InvocationId' ( in uint InvocationID)
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 51
+// Id's are bound by 55
 
                               Capability Tessellation
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint TessellationControl 4  "main" 33 36 44
+                              EntryPoint TessellationControl 4  "main" 33 37 39
                               ExecutionMode 4 OutputVertices 3
+                              ExecutionMode 4 Triangles
+                              ExecutionMode 4 SpacingFractionalEven
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUT"
                               MemberName 8(VS_OUT) 0  "cpoint"
@@ -117,12 +128,12 @@ vertices = 3
                               Name 21  "output"
                               Name 31  "ip"
                               Name 33  "ip"
-                              Name 36  "@entryPointOutput"
-                              Name 37  "param"
-                              Name 44  "InvocationId"
+                              Name 37  "@entryPointOutput"
+                              Name 39  "InvocationId"
+                              Name 41  "param"
                               Decorate 33(ip) Location 0
-                              Decorate 36(@entryPointOutput) Location 0
-                              Decorate 44(InvocationId) BuiltIn InvocationId
+                              Decorate 37(@entryPointOutput) Location 0
+                              Decorate 39(InvocationId) BuiltIn InvocationId
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -140,33 +151,37 @@ vertices = 3
               24:             TypePointer Function 7(fvec3)
               32:             TypePointer Input 11
           33(ip):     32(ptr) Variable Input
-              35:             TypePointer Output 13(HS_OUT)
-36(@entryPointOutput):     35(ptr) Variable Output
-              40:      9(int) Constant 2
-              41:      9(int) Constant 1
-              42:      9(int) Constant 0
-              43:             TypePointer Input 9(int)
-44(InvocationId):     43(ptr) Variable Input
-              46:             TypeBool
+              35:             TypeArray 13(HS_OUT) 10
+              36:             TypePointer Output 35
+37(@entryPointOutput):     36(ptr) Variable Output
+              38:             TypePointer Input 9(int)
+39(InvocationId):     38(ptr) Variable Input
+              44:             TypePointer Output 13(HS_OUT)
+              46:      9(int) Constant 2
+              47:      9(int) Constant 1
+              48:      9(int) Constant 0
+              50:             TypeBool
          4(main):           2 Function None 3
                5:             Label
           31(ip):     12(ptr) Variable Function
-       37(param):     12(ptr) Variable Function
+       41(param):     12(ptr) Variable Function
               34:          11 Load 33(ip)
                               Store 31(ip) 34
-              38:          11 Load 31(ip)
-                              Store 37(param) 38
-              39:  13(HS_OUT) FunctionCall 16(@main(struct-VS_OUT-vf31[3];) 37(param)
-                              Store 36(@entryPointOutput) 39
-                              ControlBarrier 40 41 42
-              45:      9(int) Load 44(InvocationId)
-              47:    46(bool) IEqual 45 23
-                              SelectionMerge 49 None
-                              BranchConditional 47 48 49
-              48:               Label
-              50:           2   FunctionCall 18(PCF()
-                                Branch 49
-              49:             Label
+              40:      9(int) Load 39(InvocationId)
+              42:          11 Load 31(ip)
+                              Store 41(param) 42
+              43:  13(HS_OUT) FunctionCall 16(@main(struct-VS_OUT-vf31[3];) 41(param)
+              45:     44(ptr) AccessChain 37(@entryPointOutput) 40
+                              Store 45 43
+                              ControlBarrier 46 47 48
+              49:      9(int) Load 39(InvocationId)
+              51:    50(bool) IEqual 49 23
+                              SelectionMerge 53 None
+                              BranchConditional 51 52 53
+              52:               Label
+              54:           2   FunctionCall 18(PCF()
+                                Branch 53
+              53:             Label
                               Return
                               FunctionEnd
 16(@main(struct-VS_OUT-vf31[3];):  13(HS_OUT) Function None 14
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.identifier.sample.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.identifier.sample.frag.out
index 3583141..a8870b1 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.identifier.sample.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.identifier.sample.frag.out
@@ -1,5 +1,5 @@
 hlsl.identifier.sample.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: sample(i1; ( temp int)
@@ -44,7 +44,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: sample(i1; ( temp int)
@@ -94,6 +94,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 31
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 10  "sample(i1;"
                               Name 9  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.if.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.if.frag.out
index 6f887ca..89e0bb1 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.if.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.if.frag.out
@@ -1,5 +1,5 @@
 hlsl.if.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -69,12 +69,13 @@ gl_FragCoord origin is upper left
 0:26              'input' ( in 4-component vector of float)
 0:30      Test condition and select ( temp void)
 0:30        Condition
-0:30        move second child to first child ( temp float)
-0:30          'ii' ( temp float)
-0:30          direct index ( temp float)
-0:30            'input' ( in 4-component vector of float)
-0:30            Constant:
-0:30              2 (const int)
+0:30        Convert float to bool ( temp bool)
+0:30          move second child to first child ( temp float)
+0:30            'ii' ( temp float)
+0:30            direct index ( temp float)
+0:30              'input' ( in 4-component vector of float)
+0:30              Constant:
+0:30                2 (const int)
 0:30        true case
 0:31        Pre-Increment ( temp float)
 0:31          'ii' ( temp float)
@@ -108,7 +109,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -178,12 +179,13 @@ gl_FragCoord origin is upper left
 0:26              'input' ( in 4-component vector of float)
 0:30      Test condition and select ( temp void)
 0:30        Condition
-0:30        move second child to first child ( temp float)
-0:30          'ii' ( temp float)
-0:30          direct index ( temp float)
-0:30            'input' ( in 4-component vector of float)
-0:30            Constant:
-0:30              2 (const int)
+0:30        Convert float to bool ( temp bool)
+0:30          move second child to first child ( temp float)
+0:30            'ii' ( temp float)
+0:30            direct index ( temp float)
+0:30              'input' ( in 4-component vector of float)
+0:30              Constant:
+0:30                2 (const int)
 0:30        true case
 0:31        Pre-Increment ( temp float)
 0:31          'ii' ( temp float)
@@ -215,24 +217,25 @@ gl_FragCoord origin is upper left
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 101
+// Id's are bound by 103
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "PixelShaderFunction" 94 97
+                              EntryPoint Fragment 4  "PixelShaderFunction" 96 99
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
                               Name 68  "ii"
-                              Name 80  "ii"
-                              Name 92  "input"
+                              Name 82  "ii"
                               Name 94  "input"
-                              Name 97  "@entryPointOutput"
-                              Name 98  "param"
-                              Decorate 94(input) Location 0
-                              Decorate 97(@entryPointOutput) Location 0
+                              Name 96  "input"
+                              Name 99  "@entryPointOutput"
+                              Name 100  "param"
+                              Decorate 96(input) Location 0
+                              Decorate 99(@entryPointOutput) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -244,31 +247,32 @@ gl_FragCoord origin is upper left
               67:             TypePointer Function 6(float)
               69:             TypeInt 32 0
               70:     69(int) Constant 2
-              76:    6(float) Constant 1065353216
-              78:             TypeInt 32 1
-              79:             TypePointer Function 78(int)
-              82:     78(int) Constant 1
-              93:             TypePointer Input 7(fvec4)
-       94(input):     93(ptr) Variable Input
-              96:             TypePointer Output 7(fvec4)
-97(@entryPointOutput):     96(ptr) Variable Output
+              73:    6(float) Constant 0
+              78:    6(float) Constant 1065353216
+              80:             TypeInt 32 1
+              81:             TypePointer Function 80(int)
+              84:     80(int) Constant 1
+              95:             TypePointer Input 7(fvec4)
+       96(input):     95(ptr) Variable Input
+              98:             TypePointer Output 7(fvec4)
+99(@entryPointOutput):     98(ptr) Variable Output
 4(PixelShaderFunction):           2 Function None 3
                5:             Label
-       92(input):      8(ptr) Variable Function
-       98(param):      8(ptr) Variable Function
-              95:    7(fvec4) Load 94(input)
-                              Store 92(input) 95
-              99:    7(fvec4) Load 92(input)
-                              Store 98(param) 99
-             100:    7(fvec4) FunctionCall 11(@PixelShaderFunction(vf4;) 98(param)
-                              Store 97(@entryPointOutput) 100
+       94(input):      8(ptr) Variable Function
+      100(param):      8(ptr) Variable Function
+              97:    7(fvec4) Load 96(input)
+                              Store 94(input) 97
+             101:    7(fvec4) Load 94(input)
+                              Store 100(param) 101
+             102:    7(fvec4) FunctionCall 11(@PixelShaderFunction(vf4;) 100(param)
+                              Store 99(@entryPointOutput) 102
                               Return
                               FunctionEnd
 11(@PixelShaderFunction(vf4;):    7(fvec4) Function None 9
        10(input):      8(ptr) FunctionParameter
               12:             Label
           68(ii):     67(ptr) Variable Function
-          80(ii):     79(ptr) Variable Function
+          82(ii):     81(ptr) Variable Function
               13:    7(fvec4) Load 10(input)
               14:    7(fvec4) Load 10(input)
               17:   16(bvec4) FOrdEqual 13 14
@@ -338,28 +342,29 @@ gl_FragCoord origin is upper left
               71:     67(ptr) AccessChain 10(input) 70
               72:    6(float) Load 71
                               Store 68(ii) 72
-                              SelectionMerge 74 None
-                              BranchConditional 72 73 74
-              73:               Label
-              75:    6(float)   Load 68(ii)
-              77:    6(float)   FAdd 75 76
-                                Store 68(ii) 77
-                                Branch 74
-              74:             Label
-              81:     78(int) Load 80(ii)
-              83:     78(int) IAdd 81 82
-                              Store 80(ii) 83
-              84:     78(int) Load 80(ii)
-              85:    6(float) ConvertSToF 84
-              86:    15(bool) FOrdEqual 85 76
-                              SelectionMerge 88 None
-                              BranchConditional 86 87 88
-              87:               Label
-              89:     78(int)   Load 80(ii)
-              90:     78(int)   IAdd 89 82
-                                Store 80(ii) 90
-                                Branch 88
-              88:             Label
-              91:    7(fvec4) Undef
-                              ReturnValue 91
+              74:    15(bool) FOrdNotEqual 72 73
+                              SelectionMerge 76 None
+                              BranchConditional 74 75 76
+              75:               Label
+              77:    6(float)   Load 68(ii)
+              79:    6(float)   FAdd 77 78
+                                Store 68(ii) 79
+                                Branch 76
+              76:             Label
+              83:     80(int) Load 82(ii)
+              85:     80(int) IAdd 83 84
+                              Store 82(ii) 85
+              86:     80(int) Load 82(ii)
+              87:    6(float) ConvertSToF 86
+              88:    15(bool) FOrdEqual 87 78
+                              SelectionMerge 90 None
+                              BranchConditional 88 89 90
+              89:               Label
+              91:     80(int)   Load 82(ii)
+              92:     80(int)   IAdd 91 84
+                                Store 82(ii) 92
+                                Branch 90
+              90:             Label
+              93:    7(fvec4) Undef
+                              ReturnValue 93
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init.frag.out
index 7ae9475..119d0c5 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init.frag.out
@@ -2,7 +2,7 @@ hlsl.init.frag
 WARNING: 0:40: 'typedef' : struct-member initializers ignored 
 WARNING: 0:40: 'typedef' : struct-member initializers ignored 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -168,7 +168,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -339,6 +339,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "ShaderFunction" 98 101
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "ShaderFunction"
                               Name 11  "@ShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init2.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init2.frag.out
index 33714aa..45eca98 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init2.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.init2.frag.out
@@ -1,5 +1,5 @@
 hlsl.init2.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: Test1( ( temp void)
@@ -180,7 +180,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: Test1( ( temp void)
@@ -366,6 +366,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 109
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "Test1("
                               Name 10  "PS_OUTPUT"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.inoutquals.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.inoutquals.frag.out
index 6501703..d5cce21 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.inoutquals.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.inoutquals.frag.out
@@ -1,5 +1,5 @@
 hlsl.inoutquals.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: MyFunc(f1;f1;f1; ( temp void)
@@ -95,7 +95,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Function Definition: MyFunc(f1;f1;f1; ( temp void)
@@ -197,6 +197,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 60 70 74 78
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "MyFunc(f1;f1;f1;"
                               Name 9  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.barriers.comp.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.barriers.comp.out
index c82ee0f..f7e3e22 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.barriers.comp.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.barriers.comp.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.barriers.comp
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:3  Function Definition: @ComputeShaderFunction( ( temp float)
@@ -27,7 +27,7 @@ local_size = (1, 1, 1)
 Linked compute stage:
 
 
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:3  Function Definition: @ComputeShaderFunction( ( temp float)
@@ -60,6 +60,7 @@ local_size = (1, 1, 1)
                               MemoryModel Logical GLSL450
                               EntryPoint GLCompute 4  "ComputeShaderFunction" 20
                               ExecutionMode 4 LocalSize 1 1 1
+                              Source HLSL 500
                               Name 4  "ComputeShaderFunction"
                               Name 8  "@ComputeShaderFunction("
                               Name 20  "@entryPointOutput"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.comp.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.comp.out
index c4f7ac8..bff1886 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.comp.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.comp.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.comp
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:17  Function Definition: ComputeShaderFunctionS(f1;f1;f1;u1;u1; ( temp float)
@@ -355,7 +355,7 @@ local_size = (1, 1, 1)
 Linked compute stage:
 
 
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:17  Function Definition: ComputeShaderFunctionS(f1;f1;f1;u1;u1; ( temp float)
@@ -716,6 +716,7 @@ local_size = (1, 1, 1)
                               MemoryModel Logical GLSL450
                               EntryPoint GLCompute 4  "ComputeShaderFunction" 227 230 233 237 240 243
                               ExecutionMode 4 LocalSize 1 1 1
+                              Source HLSL 500
                               Name 4  "ComputeShaderFunction"
                               Name 16  "ComputeShaderFunctionS(f1;f1;f1;u1;u1;"
                               Name 11  "inF0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.d3dcolortoubyte4.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.d3dcolortoubyte4.frag.out
index e38e621..94b4ad7 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.d3dcolortoubyte4.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.d3dcolortoubyte4.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.d3dcolortoubyte4.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main( ( temp 4-component vector of int)
@@ -38,7 +38,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main( ( temp 4-component vector of int)
@@ -82,6 +82,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 27
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "@main("
                               Name 14  "$Global"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.double.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.double.frag.out
index 1a6524f..f08775a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.double.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.double.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.double.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @PixelShaderFunction(d1;d1;d1;vd2;vd3;vd4;u1;u1; ( temp float)
@@ -83,7 +83,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @PixelShaderFunction(d1;d1;d1;vd2;vd3;vd4;u1;u1; ( temp float)
@@ -173,6 +173,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 44 47 50 54 58 62 66 69 72
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 26  "@PixelShaderFunction(d1;d1;d1;vd2;vd3;vd4;u1;u1;"
                               Name 18  "inDV1a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.evalfns.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.evalfns.frag.out
index 6d96f16..90a4db5 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.evalfns.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.evalfns.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.evalfns.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @main(f1;vf2;vf3;vf4;vi2; ( temp void)
@@ -78,7 +78,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @main(f1;vf2;vf3;vf4;vi2; ( temp void)
@@ -163,6 +163,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 51 55 59 63 67
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 23  "@main(f1;vf2;vf3;vf4;vi2;"
                               Name 18  "inF1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f1632.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f1632.frag.out
index 6b9a18b..e828d96 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f1632.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f1632.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.f1632.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: PixelShaderFunctionS(u1; ( temp float)
@@ -131,7 +131,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: PixelShaderFunctionS(u1; ( temp float)
@@ -268,6 +268,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 101
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "PixelShaderFunctionS(u1;"
                               Name 10  "inF0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f3216.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f3216.frag.out
index 30edb0a..3ff9ce7 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f3216.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.f3216.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.f3216.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: PixelShaderFunctionS(f1; ( temp uint)
@@ -136,7 +136,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: PixelShaderFunctionS(f1; ( temp uint)
@@ -278,6 +278,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 104
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "PixelShaderFunctionS(f1;"
                               Name 10  "inF0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.frag.out
index 1dbd218..289592a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:17  Function Definition: PixelShaderFunctionS(f1;f1;f1;u1;u1; ( temp float)
@@ -174,1967 +174,1931 @@ gl_FragCoord origin is upper left
 0:54            'inF0' ( in float)
 0:55      Sequence
 0:55        move second child to first child ( temp float)
-0:55          'r035' ( temp float)
-0:55          frexp ( temp float)
+0:55          'r036' ( temp float)
+0:55          fwidth ( temp float)
 0:55            'inF0' ( in float)
-0:55            'inF1' ( in float)
 0:56      Sequence
-0:56        move second child to first child ( temp float)
-0:56          'r036' ( temp float)
-0:56          fwidth ( temp float)
+0:56        move second child to first child ( temp bool)
+0:56          'r037' ( temp bool)
+0:56          isinf ( temp bool)
 0:56            'inF0' ( in float)
 0:57      Sequence
 0:57        move second child to first child ( temp bool)
-0:57          'r037' ( temp bool)
-0:57          isinf ( temp bool)
+0:57          'r038' ( temp bool)
+0:57          isnan ( temp bool)
 0:57            'inF0' ( in float)
 0:58      Sequence
-0:58        move second child to first child ( temp bool)
-0:58          'r038' ( temp bool)
-0:58          isnan ( temp bool)
+0:58        move second child to first child ( temp float)
+0:58          'r039' ( temp float)
+0:58          ldexp ( temp float)
 0:58            'inF0' ( in float)
+0:58            'inF1' ( in float)
 0:59      Sequence
 0:59        move second child to first child ( temp float)
-0:59          'r039' ( temp float)
-0:59          ldexp ( temp float)
+0:59          'r039a' ( temp float)
+0:59          mix ( temp float)
 0:59            'inF0' ( in float)
 0:59            'inF1' ( in float)
+0:59            'inF2' ( in float)
 0:60      Sequence
 0:60        move second child to first child ( temp float)
-0:60          'r039a' ( temp float)
-0:60          mix ( temp float)
+0:60          'r040' ( temp float)
+0:60          log ( temp float)
 0:60            'inF0' ( in float)
-0:60            'inF1' ( in float)
-0:60            'inF2' ( in float)
 0:61      Sequence
 0:61        move second child to first child ( temp float)
-0:61          'r040' ( temp float)
-0:61          log ( temp float)
-0:61            'inF0' ( in float)
+0:61          'r041' ( temp float)
+0:61          component-wise multiply ( temp float)
+0:61            log2 ( temp float)
+0:61              'inF0' ( in float)
+0:61            Constant:
+0:61              0.301030
 0:62      Sequence
 0:62        move second child to first child ( temp float)
-0:62          'r041' ( temp float)
-0:62          component-wise multiply ( temp float)
-0:62            log2 ( temp float)
-0:62              'inF0' ( in float)
-0:62            Constant:
-0:62              0.301030
+0:62          'r042' ( temp float)
+0:62          log2 ( temp float)
+0:62            'inF0' ( in float)
 0:63      Sequence
 0:63        move second child to first child ( temp float)
-0:63          'r042' ( temp float)
-0:63          log2 ( temp float)
+0:63          'r043' ( temp float)
+0:63          max ( temp float)
 0:63            'inF0' ( in float)
+0:63            'inF1' ( in float)
 0:64      Sequence
 0:64        move second child to first child ( temp float)
-0:64          'r043' ( temp float)
-0:64          max ( temp float)
+0:64          'r044' ( temp float)
+0:64          min ( temp float)
 0:64            'inF0' ( in float)
 0:64            'inF1' ( in float)
 0:65      Sequence
 0:65        move second child to first child ( temp float)
-0:65          'r044' ( temp float)
-0:65          min ( temp float)
+0:65          'r045' ( temp float)
+0:65          pow ( temp float)
 0:65            'inF0' ( in float)
 0:65            'inF1' ( in float)
 0:66      Sequence
 0:66        move second child to first child ( temp float)
-0:66          'r045' ( temp float)
-0:66          pow ( temp float)
+0:66          'r046' ( temp float)
+0:66          radians ( temp float)
 0:66            'inF0' ( in float)
-0:66            'inF1' ( in float)
 0:67      Sequence
 0:67        move second child to first child ( temp float)
-0:67          'r046' ( temp float)
-0:67          radians ( temp float)
+0:67          'r047' ( temp float)
+0:67          divide ( temp float)
+0:67            Constant:
+0:67              1.000000
 0:67            'inF0' ( in float)
 0:68      Sequence
-0:68        move second child to first child ( temp float)
-0:68          'r047' ( temp float)
-0:68          divide ( temp float)
-0:68            Constant:
-0:68              1.000000
-0:68            'inF0' ( in float)
+0:68        move second child to first child ( temp uint)
+0:68          'r048' ( temp uint)
+0:68          Convert int to uint ( temp uint)
+0:68            bitFieldReverse ( temp int)
+0:68              Constant:
+0:68                2 (const int)
 0:69      Sequence
-0:69        move second child to first child ( temp uint)
-0:69          'r048' ( temp uint)
-0:69          Convert int to uint ( temp uint)
-0:69            bitFieldReverse ( temp int)
-0:69              Constant:
-0:69                2 (const int)
+0:69        move second child to first child ( temp float)
+0:69          'r049' ( temp float)
+0:69          roundEven ( temp float)
+0:69            'inF0' ( in float)
 0:70      Sequence
 0:70        move second child to first child ( temp float)
-0:70          'r049' ( temp float)
-0:70          roundEven ( temp float)
+0:70          'r050' ( temp float)
+0:70          inverse sqrt ( temp float)
 0:70            'inF0' ( in float)
 0:71      Sequence
 0:71        move second child to first child ( temp float)
-0:71          'r050' ( temp float)
-0:71          inverse sqrt ( temp float)
+0:71          'r051' ( temp float)
+0:71          clamp ( temp float)
 0:71            'inF0' ( in float)
+0:71            Constant:
+0:71              0.000000
+0:71            Constant:
+0:71              1.000000
 0:72      Sequence
 0:72        move second child to first child ( temp float)
-0:72          'r051' ( temp float)
-0:72          clamp ( temp float)
+0:72          'r052' ( temp float)
+0:72          Sign ( temp float)
 0:72            'inF0' ( in float)
-0:72            Constant:
-0:72              0.000000
-0:72            Constant:
-0:72              1.000000
 0:73      Sequence
 0:73        move second child to first child ( temp float)
-0:73          'r052' ( temp float)
-0:73          Sign ( temp float)
+0:73          'r053' ( temp float)
+0:73          sine ( temp float)
 0:73            'inF0' ( in float)
 0:74      Sequence
 0:74        move second child to first child ( temp float)
-0:74          'r053' ( temp float)
+0:74          'inF1' ( in float)
 0:74          sine ( temp float)
 0:74            'inF0' ( in float)
+0:74        move second child to first child ( temp float)
+0:74          'inF2' ( in float)
+0:74          cosine ( temp float)
+0:74            'inF0' ( in float)
 0:75      Sequence
 0:75        move second child to first child ( temp float)
-0:75          'inF1' ( in float)
-0:75          sine ( temp float)
-0:75            'inF0' ( in float)
-0:75        move second child to first child ( temp float)
-0:75          'inF2' ( in float)
-0:75          cosine ( temp float)
+0:75          'r055' ( temp float)
+0:75          hyp. sine ( temp float)
 0:75            'inF0' ( in float)
 0:76      Sequence
 0:76        move second child to first child ( temp float)
-0:76          'r055' ( temp float)
-0:76          hyp. sine ( temp float)
+0:76          'r056' ( temp float)
+0:76          smoothstep ( temp float)
 0:76            'inF0' ( in float)
+0:76            'inF1' ( in float)
+0:76            'inF2' ( in float)
 0:77      Sequence
 0:77        move second child to first child ( temp float)
-0:77          'r056' ( temp float)
-0:77          smoothstep ( temp float)
+0:77          'r057' ( temp float)
+0:77          sqrt ( temp float)
 0:77            'inF0' ( in float)
-0:77            'inF1' ( in float)
-0:77            'inF2' ( in float)
 0:78      Sequence
 0:78        move second child to first child ( temp float)
-0:78          'r057' ( temp float)
-0:78          sqrt ( temp float)
+0:78          'r058' ( temp float)
+0:78          step ( temp float)
 0:78            'inF0' ( in float)
+0:78            'inF1' ( in float)
 0:79      Sequence
 0:79        move second child to first child ( temp float)
-0:79          'r058' ( temp float)
-0:79          step ( temp float)
+0:79          'r059' ( temp float)
+0:79          tangent ( temp float)
 0:79            'inF0' ( in float)
-0:79            'inF1' ( in float)
 0:80      Sequence
 0:80        move second child to first child ( temp float)
-0:80          'r059' ( temp float)
-0:80          tangent ( temp float)
+0:80          'r060' ( temp float)
+0:80          hyp. tangent ( temp float)
 0:80            'inF0' ( in float)
-0:81      Sequence
-0:81        move second child to first child ( temp float)
-0:81          'r060' ( temp float)
-0:81          hyp. tangent ( temp float)
-0:81            'inF0' ( in float)
-0:83      Sequence
-0:83        move second child to first child ( temp float)
-0:83          'r061' ( temp float)
-0:83          trunc ( temp float)
-0:83            'inF0' ( in float)
-0:85      Branch: Return with expression
-0:85        Constant:
-0:85          0.000000
-0:89  Function Definition: PixelShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
-0:89    Function Parameters: 
-0:89      'inF0' ( in 1-component vector of float)
-0:89      'inF1' ( in 1-component vector of float)
-0:89      'inF2' ( in 1-component vector of float)
+0:82      Sequence
+0:82        move second child to first child ( temp float)
+0:82          'r061' ( temp float)
+0:82          trunc ( temp float)
+0:82            'inF0' ( in float)
+0:84      Branch: Return with expression
+0:84        Constant:
+0:84          0.000000
+0:88  Function Definition: PixelShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
+0:88    Function Parameters: 
+0:88      'inF0' ( in 1-component vector of float)
+0:88      'inF1' ( in 1-component vector of float)
+0:88      'inF2' ( in 1-component vector of float)
 0:?     Sequence
-0:91      Branch: Return with expression
-0:91        Constant:
-0:91          0.000000
-0:95  Function Definition: PixelShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
-0:95    Function Parameters: 
-0:95      'inF0' ( in 2-component vector of float)
-0:95      'inF1' ( in 2-component vector of float)
-0:95      'inF2' ( in 2-component vector of float)
-0:95      'inU0' ( in 2-component vector of uint)
-0:95      'inU1' ( in 2-component vector of uint)
+0:90      Branch: Return with expression
+0:90        Constant:
+0:90          0.000000
+0:94  Function Definition: PixelShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
+0:94    Function Parameters: 
+0:94      'inF0' ( in 2-component vector of float)
+0:94      'inF1' ( in 2-component vector of float)
+0:94      'inF2' ( in 2-component vector of float)
+0:94      'inU0' ( in 2-component vector of uint)
+0:94      'inU1' ( in 2-component vector of uint)
 0:?     Sequence
+0:97      Sequence
+0:97        move second child to first child ( temp bool)
+0:97          'r000' ( temp bool)
+0:97          all ( temp bool)
+0:97            'inF0' ( in 2-component vector of float)
 0:98      Sequence
-0:98        move second child to first child ( temp bool)
-0:98          'r000' ( temp bool)
-0:98          all ( temp bool)
+0:98        move second child to first child ( temp 2-component vector of float)
+0:98          'r001' ( temp 2-component vector of float)
+0:98          Absolute value ( temp 2-component vector of float)
 0:98            'inF0' ( in 2-component vector of float)
 0:99      Sequence
 0:99        move second child to first child ( temp 2-component vector of float)
-0:99          'r001' ( temp 2-component vector of float)
-0:99          Absolute value ( temp 2-component vector of float)
+0:99          'r002' ( temp 2-component vector of float)
+0:99          arc cosine ( temp 2-component vector of float)
 0:99            'inF0' ( in 2-component vector of float)
 0:100      Sequence
-0:100        move second child to first child ( temp 2-component vector of float)
-0:100          'r002' ( temp 2-component vector of float)
-0:100          arc cosine ( temp 2-component vector of float)
+0:100        move second child to first child ( temp bool)
+0:100          'r003' ( temp bool)
+0:100          any ( temp bool)
 0:100            'inF0' ( in 2-component vector of float)
 0:101      Sequence
-0:101        move second child to first child ( temp bool)
-0:101          'r003' ( temp bool)
-0:101          any ( temp bool)
+0:101        move second child to first child ( temp 2-component vector of float)
+0:101          'r004' ( temp 2-component vector of float)
+0:101          arc sine ( temp 2-component vector of float)
 0:101            'inF0' ( in 2-component vector of float)
 0:102      Sequence
-0:102        move second child to first child ( temp 2-component vector of float)
-0:102          'r004' ( temp 2-component vector of float)
-0:102          arc sine ( temp 2-component vector of float)
+0:102        move second child to first child ( temp 2-component vector of int)
+0:102          'r005' ( temp 2-component vector of int)
+0:102          floatBitsToInt ( temp 2-component vector of int)
 0:102            'inF0' ( in 2-component vector of float)
 0:103      Sequence
-0:103        move second child to first child ( temp 2-component vector of int)
-0:103          'r005' ( temp 2-component vector of int)
-0:103          floatBitsToInt ( temp 2-component vector of int)
+0:103        move second child to first child ( temp 2-component vector of uint)
+0:103          'r006' ( temp 2-component vector of uint)
+0:103          floatBitsToUint ( temp 2-component vector of uint)
 0:103            'inF0' ( in 2-component vector of float)
 0:104      Sequence
-0:104        move second child to first child ( temp 2-component vector of uint)
-0:104          'r006' ( temp 2-component vector of uint)
-0:104          floatBitsToUint ( temp 2-component vector of uint)
-0:104            'inF0' ( in 2-component vector of float)
-0:105      Sequence
-0:105        move second child to first child ( temp 2-component vector of float)
-0:105          'r007' ( temp 2-component vector of float)
-0:105          intBitsToFloat ( temp 2-component vector of float)
-0:105            'inU0' ( in 2-component vector of uint)
+0:104        move second child to first child ( temp 2-component vector of float)
+0:104          'r007' ( temp 2-component vector of float)
+0:104          intBitsToFloat ( temp 2-component vector of float)
+0:104            'inU0' ( in 2-component vector of uint)
+0:106      Sequence
+0:106        move second child to first child ( temp 2-component vector of float)
+0:106          'r009' ( temp 2-component vector of float)
+0:106          arc tangent ( temp 2-component vector of float)
+0:106            'inF0' ( in 2-component vector of float)
 0:107      Sequence
 0:107        move second child to first child ( temp 2-component vector of float)
-0:107          'r009' ( temp 2-component vector of float)
+0:107          'r010' ( temp 2-component vector of float)
 0:107          arc tangent ( temp 2-component vector of float)
 0:107            'inF0' ( in 2-component vector of float)
+0:107            'inF1' ( in 2-component vector of float)
 0:108      Sequence
 0:108        move second child to first child ( temp 2-component vector of float)
-0:108          'r010' ( temp 2-component vector of float)
-0:108          arc tangent ( temp 2-component vector of float)
+0:108          'r011' ( temp 2-component vector of float)
+0:108          Ceiling ( temp 2-component vector of float)
 0:108            'inF0' ( in 2-component vector of float)
-0:108            'inF1' ( in 2-component vector of float)
 0:109      Sequence
 0:109        move second child to first child ( temp 2-component vector of float)
-0:109          'r011' ( temp 2-component vector of float)
-0:109          Ceiling ( temp 2-component vector of float)
+0:109          'r012' ( temp 2-component vector of float)
+0:109          clamp ( temp 2-component vector of float)
 0:109            'inF0' ( in 2-component vector of float)
-0:110      Sequence
-0:110        move second child to first child ( temp 2-component vector of float)
-0:110          'r012' ( temp 2-component vector of float)
-0:110          clamp ( temp 2-component vector of float)
+0:109            'inF1' ( in 2-component vector of float)
+0:109            'inF2' ( in 2-component vector of float)
+0:110      Test condition and select ( temp void)
+0:110        Condition
+0:110        any ( temp bool)
+0:110          Compare Less Than ( temp 2-component vector of bool)
 0:110            'inF0' ( in 2-component vector of float)
-0:110            'inF1' ( in 2-component vector of float)
-0:110            'inF2' ( in 2-component vector of float)
-0:111      Test condition and select ( temp void)
-0:111        Condition
-0:111        any ( temp bool)
-0:111          Compare Less Than ( temp 2-component vector of bool)
+0:110            Constant:
+0:110              0.000000
+0:110              0.000000
+0:110        true case
+0:110        Branch: Kill
+0:111      Sequence
+0:111        move second child to first child ( temp 2-component vector of float)
+0:111          'r013' ( temp 2-component vector of float)
+0:111          cosine ( temp 2-component vector of float)
 0:111            'inF0' ( in 2-component vector of float)
-0:111            Constant:
-0:111              0.000000
-0:111              0.000000
-0:111        true case
-0:111        Branch: Kill
 0:112      Sequence
 0:112        move second child to first child ( temp 2-component vector of float)
-0:112          'r013' ( temp 2-component vector of float)
-0:112          cosine ( temp 2-component vector of float)
+0:112          'r015' ( temp 2-component vector of float)
+0:112          hyp. cosine ( temp 2-component vector of float)
 0:112            'inF0' ( in 2-component vector of float)
 0:113      Sequence
-0:113        move second child to first child ( temp 2-component vector of float)
-0:113          'r015' ( temp 2-component vector of float)
-0:113          hyp. cosine ( temp 2-component vector of float)
-0:113            'inF0' ( in 2-component vector of float)
-0:114      Sequence
-0:114        move second child to first child ( temp 2-component vector of int)
-0:114          'r016' ( temp 2-component vector of int)
+0:113        move second child to first child ( temp 2-component vector of int)
+0:113          'r016' ( temp 2-component vector of int)
 0:?           bitCount ( temp 2-component vector of int)
 0:?             Constant:
 0:?               7 (const int)
 0:?               3 (const int)
+0:114      Sequence
+0:114        move second child to first child ( temp 2-component vector of float)
+0:114          'r017' ( temp 2-component vector of float)
+0:114          dPdx ( temp 2-component vector of float)
+0:114            'inF0' ( in 2-component vector of float)
 0:115      Sequence
 0:115        move second child to first child ( temp 2-component vector of float)
-0:115          'r017' ( temp 2-component vector of float)
-0:115          dPdx ( temp 2-component vector of float)
+0:115          'r018' ( temp 2-component vector of float)
+0:115          dPdxCoarse ( temp 2-component vector of float)
 0:115            'inF0' ( in 2-component vector of float)
 0:116      Sequence
 0:116        move second child to first child ( temp 2-component vector of float)
-0:116          'r018' ( temp 2-component vector of float)
-0:116          dPdxCoarse ( temp 2-component vector of float)
+0:116          'r019' ( temp 2-component vector of float)
+0:116          dPdxFine ( temp 2-component vector of float)
 0:116            'inF0' ( in 2-component vector of float)
 0:117      Sequence
 0:117        move second child to first child ( temp 2-component vector of float)
-0:117          'r019' ( temp 2-component vector of float)
-0:117          dPdxFine ( temp 2-component vector of float)
+0:117          'r020' ( temp 2-component vector of float)
+0:117          dPdy ( temp 2-component vector of float)
 0:117            'inF0' ( in 2-component vector of float)
 0:118      Sequence
 0:118        move second child to first child ( temp 2-component vector of float)
-0:118          'r020' ( temp 2-component vector of float)
-0:118          dPdy ( temp 2-component vector of float)
+0:118          'r021' ( temp 2-component vector of float)
+0:118          dPdyCoarse ( temp 2-component vector of float)
 0:118            'inF0' ( in 2-component vector of float)
 0:119      Sequence
 0:119        move second child to first child ( temp 2-component vector of float)
-0:119          'r021' ( temp 2-component vector of float)
-0:119          dPdyCoarse ( temp 2-component vector of float)
+0:119          'r022' ( temp 2-component vector of float)
+0:119          dPdyFine ( temp 2-component vector of float)
 0:119            'inF0' ( in 2-component vector of float)
 0:120      Sequence
 0:120        move second child to first child ( temp 2-component vector of float)
-0:120          'r022' ( temp 2-component vector of float)
-0:120          dPdyFine ( temp 2-component vector of float)
+0:120          'r023' ( temp 2-component vector of float)
+0:120          degrees ( temp 2-component vector of float)
 0:120            'inF0' ( in 2-component vector of float)
-0:121      Sequence
-0:121        move second child to first child ( temp 2-component vector of float)
-0:121          'r023' ( temp 2-component vector of float)
-0:121          degrees ( temp 2-component vector of float)
-0:121            'inF0' ( in 2-component vector of float)
+0:124      Sequence
+0:124        move second child to first child ( temp float)
+0:124          'r026' ( temp float)
+0:124          distance ( temp float)
+0:124            'inF0' ( in 2-component vector of float)
+0:124            'inF1' ( in 2-component vector of float)
 0:125      Sequence
 0:125        move second child to first child ( temp float)
-0:125          'r026' ( temp float)
-0:125          distance ( temp float)
+0:125          'r027' ( temp float)
+0:125          dot-product ( temp float)
 0:125            'inF0' ( in 2-component vector of float)
 0:125            'inF1' ( in 2-component vector of float)
-0:126      Sequence
-0:126        move second child to first child ( temp float)
-0:126          'r027' ( temp float)
-0:126          dot-product ( temp float)
-0:126            'inF0' ( in 2-component vector of float)
-0:126            'inF1' ( in 2-component vector of float)
+0:129      Sequence
+0:129        move second child to first child ( temp 2-component vector of float)
+0:129          'r028' ( temp 2-component vector of float)
+0:129          exp ( temp 2-component vector of float)
+0:129            'inF0' ( in 2-component vector of float)
 0:130      Sequence
 0:130        move second child to first child ( temp 2-component vector of float)
-0:130          'r028' ( temp 2-component vector of float)
-0:130          exp ( temp 2-component vector of float)
+0:130          'r029' ( temp 2-component vector of float)
+0:130          exp2 ( temp 2-component vector of float)
 0:130            'inF0' ( in 2-component vector of float)
 0:131      Sequence
 0:131        move second child to first child ( temp 2-component vector of float)
-0:131          'r029' ( temp 2-component vector of float)
-0:131          exp2 ( temp 2-component vector of float)
+0:131          'r030' ( temp 2-component vector of float)
+0:131          face-forward ( temp 2-component vector of float)
 0:131            'inF0' ( in 2-component vector of float)
+0:131            'inF1' ( in 2-component vector of float)
+0:131            'inF2' ( in 2-component vector of float)
 0:132      Sequence
-0:132        move second child to first child ( temp 2-component vector of float)
-0:132          'r030' ( temp 2-component vector of float)
-0:132          face-forward ( temp 2-component vector of float)
-0:132            'inF0' ( in 2-component vector of float)
-0:132            'inF1' ( in 2-component vector of float)
-0:132            'inF2' ( in 2-component vector of float)
-0:133      Sequence
-0:133        move second child to first child ( temp 2-component vector of uint)
-0:133          'r031' ( temp 2-component vector of uint)
+0:132        move second child to first child ( temp 2-component vector of uint)
+0:132          'r031' ( temp 2-component vector of uint)
 0:?           findMSB ( temp 2-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
-0:134      Sequence
-0:134        move second child to first child ( temp 2-component vector of uint)
-0:134          'r032' ( temp 2-component vector of uint)
+0:133      Sequence
+0:133        move second child to first child ( temp 2-component vector of uint)
+0:133          'r032' ( temp 2-component vector of uint)
 0:?           findLSB ( temp 2-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
-0:135      Sequence
-0:135        move second child to first child ( temp 2-component vector of float)
-0:135          'r033' ( temp 2-component vector of float)
-0:135          Floor ( temp 2-component vector of float)
-0:135            'inF0' ( in 2-component vector of float)
+0:134      Sequence
+0:134        move second child to first child ( temp 2-component vector of float)
+0:134          'r033' ( temp 2-component vector of float)
+0:134          Floor ( temp 2-component vector of float)
+0:134            'inF0' ( in 2-component vector of float)
+0:136      Sequence
+0:136        move second child to first child ( temp 2-component vector of float)
+0:136          'r035' ( temp 2-component vector of float)
+0:136          mod ( temp 2-component vector of float)
+0:136            'inF0' ( in 2-component vector of float)
+0:136            'inF1' ( in 2-component vector of float)
 0:137      Sequence
 0:137        move second child to first child ( temp 2-component vector of float)
-0:137          'r035' ( temp 2-component vector of float)
-0:137          mod ( temp 2-component vector of float)
+0:137          'r036' ( temp 2-component vector of float)
+0:137          Fraction ( temp 2-component vector of float)
 0:137            'inF0' ( in 2-component vector of float)
-0:137            'inF1' ( in 2-component vector of float)
 0:138      Sequence
 0:138        move second child to first child ( temp 2-component vector of float)
-0:138          'r036' ( temp 2-component vector of float)
-0:138          Fraction ( temp 2-component vector of float)
+0:138          'r038' ( temp 2-component vector of float)
+0:138          fwidth ( temp 2-component vector of float)
 0:138            'inF0' ( in 2-component vector of float)
 0:139      Sequence
-0:139        move second child to first child ( temp 2-component vector of float)
-0:139          'r037' ( temp 2-component vector of float)
-0:139          frexp ( temp 2-component vector of float)
+0:139        move second child to first child ( temp 2-component vector of bool)
+0:139          'r039' ( temp 2-component vector of bool)
+0:139          isinf ( temp 2-component vector of bool)
 0:139            'inF0' ( in 2-component vector of float)
-0:139            'inF1' ( in 2-component vector of float)
 0:140      Sequence
-0:140        move second child to first child ( temp 2-component vector of float)
-0:140          'r038' ( temp 2-component vector of float)
-0:140          fwidth ( temp 2-component vector of float)
+0:140        move second child to first child ( temp 2-component vector of bool)
+0:140          'r040' ( temp 2-component vector of bool)
+0:140          isnan ( temp 2-component vector of bool)
 0:140            'inF0' ( in 2-component vector of float)
 0:141      Sequence
-0:141        move second child to first child ( temp 2-component vector of bool)
-0:141          'r039' ( temp 2-component vector of bool)
-0:141          isinf ( temp 2-component vector of bool)
+0:141        move second child to first child ( temp 2-component vector of float)
+0:141          'r041' ( temp 2-component vector of float)
+0:141          ldexp ( temp 2-component vector of float)
 0:141            'inF0' ( in 2-component vector of float)
+0:141            'inF1' ( in 2-component vector of float)
 0:142      Sequence
-0:142        move second child to first child ( temp 2-component vector of bool)
-0:142          'r040' ( temp 2-component vector of bool)
-0:142          isnan ( temp 2-component vector of bool)
+0:142        move second child to first child ( temp 2-component vector of float)
+0:142          'r039a' ( temp 2-component vector of float)
+0:142          mix ( temp 2-component vector of float)
 0:142            'inF0' ( in 2-component vector of float)
+0:142            'inF1' ( in 2-component vector of float)
+0:142            'inF2' ( in 2-component vector of float)
 0:143      Sequence
-0:143        move second child to first child ( temp 2-component vector of float)
-0:143          'r041' ( temp 2-component vector of float)
-0:143          ldexp ( temp 2-component vector of float)
+0:143        move second child to first child ( temp float)
+0:143          'r042' ( temp float)
+0:143          length ( temp float)
 0:143            'inF0' ( in 2-component vector of float)
-0:143            'inF1' ( in 2-component vector of float)
 0:144      Sequence
 0:144        move second child to first child ( temp 2-component vector of float)
-0:144          'r039a' ( temp 2-component vector of float)
-0:144          mix ( temp 2-component vector of float)
+0:144          'r043' ( temp 2-component vector of float)
+0:144          log ( temp 2-component vector of float)
 0:144            'inF0' ( in 2-component vector of float)
-0:144            'inF1' ( in 2-component vector of float)
-0:144            'inF2' ( in 2-component vector of float)
 0:145      Sequence
-0:145        move second child to first child ( temp float)
-0:145          'r042' ( temp float)
-0:145          length ( temp float)
-0:145            'inF0' ( in 2-component vector of float)
+0:145        move second child to first child ( temp 2-component vector of float)
+0:145          'r044' ( temp 2-component vector of float)
+0:145          vector-scale ( temp 2-component vector of float)
+0:145            log2 ( temp 2-component vector of float)
+0:145              'inF0' ( in 2-component vector of float)
+0:145            Constant:
+0:145              0.301030
 0:146      Sequence
 0:146        move second child to first child ( temp 2-component vector of float)
-0:146          'r043' ( temp 2-component vector of float)
-0:146          log ( temp 2-component vector of float)
+0:146          'r045' ( temp 2-component vector of float)
+0:146          log2 ( temp 2-component vector of float)
 0:146            'inF0' ( in 2-component vector of float)
 0:147      Sequence
 0:147        move second child to first child ( temp 2-component vector of float)
-0:147          'r044' ( temp 2-component vector of float)
-0:147          vector-scale ( temp 2-component vector of float)
-0:147            log2 ( temp 2-component vector of float)
-0:147              'inF0' ( in 2-component vector of float)
-0:147            Constant:
-0:147              0.301030
+0:147          'r046' ( temp 2-component vector of float)
+0:147          max ( temp 2-component vector of float)
+0:147            'inF0' ( in 2-component vector of float)
+0:147            'inF1' ( in 2-component vector of float)
 0:148      Sequence
 0:148        move second child to first child ( temp 2-component vector of float)
-0:148          'r045' ( temp 2-component vector of float)
-0:148          log2 ( temp 2-component vector of float)
+0:148          'r047' ( temp 2-component vector of float)
+0:148          min ( temp 2-component vector of float)
 0:148            'inF0' ( in 2-component vector of float)
+0:148            'inF1' ( in 2-component vector of float)
 0:149      Sequence
 0:149        move second child to first child ( temp 2-component vector of float)
-0:149          'r046' ( temp 2-component vector of float)
-0:149          max ( temp 2-component vector of float)
+0:149          'r048' ( temp 2-component vector of float)
+0:149          normalize ( temp 2-component vector of float)
 0:149            'inF0' ( in 2-component vector of float)
-0:149            'inF1' ( in 2-component vector of float)
 0:150      Sequence
 0:150        move second child to first child ( temp 2-component vector of float)
-0:150          'r047' ( temp 2-component vector of float)
-0:150          min ( temp 2-component vector of float)
+0:150          'r049' ( temp 2-component vector of float)
+0:150          pow ( temp 2-component vector of float)
 0:150            'inF0' ( in 2-component vector of float)
 0:150            'inF1' ( in 2-component vector of float)
 0:151      Sequence
 0:151        move second child to first child ( temp 2-component vector of float)
-0:151          'r048' ( temp 2-component vector of float)
-0:151          normalize ( temp 2-component vector of float)
+0:151          'r050' ( temp 2-component vector of float)
+0:151          radians ( temp 2-component vector of float)
 0:151            'inF0' ( in 2-component vector of float)
 0:152      Sequence
 0:152        move second child to first child ( temp 2-component vector of float)
-0:152          'r049' ( temp 2-component vector of float)
-0:152          pow ( temp 2-component vector of float)
+0:152          'r051' ( temp 2-component vector of float)
+0:152          divide ( temp 2-component vector of float)
+0:152            Constant:
+0:152              1.000000
 0:152            'inF0' ( in 2-component vector of float)
-0:152            'inF1' ( in 2-component vector of float)
 0:153      Sequence
 0:153        move second child to first child ( temp 2-component vector of float)
-0:153          'r050' ( temp 2-component vector of float)
-0:153          radians ( temp 2-component vector of float)
+0:153          'r052' ( temp 2-component vector of float)
+0:153          reflect ( temp 2-component vector of float)
 0:153            'inF0' ( in 2-component vector of float)
+0:153            'inF1' ( in 2-component vector of float)
 0:154      Sequence
 0:154        move second child to first child ( temp 2-component vector of float)
-0:154          'r051' ( temp 2-component vector of float)
-0:154          divide ( temp 2-component vector of float)
-0:154            Constant:
-0:154              1.000000
+0:154          'r053' ( temp 2-component vector of float)
+0:154          refract ( temp 2-component vector of float)
 0:154            'inF0' ( in 2-component vector of float)
+0:154            'inF1' ( in 2-component vector of float)
+0:154            Constant:
+0:154              2.000000
 0:155      Sequence
-0:155        move second child to first child ( temp 2-component vector of float)
-0:155          'r052' ( temp 2-component vector of float)
-0:155          reflect ( temp 2-component vector of float)
-0:155            'inF0' ( in 2-component vector of float)
-0:155            'inF1' ( in 2-component vector of float)
-0:156      Sequence
-0:156        move second child to first child ( temp 2-component vector of float)
-0:156          'r053' ( temp 2-component vector of float)
-0:156          refract ( temp 2-component vector of float)
-0:156            'inF0' ( in 2-component vector of float)
-0:156            'inF1' ( in 2-component vector of float)
-0:156            Constant:
-0:156              2.000000
-0:157      Sequence
-0:157        move second child to first child ( temp 2-component vector of uint)
-0:157          'r054' ( temp 2-component vector of uint)
+0:155        move second child to first child ( temp 2-component vector of uint)
+0:155          'r054' ( temp 2-component vector of uint)
 0:?           bitFieldReverse ( temp 2-component vector of uint)
 0:?             Constant:
 0:?               1 (const uint)
 0:?               2 (const uint)
+0:156      Sequence
+0:156        move second child to first child ( temp 2-component vector of float)
+0:156          'r055' ( temp 2-component vector of float)
+0:156          roundEven ( temp 2-component vector of float)
+0:156            'inF0' ( in 2-component vector of float)
+0:157      Sequence
+0:157        move second child to first child ( temp 2-component vector of float)
+0:157          'r056' ( temp 2-component vector of float)
+0:157          inverse sqrt ( temp 2-component vector of float)
+0:157            'inF0' ( in 2-component vector of float)
 0:158      Sequence
 0:158        move second child to first child ( temp 2-component vector of float)
-0:158          'r055' ( temp 2-component vector of float)
-0:158          roundEven ( temp 2-component vector of float)
+0:158          'r057' ( temp 2-component vector of float)
+0:158          clamp ( temp 2-component vector of float)
 0:158            'inF0' ( in 2-component vector of float)
+0:158            Constant:
+0:158              0.000000
+0:158            Constant:
+0:158              1.000000
 0:159      Sequence
 0:159        move second child to first child ( temp 2-component vector of float)
-0:159          'r056' ( temp 2-component vector of float)
-0:159          inverse sqrt ( temp 2-component vector of float)
+0:159          'r058' ( temp 2-component vector of float)
+0:159          Sign ( temp 2-component vector of float)
 0:159            'inF0' ( in 2-component vector of float)
 0:160      Sequence
 0:160        move second child to first child ( temp 2-component vector of float)
-0:160          'r057' ( temp 2-component vector of float)
-0:160          clamp ( temp 2-component vector of float)
+0:160          'r059' ( temp 2-component vector of float)
+0:160          sine ( temp 2-component vector of float)
 0:160            'inF0' ( in 2-component vector of float)
-0:160            Constant:
-0:160              0.000000
-0:160            Constant:
-0:160              1.000000
 0:161      Sequence
 0:161        move second child to first child ( temp 2-component vector of float)
-0:161          'r058' ( temp 2-component vector of float)
-0:161          Sign ( temp 2-component vector of float)
+0:161          'inF1' ( in 2-component vector of float)
+0:161          sine ( temp 2-component vector of float)
+0:161            'inF0' ( in 2-component vector of float)
+0:161        move second child to first child ( temp 2-component vector of float)
+0:161          'inF2' ( in 2-component vector of float)
+0:161          cosine ( temp 2-component vector of float)
 0:161            'inF0' ( in 2-component vector of float)
 0:162      Sequence
 0:162        move second child to first child ( temp 2-component vector of float)
-0:162          'r059' ( temp 2-component vector of float)
-0:162          sine ( temp 2-component vector of float)
+0:162          'r060' ( temp 2-component vector of float)
+0:162          hyp. sine ( temp 2-component vector of float)
 0:162            'inF0' ( in 2-component vector of float)
 0:163      Sequence
 0:163        move second child to first child ( temp 2-component vector of float)
-0:163          'inF1' ( in 2-component vector of float)
-0:163          sine ( temp 2-component vector of float)
-0:163            'inF0' ( in 2-component vector of float)
-0:163        move second child to first child ( temp 2-component vector of float)
-0:163          'inF2' ( in 2-component vector of float)
-0:163          cosine ( temp 2-component vector of float)
+0:163          'r061' ( temp 2-component vector of float)
+0:163          smoothstep ( temp 2-component vector of float)
 0:163            'inF0' ( in 2-component vector of float)
+0:163            'inF1' ( in 2-component vector of float)
+0:163            'inF2' ( in 2-component vector of float)
 0:164      Sequence
 0:164        move second child to first child ( temp 2-component vector of float)
-0:164          'r060' ( temp 2-component vector of float)
-0:164          hyp. sine ( temp 2-component vector of float)
+0:164          'r062' ( temp 2-component vector of float)
+0:164          sqrt ( temp 2-component vector of float)
 0:164            'inF0' ( in 2-component vector of float)
 0:165      Sequence
 0:165        move second child to first child ( temp 2-component vector of float)
-0:165          'r061' ( temp 2-component vector of float)
-0:165          smoothstep ( temp 2-component vector of float)
+0:165          'r063' ( temp 2-component vector of float)
+0:165          step ( temp 2-component vector of float)
 0:165            'inF0' ( in 2-component vector of float)
 0:165            'inF1' ( in 2-component vector of float)
-0:165            'inF2' ( in 2-component vector of float)
 0:166      Sequence
 0:166        move second child to first child ( temp 2-component vector of float)
-0:166          'r062' ( temp 2-component vector of float)
-0:166          sqrt ( temp 2-component vector of float)
+0:166          'r064' ( temp 2-component vector of float)
+0:166          tangent ( temp 2-component vector of float)
 0:166            'inF0' ( in 2-component vector of float)
 0:167      Sequence
 0:167        move second child to first child ( temp 2-component vector of float)
-0:167          'r063' ( temp 2-component vector of float)
-0:167          step ( temp 2-component vector of float)
+0:167          'r065' ( temp 2-component vector of float)
+0:167          hyp. tangent ( temp 2-component vector of float)
 0:167            'inF0' ( in 2-component vector of float)
-0:167            'inF1' ( in 2-component vector of float)
-0:168      Sequence
-0:168        move second child to first child ( temp 2-component vector of float)
-0:168          'r064' ( temp 2-component vector of float)
-0:168          tangent ( temp 2-component vector of float)
-0:168            'inF0' ( in 2-component vector of float)
 0:169      Sequence
 0:169        move second child to first child ( temp 2-component vector of float)
-0:169          'r065' ( temp 2-component vector of float)
-0:169          hyp. tangent ( temp 2-component vector of float)
+0:169          'r066' ( temp 2-component vector of float)
+0:169          trunc ( temp 2-component vector of float)
 0:169            'inF0' ( in 2-component vector of float)
-0:171      Sequence
-0:171        move second child to first child ( temp 2-component vector of float)
-0:171          'r066' ( temp 2-component vector of float)
-0:171          trunc ( temp 2-component vector of float)
-0:171            'inF0' ( in 2-component vector of float)
-0:174      Branch: Return with expression
+0:172      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
-0:178  Function Definition: PixelShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
-0:178    Function Parameters: 
-0:178      'inF0' ( in 3-component vector of float)
-0:178      'inF1' ( in 3-component vector of float)
-0:178      'inF2' ( in 3-component vector of float)
-0:178      'inU0' ( in 3-component vector of uint)
-0:178      'inU1' ( in 3-component vector of uint)
+0:176  Function Definition: PixelShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
+0:176    Function Parameters: 
+0:176      'inF0' ( in 3-component vector of float)
+0:176      'inF1' ( in 3-component vector of float)
+0:176      'inF2' ( in 3-component vector of float)
+0:176      'inU0' ( in 3-component vector of uint)
+0:176      'inU1' ( in 3-component vector of uint)
 0:?     Sequence
+0:179      Sequence
+0:179        move second child to first child ( temp bool)
+0:179          'r000' ( temp bool)
+0:179          all ( temp bool)
+0:179            'inF0' ( in 3-component vector of float)
+0:180      Sequence
+0:180        move second child to first child ( temp 3-component vector of float)
+0:180          'r001' ( temp 3-component vector of float)
+0:180          Absolute value ( temp 3-component vector of float)
+0:180            'inF0' ( in 3-component vector of float)
 0:181      Sequence
-0:181        move second child to first child ( temp bool)
-0:181          'r000' ( temp bool)
-0:181          all ( temp bool)
+0:181        move second child to first child ( temp 3-component vector of float)
+0:181          'r002' ( temp 3-component vector of float)
+0:181          arc cosine ( temp 3-component vector of float)
 0:181            'inF0' ( in 3-component vector of float)
 0:182      Sequence
-0:182        move second child to first child ( temp 3-component vector of float)
-0:182          'r001' ( temp 3-component vector of float)
-0:182          Absolute value ( temp 3-component vector of float)
+0:182        move second child to first child ( temp bool)
+0:182          'r003' ( temp bool)
+0:182          any ( temp bool)
 0:182            'inF0' ( in 3-component vector of float)
 0:183      Sequence
 0:183        move second child to first child ( temp 3-component vector of float)
-0:183          'r002' ( temp 3-component vector of float)
-0:183          arc cosine ( temp 3-component vector of float)
+0:183          'r004' ( temp 3-component vector of float)
+0:183          arc sine ( temp 3-component vector of float)
 0:183            'inF0' ( in 3-component vector of float)
 0:184      Sequence
-0:184        move second child to first child ( temp bool)
-0:184          'r003' ( temp bool)
-0:184          any ( temp bool)
+0:184        move second child to first child ( temp 3-component vector of int)
+0:184          'r005' ( temp 3-component vector of int)
+0:184          floatBitsToInt ( temp 3-component vector of int)
 0:184            'inF0' ( in 3-component vector of float)
 0:185      Sequence
-0:185        move second child to first child ( temp 3-component vector of float)
-0:185          'r004' ( temp 3-component vector of float)
-0:185          arc sine ( temp 3-component vector of float)
+0:185        move second child to first child ( temp 3-component vector of uint)
+0:185          'r006' ( temp 3-component vector of uint)
+0:185          floatBitsToUint ( temp 3-component vector of uint)
 0:185            'inF0' ( in 3-component vector of float)
 0:186      Sequence
-0:186        move second child to first child ( temp 3-component vector of int)
-0:186          'r005' ( temp 3-component vector of int)
-0:186          floatBitsToInt ( temp 3-component vector of int)
-0:186            'inF0' ( in 3-component vector of float)
-0:187      Sequence
-0:187        move second child to first child ( temp 3-component vector of uint)
-0:187          'r006' ( temp 3-component vector of uint)
-0:187          floatBitsToUint ( temp 3-component vector of uint)
-0:187            'inF0' ( in 3-component vector of float)
+0:186        move second child to first child ( temp 3-component vector of float)
+0:186          'r007' ( temp 3-component vector of float)
+0:186          intBitsToFloat ( temp 3-component vector of float)
+0:186            'inU0' ( in 3-component vector of uint)
 0:188      Sequence
 0:188        move second child to first child ( temp 3-component vector of float)
-0:188          'r007' ( temp 3-component vector of float)
-0:188          intBitsToFloat ( temp 3-component vector of float)
-0:188            'inU0' ( in 3-component vector of uint)
+0:188          'r009' ( temp 3-component vector of float)
+0:188          arc tangent ( temp 3-component vector of float)
+0:188            'inF0' ( in 3-component vector of float)
+0:189      Sequence
+0:189        move second child to first child ( temp 3-component vector of float)
+0:189          'r010' ( temp 3-component vector of float)
+0:189          arc tangent ( temp 3-component vector of float)
+0:189            'inF0' ( in 3-component vector of float)
+0:189            'inF1' ( in 3-component vector of float)
 0:190      Sequence
 0:190        move second child to first child ( temp 3-component vector of float)
-0:190          'r009' ( temp 3-component vector of float)
-0:190          arc tangent ( temp 3-component vector of float)
+0:190          'r011' ( temp 3-component vector of float)
+0:190          Ceiling ( temp 3-component vector of float)
 0:190            'inF0' ( in 3-component vector of float)
 0:191      Sequence
 0:191        move second child to first child ( temp 3-component vector of float)
-0:191          'r010' ( temp 3-component vector of float)
-0:191          arc tangent ( temp 3-component vector of float)
+0:191          'r012' ( temp 3-component vector of float)
+0:191          clamp ( temp 3-component vector of float)
 0:191            'inF0' ( in 3-component vector of float)
 0:191            'inF1' ( in 3-component vector of float)
-0:192      Sequence
-0:192        move second child to first child ( temp 3-component vector of float)
-0:192          'r011' ( temp 3-component vector of float)
-0:192          Ceiling ( temp 3-component vector of float)
+0:191            'inF2' ( in 3-component vector of float)
+0:192      Test condition and select ( temp void)
+0:192        Condition
+0:192        any ( temp bool)
+0:192          Compare Less Than ( temp 3-component vector of bool)
 0:192            'inF0' ( in 3-component vector of float)
+0:192            Constant:
+0:192              0.000000
+0:192              0.000000
+0:192              0.000000
+0:192        true case
+0:192        Branch: Kill
 0:193      Sequence
 0:193        move second child to first child ( temp 3-component vector of float)
-0:193          'r012' ( temp 3-component vector of float)
-0:193          clamp ( temp 3-component vector of float)
+0:193          'r013' ( temp 3-component vector of float)
+0:193          cosine ( temp 3-component vector of float)
 0:193            'inF0' ( in 3-component vector of float)
-0:193            'inF1' ( in 3-component vector of float)
-0:193            'inF2' ( in 3-component vector of float)
-0:194      Test condition and select ( temp void)
-0:194        Condition
-0:194        any ( temp bool)
-0:194          Compare Less Than ( temp 3-component vector of bool)
+0:194      Sequence
+0:194        move second child to first child ( temp 3-component vector of float)
+0:194          'r014' ( temp 3-component vector of float)
+0:194          hyp. cosine ( temp 3-component vector of float)
 0:194            'inF0' ( in 3-component vector of float)
-0:194            Constant:
-0:194              0.000000
-0:194              0.000000
-0:194              0.000000
-0:194        true case
-0:194        Branch: Kill
 0:195      Sequence
-0:195        move second child to first child ( temp 3-component vector of float)
-0:195          'r013' ( temp 3-component vector of float)
-0:195          cosine ( temp 3-component vector of float)
-0:195            'inF0' ( in 3-component vector of float)
-0:196      Sequence
-0:196        move second child to first child ( temp 3-component vector of float)
-0:196          'r014' ( temp 3-component vector of float)
-0:196          hyp. cosine ( temp 3-component vector of float)
-0:196            'inF0' ( in 3-component vector of float)
-0:197      Sequence
-0:197        move second child to first child ( temp 3-component vector of uint)
-0:197          'r015' ( temp 3-component vector of uint)
+0:195        move second child to first child ( temp 3-component vector of uint)
+0:195          'r015' ( temp 3-component vector of uint)
 0:?           bitCount ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               3 (const uint)
 0:?               5 (const uint)
+0:196      Sequence
+0:196        move second child to first child ( temp 3-component vector of float)
+0:196          'r016' ( temp 3-component vector of float)
+0:196          cross-product ( temp 3-component vector of float)
+0:196            'inF0' ( in 3-component vector of float)
+0:196            'inF1' ( in 3-component vector of float)
+0:197      Sequence
+0:197        move second child to first child ( temp 3-component vector of float)
+0:197          'r017' ( temp 3-component vector of float)
+0:197          dPdx ( temp 3-component vector of float)
+0:197            'inF0' ( in 3-component vector of float)
 0:198      Sequence
 0:198        move second child to first child ( temp 3-component vector of float)
-0:198          'r016' ( temp 3-component vector of float)
-0:198          cross-product ( temp 3-component vector of float)
+0:198          'r018' ( temp 3-component vector of float)
+0:198          dPdxCoarse ( temp 3-component vector of float)
 0:198            'inF0' ( in 3-component vector of float)
-0:198            'inF1' ( in 3-component vector of float)
 0:199      Sequence
 0:199        move second child to first child ( temp 3-component vector of float)
-0:199          'r017' ( temp 3-component vector of float)
-0:199          dPdx ( temp 3-component vector of float)
+0:199          'r019' ( temp 3-component vector of float)
+0:199          dPdxFine ( temp 3-component vector of float)
 0:199            'inF0' ( in 3-component vector of float)
 0:200      Sequence
 0:200        move second child to first child ( temp 3-component vector of float)
-0:200          'r018' ( temp 3-component vector of float)
-0:200          dPdxCoarse ( temp 3-component vector of float)
+0:200          'r020' ( temp 3-component vector of float)
+0:200          dPdy ( temp 3-component vector of float)
 0:200            'inF0' ( in 3-component vector of float)
 0:201      Sequence
 0:201        move second child to first child ( temp 3-component vector of float)
-0:201          'r019' ( temp 3-component vector of float)
-0:201          dPdxFine ( temp 3-component vector of float)
+0:201          'r021' ( temp 3-component vector of float)
+0:201          dPdyCoarse ( temp 3-component vector of float)
 0:201            'inF0' ( in 3-component vector of float)
 0:202      Sequence
 0:202        move second child to first child ( temp 3-component vector of float)
-0:202          'r020' ( temp 3-component vector of float)
-0:202          dPdy ( temp 3-component vector of float)
+0:202          'r022' ( temp 3-component vector of float)
+0:202          dPdyFine ( temp 3-component vector of float)
 0:202            'inF0' ( in 3-component vector of float)
 0:203      Sequence
 0:203        move second child to first child ( temp 3-component vector of float)
-0:203          'r021' ( temp 3-component vector of float)
-0:203          dPdyCoarse ( temp 3-component vector of float)
+0:203          'r023' ( temp 3-component vector of float)
+0:203          degrees ( temp 3-component vector of float)
 0:203            'inF0' ( in 3-component vector of float)
 0:204      Sequence
-0:204        move second child to first child ( temp 3-component vector of float)
-0:204          'r022' ( temp 3-component vector of float)
-0:204          dPdyFine ( temp 3-component vector of float)
+0:204        move second child to first child ( temp float)
+0:204          'r024' ( temp float)
+0:204          distance ( temp float)
 0:204            'inF0' ( in 3-component vector of float)
+0:204            'inF1' ( in 3-component vector of float)
 0:205      Sequence
-0:205        move second child to first child ( temp 3-component vector of float)
-0:205          'r023' ( temp 3-component vector of float)
-0:205          degrees ( temp 3-component vector of float)
+0:205        move second child to first child ( temp float)
+0:205          'r025' ( temp float)
+0:205          dot-product ( temp float)
 0:205            'inF0' ( in 3-component vector of float)
-0:206      Sequence
-0:206        move second child to first child ( temp float)
-0:206          'r024' ( temp float)
-0:206          distance ( temp float)
-0:206            'inF0' ( in 3-component vector of float)
-0:206            'inF1' ( in 3-component vector of float)
-0:207      Sequence
-0:207        move second child to first child ( temp float)
-0:207          'r025' ( temp float)
-0:207          dot-product ( temp float)
-0:207            'inF0' ( in 3-component vector of float)
-0:207            'inF1' ( in 3-component vector of float)
+0:205            'inF1' ( in 3-component vector of float)
+0:209      Sequence
+0:209        move second child to first child ( temp 3-component vector of float)
+0:209          'r029' ( temp 3-component vector of float)
+0:209          exp ( temp 3-component vector of float)
+0:209            'inF0' ( in 3-component vector of float)
+0:210      Sequence
+0:210        move second child to first child ( temp 3-component vector of float)
+0:210          'r030' ( temp 3-component vector of float)
+0:210          exp2 ( temp 3-component vector of float)
+0:210            'inF0' ( in 3-component vector of float)
 0:211      Sequence
 0:211        move second child to first child ( temp 3-component vector of float)
-0:211          'r029' ( temp 3-component vector of float)
-0:211          exp ( temp 3-component vector of float)
+0:211          'r031' ( temp 3-component vector of float)
+0:211          face-forward ( temp 3-component vector of float)
 0:211            'inF0' ( in 3-component vector of float)
+0:211            'inF1' ( in 3-component vector of float)
+0:211            'inF2' ( in 3-component vector of float)
 0:212      Sequence
-0:212        move second child to first child ( temp 3-component vector of float)
-0:212          'r030' ( temp 3-component vector of float)
-0:212          exp2 ( temp 3-component vector of float)
-0:212            'inF0' ( in 3-component vector of float)
-0:213      Sequence
-0:213        move second child to first child ( temp 3-component vector of float)
-0:213          'r031' ( temp 3-component vector of float)
-0:213          face-forward ( temp 3-component vector of float)
-0:213            'inF0' ( in 3-component vector of float)
-0:213            'inF1' ( in 3-component vector of float)
-0:213            'inF2' ( in 3-component vector of float)
-0:214      Sequence
-0:214        move second child to first child ( temp 3-component vector of uint)
-0:214          'r032' ( temp 3-component vector of uint)
+0:212        move second child to first child ( temp 3-component vector of uint)
+0:212          'r032' ( temp 3-component vector of uint)
 0:?           findMSB ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               2 (const uint)
 0:?               3 (const uint)
 0:?               4 (const uint)
-0:215      Sequence
-0:215        move second child to first child ( temp 3-component vector of uint)
-0:215          'r033' ( temp 3-component vector of uint)
+0:213      Sequence
+0:213        move second child to first child ( temp 3-component vector of uint)
+0:213          'r033' ( temp 3-component vector of uint)
 0:?           findLSB ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               2 (const uint)
 0:?               3 (const uint)
 0:?               4 (const uint)
+0:214      Sequence
+0:214        move second child to first child ( temp 3-component vector of float)
+0:214          'r034' ( temp 3-component vector of float)
+0:214          Floor ( temp 3-component vector of float)
+0:214            'inF0' ( in 3-component vector of float)
 0:216      Sequence
 0:216        move second child to first child ( temp 3-component vector of float)
-0:216          'r034' ( temp 3-component vector of float)
-0:216          Floor ( temp 3-component vector of float)
+0:216          'r036' ( temp 3-component vector of float)
+0:216          mod ( temp 3-component vector of float)
 0:216            'inF0' ( in 3-component vector of float)
+0:216            'inF1' ( in 3-component vector of float)
+0:217      Sequence
+0:217        move second child to first child ( temp 3-component vector of float)
+0:217          'r037' ( temp 3-component vector of float)
+0:217          Fraction ( temp 3-component vector of float)
+0:217            'inF0' ( in 3-component vector of float)
 0:218      Sequence
 0:218        move second child to first child ( temp 3-component vector of float)
-0:218          'r036' ( temp 3-component vector of float)
-0:218          mod ( temp 3-component vector of float)
+0:218          'r039' ( temp 3-component vector of float)
+0:218          fwidth ( temp 3-component vector of float)
 0:218            'inF0' ( in 3-component vector of float)
-0:218            'inF1' ( in 3-component vector of float)
 0:219      Sequence
-0:219        move second child to first child ( temp 3-component vector of float)
-0:219          'r037' ( temp 3-component vector of float)
-0:219          Fraction ( temp 3-component vector of float)
+0:219        move second child to first child ( temp 3-component vector of bool)
+0:219          'r040' ( temp 3-component vector of bool)
+0:219          isinf ( temp 3-component vector of bool)
 0:219            'inF0' ( in 3-component vector of float)
 0:220      Sequence
-0:220        move second child to first child ( temp 3-component vector of float)
-0:220          'r038' ( temp 3-component vector of float)
-0:220          frexp ( temp 3-component vector of float)
+0:220        move second child to first child ( temp 3-component vector of bool)
+0:220          'r041' ( temp 3-component vector of bool)
+0:220          isnan ( temp 3-component vector of bool)
 0:220            'inF0' ( in 3-component vector of float)
-0:220            'inF1' ( in 3-component vector of float)
 0:221      Sequence
 0:221        move second child to first child ( temp 3-component vector of float)
-0:221          'r039' ( temp 3-component vector of float)
-0:221          fwidth ( temp 3-component vector of float)
+0:221          'r042' ( temp 3-component vector of float)
+0:221          ldexp ( temp 3-component vector of float)
 0:221            'inF0' ( in 3-component vector of float)
+0:221            'inF1' ( in 3-component vector of float)
 0:222      Sequence
-0:222        move second child to first child ( temp 3-component vector of bool)
-0:222          'r040' ( temp 3-component vector of bool)
-0:222          isinf ( temp 3-component vector of bool)
+0:222        move second child to first child ( temp 3-component vector of float)
+0:222          'r039a' ( temp 3-component vector of float)
+0:222          mix ( temp 3-component vector of float)
 0:222            'inF0' ( in 3-component vector of float)
+0:222            'inF1' ( in 3-component vector of float)
+0:222            'inF2' ( in 3-component vector of float)
 0:223      Sequence
-0:223        move second child to first child ( temp 3-component vector of bool)
-0:223          'r041' ( temp 3-component vector of bool)
-0:223          isnan ( temp 3-component vector of bool)
+0:223        move second child to first child ( temp 3-component vector of float)
+0:223          'r039b' ( temp 3-component vector of float)
+0:223          mix ( temp 3-component vector of float)
 0:223            'inF0' ( in 3-component vector of float)
+0:223            'inF1' ( in 3-component vector of float)
+0:223            Constant:
+0:223              0.300000
 0:224      Sequence
-0:224        move second child to first child ( temp 3-component vector of float)
-0:224          'r042' ( temp 3-component vector of float)
-0:224          ldexp ( temp 3-component vector of float)
+0:224        move second child to first child ( temp float)
+0:224          'r043' ( temp float)
+0:224          length ( temp float)
 0:224            'inF0' ( in 3-component vector of float)
-0:224            'inF1' ( in 3-component vector of float)
 0:225      Sequence
 0:225        move second child to first child ( temp 3-component vector of float)
-0:225          'r039a' ( temp 3-component vector of float)
-0:225          mix ( temp 3-component vector of float)
+0:225          'r044' ( temp 3-component vector of float)
+0:225          log ( temp 3-component vector of float)
 0:225            'inF0' ( in 3-component vector of float)
-0:225            'inF1' ( in 3-component vector of float)
-0:225            'inF2' ( in 3-component vector of float)
 0:226      Sequence
 0:226        move second child to first child ( temp 3-component vector of float)
-0:226          'r039b' ( temp 3-component vector of float)
-0:226          mix ( temp 3-component vector of float)
-0:226            'inF0' ( in 3-component vector of float)
-0:226            'inF1' ( in 3-component vector of float)
+0:226          'r045' ( temp 3-component vector of float)
+0:226          vector-scale ( temp 3-component vector of float)
+0:226            log2 ( temp 3-component vector of float)
+0:226              'inF0' ( in 3-component vector of float)
 0:226            Constant:
-0:226              0.300000
+0:226              0.301030
 0:227      Sequence
-0:227        move second child to first child ( temp float)
-0:227          'r043' ( temp float)
-0:227          length ( temp float)
+0:227        move second child to first child ( temp 3-component vector of float)
+0:227          'r046' ( temp 3-component vector of float)
+0:227          log2 ( temp 3-component vector of float)
 0:227            'inF0' ( in 3-component vector of float)
 0:228      Sequence
 0:228        move second child to first child ( temp 3-component vector of float)
-0:228          'r044' ( temp 3-component vector of float)
-0:228          log ( temp 3-component vector of float)
+0:228          'r047' ( temp 3-component vector of float)
+0:228          max ( temp 3-component vector of float)
 0:228            'inF0' ( in 3-component vector of float)
+0:228            'inF1' ( in 3-component vector of float)
 0:229      Sequence
 0:229        move second child to first child ( temp 3-component vector of float)
-0:229          'r045' ( temp 3-component vector of float)
-0:229          vector-scale ( temp 3-component vector of float)
-0:229            log2 ( temp 3-component vector of float)
-0:229              'inF0' ( in 3-component vector of float)
-0:229            Constant:
-0:229              0.301030
+0:229          'r048' ( temp 3-component vector of float)
+0:229          min ( temp 3-component vector of float)
+0:229            'inF0' ( in 3-component vector of float)
+0:229            'inF1' ( in 3-component vector of float)
 0:230      Sequence
 0:230        move second child to first child ( temp 3-component vector of float)
-0:230          'r046' ( temp 3-component vector of float)
-0:230          log2 ( temp 3-component vector of float)
+0:230          'r049' ( temp 3-component vector of float)
+0:230          normalize ( temp 3-component vector of float)
 0:230            'inF0' ( in 3-component vector of float)
 0:231      Sequence
 0:231        move second child to first child ( temp 3-component vector of float)
-0:231          'r047' ( temp 3-component vector of float)
-0:231          max ( temp 3-component vector of float)
+0:231          'r050' ( temp 3-component vector of float)
+0:231          pow ( temp 3-component vector of float)
 0:231            'inF0' ( in 3-component vector of float)
 0:231            'inF1' ( in 3-component vector of float)
 0:232      Sequence
 0:232        move second child to first child ( temp 3-component vector of float)
-0:232          'r048' ( temp 3-component vector of float)
-0:232          min ( temp 3-component vector of float)
+0:232          'r051' ( temp 3-component vector of float)
+0:232          radians ( temp 3-component vector of float)
 0:232            'inF0' ( in 3-component vector of float)
-0:232            'inF1' ( in 3-component vector of float)
 0:233      Sequence
 0:233        move second child to first child ( temp 3-component vector of float)
-0:233          'r049' ( temp 3-component vector of float)
-0:233          normalize ( temp 3-component vector of float)
+0:233          'r052' ( temp 3-component vector of float)
+0:233          divide ( temp 3-component vector of float)
+0:233            Constant:
+0:233              1.000000
 0:233            'inF0' ( in 3-component vector of float)
 0:234      Sequence
 0:234        move second child to first child ( temp 3-component vector of float)
-0:234          'r050' ( temp 3-component vector of float)
-0:234          pow ( temp 3-component vector of float)
+0:234          'r053' ( temp 3-component vector of float)
+0:234          reflect ( temp 3-component vector of float)
 0:234            'inF0' ( in 3-component vector of float)
 0:234            'inF1' ( in 3-component vector of float)
 0:235      Sequence
 0:235        move second child to first child ( temp 3-component vector of float)
-0:235          'r051' ( temp 3-component vector of float)
-0:235          radians ( temp 3-component vector of float)
+0:235          'r054' ( temp 3-component vector of float)
+0:235          refract ( temp 3-component vector of float)
 0:235            'inF0' ( in 3-component vector of float)
+0:235            'inF1' ( in 3-component vector of float)
+0:235            Constant:
+0:235              2.000000
 0:236      Sequence
-0:236        move second child to first child ( temp 3-component vector of float)
-0:236          'r052' ( temp 3-component vector of float)
-0:236          divide ( temp 3-component vector of float)
-0:236            Constant:
-0:236              1.000000
-0:236            'inF0' ( in 3-component vector of float)
-0:237      Sequence
-0:237        move second child to first child ( temp 3-component vector of float)
-0:237          'r053' ( temp 3-component vector of float)
-0:237          reflect ( temp 3-component vector of float)
-0:237            'inF0' ( in 3-component vector of float)
-0:237            'inF1' ( in 3-component vector of float)
-0:238      Sequence
-0:238        move second child to first child ( temp 3-component vector of float)
-0:238          'r054' ( temp 3-component vector of float)
-0:238          refract ( temp 3-component vector of float)
-0:238            'inF0' ( in 3-component vector of float)
-0:238            'inF1' ( in 3-component vector of float)
-0:238            Constant:
-0:238              2.000000
-0:239      Sequence
-0:239        move second child to first child ( temp 3-component vector of uint)
-0:239          'r055' ( temp 3-component vector of uint)
+0:236        move second child to first child ( temp 3-component vector of uint)
+0:236          'r055' ( temp 3-component vector of uint)
 0:?           bitFieldReverse ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               1 (const uint)
 0:?               2 (const uint)
 0:?               3 (const uint)
+0:237      Sequence
+0:237        move second child to first child ( temp 3-component vector of float)
+0:237          'r056' ( temp 3-component vector of float)
+0:237          roundEven ( temp 3-component vector of float)
+0:237            'inF0' ( in 3-component vector of float)
+0:238      Sequence
+0:238        move second child to first child ( temp 3-component vector of float)
+0:238          'r057' ( temp 3-component vector of float)
+0:238          inverse sqrt ( temp 3-component vector of float)
+0:238            'inF0' ( in 3-component vector of float)
+0:239      Sequence
+0:239        move second child to first child ( temp 3-component vector of float)
+0:239          'r058' ( temp 3-component vector of float)
+0:239          clamp ( temp 3-component vector of float)
+0:239            'inF0' ( in 3-component vector of float)
+0:239            Constant:
+0:239              0.000000
+0:239            Constant:
+0:239              1.000000
 0:240      Sequence
 0:240        move second child to first child ( temp 3-component vector of float)
-0:240          'r056' ( temp 3-component vector of float)
-0:240          roundEven ( temp 3-component vector of float)
+0:240          'r059' ( temp 3-component vector of float)
+0:240          Sign ( temp 3-component vector of float)
 0:240            'inF0' ( in 3-component vector of float)
 0:241      Sequence
 0:241        move second child to first child ( temp 3-component vector of float)
-0:241          'r057' ( temp 3-component vector of float)
-0:241          inverse sqrt ( temp 3-component vector of float)
+0:241          'r060' ( temp 3-component vector of float)
+0:241          sine ( temp 3-component vector of float)
 0:241            'inF0' ( in 3-component vector of float)
 0:242      Sequence
 0:242        move second child to first child ( temp 3-component vector of float)
-0:242          'r058' ( temp 3-component vector of float)
-0:242          clamp ( temp 3-component vector of float)
+0:242          'inF1' ( in 3-component vector of float)
+0:242          sine ( temp 3-component vector of float)
+0:242            'inF0' ( in 3-component vector of float)
+0:242        move second child to first child ( temp 3-component vector of float)
+0:242          'inF2' ( in 3-component vector of float)
+0:242          cosine ( temp 3-component vector of float)
 0:242            'inF0' ( in 3-component vector of float)
-0:242            Constant:
-0:242              0.000000
-0:242            Constant:
-0:242              1.000000
 0:243      Sequence
 0:243        move second child to first child ( temp 3-component vector of float)
-0:243          'r059' ( temp 3-component vector of float)
-0:243          Sign ( temp 3-component vector of float)
+0:243          'r061' ( temp 3-component vector of float)
+0:243          hyp. sine ( temp 3-component vector of float)
 0:243            'inF0' ( in 3-component vector of float)
 0:244      Sequence
 0:244        move second child to first child ( temp 3-component vector of float)
-0:244          'r060' ( temp 3-component vector of float)
-0:244          sine ( temp 3-component vector of float)
+0:244          'r062' ( temp 3-component vector of float)
+0:244          smoothstep ( temp 3-component vector of float)
 0:244            'inF0' ( in 3-component vector of float)
+0:244            'inF1' ( in 3-component vector of float)
+0:244            'inF2' ( in 3-component vector of float)
 0:245      Sequence
 0:245        move second child to first child ( temp 3-component vector of float)
-0:245          'inF1' ( in 3-component vector of float)
-0:245          sine ( temp 3-component vector of float)
-0:245            'inF0' ( in 3-component vector of float)
-0:245        move second child to first child ( temp 3-component vector of float)
-0:245          'inF2' ( in 3-component vector of float)
-0:245          cosine ( temp 3-component vector of float)
+0:245          'r063' ( temp 3-component vector of float)
+0:245          sqrt ( temp 3-component vector of float)
 0:245            'inF0' ( in 3-component vector of float)
 0:246      Sequence
 0:246        move second child to first child ( temp 3-component vector of float)
-0:246          'r061' ( temp 3-component vector of float)
-0:246          hyp. sine ( temp 3-component vector of float)
+0:246          'r064' ( temp 3-component vector of float)
+0:246          step ( temp 3-component vector of float)
 0:246            'inF0' ( in 3-component vector of float)
+0:246            'inF1' ( in 3-component vector of float)
 0:247      Sequence
 0:247        move second child to first child ( temp 3-component vector of float)
-0:247          'r062' ( temp 3-component vector of float)
-0:247          smoothstep ( temp 3-component vector of float)
+0:247          'r065' ( temp 3-component vector of float)
+0:247          tangent ( temp 3-component vector of float)
 0:247            'inF0' ( in 3-component vector of float)
-0:247            'inF1' ( in 3-component vector of float)
-0:247            'inF2' ( in 3-component vector of float)
 0:248      Sequence
 0:248        move second child to first child ( temp 3-component vector of float)
-0:248          'r063' ( temp 3-component vector of float)
-0:248          sqrt ( temp 3-component vector of float)
+0:248          'r066' ( temp 3-component vector of float)
+0:248          hyp. tangent ( temp 3-component vector of float)
 0:248            'inF0' ( in 3-component vector of float)
-0:249      Sequence
-0:249        move second child to first child ( temp 3-component vector of float)
-0:249          'r064' ( temp 3-component vector of float)
-0:249          step ( temp 3-component vector of float)
-0:249            'inF0' ( in 3-component vector of float)
-0:249            'inF1' ( in 3-component vector of float)
 0:250      Sequence
 0:250        move second child to first child ( temp 3-component vector of float)
-0:250          'r065' ( temp 3-component vector of float)
-0:250          tangent ( temp 3-component vector of float)
+0:250          'r067' ( temp 3-component vector of float)
+0:250          trunc ( temp 3-component vector of float)
 0:250            'inF0' ( in 3-component vector of float)
-0:251      Sequence
-0:251        move second child to first child ( temp 3-component vector of float)
-0:251          'r066' ( temp 3-component vector of float)
-0:251          hyp. tangent ( temp 3-component vector of float)
-0:251            'inF0' ( in 3-component vector of float)
-0:253      Sequence
-0:253        move second child to first child ( temp 3-component vector of float)
-0:253          'r067' ( temp 3-component vector of float)
-0:253          trunc ( temp 3-component vector of float)
-0:253            'inF0' ( in 3-component vector of float)
-0:256      Branch: Return with expression
+0:253      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
-0:260  Function Definition: PixelShaderFunction(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
-0:260    Function Parameters: 
-0:260      'inF0' ( in 4-component vector of float)
-0:260      'inF1' ( in 4-component vector of float)
-0:260      'inF2' ( in 4-component vector of float)
-0:260      'inU0' ( in 4-component vector of uint)
-0:260      'inU1' ( in 4-component vector of uint)
+0:257  Function Definition: PixelShaderFunction(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
+0:257    Function Parameters: 
+0:257      'inF0' ( in 4-component vector of float)
+0:257      'inF1' ( in 4-component vector of float)
+0:257      'inF2' ( in 4-component vector of float)
+0:257      'inU0' ( in 4-component vector of uint)
+0:257      'inU1' ( in 4-component vector of uint)
 0:?     Sequence
+0:260      Sequence
+0:260        move second child to first child ( temp bool)
+0:260          'r000' ( temp bool)
+0:260          all ( temp bool)
+0:260            'inF0' ( in 4-component vector of float)
+0:261      Sequence
+0:261        move second child to first child ( temp 4-component vector of float)
+0:261          'r001' ( temp 4-component vector of float)
+0:261          Absolute value ( temp 4-component vector of float)
+0:261            'inF0' ( in 4-component vector of float)
+0:262      Sequence
+0:262        move second child to first child ( temp 4-component vector of float)
+0:262          'r002' ( temp 4-component vector of float)
+0:262          arc cosine ( temp 4-component vector of float)
+0:262            'inF0' ( in 4-component vector of float)
 0:263      Sequence
 0:263        move second child to first child ( temp bool)
-0:263          'r000' ( temp bool)
-0:263          all ( temp bool)
+0:263          'r003' ( temp bool)
+0:263          any ( temp bool)
 0:263            'inF0' ( in 4-component vector of float)
 0:264      Sequence
 0:264        move second child to first child ( temp 4-component vector of float)
-0:264          'r001' ( temp 4-component vector of float)
-0:264          Absolute value ( temp 4-component vector of float)
+0:264          'r004' ( temp 4-component vector of float)
+0:264          arc sine ( temp 4-component vector of float)
 0:264            'inF0' ( in 4-component vector of float)
 0:265      Sequence
-0:265        move second child to first child ( temp 4-component vector of float)
-0:265          'r002' ( temp 4-component vector of float)
-0:265          arc cosine ( temp 4-component vector of float)
+0:265        move second child to first child ( temp 4-component vector of int)
+0:265          'r005' ( temp 4-component vector of int)
+0:265          floatBitsToInt ( temp 4-component vector of int)
 0:265            'inF0' ( in 4-component vector of float)
 0:266      Sequence
-0:266        move second child to first child ( temp bool)
-0:266          'r003' ( temp bool)
-0:266          any ( temp bool)
+0:266        move second child to first child ( temp 4-component vector of uint)
+0:266          'r006' ( temp 4-component vector of uint)
+0:266          floatBitsToUint ( temp 4-component vector of uint)
 0:266            'inF0' ( in 4-component vector of float)
 0:267      Sequence
 0:267        move second child to first child ( temp 4-component vector of float)
-0:267          'r004' ( temp 4-component vector of float)
-0:267          arc sine ( temp 4-component vector of float)
-0:267            'inF0' ( in 4-component vector of float)
-0:268      Sequence
-0:268        move second child to first child ( temp 4-component vector of int)
-0:268          'r005' ( temp 4-component vector of int)
-0:268          floatBitsToInt ( temp 4-component vector of int)
-0:268            'inF0' ( in 4-component vector of float)
+0:267          'r007' ( temp 4-component vector of float)
+0:267          intBitsToFloat ( temp 4-component vector of float)
+0:267            'inU0' ( in 4-component vector of uint)
 0:269      Sequence
-0:269        move second child to first child ( temp 4-component vector of uint)
-0:269          'r006' ( temp 4-component vector of uint)
-0:269          floatBitsToUint ( temp 4-component vector of uint)
+0:269        move second child to first child ( temp 4-component vector of float)
+0:269          'r009' ( temp 4-component vector of float)
+0:269          arc tangent ( temp 4-component vector of float)
 0:269            'inF0' ( in 4-component vector of float)
 0:270      Sequence
 0:270        move second child to first child ( temp 4-component vector of float)
-0:270          'r007' ( temp 4-component vector of float)
-0:270          intBitsToFloat ( temp 4-component vector of float)
-0:270            'inU0' ( in 4-component vector of uint)
+0:270          'r010' ( temp 4-component vector of float)
+0:270          arc tangent ( temp 4-component vector of float)
+0:270            'inF0' ( in 4-component vector of float)
+0:270            'inF1' ( in 4-component vector of float)
+0:271      Sequence
+0:271        move second child to first child ( temp 4-component vector of float)
+0:271          'r011' ( temp 4-component vector of float)
+0:271          Ceiling ( temp 4-component vector of float)
+0:271            'inF0' ( in 4-component vector of float)
 0:272      Sequence
 0:272        move second child to first child ( temp 4-component vector of float)
-0:272          'r009' ( temp 4-component vector of float)
-0:272          arc tangent ( temp 4-component vector of float)
+0:272          'r012' ( temp 4-component vector of float)
+0:272          clamp ( temp 4-component vector of float)
 0:272            'inF0' ( in 4-component vector of float)
-0:273      Sequence
-0:273        move second child to first child ( temp 4-component vector of float)
-0:273          'r010' ( temp 4-component vector of float)
-0:273          arc tangent ( temp 4-component vector of float)
+0:272            'inF1' ( in 4-component vector of float)
+0:272            'inF2' ( in 4-component vector of float)
+0:273      Test condition and select ( temp void)
+0:273        Condition
+0:273        any ( temp bool)
+0:273          Compare Less Than ( temp 4-component vector of bool)
 0:273            'inF0' ( in 4-component vector of float)
-0:273            'inF1' ( in 4-component vector of float)
+0:273            Constant:
+0:273              0.000000
+0:273              0.000000
+0:273              0.000000
+0:273              0.000000
+0:273        true case
+0:273        Branch: Kill
 0:274      Sequence
 0:274        move second child to first child ( temp 4-component vector of float)
-0:274          'r011' ( temp 4-component vector of float)
-0:274          Ceiling ( temp 4-component vector of float)
+0:274          'r013' ( temp 4-component vector of float)
+0:274          cosine ( temp 4-component vector of float)
 0:274            'inF0' ( in 4-component vector of float)
 0:275      Sequence
 0:275        move second child to first child ( temp 4-component vector of float)
-0:275          'r012' ( temp 4-component vector of float)
-0:275          clamp ( temp 4-component vector of float)
+0:275          'r014' ( temp 4-component vector of float)
+0:275          hyp. cosine ( temp 4-component vector of float)
 0:275            'inF0' ( in 4-component vector of float)
-0:275            'inF1' ( in 4-component vector of float)
-0:275            'inF2' ( in 4-component vector of float)
-0:276      Test condition and select ( temp void)
-0:276        Condition
-0:276        any ( temp bool)
-0:276          Compare Less Than ( temp 4-component vector of bool)
-0:276            'inF0' ( in 4-component vector of float)
-0:276            Constant:
-0:276              0.000000
-0:276              0.000000
-0:276              0.000000
-0:276              0.000000
-0:276        true case
-0:276        Branch: Kill
-0:277      Sequence
-0:277        move second child to first child ( temp 4-component vector of float)
-0:277          'r013' ( temp 4-component vector of float)
-0:277          cosine ( temp 4-component vector of float)
-0:277            'inF0' ( in 4-component vector of float)
-0:278      Sequence
-0:278        move second child to first child ( temp 4-component vector of float)
-0:278          'r014' ( temp 4-component vector of float)
-0:278          hyp. cosine ( temp 4-component vector of float)
-0:278            'inF0' ( in 4-component vector of float)
-0:279      Sequence
-0:279        move second child to first child ( temp 4-component vector of uint)
-0:279          'r015' ( temp 4-component vector of uint)
+0:276      Sequence
+0:276        move second child to first child ( temp 4-component vector of uint)
+0:276          'r015' ( temp 4-component vector of uint)
 0:?           bitCount ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               3 (const uint)
 0:?               5 (const uint)
 0:?               2 (const uint)
+0:277      Sequence
+0:277        move second child to first child ( temp 4-component vector of float)
+0:277          'r016' ( temp 4-component vector of float)
+0:277          dPdx ( temp 4-component vector of float)
+0:277            'inF0' ( in 4-component vector of float)
+0:278      Sequence
+0:278        move second child to first child ( temp 4-component vector of float)
+0:278          'r017' ( temp 4-component vector of float)
+0:278          dPdxCoarse ( temp 4-component vector of float)
+0:278            'inF0' ( in 4-component vector of float)
+0:279      Sequence
+0:279        move second child to first child ( temp 4-component vector of float)
+0:279          'r018' ( temp 4-component vector of float)
+0:279          dPdxFine ( temp 4-component vector of float)
+0:279            'inF0' ( in 4-component vector of float)
 0:280      Sequence
 0:280        move second child to first child ( temp 4-component vector of float)
-0:280          'r016' ( temp 4-component vector of float)
-0:280          dPdx ( temp 4-component vector of float)
+0:280          'r019' ( temp 4-component vector of float)
+0:280          dPdy ( temp 4-component vector of float)
 0:280            'inF0' ( in 4-component vector of float)
 0:281      Sequence
 0:281        move second child to first child ( temp 4-component vector of float)
-0:281          'r017' ( temp 4-component vector of float)
-0:281          dPdxCoarse ( temp 4-component vector of float)
+0:281          'r020' ( temp 4-component vector of float)
+0:281          dPdyCoarse ( temp 4-component vector of float)
 0:281            'inF0' ( in 4-component vector of float)
 0:282      Sequence
 0:282        move second child to first child ( temp 4-component vector of float)
-0:282          'r018' ( temp 4-component vector of float)
-0:282          dPdxFine ( temp 4-component vector of float)
+0:282          'r021' ( temp 4-component vector of float)
+0:282          dPdyFine ( temp 4-component vector of float)
 0:282            'inF0' ( in 4-component vector of float)
 0:283      Sequence
 0:283        move second child to first child ( temp 4-component vector of float)
-0:283          'r019' ( temp 4-component vector of float)
-0:283          dPdy ( temp 4-component vector of float)
+0:283          'r022' ( temp 4-component vector of float)
+0:283          degrees ( temp 4-component vector of float)
 0:283            'inF0' ( in 4-component vector of float)
 0:284      Sequence
-0:284        move second child to first child ( temp 4-component vector of float)
-0:284          'r020' ( temp 4-component vector of float)
-0:284          dPdyCoarse ( temp 4-component vector of float)
+0:284        move second child to first child ( temp float)
+0:284          'r023' ( temp float)
+0:284          distance ( temp float)
 0:284            'inF0' ( in 4-component vector of float)
+0:284            'inF1' ( in 4-component vector of float)
 0:285      Sequence
-0:285        move second child to first child ( temp 4-component vector of float)
-0:285          'r021' ( temp 4-component vector of float)
-0:285          dPdyFine ( temp 4-component vector of float)
+0:285        move second child to first child ( temp float)
+0:285          'r024' ( temp float)
+0:285          dot-product ( temp float)
 0:285            'inF0' ( in 4-component vector of float)
+0:285            'inF1' ( in 4-component vector of float)
 0:286      Sequence
 0:286        move second child to first child ( temp 4-component vector of float)
-0:286          'r022' ( temp 4-component vector of float)
-0:286          degrees ( temp 4-component vector of float)
-0:286            'inF0' ( in 4-component vector of float)
-0:287      Sequence
-0:287        move second child to first child ( temp float)
-0:287          'r023' ( temp float)
-0:287          distance ( temp float)
-0:287            'inF0' ( in 4-component vector of float)
-0:287            'inF1' ( in 4-component vector of float)
-0:288      Sequence
-0:288        move second child to first child ( temp float)
-0:288          'r024' ( temp float)
-0:288          dot-product ( temp float)
-0:288            'inF0' ( in 4-component vector of float)
-0:288            'inF1' ( in 4-component vector of float)
-0:289      Sequence
-0:289        move second child to first child ( temp 4-component vector of float)
-0:289          'r025' ( temp 4-component vector of float)
-0:289          Construct vec4 ( temp 4-component vector of float)
-0:289            Constant:
-0:289              1.000000
-0:289            component-wise multiply ( temp float)
-0:289              direct index ( temp float)
-0:289                'inF0' ( in 4-component vector of float)
-0:289                Constant:
-0:289                  1 (const int)
-0:289              direct index ( temp float)
-0:289                'inF1' ( in 4-component vector of float)
-0:289                Constant:
-0:289                  1 (const int)
-0:289            direct index ( temp float)
-0:289              'inF0' ( in 4-component vector of float)
-0:289              Constant:
-0:289                2 (const int)
-0:289            direct index ( temp float)
-0:289              'inF1' ( in 4-component vector of float)
-0:289              Constant:
-0:289                3 (const int)
+0:286          'r025' ( temp 4-component vector of float)
+0:286          Construct vec4 ( temp 4-component vector of float)
+0:286            Constant:
+0:286              1.000000
+0:286            component-wise multiply ( temp float)
+0:286              direct index ( temp float)
+0:286                'inF0' ( in 4-component vector of float)
+0:286                Constant:
+0:286                  1 (const int)
+0:286              direct index ( temp float)
+0:286                'inF1' ( in 4-component vector of float)
+0:286                Constant:
+0:286                  1 (const int)
+0:286            direct index ( temp float)
+0:286              'inF0' ( in 4-component vector of float)
+0:286              Constant:
+0:286                2 (const int)
+0:286            direct index ( temp float)
+0:286              'inF1' ( in 4-component vector of float)
+0:286              Constant:
+0:286                3 (const int)
+0:290      Sequence
+0:290        move second child to first child ( temp 4-component vector of float)
+0:290          'r029' ( temp 4-component vector of float)
+0:290          exp ( temp 4-component vector of float)
+0:290            'inF0' ( in 4-component vector of float)
+0:291      Sequence
+0:291        move second child to first child ( temp 4-component vector of float)
+0:291          'r030' ( temp 4-component vector of float)
+0:291          exp2 ( temp 4-component vector of float)
+0:291            'inF0' ( in 4-component vector of float)
+0:292      Sequence
+0:292        move second child to first child ( temp 4-component vector of float)
+0:292          'r031' ( temp 4-component vector of float)
+0:292          face-forward ( temp 4-component vector of float)
+0:292            'inF0' ( in 4-component vector of float)
+0:292            'inF1' ( in 4-component vector of float)
+0:292            'inF2' ( in 4-component vector of float)
 0:293      Sequence
-0:293        move second child to first child ( temp 4-component vector of float)
-0:293          'r029' ( temp 4-component vector of float)
-0:293          exp ( temp 4-component vector of float)
-0:293            'inF0' ( in 4-component vector of float)
-0:294      Sequence
-0:294        move second child to first child ( temp 4-component vector of float)
-0:294          'r030' ( temp 4-component vector of float)
-0:294          exp2 ( temp 4-component vector of float)
-0:294            'inF0' ( in 4-component vector of float)
-0:295      Sequence
-0:295        move second child to first child ( temp 4-component vector of float)
-0:295          'r031' ( temp 4-component vector of float)
-0:295          face-forward ( temp 4-component vector of float)
-0:295            'inF0' ( in 4-component vector of float)
-0:295            'inF1' ( in 4-component vector of float)
-0:295            'inF2' ( in 4-component vector of float)
-0:296      Sequence
-0:296        move second child to first child ( temp 4-component vector of uint)
-0:296          'r032' ( temp 4-component vector of uint)
+0:293        move second child to first child ( temp 4-component vector of uint)
+0:293          'r032' ( temp 4-component vector of uint)
 0:?           findMSB ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
 0:?               9 (const uint)
 0:?               10 (const uint)
-0:297      Sequence
-0:297        move second child to first child ( temp 4-component vector of uint)
-0:297          'r033' ( temp 4-component vector of uint)
+0:294      Sequence
+0:294        move second child to first child ( temp 4-component vector of uint)
+0:294          'r033' ( temp 4-component vector of uint)
 0:?           findLSB ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
 0:?               9 (const uint)
 0:?               10 (const uint)
+0:295      Sequence
+0:295        move second child to first child ( temp 4-component vector of float)
+0:295          'r034' ( temp 4-component vector of float)
+0:295          Floor ( temp 4-component vector of float)
+0:295            'inF0' ( in 4-component vector of float)
+0:297      Sequence
+0:297        move second child to first child ( temp 4-component vector of float)
+0:297          'r036' ( temp 4-component vector of float)
+0:297          mod ( temp 4-component vector of float)
+0:297            'inF0' ( in 4-component vector of float)
+0:297            'inF1' ( in 4-component vector of float)
 0:298      Sequence
 0:298        move second child to first child ( temp 4-component vector of float)
-0:298          'r034' ( temp 4-component vector of float)
-0:298          Floor ( temp 4-component vector of float)
+0:298          'r037' ( temp 4-component vector of float)
+0:298          Fraction ( temp 4-component vector of float)
 0:298            'inF0' ( in 4-component vector of float)
+0:299      Sequence
+0:299        move second child to first child ( temp 4-component vector of float)
+0:299          'r039' ( temp 4-component vector of float)
+0:299          fwidth ( temp 4-component vector of float)
+0:299            'inF0' ( in 4-component vector of float)
 0:300      Sequence
-0:300        move second child to first child ( temp 4-component vector of float)
-0:300          'r036' ( temp 4-component vector of float)
-0:300          mod ( temp 4-component vector of float)
+0:300        move second child to first child ( temp 4-component vector of bool)
+0:300          'r040' ( temp 4-component vector of bool)
+0:300          isinf ( temp 4-component vector of bool)
 0:300            'inF0' ( in 4-component vector of float)
-0:300            'inF1' ( in 4-component vector of float)
 0:301      Sequence
-0:301        move second child to first child ( temp 4-component vector of float)
-0:301          'r037' ( temp 4-component vector of float)
-0:301          Fraction ( temp 4-component vector of float)
+0:301        move second child to first child ( temp 4-component vector of bool)
+0:301          'r041' ( temp 4-component vector of bool)
+0:301          isnan ( temp 4-component vector of bool)
 0:301            'inF0' ( in 4-component vector of float)
 0:302      Sequence
 0:302        move second child to first child ( temp 4-component vector of float)
-0:302          'r038' ( temp 4-component vector of float)
-0:302          frexp ( temp 4-component vector of float)
+0:302          'r042' ( temp 4-component vector of float)
+0:302          ldexp ( temp 4-component vector of float)
 0:302            'inF0' ( in 4-component vector of float)
 0:302            'inF1' ( in 4-component vector of float)
 0:303      Sequence
 0:303        move second child to first child ( temp 4-component vector of float)
-0:303          'r039' ( temp 4-component vector of float)
-0:303          fwidth ( temp 4-component vector of float)
+0:303          'r039a' ( temp 4-component vector of float)
+0:303          mix ( temp 4-component vector of float)
 0:303            'inF0' ( in 4-component vector of float)
+0:303            'inF1' ( in 4-component vector of float)
+0:303            'inF2' ( in 4-component vector of float)
 0:304      Sequence
-0:304        move second child to first child ( temp 4-component vector of bool)
-0:304          'r040' ( temp 4-component vector of bool)
-0:304          isinf ( temp 4-component vector of bool)
+0:304        move second child to first child ( temp float)
+0:304          'r043' ( temp float)
+0:304          length ( temp float)
 0:304            'inF0' ( in 4-component vector of float)
 0:305      Sequence
-0:305        move second child to first child ( temp 4-component vector of bool)
-0:305          'r041' ( temp 4-component vector of bool)
-0:305          isnan ( temp 4-component vector of bool)
+0:305        move second child to first child ( temp 4-component vector of float)
+0:305          'r044' ( temp 4-component vector of float)
+0:305          log ( temp 4-component vector of float)
 0:305            'inF0' ( in 4-component vector of float)
 0:306      Sequence
 0:306        move second child to first child ( temp 4-component vector of float)
-0:306          'r042' ( temp 4-component vector of float)
-0:306          ldexp ( temp 4-component vector of float)
-0:306            'inF0' ( in 4-component vector of float)
-0:306            'inF1' ( in 4-component vector of float)
+0:306          'r045' ( temp 4-component vector of float)
+0:306          vector-scale ( temp 4-component vector of float)
+0:306            log2 ( temp 4-component vector of float)
+0:306              'inF0' ( in 4-component vector of float)
+0:306            Constant:
+0:306              0.301030
 0:307      Sequence
 0:307        move second child to first child ( temp 4-component vector of float)
-0:307          'r039a' ( temp 4-component vector of float)
-0:307          mix ( temp 4-component vector of float)
+0:307          'r046' ( temp 4-component vector of float)
+0:307          log2 ( temp 4-component vector of float)
 0:307            'inF0' ( in 4-component vector of float)
-0:307            'inF1' ( in 4-component vector of float)
-0:307            'inF2' ( in 4-component vector of float)
 0:308      Sequence
-0:308        move second child to first child ( temp float)
-0:308          'r043' ( temp float)
-0:308          length ( temp float)
+0:308        move second child to first child ( temp 4-component vector of float)
+0:308          'r047' ( temp 4-component vector of float)
+0:308          max ( temp 4-component vector of float)
 0:308            'inF0' ( in 4-component vector of float)
+0:308            'inF1' ( in 4-component vector of float)
 0:309      Sequence
 0:309        move second child to first child ( temp 4-component vector of float)
-0:309          'r044' ( temp 4-component vector of float)
-0:309          log ( temp 4-component vector of float)
+0:309          'r048' ( temp 4-component vector of float)
+0:309          min ( temp 4-component vector of float)
 0:309            'inF0' ( in 4-component vector of float)
+0:309            'inF1' ( in 4-component vector of float)
 0:310      Sequence
 0:310        move second child to first child ( temp 4-component vector of float)
-0:310          'r045' ( temp 4-component vector of float)
-0:310          vector-scale ( temp 4-component vector of float)
-0:310            log2 ( temp 4-component vector of float)
-0:310              'inF0' ( in 4-component vector of float)
-0:310            Constant:
-0:310              0.301030
+0:310          'r049' ( temp 4-component vector of float)
+0:310          normalize ( temp 4-component vector of float)
+0:310            'inF0' ( in 4-component vector of float)
 0:311      Sequence
 0:311        move second child to first child ( temp 4-component vector of float)
-0:311          'r046' ( temp 4-component vector of float)
-0:311          log2 ( temp 4-component vector of float)
+0:311          'r050' ( temp 4-component vector of float)
+0:311          pow ( temp 4-component vector of float)
 0:311            'inF0' ( in 4-component vector of float)
+0:311            'inF1' ( in 4-component vector of float)
 0:312      Sequence
 0:312        move second child to first child ( temp 4-component vector of float)
-0:312          'r047' ( temp 4-component vector of float)
-0:312          max ( temp 4-component vector of float)
+0:312          'r051' ( temp 4-component vector of float)
+0:312          radians ( temp 4-component vector of float)
 0:312            'inF0' ( in 4-component vector of float)
-0:312            'inF1' ( in 4-component vector of float)
 0:313      Sequence
 0:313        move second child to first child ( temp 4-component vector of float)
-0:313          'r048' ( temp 4-component vector of float)
-0:313          min ( temp 4-component vector of float)
+0:313          'r052' ( temp 4-component vector of float)
+0:313          divide ( temp 4-component vector of float)
+0:313            Constant:
+0:313              1.000000
 0:313            'inF0' ( in 4-component vector of float)
-0:313            'inF1' ( in 4-component vector of float)
 0:314      Sequence
 0:314        move second child to first child ( temp 4-component vector of float)
-0:314          'r049' ( temp 4-component vector of float)
-0:314          normalize ( temp 4-component vector of float)
+0:314          'r053' ( temp 4-component vector of float)
+0:314          reflect ( temp 4-component vector of float)
 0:314            'inF0' ( in 4-component vector of float)
+0:314            'inF1' ( in 4-component vector of float)
 0:315      Sequence
 0:315        move second child to first child ( temp 4-component vector of float)
-0:315          'r050' ( temp 4-component vector of float)
-0:315          pow ( temp 4-component vector of float)
+0:315          'r054' ( temp 4-component vector of float)
+0:315          refract ( temp 4-component vector of float)
 0:315            'inF0' ( in 4-component vector of float)
 0:315            'inF1' ( in 4-component vector of float)
+0:315            Constant:
+0:315              2.000000
 0:316      Sequence
-0:316        move second child to first child ( temp 4-component vector of float)
-0:316          'r051' ( temp 4-component vector of float)
-0:316          radians ( temp 4-component vector of float)
-0:316            'inF0' ( in 4-component vector of float)
-0:317      Sequence
-0:317        move second child to first child ( temp 4-component vector of float)
-0:317          'r052' ( temp 4-component vector of float)
-0:317          divide ( temp 4-component vector of float)
-0:317            Constant:
-0:317              1.000000
-0:317            'inF0' ( in 4-component vector of float)
-0:318      Sequence
-0:318        move second child to first child ( temp 4-component vector of float)
-0:318          'r053' ( temp 4-component vector of float)
-0:318          reflect ( temp 4-component vector of float)
-0:318            'inF0' ( in 4-component vector of float)
-0:318            'inF1' ( in 4-component vector of float)
-0:319      Sequence
-0:319        move second child to first child ( temp 4-component vector of float)
-0:319          'r054' ( temp 4-component vector of float)
-0:319          refract ( temp 4-component vector of float)
-0:319            'inF0' ( in 4-component vector of float)
-0:319            'inF1' ( in 4-component vector of float)
-0:319            Constant:
-0:319              2.000000
-0:320      Sequence
-0:320        move second child to first child ( temp 4-component vector of uint)
-0:320          'r055' ( temp 4-component vector of uint)
+0:316        move second child to first child ( temp 4-component vector of uint)
+0:316          'r055' ( temp 4-component vector of uint)
 0:?           bitFieldReverse ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               1 (const uint)
 0:?               2 (const uint)
 0:?               3 (const uint)
 0:?               4 (const uint)
+0:317      Sequence
+0:317        move second child to first child ( temp 4-component vector of float)
+0:317          'r056' ( temp 4-component vector of float)
+0:317          roundEven ( temp 4-component vector of float)
+0:317            'inF0' ( in 4-component vector of float)
+0:318      Sequence
+0:318        move second child to first child ( temp 4-component vector of float)
+0:318          'r057' ( temp 4-component vector of float)
+0:318          inverse sqrt ( temp 4-component vector of float)
+0:318            'inF0' ( in 4-component vector of float)
+0:319      Sequence
+0:319        move second child to first child ( temp 4-component vector of float)
+0:319          'r058' ( temp 4-component vector of float)
+0:319          clamp ( temp 4-component vector of float)
+0:319            'inF0' ( in 4-component vector of float)
+0:319            Constant:
+0:319              0.000000
+0:319            Constant:
+0:319              1.000000
+0:320      Sequence
+0:320        move second child to first child ( temp 4-component vector of float)
+0:320          'r059' ( temp 4-component vector of float)
+0:320          Sign ( temp 4-component vector of float)
+0:320            'inF0' ( in 4-component vector of float)
 0:321      Sequence
 0:321        move second child to first child ( temp 4-component vector of float)
-0:321          'r056' ( temp 4-component vector of float)
-0:321          roundEven ( temp 4-component vector of float)
+0:321          'r060' ( temp 4-component vector of float)
+0:321          sine ( temp 4-component vector of float)
 0:321            'inF0' ( in 4-component vector of float)
 0:322      Sequence
 0:322        move second child to first child ( temp 4-component vector of float)
-0:322          'r057' ( temp 4-component vector of float)
-0:322          inverse sqrt ( temp 4-component vector of float)
+0:322          'inF1' ( in 4-component vector of float)
+0:322          sine ( temp 4-component vector of float)
+0:322            'inF0' ( in 4-component vector of float)
+0:322        move second child to first child ( temp 4-component vector of float)
+0:322          'inF2' ( in 4-component vector of float)
+0:322          cosine ( temp 4-component vector of float)
 0:322            'inF0' ( in 4-component vector of float)
 0:323      Sequence
 0:323        move second child to first child ( temp 4-component vector of float)
-0:323          'r058' ( temp 4-component vector of float)
-0:323          clamp ( temp 4-component vector of float)
+0:323          'r061' ( temp 4-component vector of float)
+0:323          hyp. sine ( temp 4-component vector of float)
 0:323            'inF0' ( in 4-component vector of float)
-0:323            Constant:
-0:323              0.000000
-0:323            Constant:
-0:323              1.000000
 0:324      Sequence
 0:324        move second child to first child ( temp 4-component vector of float)
-0:324          'r059' ( temp 4-component vector of float)
-0:324          Sign ( temp 4-component vector of float)
+0:324          'r062' ( temp 4-component vector of float)
+0:324          smoothstep ( temp 4-component vector of float)
 0:324            'inF0' ( in 4-component vector of float)
+0:324            'inF1' ( in 4-component vector of float)
+0:324            'inF2' ( in 4-component vector of float)
 0:325      Sequence
 0:325        move second child to first child ( temp 4-component vector of float)
-0:325          'r060' ( temp 4-component vector of float)
-0:325          sine ( temp 4-component vector of float)
+0:325          'r063' ( temp 4-component vector of float)
+0:325          sqrt ( temp 4-component vector of float)
 0:325            'inF0' ( in 4-component vector of float)
 0:326      Sequence
 0:326        move second child to first child ( temp 4-component vector of float)
-0:326          'inF1' ( in 4-component vector of float)
-0:326          sine ( temp 4-component vector of float)
-0:326            'inF0' ( in 4-component vector of float)
-0:326        move second child to first child ( temp 4-component vector of float)
-0:326          'inF2' ( in 4-component vector of float)
-0:326          cosine ( temp 4-component vector of float)
+0:326          'r064' ( temp 4-component vector of float)
+0:326          step ( temp 4-component vector of float)
 0:326            'inF0' ( in 4-component vector of float)
+0:326            'inF1' ( in 4-component vector of float)
 0:327      Sequence
 0:327        move second child to first child ( temp 4-component vector of float)
-0:327          'r061' ( temp 4-component vector of float)
-0:327          hyp. sine ( temp 4-component vector of float)
+0:327          'r065' ( temp 4-component vector of float)
+0:327          tangent ( temp 4-component vector of float)
 0:327            'inF0' ( in 4-component vector of float)
 0:328      Sequence
 0:328        move second child to first child ( temp 4-component vector of float)
-0:328          'r062' ( temp 4-component vector of float)
-0:328          smoothstep ( temp 4-component vector of float)
+0:328          'r066' ( temp 4-component vector of float)
+0:328          hyp. tangent ( temp 4-component vector of float)
 0:328            'inF0' ( in 4-component vector of float)
-0:328            'inF1' ( in 4-component vector of float)
-0:328            'inF2' ( in 4-component vector of float)
-0:329      Sequence
-0:329        move second child to first child ( temp 4-component vector of float)
-0:329          'r063' ( temp 4-component vector of float)
-0:329          sqrt ( temp 4-component vector of float)
-0:329            'inF0' ( in 4-component vector of float)
 0:330      Sequence
 0:330        move second child to first child ( temp 4-component vector of float)
-0:330          'r064' ( temp 4-component vector of float)
-0:330          step ( temp 4-component vector of float)
+0:330          'r067' ( temp 4-component vector of float)
+0:330          trunc ( temp 4-component vector of float)
 0:330            'inF0' ( in 4-component vector of float)
-0:330            'inF1' ( in 4-component vector of float)
-0:331      Sequence
-0:331        move second child to first child ( temp 4-component vector of float)
-0:331          'r065' ( temp 4-component vector of float)
-0:331          tangent ( temp 4-component vector of float)
-0:331            'inF0' ( in 4-component vector of float)
-0:332      Sequence
-0:332        move second child to first child ( temp 4-component vector of float)
-0:332          'r066' ( temp 4-component vector of float)
-0:332          hyp. tangent ( temp 4-component vector of float)
-0:332            'inF0' ( in 4-component vector of float)
-0:334      Sequence
-0:334        move second child to first child ( temp 4-component vector of float)
-0:334          'r067' ( temp 4-component vector of float)
-0:334          trunc ( temp 4-component vector of float)
-0:334            'inF0' ( in 4-component vector of float)
-0:337      Branch: Return with expression
+0:333      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
 0:?           4.000000
-0:401  Function Definition: PixelShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
-0:401    Function Parameters: 
-0:401      'inF0' ( in 2X2 matrix of float)
-0:401      'inF1' ( in 2X2 matrix of float)
-0:401      'inF2' ( in 2X2 matrix of float)
+0:396  Function Definition: PixelShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
+0:396    Function Parameters: 
+0:396      'inF0' ( in 2X2 matrix of float)
+0:396      'inF1' ( in 2X2 matrix of float)
+0:396      'inF2' ( in 2X2 matrix of float)
 0:?     Sequence
-0:403      Sequence
-0:403        move second child to first child ( temp bool)
-0:403          'r000' ( temp bool)
-0:403          all ( temp bool)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r001' ( temp 2X2 matrix of float)
-0:403          Absolute value ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      arc cosine ( temp 2X2 matrix of float)
-0:403        'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp bool)
-0:403          'r003' ( temp bool)
-0:403          any ( temp bool)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r004' ( temp 2X2 matrix of float)
-0:403          arc sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r005' ( temp 2X2 matrix of float)
-0:403          arc tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r006' ( temp 2X2 matrix of float)
-0:403          arc tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r007' ( temp 2X2 matrix of float)
-0:403          Ceiling ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Test condition and select ( temp void)
-0:403        Condition
-0:403        any ( temp bool)
-0:403          Compare Less Than ( temp 2X2 matrix of bool)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            Constant:
-0:403              0.000000
-0:403              0.000000
-0:403              0.000000
-0:403              0.000000
-0:403        true case
-0:403        Branch: Kill
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r008' ( temp 2X2 matrix of float)
-0:403          clamp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403            'inF2' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r009' ( temp 2X2 matrix of float)
-0:403          cosine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r010' ( temp 2X2 matrix of float)
-0:403          hyp. cosine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r011' ( temp 2X2 matrix of float)
-0:403          dPdx ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r012' ( temp 2X2 matrix of float)
-0:403          dPdxCoarse ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r013' ( temp 2X2 matrix of float)
-0:403          dPdxFine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r014' ( temp 2X2 matrix of float)
-0:403          dPdy ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r015' ( temp 2X2 matrix of float)
-0:403          dPdyCoarse ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r016' ( temp 2X2 matrix of float)
-0:403          dPdyFine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r017' ( temp 2X2 matrix of float)
-0:403          degrees ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp float)
-0:403          'r018' ( temp float)
-0:403          determinant ( temp float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r019' ( temp 2X2 matrix of float)
-0:403          exp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'R020' ( temp 2X2 matrix of float)
-0:403          exp2 ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r021' ( temp 2X2 matrix of float)
-0:403          Floor ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r022' ( temp 2X2 matrix of float)
-0:403          mod ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r023' ( temp 2X2 matrix of float)
-0:403          Fraction ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r024' ( temp 2X2 matrix of float)
-0:403          frexp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r025' ( temp 2X2 matrix of float)
-0:403          fwidth ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r026' ( temp 2X2 matrix of float)
-0:403          ldexp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r026a' ( temp 2X2 matrix of float)
-0:403          mix ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403            'inF2' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r027' ( temp 2X2 matrix of float)
-0:403          log ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r028' ( temp 2X2 matrix of float)
-0:403          matrix-scale ( temp 2X2 matrix of float)
-0:403            log2 ( temp 2X2 matrix of float)
-0:403              'inF0' ( in 2X2 matrix of float)
-0:403            Constant:
-0:403              0.301030
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r029' ( temp 2X2 matrix of float)
-0:403          log2 ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r030' ( temp 2X2 matrix of float)
-0:403          max ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r031' ( temp 2X2 matrix of float)
-0:403          min ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r032' ( temp 2X2 matrix of float)
-0:403          pow ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r033' ( temp 2X2 matrix of float)
-0:403          radians ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r034' ( temp 2X2 matrix of float)
-0:403          roundEven ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r035' ( temp 2X2 matrix of float)
-0:403          inverse sqrt ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r036' ( temp 2X2 matrix of float)
-0:403          clamp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            Constant:
-0:403              0.000000
-0:403            Constant:
-0:403              1.000000
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r037' ( temp 2X2 matrix of float)
-0:403          Sign ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r038' ( temp 2X2 matrix of float)
-0:403          sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'inF1' ( in 2X2 matrix of float)
-0:403          sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'inF2' ( in 2X2 matrix of float)
-0:403          cosine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r039' ( temp 2X2 matrix of float)
-0:403          hyp. sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r049' ( temp 2X2 matrix of float)
-0:403          smoothstep ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403            'inF2' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r041' ( temp 2X2 matrix of float)
-0:403          sqrt ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r042' ( temp 2X2 matrix of float)
-0:403          step ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r043' ( temp 2X2 matrix of float)
-0:403          tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r044' ( temp 2X2 matrix of float)
-0:403          hyp. tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      transpose ( temp 2X2 matrix of float)
-0:403        'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r046' ( temp 2X2 matrix of float)
-0:403          trunc ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:406      Branch: Return with expression
+0:398      Sequence
+0:398        move second child to first child ( temp bool)
+0:398          'r000' ( temp bool)
+0:398          all ( temp bool)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r001' ( temp 2X2 matrix of float)
+0:398          Absolute value ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      arc cosine ( temp 2X2 matrix of float)
+0:398        'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp bool)
+0:398          'r003' ( temp bool)
+0:398          any ( temp bool)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r004' ( temp 2X2 matrix of float)
+0:398          arc sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r005' ( temp 2X2 matrix of float)
+0:398          arc tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r006' ( temp 2X2 matrix of float)
+0:398          arc tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r007' ( temp 2X2 matrix of float)
+0:398          Ceiling ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Test condition and select ( temp void)
+0:398        Condition
+0:398        any ( temp bool)
+0:398          Compare Less Than ( temp 2X2 matrix of bool)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            Constant:
+0:398              0.000000
+0:398              0.000000
+0:398              0.000000
+0:398              0.000000
+0:398        true case
+0:398        Branch: Kill
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r008' ( temp 2X2 matrix of float)
+0:398          clamp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398            'inF2' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r009' ( temp 2X2 matrix of float)
+0:398          cosine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r010' ( temp 2X2 matrix of float)
+0:398          hyp. cosine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r011' ( temp 2X2 matrix of float)
+0:398          dPdx ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r012' ( temp 2X2 matrix of float)
+0:398          dPdxCoarse ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r013' ( temp 2X2 matrix of float)
+0:398          dPdxFine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r014' ( temp 2X2 matrix of float)
+0:398          dPdy ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r015' ( temp 2X2 matrix of float)
+0:398          dPdyCoarse ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r016' ( temp 2X2 matrix of float)
+0:398          dPdyFine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r017' ( temp 2X2 matrix of float)
+0:398          degrees ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp float)
+0:398          'r018' ( temp float)
+0:398          determinant ( temp float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r019' ( temp 2X2 matrix of float)
+0:398          exp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'R020' ( temp 2X2 matrix of float)
+0:398          exp2 ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r021' ( temp 2X2 matrix of float)
+0:398          Floor ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r022' ( temp 2X2 matrix of float)
+0:398          mod ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r023' ( temp 2X2 matrix of float)
+0:398          Fraction ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r025' ( temp 2X2 matrix of float)
+0:398          fwidth ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r026' ( temp 2X2 matrix of float)
+0:398          ldexp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r026a' ( temp 2X2 matrix of float)
+0:398          mix ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398            'inF2' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r027' ( temp 2X2 matrix of float)
+0:398          log ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r028' ( temp 2X2 matrix of float)
+0:398          matrix-scale ( temp 2X2 matrix of float)
+0:398            log2 ( temp 2X2 matrix of float)
+0:398              'inF0' ( in 2X2 matrix of float)
+0:398            Constant:
+0:398              0.301030
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r029' ( temp 2X2 matrix of float)
+0:398          log2 ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r030' ( temp 2X2 matrix of float)
+0:398          max ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r031' ( temp 2X2 matrix of float)
+0:398          min ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r032' ( temp 2X2 matrix of float)
+0:398          pow ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r033' ( temp 2X2 matrix of float)
+0:398          radians ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r034' ( temp 2X2 matrix of float)
+0:398          roundEven ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r035' ( temp 2X2 matrix of float)
+0:398          inverse sqrt ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r036' ( temp 2X2 matrix of float)
+0:398          clamp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            Constant:
+0:398              0.000000
+0:398            Constant:
+0:398              1.000000
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r037' ( temp 2X2 matrix of float)
+0:398          Sign ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r038' ( temp 2X2 matrix of float)
+0:398          sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'inF1' ( in 2X2 matrix of float)
+0:398          sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'inF2' ( in 2X2 matrix of float)
+0:398          cosine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r039' ( temp 2X2 matrix of float)
+0:398          hyp. sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r049' ( temp 2X2 matrix of float)
+0:398          smoothstep ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398            'inF2' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r041' ( temp 2X2 matrix of float)
+0:398          sqrt ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r042' ( temp 2X2 matrix of float)
+0:398          step ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r043' ( temp 2X2 matrix of float)
+0:398          tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r044' ( temp 2X2 matrix of float)
+0:398          hyp. tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      transpose ( temp 2X2 matrix of float)
+0:398        'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r046' ( temp 2X2 matrix of float)
+0:398          trunc ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:401      Branch: Return with expression
 0:?         Constant:
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
-0:410  Function Definition: PixelShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
-0:410    Function Parameters: 
-0:410      'inF0' ( in 3X3 matrix of float)
-0:410      'inF1' ( in 3X3 matrix of float)
-0:410      'inF2' ( in 3X3 matrix of float)
+0:405  Function Definition: PixelShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
+0:405    Function Parameters: 
+0:405      'inF0' ( in 3X3 matrix of float)
+0:405      'inF1' ( in 3X3 matrix of float)
+0:405      'inF2' ( in 3X3 matrix of float)
 0:?     Sequence
-0:412      Sequence
-0:412        move second child to first child ( temp bool)
-0:412          'r000' ( temp bool)
-0:412          all ( temp bool)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r001' ( temp 3X3 matrix of float)
-0:412          Absolute value ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      arc cosine ( temp 3X3 matrix of float)
-0:412        'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp bool)
-0:412          'r003' ( temp bool)
-0:412          any ( temp bool)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r004' ( temp 3X3 matrix of float)
-0:412          arc sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r005' ( temp 3X3 matrix of float)
-0:412          arc tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r006' ( temp 3X3 matrix of float)
-0:412          arc tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r007' ( temp 3X3 matrix of float)
-0:412          Ceiling ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Test condition and select ( temp void)
-0:412        Condition
-0:412        any ( temp bool)
-0:412          Compare Less Than ( temp 3X3 matrix of bool)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            Constant:
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412        true case
-0:412        Branch: Kill
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r008' ( temp 3X3 matrix of float)
-0:412          clamp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412            'inF2' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r009' ( temp 3X3 matrix of float)
-0:412          cosine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r010' ( temp 3X3 matrix of float)
-0:412          hyp. cosine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r011' ( temp 3X3 matrix of float)
-0:412          dPdx ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r012' ( temp 3X3 matrix of float)
-0:412          dPdxCoarse ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r013' ( temp 3X3 matrix of float)
-0:412          dPdxFine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r014' ( temp 3X3 matrix of float)
-0:412          dPdy ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r015' ( temp 3X3 matrix of float)
-0:412          dPdyCoarse ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r016' ( temp 3X3 matrix of float)
-0:412          dPdyFine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r017' ( temp 3X3 matrix of float)
-0:412          degrees ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp float)
-0:412          'r018' ( temp float)
-0:412          determinant ( temp float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r019' ( temp 3X3 matrix of float)
-0:412          exp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'R020' ( temp 3X3 matrix of float)
-0:412          exp2 ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r021' ( temp 3X3 matrix of float)
-0:412          Floor ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r022' ( temp 3X3 matrix of float)
-0:412          mod ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r023' ( temp 3X3 matrix of float)
-0:412          Fraction ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r024' ( temp 3X3 matrix of float)
-0:412          frexp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r025' ( temp 3X3 matrix of float)
-0:412          fwidth ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r026' ( temp 3X3 matrix of float)
-0:412          ldexp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r026a' ( temp 3X3 matrix of float)
-0:412          mix ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412            'inF2' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r027' ( temp 3X3 matrix of float)
-0:412          log ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r028' ( temp 3X3 matrix of float)
-0:412          matrix-scale ( temp 3X3 matrix of float)
-0:412            log2 ( temp 3X3 matrix of float)
-0:412              'inF0' ( in 3X3 matrix of float)
-0:412            Constant:
-0:412              0.301030
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r029' ( temp 3X3 matrix of float)
-0:412          log2 ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r030' ( temp 3X3 matrix of float)
-0:412          max ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r031' ( temp 3X3 matrix of float)
-0:412          min ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r032' ( temp 3X3 matrix of float)
-0:412          pow ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r033' ( temp 3X3 matrix of float)
-0:412          radians ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r034' ( temp 3X3 matrix of float)
-0:412          roundEven ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r035' ( temp 3X3 matrix of float)
-0:412          inverse sqrt ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r036' ( temp 3X3 matrix of float)
-0:412          clamp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            Constant:
-0:412              0.000000
-0:412            Constant:
-0:412              1.000000
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r037' ( temp 3X3 matrix of float)
-0:412          Sign ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r038' ( temp 3X3 matrix of float)
-0:412          sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'inF1' ( in 3X3 matrix of float)
-0:412          sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'inF2' ( in 3X3 matrix of float)
-0:412          cosine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r039' ( temp 3X3 matrix of float)
-0:412          hyp. sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r049' ( temp 3X3 matrix of float)
-0:412          smoothstep ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412            'inF2' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r041' ( temp 3X3 matrix of float)
-0:412          sqrt ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r042' ( temp 3X3 matrix of float)
-0:412          step ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r043' ( temp 3X3 matrix of float)
-0:412          tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r044' ( temp 3X3 matrix of float)
-0:412          hyp. tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      transpose ( temp 3X3 matrix of float)
-0:412        'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r046' ( temp 3X3 matrix of float)
-0:412          trunc ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:415      Branch: Return with expression
+0:407      Sequence
+0:407        move second child to first child ( temp bool)
+0:407          'r000' ( temp bool)
+0:407          all ( temp bool)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r001' ( temp 3X3 matrix of float)
+0:407          Absolute value ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      arc cosine ( temp 3X3 matrix of float)
+0:407        'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp bool)
+0:407          'r003' ( temp bool)
+0:407          any ( temp bool)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r004' ( temp 3X3 matrix of float)
+0:407          arc sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r005' ( temp 3X3 matrix of float)
+0:407          arc tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r006' ( temp 3X3 matrix of float)
+0:407          arc tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r007' ( temp 3X3 matrix of float)
+0:407          Ceiling ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Test condition and select ( temp void)
+0:407        Condition
+0:407        any ( temp bool)
+0:407          Compare Less Than ( temp 3X3 matrix of bool)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            Constant:
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407        true case
+0:407        Branch: Kill
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r008' ( temp 3X3 matrix of float)
+0:407          clamp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407            'inF2' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r009' ( temp 3X3 matrix of float)
+0:407          cosine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r010' ( temp 3X3 matrix of float)
+0:407          hyp. cosine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r011' ( temp 3X3 matrix of float)
+0:407          dPdx ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r012' ( temp 3X3 matrix of float)
+0:407          dPdxCoarse ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r013' ( temp 3X3 matrix of float)
+0:407          dPdxFine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r014' ( temp 3X3 matrix of float)
+0:407          dPdy ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r015' ( temp 3X3 matrix of float)
+0:407          dPdyCoarse ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r016' ( temp 3X3 matrix of float)
+0:407          dPdyFine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r017' ( temp 3X3 matrix of float)
+0:407          degrees ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp float)
+0:407          'r018' ( temp float)
+0:407          determinant ( temp float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r019' ( temp 3X3 matrix of float)
+0:407          exp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'R020' ( temp 3X3 matrix of float)
+0:407          exp2 ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r021' ( temp 3X3 matrix of float)
+0:407          Floor ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r022' ( temp 3X3 matrix of float)
+0:407          mod ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r023' ( temp 3X3 matrix of float)
+0:407          Fraction ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r025' ( temp 3X3 matrix of float)
+0:407          fwidth ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r026' ( temp 3X3 matrix of float)
+0:407          ldexp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r026a' ( temp 3X3 matrix of float)
+0:407          mix ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407            'inF2' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r027' ( temp 3X3 matrix of float)
+0:407          log ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r028' ( temp 3X3 matrix of float)
+0:407          matrix-scale ( temp 3X3 matrix of float)
+0:407            log2 ( temp 3X3 matrix of float)
+0:407              'inF0' ( in 3X3 matrix of float)
+0:407            Constant:
+0:407              0.301030
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r029' ( temp 3X3 matrix of float)
+0:407          log2 ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r030' ( temp 3X3 matrix of float)
+0:407          max ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r031' ( temp 3X3 matrix of float)
+0:407          min ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r032' ( temp 3X3 matrix of float)
+0:407          pow ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r033' ( temp 3X3 matrix of float)
+0:407          radians ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r034' ( temp 3X3 matrix of float)
+0:407          roundEven ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r035' ( temp 3X3 matrix of float)
+0:407          inverse sqrt ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r036' ( temp 3X3 matrix of float)
+0:407          clamp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            Constant:
+0:407              0.000000
+0:407            Constant:
+0:407              1.000000
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r037' ( temp 3X3 matrix of float)
+0:407          Sign ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r038' ( temp 3X3 matrix of float)
+0:407          sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'inF1' ( in 3X3 matrix of float)
+0:407          sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'inF2' ( in 3X3 matrix of float)
+0:407          cosine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r039' ( temp 3X3 matrix of float)
+0:407          hyp. sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r049' ( temp 3X3 matrix of float)
+0:407          smoothstep ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407            'inF2' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r041' ( temp 3X3 matrix of float)
+0:407          sqrt ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r042' ( temp 3X3 matrix of float)
+0:407          step ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r043' ( temp 3X3 matrix of float)
+0:407          tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r044' ( temp 3X3 matrix of float)
+0:407          hyp. tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      transpose ( temp 3X3 matrix of float)
+0:407        'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r046' ( temp 3X3 matrix of float)
+0:407          trunc ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:410      Branch: Return with expression
 0:?         Constant:
 0:?           3.000000
 0:?           3.000000
@@ -2145,301 +2109,295 @@ gl_FragCoord origin is upper left
 0:?           3.000000
 0:?           3.000000
 0:?           3.000000
-0:419  Function Definition: PixelShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
-0:419    Function Parameters: 
-0:419      'inF0' ( in 4X4 matrix of float)
-0:419      'inF1' ( in 4X4 matrix of float)
-0:419      'inF2' ( in 4X4 matrix of float)
+0:414  Function Definition: PixelShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
+0:414    Function Parameters: 
+0:414      'inF0' ( in 4X4 matrix of float)
+0:414      'inF1' ( in 4X4 matrix of float)
+0:414      'inF2' ( in 4X4 matrix of float)
 0:?     Sequence
-0:421      Sequence
-0:421        move second child to first child ( temp bool)
-0:421          'r000' ( temp bool)
-0:421          all ( temp bool)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r001' ( temp 4X4 matrix of float)
-0:421          Absolute value ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      arc cosine ( temp 4X4 matrix of float)
-0:421        'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp bool)
-0:421          'r003' ( temp bool)
-0:421          any ( temp bool)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r004' ( temp 4X4 matrix of float)
-0:421          arc sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r005' ( temp 4X4 matrix of float)
-0:421          arc tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r006' ( temp 4X4 matrix of float)
-0:421          arc tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r007' ( temp 4X4 matrix of float)
-0:421          Ceiling ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Test condition and select ( temp void)
-0:421        Condition
-0:421        any ( temp bool)
-0:421          Compare Less Than ( temp 4X4 matrix of bool)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            Constant:
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421        true case
-0:421        Branch: Kill
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r008' ( temp 4X4 matrix of float)
-0:421          clamp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421            'inF2' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r009' ( temp 4X4 matrix of float)
-0:421          cosine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r010' ( temp 4X4 matrix of float)
-0:421          hyp. cosine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r011' ( temp 4X4 matrix of float)
-0:421          dPdx ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r012' ( temp 4X4 matrix of float)
-0:421          dPdxCoarse ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r013' ( temp 4X4 matrix of float)
-0:421          dPdxFine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r014' ( temp 4X4 matrix of float)
-0:421          dPdy ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r015' ( temp 4X4 matrix of float)
-0:421          dPdyCoarse ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r016' ( temp 4X4 matrix of float)
-0:421          dPdyFine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r017' ( temp 4X4 matrix of float)
-0:421          degrees ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp float)
-0:421          'r018' ( temp float)
-0:421          determinant ( temp float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r019' ( temp 4X4 matrix of float)
-0:421          exp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'R020' ( temp 4X4 matrix of float)
-0:421          exp2 ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r021' ( temp 4X4 matrix of float)
-0:421          Floor ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r022' ( temp 4X4 matrix of float)
-0:421          mod ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r023' ( temp 4X4 matrix of float)
-0:421          Fraction ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r024' ( temp 4X4 matrix of float)
-0:421          frexp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r025' ( temp 4X4 matrix of float)
-0:421          fwidth ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r026' ( temp 4X4 matrix of float)
-0:421          ldexp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r026a' ( temp 4X4 matrix of float)
-0:421          mix ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421            'inF2' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r027' ( temp 4X4 matrix of float)
-0:421          log ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r028' ( temp 4X4 matrix of float)
-0:421          matrix-scale ( temp 4X4 matrix of float)
-0:421            log2 ( temp 4X4 matrix of float)
-0:421              'inF0' ( in 4X4 matrix of float)
-0:421            Constant:
-0:421              0.301030
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r029' ( temp 4X4 matrix of float)
-0:421          log2 ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r030' ( temp 4X4 matrix of float)
-0:421          max ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r031' ( temp 4X4 matrix of float)
-0:421          min ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r032' ( temp 4X4 matrix of float)
-0:421          pow ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r033' ( temp 4X4 matrix of float)
-0:421          radians ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r034' ( temp 4X4 matrix of float)
-0:421          roundEven ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r035' ( temp 4X4 matrix of float)
-0:421          inverse sqrt ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r036' ( temp 4X4 matrix of float)
-0:421          clamp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            Constant:
-0:421              0.000000
-0:421            Constant:
-0:421              1.000000
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r037' ( temp 4X4 matrix of float)
-0:421          Sign ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r038' ( temp 4X4 matrix of float)
-0:421          sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'inF1' ( in 4X4 matrix of float)
-0:421          sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'inF2' ( in 4X4 matrix of float)
-0:421          cosine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r039' ( temp 4X4 matrix of float)
-0:421          hyp. sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r049' ( temp 4X4 matrix of float)
-0:421          smoothstep ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421            'inF2' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r041' ( temp 4X4 matrix of float)
-0:421          sqrt ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r042' ( temp 4X4 matrix of float)
-0:421          step ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r043' ( temp 4X4 matrix of float)
-0:421          tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r044' ( temp 4X4 matrix of float)
-0:421          hyp. tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      transpose ( temp 4X4 matrix of float)
-0:421        'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r046' ( temp 4X4 matrix of float)
-0:421          trunc ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:424      Branch: Return with expression
+0:416      Sequence
+0:416        move second child to first child ( temp bool)
+0:416          'r000' ( temp bool)
+0:416          all ( temp bool)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r001' ( temp 4X4 matrix of float)
+0:416          Absolute value ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      arc cosine ( temp 4X4 matrix of float)
+0:416        'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp bool)
+0:416          'r003' ( temp bool)
+0:416          any ( temp bool)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r004' ( temp 4X4 matrix of float)
+0:416          arc sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r005' ( temp 4X4 matrix of float)
+0:416          arc tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r006' ( temp 4X4 matrix of float)
+0:416          arc tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r007' ( temp 4X4 matrix of float)
+0:416          Ceiling ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Test condition and select ( temp void)
+0:416        Condition
+0:416        any ( temp bool)
+0:416          Compare Less Than ( temp 4X4 matrix of bool)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            Constant:
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416        true case
+0:416        Branch: Kill
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r008' ( temp 4X4 matrix of float)
+0:416          clamp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416            'inF2' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r009' ( temp 4X4 matrix of float)
+0:416          cosine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r010' ( temp 4X4 matrix of float)
+0:416          hyp. cosine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r011' ( temp 4X4 matrix of float)
+0:416          dPdx ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r012' ( temp 4X4 matrix of float)
+0:416          dPdxCoarse ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r013' ( temp 4X4 matrix of float)
+0:416          dPdxFine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r014' ( temp 4X4 matrix of float)
+0:416          dPdy ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r015' ( temp 4X4 matrix of float)
+0:416          dPdyCoarse ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r016' ( temp 4X4 matrix of float)
+0:416          dPdyFine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r017' ( temp 4X4 matrix of float)
+0:416          degrees ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp float)
+0:416          'r018' ( temp float)
+0:416          determinant ( temp float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r019' ( temp 4X4 matrix of float)
+0:416          exp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'R020' ( temp 4X4 matrix of float)
+0:416          exp2 ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r021' ( temp 4X4 matrix of float)
+0:416          Floor ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r022' ( temp 4X4 matrix of float)
+0:416          mod ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r023' ( temp 4X4 matrix of float)
+0:416          Fraction ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r025' ( temp 4X4 matrix of float)
+0:416          fwidth ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r026' ( temp 4X4 matrix of float)
+0:416          ldexp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r026a' ( temp 4X4 matrix of float)
+0:416          mix ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416            'inF2' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r027' ( temp 4X4 matrix of float)
+0:416          log ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r028' ( temp 4X4 matrix of float)
+0:416          matrix-scale ( temp 4X4 matrix of float)
+0:416            log2 ( temp 4X4 matrix of float)
+0:416              'inF0' ( in 4X4 matrix of float)
+0:416            Constant:
+0:416              0.301030
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r029' ( temp 4X4 matrix of float)
+0:416          log2 ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r030' ( temp 4X4 matrix of float)
+0:416          max ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r031' ( temp 4X4 matrix of float)
+0:416          min ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r032' ( temp 4X4 matrix of float)
+0:416          pow ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r033' ( temp 4X4 matrix of float)
+0:416          radians ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r034' ( temp 4X4 matrix of float)
+0:416          roundEven ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r035' ( temp 4X4 matrix of float)
+0:416          inverse sqrt ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r036' ( temp 4X4 matrix of float)
+0:416          clamp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            Constant:
+0:416              0.000000
+0:416            Constant:
+0:416              1.000000
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r037' ( temp 4X4 matrix of float)
+0:416          Sign ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r038' ( temp 4X4 matrix of float)
+0:416          sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'inF1' ( in 4X4 matrix of float)
+0:416          sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'inF2' ( in 4X4 matrix of float)
+0:416          cosine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r039' ( temp 4X4 matrix of float)
+0:416          hyp. sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r049' ( temp 4X4 matrix of float)
+0:416          smoothstep ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416            'inF2' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r041' ( temp 4X4 matrix of float)
+0:416          sqrt ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r042' ( temp 4X4 matrix of float)
+0:416          step ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r043' ( temp 4X4 matrix of float)
+0:416          tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r044' ( temp 4X4 matrix of float)
+0:416          hyp. tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      transpose ( temp 4X4 matrix of float)
+0:416        'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r046' ( temp 4X4 matrix of float)
+0:416          trunc ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:419      Branch: Return with expression
 0:?         Constant:
 0:?           4.000000
 0:?           4.000000
@@ -2457,334 +2415,334 @@ gl_FragCoord origin is upper left
 0:?           4.000000
 0:?           4.000000
 0:?           4.000000
-0:442  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
-0:442    Function Parameters: 
-0:442      'inF0' ( in float)
-0:442      'inF1' ( in float)
-0:442      'inFV0' ( in 2-component vector of float)
-0:442      'inFV1' ( in 2-component vector of float)
-0:442      'inFM0' ( in 2X2 matrix of float)
-0:442      'inFM1' ( in 2X2 matrix of float)
+0:437  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
+0:437    Function Parameters: 
+0:437      'inF0' ( in float)
+0:437      'inF1' ( in float)
+0:437      'inFV0' ( in 2-component vector of float)
+0:437      'inFV1' ( in 2-component vector of float)
+0:437      'inFM0' ( in 2X2 matrix of float)
+0:437      'inFM1' ( in 2X2 matrix of float)
 0:?     Sequence
-0:443      Sequence
-0:443        move second child to first child ( temp float)
-0:443          'r0' ( temp float)
-0:443          component-wise multiply ( temp float)
-0:443            'inF1' ( in float)
-0:443            'inF0' ( in float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r1' ( temp 2-component vector of float)
-0:443          vector-scale ( temp 2-component vector of float)
-0:443            'inF0' ( in float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r2' ( temp 2-component vector of float)
-0:443          vector-scale ( temp 2-component vector of float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443            'inF0' ( in float)
-0:443      Sequence
-0:443        move second child to first child ( temp float)
-0:443          'r3' ( temp float)
-0:443          dot-product ( temp float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443            'inFV1' ( in 2-component vector of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r4' ( temp 2-component vector of float)
-0:443          vector-times-matrix ( temp 2-component vector of float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r5' ( temp 2-component vector of float)
-0:443          matrix-times-vector ( temp 2-component vector of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2X2 matrix of float)
-0:443          'r6' ( temp 2X2 matrix of float)
-0:443          matrix-scale ( temp 2X2 matrix of float)
-0:443            'inF0' ( in float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2X2 matrix of float)
-0:443          'r7' ( temp 2X2 matrix of float)
-0:443          matrix-scale ( temp 2X2 matrix of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443            'inF0' ( in float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2X2 matrix of float)
-0:443          'r8' ( temp 2X2 matrix of float)
-0:443          matrix-multiply ( temp 2X2 matrix of float)
-0:443            'inFM1' ( in 2X2 matrix of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:449  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
-0:449    Function Parameters: 
-0:449      'inF0' ( in float)
-0:449      'inF1' ( in float)
-0:449      'inFV0' ( in 3-component vector of float)
-0:449      'inFV1' ( in 3-component vector of float)
-0:449      'inFM0' ( in 3X3 matrix of float)
-0:449      'inFM1' ( in 3X3 matrix of float)
+0:438      Sequence
+0:438        move second child to first child ( temp float)
+0:438          'r0' ( temp float)
+0:438          component-wise multiply ( temp float)
+0:438            'inF1' ( in float)
+0:438            'inF0' ( in float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r1' ( temp 2-component vector of float)
+0:438          vector-scale ( temp 2-component vector of float)
+0:438            'inF0' ( in float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r2' ( temp 2-component vector of float)
+0:438          vector-scale ( temp 2-component vector of float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438            'inF0' ( in float)
+0:438      Sequence
+0:438        move second child to first child ( temp float)
+0:438          'r3' ( temp float)
+0:438          dot-product ( temp float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438            'inFV1' ( in 2-component vector of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r4' ( temp 2-component vector of float)
+0:438          vector-times-matrix ( temp 2-component vector of float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r5' ( temp 2-component vector of float)
+0:438          matrix-times-vector ( temp 2-component vector of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2X2 matrix of float)
+0:438          'r6' ( temp 2X2 matrix of float)
+0:438          matrix-scale ( temp 2X2 matrix of float)
+0:438            'inF0' ( in float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2X2 matrix of float)
+0:438          'r7' ( temp 2X2 matrix of float)
+0:438          matrix-scale ( temp 2X2 matrix of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438            'inF0' ( in float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2X2 matrix of float)
+0:438          'r8' ( temp 2X2 matrix of float)
+0:438          matrix-multiply ( temp 2X2 matrix of float)
+0:438            'inFM1' ( in 2X2 matrix of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:444  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
+0:444    Function Parameters: 
+0:444      'inF0' ( in float)
+0:444      'inF1' ( in float)
+0:444      'inFV0' ( in 3-component vector of float)
+0:444      'inFV1' ( in 3-component vector of float)
+0:444      'inFM0' ( in 3X3 matrix of float)
+0:444      'inFM1' ( in 3X3 matrix of float)
 0:?     Sequence
-0:450      Sequence
-0:450        move second child to first child ( temp float)
-0:450          'r0' ( temp float)
-0:450          component-wise multiply ( temp float)
-0:450            'inF1' ( in float)
-0:450            'inF0' ( in float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r1' ( temp 3-component vector of float)
-0:450          vector-scale ( temp 3-component vector of float)
-0:450            'inF0' ( in float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r2' ( temp 3-component vector of float)
-0:450          vector-scale ( temp 3-component vector of float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450            'inF0' ( in float)
-0:450      Sequence
-0:450        move second child to first child ( temp float)
-0:450          'r3' ( temp float)
-0:450          dot-product ( temp float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450            'inFV1' ( in 3-component vector of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r4' ( temp 3-component vector of float)
-0:450          vector-times-matrix ( temp 3-component vector of float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r5' ( temp 3-component vector of float)
-0:450          matrix-times-vector ( temp 3-component vector of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3X3 matrix of float)
-0:450          'r6' ( temp 3X3 matrix of float)
-0:450          matrix-scale ( temp 3X3 matrix of float)
-0:450            'inF0' ( in float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3X3 matrix of float)
-0:450          'r7' ( temp 3X3 matrix of float)
-0:450          matrix-scale ( temp 3X3 matrix of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450            'inF0' ( in float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3X3 matrix of float)
-0:450          'r8' ( temp 3X3 matrix of float)
-0:450          matrix-multiply ( temp 3X3 matrix of float)
-0:450            'inFM1' ( in 3X3 matrix of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:456  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
-0:456    Function Parameters: 
-0:456      'inF0' ( in float)
-0:456      'inF1' ( in float)
-0:456      'inFV0' ( in 4-component vector of float)
-0:456      'inFV1' ( in 4-component vector of float)
-0:456      'inFM0' ( in 4X4 matrix of float)
-0:456      'inFM1' ( in 4X4 matrix of float)
+0:445      Sequence
+0:445        move second child to first child ( temp float)
+0:445          'r0' ( temp float)
+0:445          component-wise multiply ( temp float)
+0:445            'inF1' ( in float)
+0:445            'inF0' ( in float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r1' ( temp 3-component vector of float)
+0:445          vector-scale ( temp 3-component vector of float)
+0:445            'inF0' ( in float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r2' ( temp 3-component vector of float)
+0:445          vector-scale ( temp 3-component vector of float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445            'inF0' ( in float)
+0:445      Sequence
+0:445        move second child to first child ( temp float)
+0:445          'r3' ( temp float)
+0:445          dot-product ( temp float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445            'inFV1' ( in 3-component vector of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r4' ( temp 3-component vector of float)
+0:445          vector-times-matrix ( temp 3-component vector of float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r5' ( temp 3-component vector of float)
+0:445          matrix-times-vector ( temp 3-component vector of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3X3 matrix of float)
+0:445          'r6' ( temp 3X3 matrix of float)
+0:445          matrix-scale ( temp 3X3 matrix of float)
+0:445            'inF0' ( in float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3X3 matrix of float)
+0:445          'r7' ( temp 3X3 matrix of float)
+0:445          matrix-scale ( temp 3X3 matrix of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445            'inF0' ( in float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3X3 matrix of float)
+0:445          'r8' ( temp 3X3 matrix of float)
+0:445          matrix-multiply ( temp 3X3 matrix of float)
+0:445            'inFM1' ( in 3X3 matrix of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:451  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
+0:451    Function Parameters: 
+0:451      'inF0' ( in float)
+0:451      'inF1' ( in float)
+0:451      'inFV0' ( in 4-component vector of float)
+0:451      'inFV1' ( in 4-component vector of float)
+0:451      'inFM0' ( in 4X4 matrix of float)
+0:451      'inFM1' ( in 4X4 matrix of float)
 0:?     Sequence
-0:457      Sequence
-0:457        move second child to first child ( temp float)
-0:457          'r0' ( temp float)
-0:457          component-wise multiply ( temp float)
-0:457            'inF1' ( in float)
-0:457            'inF0' ( in float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r1' ( temp 4-component vector of float)
-0:457          vector-scale ( temp 4-component vector of float)
-0:457            'inF0' ( in float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r2' ( temp 4-component vector of float)
-0:457          vector-scale ( temp 4-component vector of float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457            'inF0' ( in float)
-0:457      Sequence
-0:457        move second child to first child ( temp float)
-0:457          'r3' ( temp float)
-0:457          dot-product ( temp float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457            'inFV1' ( in 4-component vector of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r4' ( temp 4-component vector of float)
-0:457          vector-times-matrix ( temp 4-component vector of float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r5' ( temp 4-component vector of float)
-0:457          matrix-times-vector ( temp 4-component vector of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4X4 matrix of float)
-0:457          'r6' ( temp 4X4 matrix of float)
-0:457          matrix-scale ( temp 4X4 matrix of float)
-0:457            'inF0' ( in float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4X4 matrix of float)
-0:457          'r7' ( temp 4X4 matrix of float)
-0:457          matrix-scale ( temp 4X4 matrix of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457            'inF0' ( in float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4X4 matrix of float)
-0:457          'r8' ( temp 4X4 matrix of float)
-0:457          matrix-multiply ( temp 4X4 matrix of float)
-0:457            'inFM1' ( in 4X4 matrix of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:466  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
-0:466    Function Parameters: 
-0:466      'inF0' ( in float)
-0:466      'inF1' ( in float)
-0:466      'inFV2' ( in 2-component vector of float)
-0:466      'inFV3' ( in 3-component vector of float)
-0:466      'inFM2x3' ( in 2X3 matrix of float)
-0:466      'inFM3x2' ( in 3X2 matrix of float)
-0:466      'inFM3x3' ( in 3X3 matrix of float)
-0:466      'inFM3x4' ( in 3X4 matrix of float)
-0:466      'inFM2x4' ( in 2X4 matrix of float)
+0:452      Sequence
+0:452        move second child to first child ( temp float)
+0:452          'r0' ( temp float)
+0:452          component-wise multiply ( temp float)
+0:452            'inF1' ( in float)
+0:452            'inF0' ( in float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r1' ( temp 4-component vector of float)
+0:452          vector-scale ( temp 4-component vector of float)
+0:452            'inF0' ( in float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r2' ( temp 4-component vector of float)
+0:452          vector-scale ( temp 4-component vector of float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452            'inF0' ( in float)
+0:452      Sequence
+0:452        move second child to first child ( temp float)
+0:452          'r3' ( temp float)
+0:452          dot-product ( temp float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452            'inFV1' ( in 4-component vector of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r4' ( temp 4-component vector of float)
+0:452          vector-times-matrix ( temp 4-component vector of float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r5' ( temp 4-component vector of float)
+0:452          matrix-times-vector ( temp 4-component vector of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4X4 matrix of float)
+0:452          'r6' ( temp 4X4 matrix of float)
+0:452          matrix-scale ( temp 4X4 matrix of float)
+0:452            'inF0' ( in float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4X4 matrix of float)
+0:452          'r7' ( temp 4X4 matrix of float)
+0:452          matrix-scale ( temp 4X4 matrix of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452            'inF0' ( in float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4X4 matrix of float)
+0:452          'r8' ( temp 4X4 matrix of float)
+0:452          matrix-multiply ( temp 4X4 matrix of float)
+0:452            'inFM1' ( in 4X4 matrix of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:461  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
+0:461    Function Parameters: 
+0:461      'inF0' ( in float)
+0:461      'inF1' ( in float)
+0:461      'inFV2' ( in 2-component vector of float)
+0:461      'inFV3' ( in 3-component vector of float)
+0:461      'inFM2x3' ( in 2X3 matrix of float)
+0:461      'inFM3x2' ( in 3X2 matrix of float)
+0:461      'inFM3x3' ( in 3X3 matrix of float)
+0:461      'inFM3x4' ( in 3X4 matrix of float)
+0:461      'inFM2x4' ( in 2X4 matrix of float)
 0:?     Sequence
+0:462      Sequence
+0:462        move second child to first child ( temp float)
+0:462          'r00' ( temp float)
+0:462          component-wise multiply ( temp float)
+0:462            'inF1' ( in float)
+0:462            'inF0' ( in float)
+0:463      Sequence
+0:463        move second child to first child ( temp 2-component vector of float)
+0:463          'r01' ( temp 2-component vector of float)
+0:463          vector-scale ( temp 2-component vector of float)
+0:463            'inF0' ( in float)
+0:463            'inFV2' ( in 2-component vector of float)
+0:464      Sequence
+0:464        move second child to first child ( temp 3-component vector of float)
+0:464          'r02' ( temp 3-component vector of float)
+0:464          vector-scale ( temp 3-component vector of float)
+0:464            'inF0' ( in float)
+0:464            'inFV3' ( in 3-component vector of float)
+0:465      Sequence
+0:465        move second child to first child ( temp 2-component vector of float)
+0:465          'r03' ( temp 2-component vector of float)
+0:465          vector-scale ( temp 2-component vector of float)
+0:465            'inFV2' ( in 2-component vector of float)
+0:465            'inF0' ( in float)
+0:466      Sequence
+0:466        move second child to first child ( temp 3-component vector of float)
+0:466          'r04' ( temp 3-component vector of float)
+0:466          vector-scale ( temp 3-component vector of float)
+0:466            'inFV3' ( in 3-component vector of float)
+0:466            'inF0' ( in float)
 0:467      Sequence
 0:467        move second child to first child ( temp float)
-0:467          'r00' ( temp float)
-0:467          component-wise multiply ( temp float)
-0:467            'inF1' ( in float)
-0:467            'inF0' ( in float)
+0:467          'r05' ( temp float)
+0:467          dot-product ( temp float)
+0:467            'inFV2' ( in 2-component vector of float)
+0:467            'inFV2' ( in 2-component vector of float)
 0:468      Sequence
-0:468        move second child to first child ( temp 2-component vector of float)
-0:468          'r01' ( temp 2-component vector of float)
-0:468          vector-scale ( temp 2-component vector of float)
-0:468            'inF0' ( in float)
-0:468            'inFV2' ( in 2-component vector of float)
+0:468        move second child to first child ( temp float)
+0:468          'r06' ( temp float)
+0:468          dot-product ( temp float)
+0:468            'inFV3' ( in 3-component vector of float)
+0:468            'inFV3' ( in 3-component vector of float)
 0:469      Sequence
 0:469        move second child to first child ( temp 3-component vector of float)
-0:469          'r02' ( temp 3-component vector of float)
-0:469          vector-scale ( temp 3-component vector of float)
-0:469            'inF0' ( in float)
-0:469            'inFV3' ( in 3-component vector of float)
+0:469          'r07' ( temp 3-component vector of float)
+0:469          matrix-times-vector ( temp 3-component vector of float)
+0:469            'inFM2x3' ( in 2X3 matrix of float)
+0:469            'inFV2' ( in 2-component vector of float)
 0:470      Sequence
 0:470        move second child to first child ( temp 2-component vector of float)
-0:470          'r03' ( temp 2-component vector of float)
-0:470          vector-scale ( temp 2-component vector of float)
-0:470            'inFV2' ( in 2-component vector of float)
-0:470            'inF0' ( in float)
+0:470          'r08' ( temp 2-component vector of float)
+0:470          matrix-times-vector ( temp 2-component vector of float)
+0:470            'inFM3x2' ( in 3X2 matrix of float)
+0:470            'inFV3' ( in 3-component vector of float)
 0:471      Sequence
-0:471        move second child to first child ( temp 3-component vector of float)
-0:471          'r04' ( temp 3-component vector of float)
-0:471          vector-scale ( temp 3-component vector of float)
+0:471        move second child to first child ( temp 2-component vector of float)
+0:471          'r09' ( temp 2-component vector of float)
+0:471          vector-times-matrix ( temp 2-component vector of float)
 0:471            'inFV3' ( in 3-component vector of float)
-0:471            'inF0' ( in float)
+0:471            'inFM2x3' ( in 2X3 matrix of float)
 0:472      Sequence
-0:472        move second child to first child ( temp float)
-0:472          'r05' ( temp float)
-0:472          dot-product ( temp float)
-0:472            'inFV2' ( in 2-component vector of float)
+0:472        move second child to first child ( temp 3-component vector of float)
+0:472          'r10' ( temp 3-component vector of float)
+0:472          vector-times-matrix ( temp 3-component vector of float)
 0:472            'inFV2' ( in 2-component vector of float)
+0:472            'inFM3x2' ( in 3X2 matrix of float)
 0:473      Sequence
-0:473        move second child to first child ( temp float)
-0:473          'r06' ( temp float)
-0:473          dot-product ( temp float)
-0:473            'inFV3' ( in 3-component vector of float)
-0:473            'inFV3' ( in 3-component vector of float)
+0:473        move second child to first child ( temp 2X3 matrix of float)
+0:473          'r11' ( temp 2X3 matrix of float)
+0:473          matrix-scale ( temp 2X3 matrix of float)
+0:473            'inF0' ( in float)
+0:473            'inFM2x3' ( in 2X3 matrix of float)
 0:474      Sequence
-0:474        move second child to first child ( temp 3-component vector of float)
-0:474          'r07' ( temp 3-component vector of float)
-0:474          matrix-times-vector ( temp 3-component vector of float)
-0:474            'inFM2x3' ( in 2X3 matrix of float)
-0:474            'inFV2' ( in 2-component vector of float)
+0:474        move second child to first child ( temp 3X2 matrix of float)
+0:474          'r12' ( temp 3X2 matrix of float)
+0:474          matrix-scale ( temp 3X2 matrix of float)
+0:474            'inF0' ( in float)
+0:474            'inFM3x2' ( in 3X2 matrix of float)
 0:475      Sequence
-0:475        move second child to first child ( temp 2-component vector of float)
-0:475          'r08' ( temp 2-component vector of float)
-0:475          matrix-times-vector ( temp 2-component vector of float)
+0:475        move second child to first child ( temp 2X2 matrix of float)
+0:475          'r13' ( temp 2X2 matrix of float)
+0:475          matrix-multiply ( temp 2X2 matrix of float)
 0:475            'inFM3x2' ( in 3X2 matrix of float)
-0:475            'inFV3' ( in 3-component vector of float)
+0:475            'inFM2x3' ( in 2X3 matrix of float)
 0:476      Sequence
-0:476        move second child to first child ( temp 2-component vector of float)
-0:476          'r09' ( temp 2-component vector of float)
-0:476          vector-times-matrix ( temp 2-component vector of float)
-0:476            'inFV3' ( in 3-component vector of float)
+0:476        move second child to first child ( temp 2X3 matrix of float)
+0:476          'r14' ( temp 2X3 matrix of float)
+0:476          matrix-multiply ( temp 2X3 matrix of float)
+0:476            'inFM3x3' ( in 3X3 matrix of float)
 0:476            'inFM2x3' ( in 2X3 matrix of float)
 0:477      Sequence
-0:477        move second child to first child ( temp 3-component vector of float)
-0:477          'r10' ( temp 3-component vector of float)
-0:477          vector-times-matrix ( temp 3-component vector of float)
-0:477            'inFV2' ( in 2-component vector of float)
-0:477            'inFM3x2' ( in 3X2 matrix of float)
+0:477        move second child to first child ( temp 2X4 matrix of float)
+0:477          'r15' ( temp 2X4 matrix of float)
+0:477          matrix-multiply ( temp 2X4 matrix of float)
+0:477            'inFM3x4' ( in 3X4 matrix of float)
+0:477            'inFM2x3' ( in 2X3 matrix of float)
 0:478      Sequence
-0:478        move second child to first child ( temp 2X3 matrix of float)
-0:478          'r11' ( temp 2X3 matrix of float)
-0:478          matrix-scale ( temp 2X3 matrix of float)
-0:478            'inF0' ( in float)
-0:478            'inFM2x3' ( in 2X3 matrix of float)
-0:479      Sequence
-0:479        move second child to first child ( temp 3X2 matrix of float)
-0:479          'r12' ( temp 3X2 matrix of float)
-0:479          matrix-scale ( temp 3X2 matrix of float)
-0:479            'inF0' ( in float)
-0:479            'inFM3x2' ( in 3X2 matrix of float)
-0:480      Sequence
-0:480        move second child to first child ( temp 2X2 matrix of float)
-0:480          'r13' ( temp 2X2 matrix of float)
-0:480          matrix-multiply ( temp 2X2 matrix of float)
-0:480            'inFM3x2' ( in 3X2 matrix of float)
-0:480            'inFM2x3' ( in 2X3 matrix of float)
-0:481      Sequence
-0:481        move second child to first child ( temp 2X3 matrix of float)
-0:481          'r14' ( temp 2X3 matrix of float)
-0:481          matrix-multiply ( temp 2X3 matrix of float)
-0:481            'inFM3x3' ( in 3X3 matrix of float)
-0:481            'inFM2x3' ( in 2X3 matrix of float)
-0:482      Sequence
-0:482        move second child to first child ( temp 2X4 matrix of float)
-0:482          'r15' ( temp 2X4 matrix of float)
-0:482          matrix-multiply ( temp 2X4 matrix of float)
-0:482            'inFM3x4' ( in 3X4 matrix of float)
-0:482            'inFM2x3' ( in 2X3 matrix of float)
-0:483      Sequence
-0:483        move second child to first child ( temp 3X4 matrix of float)
-0:483          'r16' ( temp 3X4 matrix of float)
-0:483          matrix-multiply ( temp 3X4 matrix of float)
-0:483            'inFM2x4' ( in 2X4 matrix of float)
-0:483            'inFM3x2' ( in 3X2 matrix of float)
-0:489  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
-0:489    Function Parameters: 
+0:478        move second child to first child ( temp 3X4 matrix of float)
+0:478          'r16' ( temp 3X4 matrix of float)
+0:478          matrix-multiply ( temp 3X4 matrix of float)
+0:478            'inFM2x4' ( in 2X4 matrix of float)
+0:478            'inFM3x2' ( in 3X2 matrix of float)
+0:484  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
+0:484    Function Parameters: 
 0:?     Sequence
-0:491      move second child to first child ( temp 4-component vector of float)
-0:491        color: direct index for structure ( temp 4-component vector of float)
-0:491          'ps_output' ( temp structure{ temp 4-component vector of float color})
-0:491          Constant:
-0:491            0 (const int)
-0:491        Constant:
-0:491          1.000000
-0:491          1.000000
-0:491          1.000000
-0:491          1.000000
-0:492      Branch: Return with expression
-0:492        'ps_output' ( temp structure{ temp 4-component vector of float color})
-0:489  Function Definition: main( ( temp void)
-0:489    Function Parameters: 
+0:486      move second child to first child ( temp 4-component vector of float)
+0:486        color: direct index for structure ( temp 4-component vector of float)
+0:486          'ps_output' ( temp structure{ temp 4-component vector of float color})
+0:486          Constant:
+0:486            0 (const int)
+0:486        Constant:
+0:486          1.000000
+0:486          1.000000
+0:486          1.000000
+0:486          1.000000
+0:487      Branch: Return with expression
+0:487        'ps_output' ( temp structure{ temp 4-component vector of float color})
+0:484  Function Definition: main( ( temp void)
+0:484    Function Parameters: 
 0:?     Sequence
-0:489      Sequence
-0:489        move second child to first child ( temp 4-component vector of float)
+0:484      Sequence
+0:484        move second child to first child ( temp 4-component vector of float)
 0:?           'color' (layout( location=0) out 4-component vector of float)
-0:489          color: direct index for structure ( temp 4-component vector of float)
-0:489            Function Call: @main( ( temp structure{ temp 4-component vector of float color})
-0:489            Constant:
-0:489              0 (const int)
+0:484          color: direct index for structure ( temp 4-component vector of float)
+0:484            Function Call: @main( ( temp structure{ temp 4-component vector of float color})
+0:484            Constant:
+0:484              0 (const int)
 0:?   Linker Objects
 0:?     'gs_ua' ( shared uint)
 0:?     'gs_ub' ( shared uint)
@@ -2804,7 +2762,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:17  Function Definition: PixelShaderFunctionS(f1;f1;f1;u1;u1; ( temp float)
@@ -2979,1967 +2937,1931 @@ gl_FragCoord origin is upper left
 0:54            'inF0' ( in float)
 0:55      Sequence
 0:55        move second child to first child ( temp float)
-0:55          'r035' ( temp float)
-0:55          frexp ( temp float)
+0:55          'r036' ( temp float)
+0:55          fwidth ( temp float)
 0:55            'inF0' ( in float)
-0:55            'inF1' ( in float)
 0:56      Sequence
-0:56        move second child to first child ( temp float)
-0:56          'r036' ( temp float)
-0:56          fwidth ( temp float)
+0:56        move second child to first child ( temp bool)
+0:56          'r037' ( temp bool)
+0:56          isinf ( temp bool)
 0:56            'inF0' ( in float)
 0:57      Sequence
 0:57        move second child to first child ( temp bool)
-0:57          'r037' ( temp bool)
-0:57          isinf ( temp bool)
+0:57          'r038' ( temp bool)
+0:57          isnan ( temp bool)
 0:57            'inF0' ( in float)
 0:58      Sequence
-0:58        move second child to first child ( temp bool)
-0:58          'r038' ( temp bool)
-0:58          isnan ( temp bool)
+0:58        move second child to first child ( temp float)
+0:58          'r039' ( temp float)
+0:58          ldexp ( temp float)
 0:58            'inF0' ( in float)
+0:58            'inF1' ( in float)
 0:59      Sequence
 0:59        move second child to first child ( temp float)
-0:59          'r039' ( temp float)
-0:59          ldexp ( temp float)
+0:59          'r039a' ( temp float)
+0:59          mix ( temp float)
 0:59            'inF0' ( in float)
 0:59            'inF1' ( in float)
+0:59            'inF2' ( in float)
 0:60      Sequence
 0:60        move second child to first child ( temp float)
-0:60          'r039a' ( temp float)
-0:60          mix ( temp float)
+0:60          'r040' ( temp float)
+0:60          log ( temp float)
 0:60            'inF0' ( in float)
-0:60            'inF1' ( in float)
-0:60            'inF2' ( in float)
 0:61      Sequence
 0:61        move second child to first child ( temp float)
-0:61          'r040' ( temp float)
-0:61          log ( temp float)
-0:61            'inF0' ( in float)
+0:61          'r041' ( temp float)
+0:61          component-wise multiply ( temp float)
+0:61            log2 ( temp float)
+0:61              'inF0' ( in float)
+0:61            Constant:
+0:61              0.301030
 0:62      Sequence
 0:62        move second child to first child ( temp float)
-0:62          'r041' ( temp float)
-0:62          component-wise multiply ( temp float)
-0:62            log2 ( temp float)
-0:62              'inF0' ( in float)
-0:62            Constant:
-0:62              0.301030
+0:62          'r042' ( temp float)
+0:62          log2 ( temp float)
+0:62            'inF0' ( in float)
 0:63      Sequence
 0:63        move second child to first child ( temp float)
-0:63          'r042' ( temp float)
-0:63          log2 ( temp float)
+0:63          'r043' ( temp float)
+0:63          max ( temp float)
 0:63            'inF0' ( in float)
+0:63            'inF1' ( in float)
 0:64      Sequence
 0:64        move second child to first child ( temp float)
-0:64          'r043' ( temp float)
-0:64          max ( temp float)
+0:64          'r044' ( temp float)
+0:64          min ( temp float)
 0:64            'inF0' ( in float)
 0:64            'inF1' ( in float)
 0:65      Sequence
 0:65        move second child to first child ( temp float)
-0:65          'r044' ( temp float)
-0:65          min ( temp float)
+0:65          'r045' ( temp float)
+0:65          pow ( temp float)
 0:65            'inF0' ( in float)
 0:65            'inF1' ( in float)
 0:66      Sequence
 0:66        move second child to first child ( temp float)
-0:66          'r045' ( temp float)
-0:66          pow ( temp float)
+0:66          'r046' ( temp float)
+0:66          radians ( temp float)
 0:66            'inF0' ( in float)
-0:66            'inF1' ( in float)
 0:67      Sequence
 0:67        move second child to first child ( temp float)
-0:67          'r046' ( temp float)
-0:67          radians ( temp float)
+0:67          'r047' ( temp float)
+0:67          divide ( temp float)
+0:67            Constant:
+0:67              1.000000
 0:67            'inF0' ( in float)
 0:68      Sequence
-0:68        move second child to first child ( temp float)
-0:68          'r047' ( temp float)
-0:68          divide ( temp float)
-0:68            Constant:
-0:68              1.000000
-0:68            'inF0' ( in float)
+0:68        move second child to first child ( temp uint)
+0:68          'r048' ( temp uint)
+0:68          Convert int to uint ( temp uint)
+0:68            bitFieldReverse ( temp int)
+0:68              Constant:
+0:68                2 (const int)
 0:69      Sequence
-0:69        move second child to first child ( temp uint)
-0:69          'r048' ( temp uint)
-0:69          Convert int to uint ( temp uint)
-0:69            bitFieldReverse ( temp int)
-0:69              Constant:
-0:69                2 (const int)
+0:69        move second child to first child ( temp float)
+0:69          'r049' ( temp float)
+0:69          roundEven ( temp float)
+0:69            'inF0' ( in float)
 0:70      Sequence
 0:70        move second child to first child ( temp float)
-0:70          'r049' ( temp float)
-0:70          roundEven ( temp float)
+0:70          'r050' ( temp float)
+0:70          inverse sqrt ( temp float)
 0:70            'inF0' ( in float)
 0:71      Sequence
 0:71        move second child to first child ( temp float)
-0:71          'r050' ( temp float)
-0:71          inverse sqrt ( temp float)
+0:71          'r051' ( temp float)
+0:71          clamp ( temp float)
 0:71            'inF0' ( in float)
+0:71            Constant:
+0:71              0.000000
+0:71            Constant:
+0:71              1.000000
 0:72      Sequence
 0:72        move second child to first child ( temp float)
-0:72          'r051' ( temp float)
-0:72          clamp ( temp float)
+0:72          'r052' ( temp float)
+0:72          Sign ( temp float)
 0:72            'inF0' ( in float)
-0:72            Constant:
-0:72              0.000000
-0:72            Constant:
-0:72              1.000000
 0:73      Sequence
 0:73        move second child to first child ( temp float)
-0:73          'r052' ( temp float)
-0:73          Sign ( temp float)
+0:73          'r053' ( temp float)
+0:73          sine ( temp float)
 0:73            'inF0' ( in float)
 0:74      Sequence
 0:74        move second child to first child ( temp float)
-0:74          'r053' ( temp float)
+0:74          'inF1' ( in float)
 0:74          sine ( temp float)
 0:74            'inF0' ( in float)
+0:74        move second child to first child ( temp float)
+0:74          'inF2' ( in float)
+0:74          cosine ( temp float)
+0:74            'inF0' ( in float)
 0:75      Sequence
 0:75        move second child to first child ( temp float)
-0:75          'inF1' ( in float)
-0:75          sine ( temp float)
-0:75            'inF0' ( in float)
-0:75        move second child to first child ( temp float)
-0:75          'inF2' ( in float)
-0:75          cosine ( temp float)
+0:75          'r055' ( temp float)
+0:75          hyp. sine ( temp float)
 0:75            'inF0' ( in float)
 0:76      Sequence
 0:76        move second child to first child ( temp float)
-0:76          'r055' ( temp float)
-0:76          hyp. sine ( temp float)
+0:76          'r056' ( temp float)
+0:76          smoothstep ( temp float)
 0:76            'inF0' ( in float)
+0:76            'inF1' ( in float)
+0:76            'inF2' ( in float)
 0:77      Sequence
 0:77        move second child to first child ( temp float)
-0:77          'r056' ( temp float)
-0:77          smoothstep ( temp float)
+0:77          'r057' ( temp float)
+0:77          sqrt ( temp float)
 0:77            'inF0' ( in float)
-0:77            'inF1' ( in float)
-0:77            'inF2' ( in float)
 0:78      Sequence
 0:78        move second child to first child ( temp float)
-0:78          'r057' ( temp float)
-0:78          sqrt ( temp float)
+0:78          'r058' ( temp float)
+0:78          step ( temp float)
 0:78            'inF0' ( in float)
+0:78            'inF1' ( in float)
 0:79      Sequence
 0:79        move second child to first child ( temp float)
-0:79          'r058' ( temp float)
-0:79          step ( temp float)
+0:79          'r059' ( temp float)
+0:79          tangent ( temp float)
 0:79            'inF0' ( in float)
-0:79            'inF1' ( in float)
 0:80      Sequence
 0:80        move second child to first child ( temp float)
-0:80          'r059' ( temp float)
-0:80          tangent ( temp float)
+0:80          'r060' ( temp float)
+0:80          hyp. tangent ( temp float)
 0:80            'inF0' ( in float)
-0:81      Sequence
-0:81        move second child to first child ( temp float)
-0:81          'r060' ( temp float)
-0:81          hyp. tangent ( temp float)
-0:81            'inF0' ( in float)
-0:83      Sequence
-0:83        move second child to first child ( temp float)
-0:83          'r061' ( temp float)
-0:83          trunc ( temp float)
-0:83            'inF0' ( in float)
-0:85      Branch: Return with expression
-0:85        Constant:
-0:85          0.000000
-0:89  Function Definition: PixelShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
-0:89    Function Parameters: 
-0:89      'inF0' ( in 1-component vector of float)
-0:89      'inF1' ( in 1-component vector of float)
-0:89      'inF2' ( in 1-component vector of float)
+0:82      Sequence
+0:82        move second child to first child ( temp float)
+0:82          'r061' ( temp float)
+0:82          trunc ( temp float)
+0:82            'inF0' ( in float)
+0:84      Branch: Return with expression
+0:84        Constant:
+0:84          0.000000
+0:88  Function Definition: PixelShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
+0:88    Function Parameters: 
+0:88      'inF0' ( in 1-component vector of float)
+0:88      'inF1' ( in 1-component vector of float)
+0:88      'inF2' ( in 1-component vector of float)
 0:?     Sequence
-0:91      Branch: Return with expression
-0:91        Constant:
-0:91          0.000000
-0:95  Function Definition: PixelShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
-0:95    Function Parameters: 
-0:95      'inF0' ( in 2-component vector of float)
-0:95      'inF1' ( in 2-component vector of float)
-0:95      'inF2' ( in 2-component vector of float)
-0:95      'inU0' ( in 2-component vector of uint)
-0:95      'inU1' ( in 2-component vector of uint)
+0:90      Branch: Return with expression
+0:90        Constant:
+0:90          0.000000
+0:94  Function Definition: PixelShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
+0:94    Function Parameters: 
+0:94      'inF0' ( in 2-component vector of float)
+0:94      'inF1' ( in 2-component vector of float)
+0:94      'inF2' ( in 2-component vector of float)
+0:94      'inU0' ( in 2-component vector of uint)
+0:94      'inU1' ( in 2-component vector of uint)
 0:?     Sequence
+0:97      Sequence
+0:97        move second child to first child ( temp bool)
+0:97          'r000' ( temp bool)
+0:97          all ( temp bool)
+0:97            'inF0' ( in 2-component vector of float)
 0:98      Sequence
-0:98        move second child to first child ( temp bool)
-0:98          'r000' ( temp bool)
-0:98          all ( temp bool)
+0:98        move second child to first child ( temp 2-component vector of float)
+0:98          'r001' ( temp 2-component vector of float)
+0:98          Absolute value ( temp 2-component vector of float)
 0:98            'inF0' ( in 2-component vector of float)
 0:99      Sequence
 0:99        move second child to first child ( temp 2-component vector of float)
-0:99          'r001' ( temp 2-component vector of float)
-0:99          Absolute value ( temp 2-component vector of float)
+0:99          'r002' ( temp 2-component vector of float)
+0:99          arc cosine ( temp 2-component vector of float)
 0:99            'inF0' ( in 2-component vector of float)
 0:100      Sequence
-0:100        move second child to first child ( temp 2-component vector of float)
-0:100          'r002' ( temp 2-component vector of float)
-0:100          arc cosine ( temp 2-component vector of float)
+0:100        move second child to first child ( temp bool)
+0:100          'r003' ( temp bool)
+0:100          any ( temp bool)
 0:100            'inF0' ( in 2-component vector of float)
 0:101      Sequence
-0:101        move second child to first child ( temp bool)
-0:101          'r003' ( temp bool)
-0:101          any ( temp bool)
+0:101        move second child to first child ( temp 2-component vector of float)
+0:101          'r004' ( temp 2-component vector of float)
+0:101          arc sine ( temp 2-component vector of float)
 0:101            'inF0' ( in 2-component vector of float)
 0:102      Sequence
-0:102        move second child to first child ( temp 2-component vector of float)
-0:102          'r004' ( temp 2-component vector of float)
-0:102          arc sine ( temp 2-component vector of float)
+0:102        move second child to first child ( temp 2-component vector of int)
+0:102          'r005' ( temp 2-component vector of int)
+0:102          floatBitsToInt ( temp 2-component vector of int)
 0:102            'inF0' ( in 2-component vector of float)
 0:103      Sequence
-0:103        move second child to first child ( temp 2-component vector of int)
-0:103          'r005' ( temp 2-component vector of int)
-0:103          floatBitsToInt ( temp 2-component vector of int)
+0:103        move second child to first child ( temp 2-component vector of uint)
+0:103          'r006' ( temp 2-component vector of uint)
+0:103          floatBitsToUint ( temp 2-component vector of uint)
 0:103            'inF0' ( in 2-component vector of float)
 0:104      Sequence
-0:104        move second child to first child ( temp 2-component vector of uint)
-0:104          'r006' ( temp 2-component vector of uint)
-0:104          floatBitsToUint ( temp 2-component vector of uint)
-0:104            'inF0' ( in 2-component vector of float)
-0:105      Sequence
-0:105        move second child to first child ( temp 2-component vector of float)
-0:105          'r007' ( temp 2-component vector of float)
-0:105          intBitsToFloat ( temp 2-component vector of float)
-0:105            'inU0' ( in 2-component vector of uint)
+0:104        move second child to first child ( temp 2-component vector of float)
+0:104          'r007' ( temp 2-component vector of float)
+0:104          intBitsToFloat ( temp 2-component vector of float)
+0:104            'inU0' ( in 2-component vector of uint)
+0:106      Sequence
+0:106        move second child to first child ( temp 2-component vector of float)
+0:106          'r009' ( temp 2-component vector of float)
+0:106          arc tangent ( temp 2-component vector of float)
+0:106            'inF0' ( in 2-component vector of float)
 0:107      Sequence
 0:107        move second child to first child ( temp 2-component vector of float)
-0:107          'r009' ( temp 2-component vector of float)
+0:107          'r010' ( temp 2-component vector of float)
 0:107          arc tangent ( temp 2-component vector of float)
 0:107            'inF0' ( in 2-component vector of float)
+0:107            'inF1' ( in 2-component vector of float)
 0:108      Sequence
 0:108        move second child to first child ( temp 2-component vector of float)
-0:108          'r010' ( temp 2-component vector of float)
-0:108          arc tangent ( temp 2-component vector of float)
+0:108          'r011' ( temp 2-component vector of float)
+0:108          Ceiling ( temp 2-component vector of float)
 0:108            'inF0' ( in 2-component vector of float)
-0:108            'inF1' ( in 2-component vector of float)
 0:109      Sequence
 0:109        move second child to first child ( temp 2-component vector of float)
-0:109          'r011' ( temp 2-component vector of float)
-0:109          Ceiling ( temp 2-component vector of float)
+0:109          'r012' ( temp 2-component vector of float)
+0:109          clamp ( temp 2-component vector of float)
 0:109            'inF0' ( in 2-component vector of float)
-0:110      Sequence
-0:110        move second child to first child ( temp 2-component vector of float)
-0:110          'r012' ( temp 2-component vector of float)
-0:110          clamp ( temp 2-component vector of float)
+0:109            'inF1' ( in 2-component vector of float)
+0:109            'inF2' ( in 2-component vector of float)
+0:110      Test condition and select ( temp void)
+0:110        Condition
+0:110        any ( temp bool)
+0:110          Compare Less Than ( temp 2-component vector of bool)
 0:110            'inF0' ( in 2-component vector of float)
-0:110            'inF1' ( in 2-component vector of float)
-0:110            'inF2' ( in 2-component vector of float)
-0:111      Test condition and select ( temp void)
-0:111        Condition
-0:111        any ( temp bool)
-0:111          Compare Less Than ( temp 2-component vector of bool)
+0:110            Constant:
+0:110              0.000000
+0:110              0.000000
+0:110        true case
+0:110        Branch: Kill
+0:111      Sequence
+0:111        move second child to first child ( temp 2-component vector of float)
+0:111          'r013' ( temp 2-component vector of float)
+0:111          cosine ( temp 2-component vector of float)
 0:111            'inF0' ( in 2-component vector of float)
-0:111            Constant:
-0:111              0.000000
-0:111              0.000000
-0:111        true case
-0:111        Branch: Kill
 0:112      Sequence
 0:112        move second child to first child ( temp 2-component vector of float)
-0:112          'r013' ( temp 2-component vector of float)
-0:112          cosine ( temp 2-component vector of float)
+0:112          'r015' ( temp 2-component vector of float)
+0:112          hyp. cosine ( temp 2-component vector of float)
 0:112            'inF0' ( in 2-component vector of float)
 0:113      Sequence
-0:113        move second child to first child ( temp 2-component vector of float)
-0:113          'r015' ( temp 2-component vector of float)
-0:113          hyp. cosine ( temp 2-component vector of float)
-0:113            'inF0' ( in 2-component vector of float)
-0:114      Sequence
-0:114        move second child to first child ( temp 2-component vector of int)
-0:114          'r016' ( temp 2-component vector of int)
+0:113        move second child to first child ( temp 2-component vector of int)
+0:113          'r016' ( temp 2-component vector of int)
 0:?           bitCount ( temp 2-component vector of int)
 0:?             Constant:
 0:?               7 (const int)
 0:?               3 (const int)
+0:114      Sequence
+0:114        move second child to first child ( temp 2-component vector of float)
+0:114          'r017' ( temp 2-component vector of float)
+0:114          dPdx ( temp 2-component vector of float)
+0:114            'inF0' ( in 2-component vector of float)
 0:115      Sequence
 0:115        move second child to first child ( temp 2-component vector of float)
-0:115          'r017' ( temp 2-component vector of float)
-0:115          dPdx ( temp 2-component vector of float)
+0:115          'r018' ( temp 2-component vector of float)
+0:115          dPdxCoarse ( temp 2-component vector of float)
 0:115            'inF0' ( in 2-component vector of float)
 0:116      Sequence
 0:116        move second child to first child ( temp 2-component vector of float)
-0:116          'r018' ( temp 2-component vector of float)
-0:116          dPdxCoarse ( temp 2-component vector of float)
+0:116          'r019' ( temp 2-component vector of float)
+0:116          dPdxFine ( temp 2-component vector of float)
 0:116            'inF0' ( in 2-component vector of float)
 0:117      Sequence
 0:117        move second child to first child ( temp 2-component vector of float)
-0:117          'r019' ( temp 2-component vector of float)
-0:117          dPdxFine ( temp 2-component vector of float)
+0:117          'r020' ( temp 2-component vector of float)
+0:117          dPdy ( temp 2-component vector of float)
 0:117            'inF0' ( in 2-component vector of float)
 0:118      Sequence
 0:118        move second child to first child ( temp 2-component vector of float)
-0:118          'r020' ( temp 2-component vector of float)
-0:118          dPdy ( temp 2-component vector of float)
+0:118          'r021' ( temp 2-component vector of float)
+0:118          dPdyCoarse ( temp 2-component vector of float)
 0:118            'inF0' ( in 2-component vector of float)
 0:119      Sequence
 0:119        move second child to first child ( temp 2-component vector of float)
-0:119          'r021' ( temp 2-component vector of float)
-0:119          dPdyCoarse ( temp 2-component vector of float)
+0:119          'r022' ( temp 2-component vector of float)
+0:119          dPdyFine ( temp 2-component vector of float)
 0:119            'inF0' ( in 2-component vector of float)
 0:120      Sequence
 0:120        move second child to first child ( temp 2-component vector of float)
-0:120          'r022' ( temp 2-component vector of float)
-0:120          dPdyFine ( temp 2-component vector of float)
+0:120          'r023' ( temp 2-component vector of float)
+0:120          degrees ( temp 2-component vector of float)
 0:120            'inF0' ( in 2-component vector of float)
-0:121      Sequence
-0:121        move second child to first child ( temp 2-component vector of float)
-0:121          'r023' ( temp 2-component vector of float)
-0:121          degrees ( temp 2-component vector of float)
-0:121            'inF0' ( in 2-component vector of float)
+0:124      Sequence
+0:124        move second child to first child ( temp float)
+0:124          'r026' ( temp float)
+0:124          distance ( temp float)
+0:124            'inF0' ( in 2-component vector of float)
+0:124            'inF1' ( in 2-component vector of float)
 0:125      Sequence
 0:125        move second child to first child ( temp float)
-0:125          'r026' ( temp float)
-0:125          distance ( temp float)
+0:125          'r027' ( temp float)
+0:125          dot-product ( temp float)
 0:125            'inF0' ( in 2-component vector of float)
 0:125            'inF1' ( in 2-component vector of float)
-0:126      Sequence
-0:126        move second child to first child ( temp float)
-0:126          'r027' ( temp float)
-0:126          dot-product ( temp float)
-0:126            'inF0' ( in 2-component vector of float)
-0:126            'inF1' ( in 2-component vector of float)
+0:129      Sequence
+0:129        move second child to first child ( temp 2-component vector of float)
+0:129          'r028' ( temp 2-component vector of float)
+0:129          exp ( temp 2-component vector of float)
+0:129            'inF0' ( in 2-component vector of float)
 0:130      Sequence
 0:130        move second child to first child ( temp 2-component vector of float)
-0:130          'r028' ( temp 2-component vector of float)
-0:130          exp ( temp 2-component vector of float)
+0:130          'r029' ( temp 2-component vector of float)
+0:130          exp2 ( temp 2-component vector of float)
 0:130            'inF0' ( in 2-component vector of float)
 0:131      Sequence
 0:131        move second child to first child ( temp 2-component vector of float)
-0:131          'r029' ( temp 2-component vector of float)
-0:131          exp2 ( temp 2-component vector of float)
+0:131          'r030' ( temp 2-component vector of float)
+0:131          face-forward ( temp 2-component vector of float)
 0:131            'inF0' ( in 2-component vector of float)
+0:131            'inF1' ( in 2-component vector of float)
+0:131            'inF2' ( in 2-component vector of float)
 0:132      Sequence
-0:132        move second child to first child ( temp 2-component vector of float)
-0:132          'r030' ( temp 2-component vector of float)
-0:132          face-forward ( temp 2-component vector of float)
-0:132            'inF0' ( in 2-component vector of float)
-0:132            'inF1' ( in 2-component vector of float)
-0:132            'inF2' ( in 2-component vector of float)
-0:133      Sequence
-0:133        move second child to first child ( temp 2-component vector of uint)
-0:133          'r031' ( temp 2-component vector of uint)
+0:132        move second child to first child ( temp 2-component vector of uint)
+0:132          'r031' ( temp 2-component vector of uint)
 0:?           findMSB ( temp 2-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
-0:134      Sequence
-0:134        move second child to first child ( temp 2-component vector of uint)
-0:134          'r032' ( temp 2-component vector of uint)
+0:133      Sequence
+0:133        move second child to first child ( temp 2-component vector of uint)
+0:133          'r032' ( temp 2-component vector of uint)
 0:?           findLSB ( temp 2-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
-0:135      Sequence
-0:135        move second child to first child ( temp 2-component vector of float)
-0:135          'r033' ( temp 2-component vector of float)
-0:135          Floor ( temp 2-component vector of float)
-0:135            'inF0' ( in 2-component vector of float)
+0:134      Sequence
+0:134        move second child to first child ( temp 2-component vector of float)
+0:134          'r033' ( temp 2-component vector of float)
+0:134          Floor ( temp 2-component vector of float)
+0:134            'inF0' ( in 2-component vector of float)
+0:136      Sequence
+0:136        move second child to first child ( temp 2-component vector of float)
+0:136          'r035' ( temp 2-component vector of float)
+0:136          mod ( temp 2-component vector of float)
+0:136            'inF0' ( in 2-component vector of float)
+0:136            'inF1' ( in 2-component vector of float)
 0:137      Sequence
 0:137        move second child to first child ( temp 2-component vector of float)
-0:137          'r035' ( temp 2-component vector of float)
-0:137          mod ( temp 2-component vector of float)
+0:137          'r036' ( temp 2-component vector of float)
+0:137          Fraction ( temp 2-component vector of float)
 0:137            'inF0' ( in 2-component vector of float)
-0:137            'inF1' ( in 2-component vector of float)
 0:138      Sequence
 0:138        move second child to first child ( temp 2-component vector of float)
-0:138          'r036' ( temp 2-component vector of float)
-0:138          Fraction ( temp 2-component vector of float)
+0:138          'r038' ( temp 2-component vector of float)
+0:138          fwidth ( temp 2-component vector of float)
 0:138            'inF0' ( in 2-component vector of float)
 0:139      Sequence
-0:139        move second child to first child ( temp 2-component vector of float)
-0:139          'r037' ( temp 2-component vector of float)
-0:139          frexp ( temp 2-component vector of float)
+0:139        move second child to first child ( temp 2-component vector of bool)
+0:139          'r039' ( temp 2-component vector of bool)
+0:139          isinf ( temp 2-component vector of bool)
 0:139            'inF0' ( in 2-component vector of float)
-0:139            'inF1' ( in 2-component vector of float)
 0:140      Sequence
-0:140        move second child to first child ( temp 2-component vector of float)
-0:140          'r038' ( temp 2-component vector of float)
-0:140          fwidth ( temp 2-component vector of float)
+0:140        move second child to first child ( temp 2-component vector of bool)
+0:140          'r040' ( temp 2-component vector of bool)
+0:140          isnan ( temp 2-component vector of bool)
 0:140            'inF0' ( in 2-component vector of float)
 0:141      Sequence
-0:141        move second child to first child ( temp 2-component vector of bool)
-0:141          'r039' ( temp 2-component vector of bool)
-0:141          isinf ( temp 2-component vector of bool)
+0:141        move second child to first child ( temp 2-component vector of float)
+0:141          'r041' ( temp 2-component vector of float)
+0:141          ldexp ( temp 2-component vector of float)
 0:141            'inF0' ( in 2-component vector of float)
+0:141            'inF1' ( in 2-component vector of float)
 0:142      Sequence
-0:142        move second child to first child ( temp 2-component vector of bool)
-0:142          'r040' ( temp 2-component vector of bool)
-0:142          isnan ( temp 2-component vector of bool)
+0:142        move second child to first child ( temp 2-component vector of float)
+0:142          'r039a' ( temp 2-component vector of float)
+0:142          mix ( temp 2-component vector of float)
 0:142            'inF0' ( in 2-component vector of float)
+0:142            'inF1' ( in 2-component vector of float)
+0:142            'inF2' ( in 2-component vector of float)
 0:143      Sequence
-0:143        move second child to first child ( temp 2-component vector of float)
-0:143          'r041' ( temp 2-component vector of float)
-0:143          ldexp ( temp 2-component vector of float)
+0:143        move second child to first child ( temp float)
+0:143          'r042' ( temp float)
+0:143          length ( temp float)
 0:143            'inF0' ( in 2-component vector of float)
-0:143            'inF1' ( in 2-component vector of float)
 0:144      Sequence
 0:144        move second child to first child ( temp 2-component vector of float)
-0:144          'r039a' ( temp 2-component vector of float)
-0:144          mix ( temp 2-component vector of float)
+0:144          'r043' ( temp 2-component vector of float)
+0:144          log ( temp 2-component vector of float)
 0:144            'inF0' ( in 2-component vector of float)
-0:144            'inF1' ( in 2-component vector of float)
-0:144            'inF2' ( in 2-component vector of float)
 0:145      Sequence
-0:145        move second child to first child ( temp float)
-0:145          'r042' ( temp float)
-0:145          length ( temp float)
-0:145            'inF0' ( in 2-component vector of float)
+0:145        move second child to first child ( temp 2-component vector of float)
+0:145          'r044' ( temp 2-component vector of float)
+0:145          vector-scale ( temp 2-component vector of float)
+0:145            log2 ( temp 2-component vector of float)
+0:145              'inF0' ( in 2-component vector of float)
+0:145            Constant:
+0:145              0.301030
 0:146      Sequence
 0:146        move second child to first child ( temp 2-component vector of float)
-0:146          'r043' ( temp 2-component vector of float)
-0:146          log ( temp 2-component vector of float)
+0:146          'r045' ( temp 2-component vector of float)
+0:146          log2 ( temp 2-component vector of float)
 0:146            'inF0' ( in 2-component vector of float)
 0:147      Sequence
 0:147        move second child to first child ( temp 2-component vector of float)
-0:147          'r044' ( temp 2-component vector of float)
-0:147          vector-scale ( temp 2-component vector of float)
-0:147            log2 ( temp 2-component vector of float)
-0:147              'inF0' ( in 2-component vector of float)
-0:147            Constant:
-0:147              0.301030
+0:147          'r046' ( temp 2-component vector of float)
+0:147          max ( temp 2-component vector of float)
+0:147            'inF0' ( in 2-component vector of float)
+0:147            'inF1' ( in 2-component vector of float)
 0:148      Sequence
 0:148        move second child to first child ( temp 2-component vector of float)
-0:148          'r045' ( temp 2-component vector of float)
-0:148          log2 ( temp 2-component vector of float)
+0:148          'r047' ( temp 2-component vector of float)
+0:148          min ( temp 2-component vector of float)
 0:148            'inF0' ( in 2-component vector of float)
+0:148            'inF1' ( in 2-component vector of float)
 0:149      Sequence
 0:149        move second child to first child ( temp 2-component vector of float)
-0:149          'r046' ( temp 2-component vector of float)
-0:149          max ( temp 2-component vector of float)
+0:149          'r048' ( temp 2-component vector of float)
+0:149          normalize ( temp 2-component vector of float)
 0:149            'inF0' ( in 2-component vector of float)
-0:149            'inF1' ( in 2-component vector of float)
 0:150      Sequence
 0:150        move second child to first child ( temp 2-component vector of float)
-0:150          'r047' ( temp 2-component vector of float)
-0:150          min ( temp 2-component vector of float)
+0:150          'r049' ( temp 2-component vector of float)
+0:150          pow ( temp 2-component vector of float)
 0:150            'inF0' ( in 2-component vector of float)
 0:150            'inF1' ( in 2-component vector of float)
 0:151      Sequence
 0:151        move second child to first child ( temp 2-component vector of float)
-0:151          'r048' ( temp 2-component vector of float)
-0:151          normalize ( temp 2-component vector of float)
+0:151          'r050' ( temp 2-component vector of float)
+0:151          radians ( temp 2-component vector of float)
 0:151            'inF0' ( in 2-component vector of float)
 0:152      Sequence
 0:152        move second child to first child ( temp 2-component vector of float)
-0:152          'r049' ( temp 2-component vector of float)
-0:152          pow ( temp 2-component vector of float)
+0:152          'r051' ( temp 2-component vector of float)
+0:152          divide ( temp 2-component vector of float)
+0:152            Constant:
+0:152              1.000000
 0:152            'inF0' ( in 2-component vector of float)
-0:152            'inF1' ( in 2-component vector of float)
 0:153      Sequence
 0:153        move second child to first child ( temp 2-component vector of float)
-0:153          'r050' ( temp 2-component vector of float)
-0:153          radians ( temp 2-component vector of float)
+0:153          'r052' ( temp 2-component vector of float)
+0:153          reflect ( temp 2-component vector of float)
 0:153            'inF0' ( in 2-component vector of float)
+0:153            'inF1' ( in 2-component vector of float)
 0:154      Sequence
 0:154        move second child to first child ( temp 2-component vector of float)
-0:154          'r051' ( temp 2-component vector of float)
-0:154          divide ( temp 2-component vector of float)
-0:154            Constant:
-0:154              1.000000
+0:154          'r053' ( temp 2-component vector of float)
+0:154          refract ( temp 2-component vector of float)
 0:154            'inF0' ( in 2-component vector of float)
+0:154            'inF1' ( in 2-component vector of float)
+0:154            Constant:
+0:154              2.000000
 0:155      Sequence
-0:155        move second child to first child ( temp 2-component vector of float)
-0:155          'r052' ( temp 2-component vector of float)
-0:155          reflect ( temp 2-component vector of float)
-0:155            'inF0' ( in 2-component vector of float)
-0:155            'inF1' ( in 2-component vector of float)
-0:156      Sequence
-0:156        move second child to first child ( temp 2-component vector of float)
-0:156          'r053' ( temp 2-component vector of float)
-0:156          refract ( temp 2-component vector of float)
-0:156            'inF0' ( in 2-component vector of float)
-0:156            'inF1' ( in 2-component vector of float)
-0:156            Constant:
-0:156              2.000000
-0:157      Sequence
-0:157        move second child to first child ( temp 2-component vector of uint)
-0:157          'r054' ( temp 2-component vector of uint)
+0:155        move second child to first child ( temp 2-component vector of uint)
+0:155          'r054' ( temp 2-component vector of uint)
 0:?           bitFieldReverse ( temp 2-component vector of uint)
 0:?             Constant:
 0:?               1 (const uint)
 0:?               2 (const uint)
+0:156      Sequence
+0:156        move second child to first child ( temp 2-component vector of float)
+0:156          'r055' ( temp 2-component vector of float)
+0:156          roundEven ( temp 2-component vector of float)
+0:156            'inF0' ( in 2-component vector of float)
+0:157      Sequence
+0:157        move second child to first child ( temp 2-component vector of float)
+0:157          'r056' ( temp 2-component vector of float)
+0:157          inverse sqrt ( temp 2-component vector of float)
+0:157            'inF0' ( in 2-component vector of float)
 0:158      Sequence
 0:158        move second child to first child ( temp 2-component vector of float)
-0:158          'r055' ( temp 2-component vector of float)
-0:158          roundEven ( temp 2-component vector of float)
+0:158          'r057' ( temp 2-component vector of float)
+0:158          clamp ( temp 2-component vector of float)
 0:158            'inF0' ( in 2-component vector of float)
+0:158            Constant:
+0:158              0.000000
+0:158            Constant:
+0:158              1.000000
 0:159      Sequence
 0:159        move second child to first child ( temp 2-component vector of float)
-0:159          'r056' ( temp 2-component vector of float)
-0:159          inverse sqrt ( temp 2-component vector of float)
+0:159          'r058' ( temp 2-component vector of float)
+0:159          Sign ( temp 2-component vector of float)
 0:159            'inF0' ( in 2-component vector of float)
 0:160      Sequence
 0:160        move second child to first child ( temp 2-component vector of float)
-0:160          'r057' ( temp 2-component vector of float)
-0:160          clamp ( temp 2-component vector of float)
+0:160          'r059' ( temp 2-component vector of float)
+0:160          sine ( temp 2-component vector of float)
 0:160            'inF0' ( in 2-component vector of float)
-0:160            Constant:
-0:160              0.000000
-0:160            Constant:
-0:160              1.000000
 0:161      Sequence
 0:161        move second child to first child ( temp 2-component vector of float)
-0:161          'r058' ( temp 2-component vector of float)
-0:161          Sign ( temp 2-component vector of float)
+0:161          'inF1' ( in 2-component vector of float)
+0:161          sine ( temp 2-component vector of float)
+0:161            'inF0' ( in 2-component vector of float)
+0:161        move second child to first child ( temp 2-component vector of float)
+0:161          'inF2' ( in 2-component vector of float)
+0:161          cosine ( temp 2-component vector of float)
 0:161            'inF0' ( in 2-component vector of float)
 0:162      Sequence
 0:162        move second child to first child ( temp 2-component vector of float)
-0:162          'r059' ( temp 2-component vector of float)
-0:162          sine ( temp 2-component vector of float)
+0:162          'r060' ( temp 2-component vector of float)
+0:162          hyp. sine ( temp 2-component vector of float)
 0:162            'inF0' ( in 2-component vector of float)
 0:163      Sequence
 0:163        move second child to first child ( temp 2-component vector of float)
-0:163          'inF1' ( in 2-component vector of float)
-0:163          sine ( temp 2-component vector of float)
-0:163            'inF0' ( in 2-component vector of float)
-0:163        move second child to first child ( temp 2-component vector of float)
-0:163          'inF2' ( in 2-component vector of float)
-0:163          cosine ( temp 2-component vector of float)
+0:163          'r061' ( temp 2-component vector of float)
+0:163          smoothstep ( temp 2-component vector of float)
 0:163            'inF0' ( in 2-component vector of float)
+0:163            'inF1' ( in 2-component vector of float)
+0:163            'inF2' ( in 2-component vector of float)
 0:164      Sequence
 0:164        move second child to first child ( temp 2-component vector of float)
-0:164          'r060' ( temp 2-component vector of float)
-0:164          hyp. sine ( temp 2-component vector of float)
+0:164          'r062' ( temp 2-component vector of float)
+0:164          sqrt ( temp 2-component vector of float)
 0:164            'inF0' ( in 2-component vector of float)
 0:165      Sequence
 0:165        move second child to first child ( temp 2-component vector of float)
-0:165          'r061' ( temp 2-component vector of float)
-0:165          smoothstep ( temp 2-component vector of float)
+0:165          'r063' ( temp 2-component vector of float)
+0:165          step ( temp 2-component vector of float)
 0:165            'inF0' ( in 2-component vector of float)
 0:165            'inF1' ( in 2-component vector of float)
-0:165            'inF2' ( in 2-component vector of float)
 0:166      Sequence
 0:166        move second child to first child ( temp 2-component vector of float)
-0:166          'r062' ( temp 2-component vector of float)
-0:166          sqrt ( temp 2-component vector of float)
+0:166          'r064' ( temp 2-component vector of float)
+0:166          tangent ( temp 2-component vector of float)
 0:166            'inF0' ( in 2-component vector of float)
 0:167      Sequence
 0:167        move second child to first child ( temp 2-component vector of float)
-0:167          'r063' ( temp 2-component vector of float)
-0:167          step ( temp 2-component vector of float)
+0:167          'r065' ( temp 2-component vector of float)
+0:167          hyp. tangent ( temp 2-component vector of float)
 0:167            'inF0' ( in 2-component vector of float)
-0:167            'inF1' ( in 2-component vector of float)
-0:168      Sequence
-0:168        move second child to first child ( temp 2-component vector of float)
-0:168          'r064' ( temp 2-component vector of float)
-0:168          tangent ( temp 2-component vector of float)
-0:168            'inF0' ( in 2-component vector of float)
 0:169      Sequence
 0:169        move second child to first child ( temp 2-component vector of float)
-0:169          'r065' ( temp 2-component vector of float)
-0:169          hyp. tangent ( temp 2-component vector of float)
+0:169          'r066' ( temp 2-component vector of float)
+0:169          trunc ( temp 2-component vector of float)
 0:169            'inF0' ( in 2-component vector of float)
-0:171      Sequence
-0:171        move second child to first child ( temp 2-component vector of float)
-0:171          'r066' ( temp 2-component vector of float)
-0:171          trunc ( temp 2-component vector of float)
-0:171            'inF0' ( in 2-component vector of float)
-0:174      Branch: Return with expression
+0:172      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
-0:178  Function Definition: PixelShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
-0:178    Function Parameters: 
-0:178      'inF0' ( in 3-component vector of float)
-0:178      'inF1' ( in 3-component vector of float)
-0:178      'inF2' ( in 3-component vector of float)
-0:178      'inU0' ( in 3-component vector of uint)
-0:178      'inU1' ( in 3-component vector of uint)
+0:176  Function Definition: PixelShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
+0:176    Function Parameters: 
+0:176      'inF0' ( in 3-component vector of float)
+0:176      'inF1' ( in 3-component vector of float)
+0:176      'inF2' ( in 3-component vector of float)
+0:176      'inU0' ( in 3-component vector of uint)
+0:176      'inU1' ( in 3-component vector of uint)
 0:?     Sequence
+0:179      Sequence
+0:179        move second child to first child ( temp bool)
+0:179          'r000' ( temp bool)
+0:179          all ( temp bool)
+0:179            'inF0' ( in 3-component vector of float)
+0:180      Sequence
+0:180        move second child to first child ( temp 3-component vector of float)
+0:180          'r001' ( temp 3-component vector of float)
+0:180          Absolute value ( temp 3-component vector of float)
+0:180            'inF0' ( in 3-component vector of float)
 0:181      Sequence
-0:181        move second child to first child ( temp bool)
-0:181          'r000' ( temp bool)
-0:181          all ( temp bool)
+0:181        move second child to first child ( temp 3-component vector of float)
+0:181          'r002' ( temp 3-component vector of float)
+0:181          arc cosine ( temp 3-component vector of float)
 0:181            'inF0' ( in 3-component vector of float)
 0:182      Sequence
-0:182        move second child to first child ( temp 3-component vector of float)
-0:182          'r001' ( temp 3-component vector of float)
-0:182          Absolute value ( temp 3-component vector of float)
+0:182        move second child to first child ( temp bool)
+0:182          'r003' ( temp bool)
+0:182          any ( temp bool)
 0:182            'inF0' ( in 3-component vector of float)
 0:183      Sequence
 0:183        move second child to first child ( temp 3-component vector of float)
-0:183          'r002' ( temp 3-component vector of float)
-0:183          arc cosine ( temp 3-component vector of float)
+0:183          'r004' ( temp 3-component vector of float)
+0:183          arc sine ( temp 3-component vector of float)
 0:183            'inF0' ( in 3-component vector of float)
 0:184      Sequence
-0:184        move second child to first child ( temp bool)
-0:184          'r003' ( temp bool)
-0:184          any ( temp bool)
+0:184        move second child to first child ( temp 3-component vector of int)
+0:184          'r005' ( temp 3-component vector of int)
+0:184          floatBitsToInt ( temp 3-component vector of int)
 0:184            'inF0' ( in 3-component vector of float)
 0:185      Sequence
-0:185        move second child to first child ( temp 3-component vector of float)
-0:185          'r004' ( temp 3-component vector of float)
-0:185          arc sine ( temp 3-component vector of float)
+0:185        move second child to first child ( temp 3-component vector of uint)
+0:185          'r006' ( temp 3-component vector of uint)
+0:185          floatBitsToUint ( temp 3-component vector of uint)
 0:185            'inF0' ( in 3-component vector of float)
 0:186      Sequence
-0:186        move second child to first child ( temp 3-component vector of int)
-0:186          'r005' ( temp 3-component vector of int)
-0:186          floatBitsToInt ( temp 3-component vector of int)
-0:186            'inF0' ( in 3-component vector of float)
-0:187      Sequence
-0:187        move second child to first child ( temp 3-component vector of uint)
-0:187          'r006' ( temp 3-component vector of uint)
-0:187          floatBitsToUint ( temp 3-component vector of uint)
-0:187            'inF0' ( in 3-component vector of float)
+0:186        move second child to first child ( temp 3-component vector of float)
+0:186          'r007' ( temp 3-component vector of float)
+0:186          intBitsToFloat ( temp 3-component vector of float)
+0:186            'inU0' ( in 3-component vector of uint)
 0:188      Sequence
 0:188        move second child to first child ( temp 3-component vector of float)
-0:188          'r007' ( temp 3-component vector of float)
-0:188          intBitsToFloat ( temp 3-component vector of float)
-0:188            'inU0' ( in 3-component vector of uint)
+0:188          'r009' ( temp 3-component vector of float)
+0:188          arc tangent ( temp 3-component vector of float)
+0:188            'inF0' ( in 3-component vector of float)
+0:189      Sequence
+0:189        move second child to first child ( temp 3-component vector of float)
+0:189          'r010' ( temp 3-component vector of float)
+0:189          arc tangent ( temp 3-component vector of float)
+0:189            'inF0' ( in 3-component vector of float)
+0:189            'inF1' ( in 3-component vector of float)
 0:190      Sequence
 0:190        move second child to first child ( temp 3-component vector of float)
-0:190          'r009' ( temp 3-component vector of float)
-0:190          arc tangent ( temp 3-component vector of float)
+0:190          'r011' ( temp 3-component vector of float)
+0:190          Ceiling ( temp 3-component vector of float)
 0:190            'inF0' ( in 3-component vector of float)
 0:191      Sequence
 0:191        move second child to first child ( temp 3-component vector of float)
-0:191          'r010' ( temp 3-component vector of float)
-0:191          arc tangent ( temp 3-component vector of float)
+0:191          'r012' ( temp 3-component vector of float)
+0:191          clamp ( temp 3-component vector of float)
 0:191            'inF0' ( in 3-component vector of float)
 0:191            'inF1' ( in 3-component vector of float)
-0:192      Sequence
-0:192        move second child to first child ( temp 3-component vector of float)
-0:192          'r011' ( temp 3-component vector of float)
-0:192          Ceiling ( temp 3-component vector of float)
+0:191            'inF2' ( in 3-component vector of float)
+0:192      Test condition and select ( temp void)
+0:192        Condition
+0:192        any ( temp bool)
+0:192          Compare Less Than ( temp 3-component vector of bool)
 0:192            'inF0' ( in 3-component vector of float)
+0:192            Constant:
+0:192              0.000000
+0:192              0.000000
+0:192              0.000000
+0:192        true case
+0:192        Branch: Kill
 0:193      Sequence
 0:193        move second child to first child ( temp 3-component vector of float)
-0:193          'r012' ( temp 3-component vector of float)
-0:193          clamp ( temp 3-component vector of float)
+0:193          'r013' ( temp 3-component vector of float)
+0:193          cosine ( temp 3-component vector of float)
 0:193            'inF0' ( in 3-component vector of float)
-0:193            'inF1' ( in 3-component vector of float)
-0:193            'inF2' ( in 3-component vector of float)
-0:194      Test condition and select ( temp void)
-0:194        Condition
-0:194        any ( temp bool)
-0:194          Compare Less Than ( temp 3-component vector of bool)
+0:194      Sequence
+0:194        move second child to first child ( temp 3-component vector of float)
+0:194          'r014' ( temp 3-component vector of float)
+0:194          hyp. cosine ( temp 3-component vector of float)
 0:194            'inF0' ( in 3-component vector of float)
-0:194            Constant:
-0:194              0.000000
-0:194              0.000000
-0:194              0.000000
-0:194        true case
-0:194        Branch: Kill
 0:195      Sequence
-0:195        move second child to first child ( temp 3-component vector of float)
-0:195          'r013' ( temp 3-component vector of float)
-0:195          cosine ( temp 3-component vector of float)
-0:195            'inF0' ( in 3-component vector of float)
-0:196      Sequence
-0:196        move second child to first child ( temp 3-component vector of float)
-0:196          'r014' ( temp 3-component vector of float)
-0:196          hyp. cosine ( temp 3-component vector of float)
-0:196            'inF0' ( in 3-component vector of float)
-0:197      Sequence
-0:197        move second child to first child ( temp 3-component vector of uint)
-0:197          'r015' ( temp 3-component vector of uint)
+0:195        move second child to first child ( temp 3-component vector of uint)
+0:195          'r015' ( temp 3-component vector of uint)
 0:?           bitCount ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               3 (const uint)
 0:?               5 (const uint)
+0:196      Sequence
+0:196        move second child to first child ( temp 3-component vector of float)
+0:196          'r016' ( temp 3-component vector of float)
+0:196          cross-product ( temp 3-component vector of float)
+0:196            'inF0' ( in 3-component vector of float)
+0:196            'inF1' ( in 3-component vector of float)
+0:197      Sequence
+0:197        move second child to first child ( temp 3-component vector of float)
+0:197          'r017' ( temp 3-component vector of float)
+0:197          dPdx ( temp 3-component vector of float)
+0:197            'inF0' ( in 3-component vector of float)
 0:198      Sequence
 0:198        move second child to first child ( temp 3-component vector of float)
-0:198          'r016' ( temp 3-component vector of float)
-0:198          cross-product ( temp 3-component vector of float)
+0:198          'r018' ( temp 3-component vector of float)
+0:198          dPdxCoarse ( temp 3-component vector of float)
 0:198            'inF0' ( in 3-component vector of float)
-0:198            'inF1' ( in 3-component vector of float)
 0:199      Sequence
 0:199        move second child to first child ( temp 3-component vector of float)
-0:199          'r017' ( temp 3-component vector of float)
-0:199          dPdx ( temp 3-component vector of float)
+0:199          'r019' ( temp 3-component vector of float)
+0:199          dPdxFine ( temp 3-component vector of float)
 0:199            'inF0' ( in 3-component vector of float)
 0:200      Sequence
 0:200        move second child to first child ( temp 3-component vector of float)
-0:200          'r018' ( temp 3-component vector of float)
-0:200          dPdxCoarse ( temp 3-component vector of float)
+0:200          'r020' ( temp 3-component vector of float)
+0:200          dPdy ( temp 3-component vector of float)
 0:200            'inF0' ( in 3-component vector of float)
 0:201      Sequence
 0:201        move second child to first child ( temp 3-component vector of float)
-0:201          'r019' ( temp 3-component vector of float)
-0:201          dPdxFine ( temp 3-component vector of float)
+0:201          'r021' ( temp 3-component vector of float)
+0:201          dPdyCoarse ( temp 3-component vector of float)
 0:201            'inF0' ( in 3-component vector of float)
 0:202      Sequence
 0:202        move second child to first child ( temp 3-component vector of float)
-0:202          'r020' ( temp 3-component vector of float)
-0:202          dPdy ( temp 3-component vector of float)
+0:202          'r022' ( temp 3-component vector of float)
+0:202          dPdyFine ( temp 3-component vector of float)
 0:202            'inF0' ( in 3-component vector of float)
 0:203      Sequence
 0:203        move second child to first child ( temp 3-component vector of float)
-0:203          'r021' ( temp 3-component vector of float)
-0:203          dPdyCoarse ( temp 3-component vector of float)
+0:203          'r023' ( temp 3-component vector of float)
+0:203          degrees ( temp 3-component vector of float)
 0:203            'inF0' ( in 3-component vector of float)
 0:204      Sequence
-0:204        move second child to first child ( temp 3-component vector of float)
-0:204          'r022' ( temp 3-component vector of float)
-0:204          dPdyFine ( temp 3-component vector of float)
+0:204        move second child to first child ( temp float)
+0:204          'r024' ( temp float)
+0:204          distance ( temp float)
 0:204            'inF0' ( in 3-component vector of float)
+0:204            'inF1' ( in 3-component vector of float)
 0:205      Sequence
-0:205        move second child to first child ( temp 3-component vector of float)
-0:205          'r023' ( temp 3-component vector of float)
-0:205          degrees ( temp 3-component vector of float)
+0:205        move second child to first child ( temp float)
+0:205          'r025' ( temp float)
+0:205          dot-product ( temp float)
 0:205            'inF0' ( in 3-component vector of float)
-0:206      Sequence
-0:206        move second child to first child ( temp float)
-0:206          'r024' ( temp float)
-0:206          distance ( temp float)
-0:206            'inF0' ( in 3-component vector of float)
-0:206            'inF1' ( in 3-component vector of float)
-0:207      Sequence
-0:207        move second child to first child ( temp float)
-0:207          'r025' ( temp float)
-0:207          dot-product ( temp float)
-0:207            'inF0' ( in 3-component vector of float)
-0:207            'inF1' ( in 3-component vector of float)
+0:205            'inF1' ( in 3-component vector of float)
+0:209      Sequence
+0:209        move second child to first child ( temp 3-component vector of float)
+0:209          'r029' ( temp 3-component vector of float)
+0:209          exp ( temp 3-component vector of float)
+0:209            'inF0' ( in 3-component vector of float)
+0:210      Sequence
+0:210        move second child to first child ( temp 3-component vector of float)
+0:210          'r030' ( temp 3-component vector of float)
+0:210          exp2 ( temp 3-component vector of float)
+0:210            'inF0' ( in 3-component vector of float)
 0:211      Sequence
 0:211        move second child to first child ( temp 3-component vector of float)
-0:211          'r029' ( temp 3-component vector of float)
-0:211          exp ( temp 3-component vector of float)
+0:211          'r031' ( temp 3-component vector of float)
+0:211          face-forward ( temp 3-component vector of float)
 0:211            'inF0' ( in 3-component vector of float)
+0:211            'inF1' ( in 3-component vector of float)
+0:211            'inF2' ( in 3-component vector of float)
 0:212      Sequence
-0:212        move second child to first child ( temp 3-component vector of float)
-0:212          'r030' ( temp 3-component vector of float)
-0:212          exp2 ( temp 3-component vector of float)
-0:212            'inF0' ( in 3-component vector of float)
-0:213      Sequence
-0:213        move second child to first child ( temp 3-component vector of float)
-0:213          'r031' ( temp 3-component vector of float)
-0:213          face-forward ( temp 3-component vector of float)
-0:213            'inF0' ( in 3-component vector of float)
-0:213            'inF1' ( in 3-component vector of float)
-0:213            'inF2' ( in 3-component vector of float)
-0:214      Sequence
-0:214        move second child to first child ( temp 3-component vector of uint)
-0:214          'r032' ( temp 3-component vector of uint)
+0:212        move second child to first child ( temp 3-component vector of uint)
+0:212          'r032' ( temp 3-component vector of uint)
 0:?           findMSB ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               2 (const uint)
 0:?               3 (const uint)
 0:?               4 (const uint)
-0:215      Sequence
-0:215        move second child to first child ( temp 3-component vector of uint)
-0:215          'r033' ( temp 3-component vector of uint)
+0:213      Sequence
+0:213        move second child to first child ( temp 3-component vector of uint)
+0:213          'r033' ( temp 3-component vector of uint)
 0:?           findLSB ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               2 (const uint)
 0:?               3 (const uint)
 0:?               4 (const uint)
+0:214      Sequence
+0:214        move second child to first child ( temp 3-component vector of float)
+0:214          'r034' ( temp 3-component vector of float)
+0:214          Floor ( temp 3-component vector of float)
+0:214            'inF0' ( in 3-component vector of float)
 0:216      Sequence
 0:216        move second child to first child ( temp 3-component vector of float)
-0:216          'r034' ( temp 3-component vector of float)
-0:216          Floor ( temp 3-component vector of float)
+0:216          'r036' ( temp 3-component vector of float)
+0:216          mod ( temp 3-component vector of float)
 0:216            'inF0' ( in 3-component vector of float)
+0:216            'inF1' ( in 3-component vector of float)
+0:217      Sequence
+0:217        move second child to first child ( temp 3-component vector of float)
+0:217          'r037' ( temp 3-component vector of float)
+0:217          Fraction ( temp 3-component vector of float)
+0:217            'inF0' ( in 3-component vector of float)
 0:218      Sequence
 0:218        move second child to first child ( temp 3-component vector of float)
-0:218          'r036' ( temp 3-component vector of float)
-0:218          mod ( temp 3-component vector of float)
+0:218          'r039' ( temp 3-component vector of float)
+0:218          fwidth ( temp 3-component vector of float)
 0:218            'inF0' ( in 3-component vector of float)
-0:218            'inF1' ( in 3-component vector of float)
 0:219      Sequence
-0:219        move second child to first child ( temp 3-component vector of float)
-0:219          'r037' ( temp 3-component vector of float)
-0:219          Fraction ( temp 3-component vector of float)
+0:219        move second child to first child ( temp 3-component vector of bool)
+0:219          'r040' ( temp 3-component vector of bool)
+0:219          isinf ( temp 3-component vector of bool)
 0:219            'inF0' ( in 3-component vector of float)
 0:220      Sequence
-0:220        move second child to first child ( temp 3-component vector of float)
-0:220          'r038' ( temp 3-component vector of float)
-0:220          frexp ( temp 3-component vector of float)
+0:220        move second child to first child ( temp 3-component vector of bool)
+0:220          'r041' ( temp 3-component vector of bool)
+0:220          isnan ( temp 3-component vector of bool)
 0:220            'inF0' ( in 3-component vector of float)
-0:220            'inF1' ( in 3-component vector of float)
 0:221      Sequence
 0:221        move second child to first child ( temp 3-component vector of float)
-0:221          'r039' ( temp 3-component vector of float)
-0:221          fwidth ( temp 3-component vector of float)
+0:221          'r042' ( temp 3-component vector of float)
+0:221          ldexp ( temp 3-component vector of float)
 0:221            'inF0' ( in 3-component vector of float)
+0:221            'inF1' ( in 3-component vector of float)
 0:222      Sequence
-0:222        move second child to first child ( temp 3-component vector of bool)
-0:222          'r040' ( temp 3-component vector of bool)
-0:222          isinf ( temp 3-component vector of bool)
+0:222        move second child to first child ( temp 3-component vector of float)
+0:222          'r039a' ( temp 3-component vector of float)
+0:222          mix ( temp 3-component vector of float)
 0:222            'inF0' ( in 3-component vector of float)
+0:222            'inF1' ( in 3-component vector of float)
+0:222            'inF2' ( in 3-component vector of float)
 0:223      Sequence
-0:223        move second child to first child ( temp 3-component vector of bool)
-0:223          'r041' ( temp 3-component vector of bool)
-0:223          isnan ( temp 3-component vector of bool)
+0:223        move second child to first child ( temp 3-component vector of float)
+0:223          'r039b' ( temp 3-component vector of float)
+0:223          mix ( temp 3-component vector of float)
 0:223            'inF0' ( in 3-component vector of float)
+0:223            'inF1' ( in 3-component vector of float)
+0:223            Constant:
+0:223              0.300000
 0:224      Sequence
-0:224        move second child to first child ( temp 3-component vector of float)
-0:224          'r042' ( temp 3-component vector of float)
-0:224          ldexp ( temp 3-component vector of float)
+0:224        move second child to first child ( temp float)
+0:224          'r043' ( temp float)
+0:224          length ( temp float)
 0:224            'inF0' ( in 3-component vector of float)
-0:224            'inF1' ( in 3-component vector of float)
 0:225      Sequence
 0:225        move second child to first child ( temp 3-component vector of float)
-0:225          'r039a' ( temp 3-component vector of float)
-0:225          mix ( temp 3-component vector of float)
+0:225          'r044' ( temp 3-component vector of float)
+0:225          log ( temp 3-component vector of float)
 0:225            'inF0' ( in 3-component vector of float)
-0:225            'inF1' ( in 3-component vector of float)
-0:225            'inF2' ( in 3-component vector of float)
 0:226      Sequence
 0:226        move second child to first child ( temp 3-component vector of float)
-0:226          'r039b' ( temp 3-component vector of float)
-0:226          mix ( temp 3-component vector of float)
-0:226            'inF0' ( in 3-component vector of float)
-0:226            'inF1' ( in 3-component vector of float)
+0:226          'r045' ( temp 3-component vector of float)
+0:226          vector-scale ( temp 3-component vector of float)
+0:226            log2 ( temp 3-component vector of float)
+0:226              'inF0' ( in 3-component vector of float)
 0:226            Constant:
-0:226              0.300000
+0:226              0.301030
 0:227      Sequence
-0:227        move second child to first child ( temp float)
-0:227          'r043' ( temp float)
-0:227          length ( temp float)
+0:227        move second child to first child ( temp 3-component vector of float)
+0:227          'r046' ( temp 3-component vector of float)
+0:227          log2 ( temp 3-component vector of float)
 0:227            'inF0' ( in 3-component vector of float)
 0:228      Sequence
 0:228        move second child to first child ( temp 3-component vector of float)
-0:228          'r044' ( temp 3-component vector of float)
-0:228          log ( temp 3-component vector of float)
+0:228          'r047' ( temp 3-component vector of float)
+0:228          max ( temp 3-component vector of float)
 0:228            'inF0' ( in 3-component vector of float)
+0:228            'inF1' ( in 3-component vector of float)
 0:229      Sequence
 0:229        move second child to first child ( temp 3-component vector of float)
-0:229          'r045' ( temp 3-component vector of float)
-0:229          vector-scale ( temp 3-component vector of float)
-0:229            log2 ( temp 3-component vector of float)
-0:229              'inF0' ( in 3-component vector of float)
-0:229            Constant:
-0:229              0.301030
+0:229          'r048' ( temp 3-component vector of float)
+0:229          min ( temp 3-component vector of float)
+0:229            'inF0' ( in 3-component vector of float)
+0:229            'inF1' ( in 3-component vector of float)
 0:230      Sequence
 0:230        move second child to first child ( temp 3-component vector of float)
-0:230          'r046' ( temp 3-component vector of float)
-0:230          log2 ( temp 3-component vector of float)
+0:230          'r049' ( temp 3-component vector of float)
+0:230          normalize ( temp 3-component vector of float)
 0:230            'inF0' ( in 3-component vector of float)
 0:231      Sequence
 0:231        move second child to first child ( temp 3-component vector of float)
-0:231          'r047' ( temp 3-component vector of float)
-0:231          max ( temp 3-component vector of float)
+0:231          'r050' ( temp 3-component vector of float)
+0:231          pow ( temp 3-component vector of float)
 0:231            'inF0' ( in 3-component vector of float)
 0:231            'inF1' ( in 3-component vector of float)
 0:232      Sequence
 0:232        move second child to first child ( temp 3-component vector of float)
-0:232          'r048' ( temp 3-component vector of float)
-0:232          min ( temp 3-component vector of float)
+0:232          'r051' ( temp 3-component vector of float)
+0:232          radians ( temp 3-component vector of float)
 0:232            'inF0' ( in 3-component vector of float)
-0:232            'inF1' ( in 3-component vector of float)
 0:233      Sequence
 0:233        move second child to first child ( temp 3-component vector of float)
-0:233          'r049' ( temp 3-component vector of float)
-0:233          normalize ( temp 3-component vector of float)
+0:233          'r052' ( temp 3-component vector of float)
+0:233          divide ( temp 3-component vector of float)
+0:233            Constant:
+0:233              1.000000
 0:233            'inF0' ( in 3-component vector of float)
 0:234      Sequence
 0:234        move second child to first child ( temp 3-component vector of float)
-0:234          'r050' ( temp 3-component vector of float)
-0:234          pow ( temp 3-component vector of float)
+0:234          'r053' ( temp 3-component vector of float)
+0:234          reflect ( temp 3-component vector of float)
 0:234            'inF0' ( in 3-component vector of float)
 0:234            'inF1' ( in 3-component vector of float)
 0:235      Sequence
 0:235        move second child to first child ( temp 3-component vector of float)
-0:235          'r051' ( temp 3-component vector of float)
-0:235          radians ( temp 3-component vector of float)
+0:235          'r054' ( temp 3-component vector of float)
+0:235          refract ( temp 3-component vector of float)
 0:235            'inF0' ( in 3-component vector of float)
+0:235            'inF1' ( in 3-component vector of float)
+0:235            Constant:
+0:235              2.000000
 0:236      Sequence
-0:236        move second child to first child ( temp 3-component vector of float)
-0:236          'r052' ( temp 3-component vector of float)
-0:236          divide ( temp 3-component vector of float)
-0:236            Constant:
-0:236              1.000000
-0:236            'inF0' ( in 3-component vector of float)
-0:237      Sequence
-0:237        move second child to first child ( temp 3-component vector of float)
-0:237          'r053' ( temp 3-component vector of float)
-0:237          reflect ( temp 3-component vector of float)
-0:237            'inF0' ( in 3-component vector of float)
-0:237            'inF1' ( in 3-component vector of float)
-0:238      Sequence
-0:238        move second child to first child ( temp 3-component vector of float)
-0:238          'r054' ( temp 3-component vector of float)
-0:238          refract ( temp 3-component vector of float)
-0:238            'inF0' ( in 3-component vector of float)
-0:238            'inF1' ( in 3-component vector of float)
-0:238            Constant:
-0:238              2.000000
-0:239      Sequence
-0:239        move second child to first child ( temp 3-component vector of uint)
-0:239          'r055' ( temp 3-component vector of uint)
+0:236        move second child to first child ( temp 3-component vector of uint)
+0:236          'r055' ( temp 3-component vector of uint)
 0:?           bitFieldReverse ( temp 3-component vector of uint)
 0:?             Constant:
 0:?               1 (const uint)
 0:?               2 (const uint)
 0:?               3 (const uint)
+0:237      Sequence
+0:237        move second child to first child ( temp 3-component vector of float)
+0:237          'r056' ( temp 3-component vector of float)
+0:237          roundEven ( temp 3-component vector of float)
+0:237            'inF0' ( in 3-component vector of float)
+0:238      Sequence
+0:238        move second child to first child ( temp 3-component vector of float)
+0:238          'r057' ( temp 3-component vector of float)
+0:238          inverse sqrt ( temp 3-component vector of float)
+0:238            'inF0' ( in 3-component vector of float)
+0:239      Sequence
+0:239        move second child to first child ( temp 3-component vector of float)
+0:239          'r058' ( temp 3-component vector of float)
+0:239          clamp ( temp 3-component vector of float)
+0:239            'inF0' ( in 3-component vector of float)
+0:239            Constant:
+0:239              0.000000
+0:239            Constant:
+0:239              1.000000
 0:240      Sequence
 0:240        move second child to first child ( temp 3-component vector of float)
-0:240          'r056' ( temp 3-component vector of float)
-0:240          roundEven ( temp 3-component vector of float)
+0:240          'r059' ( temp 3-component vector of float)
+0:240          Sign ( temp 3-component vector of float)
 0:240            'inF0' ( in 3-component vector of float)
 0:241      Sequence
 0:241        move second child to first child ( temp 3-component vector of float)
-0:241          'r057' ( temp 3-component vector of float)
-0:241          inverse sqrt ( temp 3-component vector of float)
+0:241          'r060' ( temp 3-component vector of float)
+0:241          sine ( temp 3-component vector of float)
 0:241            'inF0' ( in 3-component vector of float)
 0:242      Sequence
 0:242        move second child to first child ( temp 3-component vector of float)
-0:242          'r058' ( temp 3-component vector of float)
-0:242          clamp ( temp 3-component vector of float)
+0:242          'inF1' ( in 3-component vector of float)
+0:242          sine ( temp 3-component vector of float)
+0:242            'inF0' ( in 3-component vector of float)
+0:242        move second child to first child ( temp 3-component vector of float)
+0:242          'inF2' ( in 3-component vector of float)
+0:242          cosine ( temp 3-component vector of float)
 0:242            'inF0' ( in 3-component vector of float)
-0:242            Constant:
-0:242              0.000000
-0:242            Constant:
-0:242              1.000000
 0:243      Sequence
 0:243        move second child to first child ( temp 3-component vector of float)
-0:243          'r059' ( temp 3-component vector of float)
-0:243          Sign ( temp 3-component vector of float)
+0:243          'r061' ( temp 3-component vector of float)
+0:243          hyp. sine ( temp 3-component vector of float)
 0:243            'inF0' ( in 3-component vector of float)
 0:244      Sequence
 0:244        move second child to first child ( temp 3-component vector of float)
-0:244          'r060' ( temp 3-component vector of float)
-0:244          sine ( temp 3-component vector of float)
+0:244          'r062' ( temp 3-component vector of float)
+0:244          smoothstep ( temp 3-component vector of float)
 0:244            'inF0' ( in 3-component vector of float)
+0:244            'inF1' ( in 3-component vector of float)
+0:244            'inF2' ( in 3-component vector of float)
 0:245      Sequence
 0:245        move second child to first child ( temp 3-component vector of float)
-0:245          'inF1' ( in 3-component vector of float)
-0:245          sine ( temp 3-component vector of float)
-0:245            'inF0' ( in 3-component vector of float)
-0:245        move second child to first child ( temp 3-component vector of float)
-0:245          'inF2' ( in 3-component vector of float)
-0:245          cosine ( temp 3-component vector of float)
+0:245          'r063' ( temp 3-component vector of float)
+0:245          sqrt ( temp 3-component vector of float)
 0:245            'inF0' ( in 3-component vector of float)
 0:246      Sequence
 0:246        move second child to first child ( temp 3-component vector of float)
-0:246          'r061' ( temp 3-component vector of float)
-0:246          hyp. sine ( temp 3-component vector of float)
+0:246          'r064' ( temp 3-component vector of float)
+0:246          step ( temp 3-component vector of float)
 0:246            'inF0' ( in 3-component vector of float)
+0:246            'inF1' ( in 3-component vector of float)
 0:247      Sequence
 0:247        move second child to first child ( temp 3-component vector of float)
-0:247          'r062' ( temp 3-component vector of float)
-0:247          smoothstep ( temp 3-component vector of float)
+0:247          'r065' ( temp 3-component vector of float)
+0:247          tangent ( temp 3-component vector of float)
 0:247            'inF0' ( in 3-component vector of float)
-0:247            'inF1' ( in 3-component vector of float)
-0:247            'inF2' ( in 3-component vector of float)
 0:248      Sequence
 0:248        move second child to first child ( temp 3-component vector of float)
-0:248          'r063' ( temp 3-component vector of float)
-0:248          sqrt ( temp 3-component vector of float)
+0:248          'r066' ( temp 3-component vector of float)
+0:248          hyp. tangent ( temp 3-component vector of float)
 0:248            'inF0' ( in 3-component vector of float)
-0:249      Sequence
-0:249        move second child to first child ( temp 3-component vector of float)
-0:249          'r064' ( temp 3-component vector of float)
-0:249          step ( temp 3-component vector of float)
-0:249            'inF0' ( in 3-component vector of float)
-0:249            'inF1' ( in 3-component vector of float)
 0:250      Sequence
 0:250        move second child to first child ( temp 3-component vector of float)
-0:250          'r065' ( temp 3-component vector of float)
-0:250          tangent ( temp 3-component vector of float)
+0:250          'r067' ( temp 3-component vector of float)
+0:250          trunc ( temp 3-component vector of float)
 0:250            'inF0' ( in 3-component vector of float)
-0:251      Sequence
-0:251        move second child to first child ( temp 3-component vector of float)
-0:251          'r066' ( temp 3-component vector of float)
-0:251          hyp. tangent ( temp 3-component vector of float)
-0:251            'inF0' ( in 3-component vector of float)
-0:253      Sequence
-0:253        move second child to first child ( temp 3-component vector of float)
-0:253          'r067' ( temp 3-component vector of float)
-0:253          trunc ( temp 3-component vector of float)
-0:253            'inF0' ( in 3-component vector of float)
-0:256      Branch: Return with expression
+0:253      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
-0:260  Function Definition: PixelShaderFunction(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
-0:260    Function Parameters: 
-0:260      'inF0' ( in 4-component vector of float)
-0:260      'inF1' ( in 4-component vector of float)
-0:260      'inF2' ( in 4-component vector of float)
-0:260      'inU0' ( in 4-component vector of uint)
-0:260      'inU1' ( in 4-component vector of uint)
+0:257  Function Definition: PixelShaderFunction(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
+0:257    Function Parameters: 
+0:257      'inF0' ( in 4-component vector of float)
+0:257      'inF1' ( in 4-component vector of float)
+0:257      'inF2' ( in 4-component vector of float)
+0:257      'inU0' ( in 4-component vector of uint)
+0:257      'inU1' ( in 4-component vector of uint)
 0:?     Sequence
+0:260      Sequence
+0:260        move second child to first child ( temp bool)
+0:260          'r000' ( temp bool)
+0:260          all ( temp bool)
+0:260            'inF0' ( in 4-component vector of float)
+0:261      Sequence
+0:261        move second child to first child ( temp 4-component vector of float)
+0:261          'r001' ( temp 4-component vector of float)
+0:261          Absolute value ( temp 4-component vector of float)
+0:261            'inF0' ( in 4-component vector of float)
+0:262      Sequence
+0:262        move second child to first child ( temp 4-component vector of float)
+0:262          'r002' ( temp 4-component vector of float)
+0:262          arc cosine ( temp 4-component vector of float)
+0:262            'inF0' ( in 4-component vector of float)
 0:263      Sequence
 0:263        move second child to first child ( temp bool)
-0:263          'r000' ( temp bool)
-0:263          all ( temp bool)
+0:263          'r003' ( temp bool)
+0:263          any ( temp bool)
 0:263            'inF0' ( in 4-component vector of float)
 0:264      Sequence
 0:264        move second child to first child ( temp 4-component vector of float)
-0:264          'r001' ( temp 4-component vector of float)
-0:264          Absolute value ( temp 4-component vector of float)
+0:264          'r004' ( temp 4-component vector of float)
+0:264          arc sine ( temp 4-component vector of float)
 0:264            'inF0' ( in 4-component vector of float)
 0:265      Sequence
-0:265        move second child to first child ( temp 4-component vector of float)
-0:265          'r002' ( temp 4-component vector of float)
-0:265          arc cosine ( temp 4-component vector of float)
+0:265        move second child to first child ( temp 4-component vector of int)
+0:265          'r005' ( temp 4-component vector of int)
+0:265          floatBitsToInt ( temp 4-component vector of int)
 0:265            'inF0' ( in 4-component vector of float)
 0:266      Sequence
-0:266        move second child to first child ( temp bool)
-0:266          'r003' ( temp bool)
-0:266          any ( temp bool)
+0:266        move second child to first child ( temp 4-component vector of uint)
+0:266          'r006' ( temp 4-component vector of uint)
+0:266          floatBitsToUint ( temp 4-component vector of uint)
 0:266            'inF0' ( in 4-component vector of float)
 0:267      Sequence
 0:267        move second child to first child ( temp 4-component vector of float)
-0:267          'r004' ( temp 4-component vector of float)
-0:267          arc sine ( temp 4-component vector of float)
-0:267            'inF0' ( in 4-component vector of float)
-0:268      Sequence
-0:268        move second child to first child ( temp 4-component vector of int)
-0:268          'r005' ( temp 4-component vector of int)
-0:268          floatBitsToInt ( temp 4-component vector of int)
-0:268            'inF0' ( in 4-component vector of float)
+0:267          'r007' ( temp 4-component vector of float)
+0:267          intBitsToFloat ( temp 4-component vector of float)
+0:267            'inU0' ( in 4-component vector of uint)
 0:269      Sequence
-0:269        move second child to first child ( temp 4-component vector of uint)
-0:269          'r006' ( temp 4-component vector of uint)
-0:269          floatBitsToUint ( temp 4-component vector of uint)
+0:269        move second child to first child ( temp 4-component vector of float)
+0:269          'r009' ( temp 4-component vector of float)
+0:269          arc tangent ( temp 4-component vector of float)
 0:269            'inF0' ( in 4-component vector of float)
 0:270      Sequence
 0:270        move second child to first child ( temp 4-component vector of float)
-0:270          'r007' ( temp 4-component vector of float)
-0:270          intBitsToFloat ( temp 4-component vector of float)
-0:270            'inU0' ( in 4-component vector of uint)
+0:270          'r010' ( temp 4-component vector of float)
+0:270          arc tangent ( temp 4-component vector of float)
+0:270            'inF0' ( in 4-component vector of float)
+0:270            'inF1' ( in 4-component vector of float)
+0:271      Sequence
+0:271        move second child to first child ( temp 4-component vector of float)
+0:271          'r011' ( temp 4-component vector of float)
+0:271          Ceiling ( temp 4-component vector of float)
+0:271            'inF0' ( in 4-component vector of float)
 0:272      Sequence
 0:272        move second child to first child ( temp 4-component vector of float)
-0:272          'r009' ( temp 4-component vector of float)
-0:272          arc tangent ( temp 4-component vector of float)
+0:272          'r012' ( temp 4-component vector of float)
+0:272          clamp ( temp 4-component vector of float)
 0:272            'inF0' ( in 4-component vector of float)
-0:273      Sequence
-0:273        move second child to first child ( temp 4-component vector of float)
-0:273          'r010' ( temp 4-component vector of float)
-0:273          arc tangent ( temp 4-component vector of float)
+0:272            'inF1' ( in 4-component vector of float)
+0:272            'inF2' ( in 4-component vector of float)
+0:273      Test condition and select ( temp void)
+0:273        Condition
+0:273        any ( temp bool)
+0:273          Compare Less Than ( temp 4-component vector of bool)
 0:273            'inF0' ( in 4-component vector of float)
-0:273            'inF1' ( in 4-component vector of float)
+0:273            Constant:
+0:273              0.000000
+0:273              0.000000
+0:273              0.000000
+0:273              0.000000
+0:273        true case
+0:273        Branch: Kill
 0:274      Sequence
 0:274        move second child to first child ( temp 4-component vector of float)
-0:274          'r011' ( temp 4-component vector of float)
-0:274          Ceiling ( temp 4-component vector of float)
+0:274          'r013' ( temp 4-component vector of float)
+0:274          cosine ( temp 4-component vector of float)
 0:274            'inF0' ( in 4-component vector of float)
 0:275      Sequence
 0:275        move second child to first child ( temp 4-component vector of float)
-0:275          'r012' ( temp 4-component vector of float)
-0:275          clamp ( temp 4-component vector of float)
+0:275          'r014' ( temp 4-component vector of float)
+0:275          hyp. cosine ( temp 4-component vector of float)
 0:275            'inF0' ( in 4-component vector of float)
-0:275            'inF1' ( in 4-component vector of float)
-0:275            'inF2' ( in 4-component vector of float)
-0:276      Test condition and select ( temp void)
-0:276        Condition
-0:276        any ( temp bool)
-0:276          Compare Less Than ( temp 4-component vector of bool)
-0:276            'inF0' ( in 4-component vector of float)
-0:276            Constant:
-0:276              0.000000
-0:276              0.000000
-0:276              0.000000
-0:276              0.000000
-0:276        true case
-0:276        Branch: Kill
-0:277      Sequence
-0:277        move second child to first child ( temp 4-component vector of float)
-0:277          'r013' ( temp 4-component vector of float)
-0:277          cosine ( temp 4-component vector of float)
-0:277            'inF0' ( in 4-component vector of float)
-0:278      Sequence
-0:278        move second child to first child ( temp 4-component vector of float)
-0:278          'r014' ( temp 4-component vector of float)
-0:278          hyp. cosine ( temp 4-component vector of float)
-0:278            'inF0' ( in 4-component vector of float)
-0:279      Sequence
-0:279        move second child to first child ( temp 4-component vector of uint)
-0:279          'r015' ( temp 4-component vector of uint)
+0:276      Sequence
+0:276        move second child to first child ( temp 4-component vector of uint)
+0:276          'r015' ( temp 4-component vector of uint)
 0:?           bitCount ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               3 (const uint)
 0:?               5 (const uint)
 0:?               2 (const uint)
+0:277      Sequence
+0:277        move second child to first child ( temp 4-component vector of float)
+0:277          'r016' ( temp 4-component vector of float)
+0:277          dPdx ( temp 4-component vector of float)
+0:277            'inF0' ( in 4-component vector of float)
+0:278      Sequence
+0:278        move second child to first child ( temp 4-component vector of float)
+0:278          'r017' ( temp 4-component vector of float)
+0:278          dPdxCoarse ( temp 4-component vector of float)
+0:278            'inF0' ( in 4-component vector of float)
+0:279      Sequence
+0:279        move second child to first child ( temp 4-component vector of float)
+0:279          'r018' ( temp 4-component vector of float)
+0:279          dPdxFine ( temp 4-component vector of float)
+0:279            'inF0' ( in 4-component vector of float)
 0:280      Sequence
 0:280        move second child to first child ( temp 4-component vector of float)
-0:280          'r016' ( temp 4-component vector of float)
-0:280          dPdx ( temp 4-component vector of float)
+0:280          'r019' ( temp 4-component vector of float)
+0:280          dPdy ( temp 4-component vector of float)
 0:280            'inF0' ( in 4-component vector of float)
 0:281      Sequence
 0:281        move second child to first child ( temp 4-component vector of float)
-0:281          'r017' ( temp 4-component vector of float)
-0:281          dPdxCoarse ( temp 4-component vector of float)
+0:281          'r020' ( temp 4-component vector of float)
+0:281          dPdyCoarse ( temp 4-component vector of float)
 0:281            'inF0' ( in 4-component vector of float)
 0:282      Sequence
 0:282        move second child to first child ( temp 4-component vector of float)
-0:282          'r018' ( temp 4-component vector of float)
-0:282          dPdxFine ( temp 4-component vector of float)
+0:282          'r021' ( temp 4-component vector of float)
+0:282          dPdyFine ( temp 4-component vector of float)
 0:282            'inF0' ( in 4-component vector of float)
 0:283      Sequence
 0:283        move second child to first child ( temp 4-component vector of float)
-0:283          'r019' ( temp 4-component vector of float)
-0:283          dPdy ( temp 4-component vector of float)
+0:283          'r022' ( temp 4-component vector of float)
+0:283          degrees ( temp 4-component vector of float)
 0:283            'inF0' ( in 4-component vector of float)
 0:284      Sequence
-0:284        move second child to first child ( temp 4-component vector of float)
-0:284          'r020' ( temp 4-component vector of float)
-0:284          dPdyCoarse ( temp 4-component vector of float)
+0:284        move second child to first child ( temp float)
+0:284          'r023' ( temp float)
+0:284          distance ( temp float)
 0:284            'inF0' ( in 4-component vector of float)
+0:284            'inF1' ( in 4-component vector of float)
 0:285      Sequence
-0:285        move second child to first child ( temp 4-component vector of float)
-0:285          'r021' ( temp 4-component vector of float)
-0:285          dPdyFine ( temp 4-component vector of float)
+0:285        move second child to first child ( temp float)
+0:285          'r024' ( temp float)
+0:285          dot-product ( temp float)
 0:285            'inF0' ( in 4-component vector of float)
+0:285            'inF1' ( in 4-component vector of float)
 0:286      Sequence
 0:286        move second child to first child ( temp 4-component vector of float)
-0:286          'r022' ( temp 4-component vector of float)
-0:286          degrees ( temp 4-component vector of float)
-0:286            'inF0' ( in 4-component vector of float)
-0:287      Sequence
-0:287        move second child to first child ( temp float)
-0:287          'r023' ( temp float)
-0:287          distance ( temp float)
-0:287            'inF0' ( in 4-component vector of float)
-0:287            'inF1' ( in 4-component vector of float)
-0:288      Sequence
-0:288        move second child to first child ( temp float)
-0:288          'r024' ( temp float)
-0:288          dot-product ( temp float)
-0:288            'inF0' ( in 4-component vector of float)
-0:288            'inF1' ( in 4-component vector of float)
-0:289      Sequence
-0:289        move second child to first child ( temp 4-component vector of float)
-0:289          'r025' ( temp 4-component vector of float)
-0:289          Construct vec4 ( temp 4-component vector of float)
-0:289            Constant:
-0:289              1.000000
-0:289            component-wise multiply ( temp float)
-0:289              direct index ( temp float)
-0:289                'inF0' ( in 4-component vector of float)
-0:289                Constant:
-0:289                  1 (const int)
-0:289              direct index ( temp float)
-0:289                'inF1' ( in 4-component vector of float)
-0:289                Constant:
-0:289                  1 (const int)
-0:289            direct index ( temp float)
-0:289              'inF0' ( in 4-component vector of float)
-0:289              Constant:
-0:289                2 (const int)
-0:289            direct index ( temp float)
-0:289              'inF1' ( in 4-component vector of float)
-0:289              Constant:
-0:289                3 (const int)
+0:286          'r025' ( temp 4-component vector of float)
+0:286          Construct vec4 ( temp 4-component vector of float)
+0:286            Constant:
+0:286              1.000000
+0:286            component-wise multiply ( temp float)
+0:286              direct index ( temp float)
+0:286                'inF0' ( in 4-component vector of float)
+0:286                Constant:
+0:286                  1 (const int)
+0:286              direct index ( temp float)
+0:286                'inF1' ( in 4-component vector of float)
+0:286                Constant:
+0:286                  1 (const int)
+0:286            direct index ( temp float)
+0:286              'inF0' ( in 4-component vector of float)
+0:286              Constant:
+0:286                2 (const int)
+0:286            direct index ( temp float)
+0:286              'inF1' ( in 4-component vector of float)
+0:286              Constant:
+0:286                3 (const int)
+0:290      Sequence
+0:290        move second child to first child ( temp 4-component vector of float)
+0:290          'r029' ( temp 4-component vector of float)
+0:290          exp ( temp 4-component vector of float)
+0:290            'inF0' ( in 4-component vector of float)
+0:291      Sequence
+0:291        move second child to first child ( temp 4-component vector of float)
+0:291          'r030' ( temp 4-component vector of float)
+0:291          exp2 ( temp 4-component vector of float)
+0:291            'inF0' ( in 4-component vector of float)
+0:292      Sequence
+0:292        move second child to first child ( temp 4-component vector of float)
+0:292          'r031' ( temp 4-component vector of float)
+0:292          face-forward ( temp 4-component vector of float)
+0:292            'inF0' ( in 4-component vector of float)
+0:292            'inF1' ( in 4-component vector of float)
+0:292            'inF2' ( in 4-component vector of float)
 0:293      Sequence
-0:293        move second child to first child ( temp 4-component vector of float)
-0:293          'r029' ( temp 4-component vector of float)
-0:293          exp ( temp 4-component vector of float)
-0:293            'inF0' ( in 4-component vector of float)
-0:294      Sequence
-0:294        move second child to first child ( temp 4-component vector of float)
-0:294          'r030' ( temp 4-component vector of float)
-0:294          exp2 ( temp 4-component vector of float)
-0:294            'inF0' ( in 4-component vector of float)
-0:295      Sequence
-0:295        move second child to first child ( temp 4-component vector of float)
-0:295          'r031' ( temp 4-component vector of float)
-0:295          face-forward ( temp 4-component vector of float)
-0:295            'inF0' ( in 4-component vector of float)
-0:295            'inF1' ( in 4-component vector of float)
-0:295            'inF2' ( in 4-component vector of float)
-0:296      Sequence
-0:296        move second child to first child ( temp 4-component vector of uint)
-0:296          'r032' ( temp 4-component vector of uint)
+0:293        move second child to first child ( temp 4-component vector of uint)
+0:293          'r032' ( temp 4-component vector of uint)
 0:?           findMSB ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
 0:?               9 (const uint)
 0:?               10 (const uint)
-0:297      Sequence
-0:297        move second child to first child ( temp 4-component vector of uint)
-0:297          'r033' ( temp 4-component vector of uint)
+0:294      Sequence
+0:294        move second child to first child ( temp 4-component vector of uint)
+0:294          'r033' ( temp 4-component vector of uint)
 0:?           findLSB ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               7 (const uint)
 0:?               8 (const uint)
 0:?               9 (const uint)
 0:?               10 (const uint)
+0:295      Sequence
+0:295        move second child to first child ( temp 4-component vector of float)
+0:295          'r034' ( temp 4-component vector of float)
+0:295          Floor ( temp 4-component vector of float)
+0:295            'inF0' ( in 4-component vector of float)
+0:297      Sequence
+0:297        move second child to first child ( temp 4-component vector of float)
+0:297          'r036' ( temp 4-component vector of float)
+0:297          mod ( temp 4-component vector of float)
+0:297            'inF0' ( in 4-component vector of float)
+0:297            'inF1' ( in 4-component vector of float)
 0:298      Sequence
 0:298        move second child to first child ( temp 4-component vector of float)
-0:298          'r034' ( temp 4-component vector of float)
-0:298          Floor ( temp 4-component vector of float)
+0:298          'r037' ( temp 4-component vector of float)
+0:298          Fraction ( temp 4-component vector of float)
 0:298            'inF0' ( in 4-component vector of float)
+0:299      Sequence
+0:299        move second child to first child ( temp 4-component vector of float)
+0:299          'r039' ( temp 4-component vector of float)
+0:299          fwidth ( temp 4-component vector of float)
+0:299            'inF0' ( in 4-component vector of float)
 0:300      Sequence
-0:300        move second child to first child ( temp 4-component vector of float)
-0:300          'r036' ( temp 4-component vector of float)
-0:300          mod ( temp 4-component vector of float)
+0:300        move second child to first child ( temp 4-component vector of bool)
+0:300          'r040' ( temp 4-component vector of bool)
+0:300          isinf ( temp 4-component vector of bool)
 0:300            'inF0' ( in 4-component vector of float)
-0:300            'inF1' ( in 4-component vector of float)
 0:301      Sequence
-0:301        move second child to first child ( temp 4-component vector of float)
-0:301          'r037' ( temp 4-component vector of float)
-0:301          Fraction ( temp 4-component vector of float)
+0:301        move second child to first child ( temp 4-component vector of bool)
+0:301          'r041' ( temp 4-component vector of bool)
+0:301          isnan ( temp 4-component vector of bool)
 0:301            'inF0' ( in 4-component vector of float)
 0:302      Sequence
 0:302        move second child to first child ( temp 4-component vector of float)
-0:302          'r038' ( temp 4-component vector of float)
-0:302          frexp ( temp 4-component vector of float)
+0:302          'r042' ( temp 4-component vector of float)
+0:302          ldexp ( temp 4-component vector of float)
 0:302            'inF0' ( in 4-component vector of float)
 0:302            'inF1' ( in 4-component vector of float)
 0:303      Sequence
 0:303        move second child to first child ( temp 4-component vector of float)
-0:303          'r039' ( temp 4-component vector of float)
-0:303          fwidth ( temp 4-component vector of float)
+0:303          'r039a' ( temp 4-component vector of float)
+0:303          mix ( temp 4-component vector of float)
 0:303            'inF0' ( in 4-component vector of float)
+0:303            'inF1' ( in 4-component vector of float)
+0:303            'inF2' ( in 4-component vector of float)
 0:304      Sequence
-0:304        move second child to first child ( temp 4-component vector of bool)
-0:304          'r040' ( temp 4-component vector of bool)
-0:304          isinf ( temp 4-component vector of bool)
+0:304        move second child to first child ( temp float)
+0:304          'r043' ( temp float)
+0:304          length ( temp float)
 0:304            'inF0' ( in 4-component vector of float)
 0:305      Sequence
-0:305        move second child to first child ( temp 4-component vector of bool)
-0:305          'r041' ( temp 4-component vector of bool)
-0:305          isnan ( temp 4-component vector of bool)
+0:305        move second child to first child ( temp 4-component vector of float)
+0:305          'r044' ( temp 4-component vector of float)
+0:305          log ( temp 4-component vector of float)
 0:305            'inF0' ( in 4-component vector of float)
 0:306      Sequence
 0:306        move second child to first child ( temp 4-component vector of float)
-0:306          'r042' ( temp 4-component vector of float)
-0:306          ldexp ( temp 4-component vector of float)
-0:306            'inF0' ( in 4-component vector of float)
-0:306            'inF1' ( in 4-component vector of float)
+0:306          'r045' ( temp 4-component vector of float)
+0:306          vector-scale ( temp 4-component vector of float)
+0:306            log2 ( temp 4-component vector of float)
+0:306              'inF0' ( in 4-component vector of float)
+0:306            Constant:
+0:306              0.301030
 0:307      Sequence
 0:307        move second child to first child ( temp 4-component vector of float)
-0:307          'r039a' ( temp 4-component vector of float)
-0:307          mix ( temp 4-component vector of float)
+0:307          'r046' ( temp 4-component vector of float)
+0:307          log2 ( temp 4-component vector of float)
 0:307            'inF0' ( in 4-component vector of float)
-0:307            'inF1' ( in 4-component vector of float)
-0:307            'inF2' ( in 4-component vector of float)
 0:308      Sequence
-0:308        move second child to first child ( temp float)
-0:308          'r043' ( temp float)
-0:308          length ( temp float)
+0:308        move second child to first child ( temp 4-component vector of float)
+0:308          'r047' ( temp 4-component vector of float)
+0:308          max ( temp 4-component vector of float)
 0:308            'inF0' ( in 4-component vector of float)
+0:308            'inF1' ( in 4-component vector of float)
 0:309      Sequence
 0:309        move second child to first child ( temp 4-component vector of float)
-0:309          'r044' ( temp 4-component vector of float)
-0:309          log ( temp 4-component vector of float)
+0:309          'r048' ( temp 4-component vector of float)
+0:309          min ( temp 4-component vector of float)
 0:309            'inF0' ( in 4-component vector of float)
+0:309            'inF1' ( in 4-component vector of float)
 0:310      Sequence
 0:310        move second child to first child ( temp 4-component vector of float)
-0:310          'r045' ( temp 4-component vector of float)
-0:310          vector-scale ( temp 4-component vector of float)
-0:310            log2 ( temp 4-component vector of float)
-0:310              'inF0' ( in 4-component vector of float)
-0:310            Constant:
-0:310              0.301030
+0:310          'r049' ( temp 4-component vector of float)
+0:310          normalize ( temp 4-component vector of float)
+0:310            'inF0' ( in 4-component vector of float)
 0:311      Sequence
 0:311        move second child to first child ( temp 4-component vector of float)
-0:311          'r046' ( temp 4-component vector of float)
-0:311          log2 ( temp 4-component vector of float)
+0:311          'r050' ( temp 4-component vector of float)
+0:311          pow ( temp 4-component vector of float)
 0:311            'inF0' ( in 4-component vector of float)
+0:311            'inF1' ( in 4-component vector of float)
 0:312      Sequence
 0:312        move second child to first child ( temp 4-component vector of float)
-0:312          'r047' ( temp 4-component vector of float)
-0:312          max ( temp 4-component vector of float)
+0:312          'r051' ( temp 4-component vector of float)
+0:312          radians ( temp 4-component vector of float)
 0:312            'inF0' ( in 4-component vector of float)
-0:312            'inF1' ( in 4-component vector of float)
 0:313      Sequence
 0:313        move second child to first child ( temp 4-component vector of float)
-0:313          'r048' ( temp 4-component vector of float)
-0:313          min ( temp 4-component vector of float)
+0:313          'r052' ( temp 4-component vector of float)
+0:313          divide ( temp 4-component vector of float)
+0:313            Constant:
+0:313              1.000000
 0:313            'inF0' ( in 4-component vector of float)
-0:313            'inF1' ( in 4-component vector of float)
 0:314      Sequence
 0:314        move second child to first child ( temp 4-component vector of float)
-0:314          'r049' ( temp 4-component vector of float)
-0:314          normalize ( temp 4-component vector of float)
+0:314          'r053' ( temp 4-component vector of float)
+0:314          reflect ( temp 4-component vector of float)
 0:314            'inF0' ( in 4-component vector of float)
+0:314            'inF1' ( in 4-component vector of float)
 0:315      Sequence
 0:315        move second child to first child ( temp 4-component vector of float)
-0:315          'r050' ( temp 4-component vector of float)
-0:315          pow ( temp 4-component vector of float)
+0:315          'r054' ( temp 4-component vector of float)
+0:315          refract ( temp 4-component vector of float)
 0:315            'inF0' ( in 4-component vector of float)
 0:315            'inF1' ( in 4-component vector of float)
+0:315            Constant:
+0:315              2.000000
 0:316      Sequence
-0:316        move second child to first child ( temp 4-component vector of float)
-0:316          'r051' ( temp 4-component vector of float)
-0:316          radians ( temp 4-component vector of float)
-0:316            'inF0' ( in 4-component vector of float)
-0:317      Sequence
-0:317        move second child to first child ( temp 4-component vector of float)
-0:317          'r052' ( temp 4-component vector of float)
-0:317          divide ( temp 4-component vector of float)
-0:317            Constant:
-0:317              1.000000
-0:317            'inF0' ( in 4-component vector of float)
-0:318      Sequence
-0:318        move second child to first child ( temp 4-component vector of float)
-0:318          'r053' ( temp 4-component vector of float)
-0:318          reflect ( temp 4-component vector of float)
-0:318            'inF0' ( in 4-component vector of float)
-0:318            'inF1' ( in 4-component vector of float)
-0:319      Sequence
-0:319        move second child to first child ( temp 4-component vector of float)
-0:319          'r054' ( temp 4-component vector of float)
-0:319          refract ( temp 4-component vector of float)
-0:319            'inF0' ( in 4-component vector of float)
-0:319            'inF1' ( in 4-component vector of float)
-0:319            Constant:
-0:319              2.000000
-0:320      Sequence
-0:320        move second child to first child ( temp 4-component vector of uint)
-0:320          'r055' ( temp 4-component vector of uint)
+0:316        move second child to first child ( temp 4-component vector of uint)
+0:316          'r055' ( temp 4-component vector of uint)
 0:?           bitFieldReverse ( temp 4-component vector of uint)
 0:?             Constant:
 0:?               1 (const uint)
 0:?               2 (const uint)
 0:?               3 (const uint)
 0:?               4 (const uint)
+0:317      Sequence
+0:317        move second child to first child ( temp 4-component vector of float)
+0:317          'r056' ( temp 4-component vector of float)
+0:317          roundEven ( temp 4-component vector of float)
+0:317            'inF0' ( in 4-component vector of float)
+0:318      Sequence
+0:318        move second child to first child ( temp 4-component vector of float)
+0:318          'r057' ( temp 4-component vector of float)
+0:318          inverse sqrt ( temp 4-component vector of float)
+0:318            'inF0' ( in 4-component vector of float)
+0:319      Sequence
+0:319        move second child to first child ( temp 4-component vector of float)
+0:319          'r058' ( temp 4-component vector of float)
+0:319          clamp ( temp 4-component vector of float)
+0:319            'inF0' ( in 4-component vector of float)
+0:319            Constant:
+0:319              0.000000
+0:319            Constant:
+0:319              1.000000
+0:320      Sequence
+0:320        move second child to first child ( temp 4-component vector of float)
+0:320          'r059' ( temp 4-component vector of float)
+0:320          Sign ( temp 4-component vector of float)
+0:320            'inF0' ( in 4-component vector of float)
 0:321      Sequence
 0:321        move second child to first child ( temp 4-component vector of float)
-0:321          'r056' ( temp 4-component vector of float)
-0:321          roundEven ( temp 4-component vector of float)
+0:321          'r060' ( temp 4-component vector of float)
+0:321          sine ( temp 4-component vector of float)
 0:321            'inF0' ( in 4-component vector of float)
 0:322      Sequence
 0:322        move second child to first child ( temp 4-component vector of float)
-0:322          'r057' ( temp 4-component vector of float)
-0:322          inverse sqrt ( temp 4-component vector of float)
+0:322          'inF1' ( in 4-component vector of float)
+0:322          sine ( temp 4-component vector of float)
+0:322            'inF0' ( in 4-component vector of float)
+0:322        move second child to first child ( temp 4-component vector of float)
+0:322          'inF2' ( in 4-component vector of float)
+0:322          cosine ( temp 4-component vector of float)
 0:322            'inF0' ( in 4-component vector of float)
 0:323      Sequence
 0:323        move second child to first child ( temp 4-component vector of float)
-0:323          'r058' ( temp 4-component vector of float)
-0:323          clamp ( temp 4-component vector of float)
+0:323          'r061' ( temp 4-component vector of float)
+0:323          hyp. sine ( temp 4-component vector of float)
 0:323            'inF0' ( in 4-component vector of float)
-0:323            Constant:
-0:323              0.000000
-0:323            Constant:
-0:323              1.000000
 0:324      Sequence
 0:324        move second child to first child ( temp 4-component vector of float)
-0:324          'r059' ( temp 4-component vector of float)
-0:324          Sign ( temp 4-component vector of float)
+0:324          'r062' ( temp 4-component vector of float)
+0:324          smoothstep ( temp 4-component vector of float)
 0:324            'inF0' ( in 4-component vector of float)
+0:324            'inF1' ( in 4-component vector of float)
+0:324            'inF2' ( in 4-component vector of float)
 0:325      Sequence
 0:325        move second child to first child ( temp 4-component vector of float)
-0:325          'r060' ( temp 4-component vector of float)
-0:325          sine ( temp 4-component vector of float)
+0:325          'r063' ( temp 4-component vector of float)
+0:325          sqrt ( temp 4-component vector of float)
 0:325            'inF0' ( in 4-component vector of float)
 0:326      Sequence
 0:326        move second child to first child ( temp 4-component vector of float)
-0:326          'inF1' ( in 4-component vector of float)
-0:326          sine ( temp 4-component vector of float)
-0:326            'inF0' ( in 4-component vector of float)
-0:326        move second child to first child ( temp 4-component vector of float)
-0:326          'inF2' ( in 4-component vector of float)
-0:326          cosine ( temp 4-component vector of float)
+0:326          'r064' ( temp 4-component vector of float)
+0:326          step ( temp 4-component vector of float)
 0:326            'inF0' ( in 4-component vector of float)
+0:326            'inF1' ( in 4-component vector of float)
 0:327      Sequence
 0:327        move second child to first child ( temp 4-component vector of float)
-0:327          'r061' ( temp 4-component vector of float)
-0:327          hyp. sine ( temp 4-component vector of float)
+0:327          'r065' ( temp 4-component vector of float)
+0:327          tangent ( temp 4-component vector of float)
 0:327            'inF0' ( in 4-component vector of float)
 0:328      Sequence
 0:328        move second child to first child ( temp 4-component vector of float)
-0:328          'r062' ( temp 4-component vector of float)
-0:328          smoothstep ( temp 4-component vector of float)
+0:328          'r066' ( temp 4-component vector of float)
+0:328          hyp. tangent ( temp 4-component vector of float)
 0:328            'inF0' ( in 4-component vector of float)
-0:328            'inF1' ( in 4-component vector of float)
-0:328            'inF2' ( in 4-component vector of float)
-0:329      Sequence
-0:329        move second child to first child ( temp 4-component vector of float)
-0:329          'r063' ( temp 4-component vector of float)
-0:329          sqrt ( temp 4-component vector of float)
-0:329            'inF0' ( in 4-component vector of float)
 0:330      Sequence
 0:330        move second child to first child ( temp 4-component vector of float)
-0:330          'r064' ( temp 4-component vector of float)
-0:330          step ( temp 4-component vector of float)
+0:330          'r067' ( temp 4-component vector of float)
+0:330          trunc ( temp 4-component vector of float)
 0:330            'inF0' ( in 4-component vector of float)
-0:330            'inF1' ( in 4-component vector of float)
-0:331      Sequence
-0:331        move second child to first child ( temp 4-component vector of float)
-0:331          'r065' ( temp 4-component vector of float)
-0:331          tangent ( temp 4-component vector of float)
-0:331            'inF0' ( in 4-component vector of float)
-0:332      Sequence
-0:332        move second child to first child ( temp 4-component vector of float)
-0:332          'r066' ( temp 4-component vector of float)
-0:332          hyp. tangent ( temp 4-component vector of float)
-0:332            'inF0' ( in 4-component vector of float)
-0:334      Sequence
-0:334        move second child to first child ( temp 4-component vector of float)
-0:334          'r067' ( temp 4-component vector of float)
-0:334          trunc ( temp 4-component vector of float)
-0:334            'inF0' ( in 4-component vector of float)
-0:337      Branch: Return with expression
+0:333      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
 0:?           4.000000
-0:401  Function Definition: PixelShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
-0:401    Function Parameters: 
-0:401      'inF0' ( in 2X2 matrix of float)
-0:401      'inF1' ( in 2X2 matrix of float)
-0:401      'inF2' ( in 2X2 matrix of float)
+0:396  Function Definition: PixelShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
+0:396    Function Parameters: 
+0:396      'inF0' ( in 2X2 matrix of float)
+0:396      'inF1' ( in 2X2 matrix of float)
+0:396      'inF2' ( in 2X2 matrix of float)
 0:?     Sequence
-0:403      Sequence
-0:403        move second child to first child ( temp bool)
-0:403          'r000' ( temp bool)
-0:403          all ( temp bool)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r001' ( temp 2X2 matrix of float)
-0:403          Absolute value ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      arc cosine ( temp 2X2 matrix of float)
-0:403        'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp bool)
-0:403          'r003' ( temp bool)
-0:403          any ( temp bool)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r004' ( temp 2X2 matrix of float)
-0:403          arc sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r005' ( temp 2X2 matrix of float)
-0:403          arc tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r006' ( temp 2X2 matrix of float)
-0:403          arc tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r007' ( temp 2X2 matrix of float)
-0:403          Ceiling ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Test condition and select ( temp void)
-0:403        Condition
-0:403        any ( temp bool)
-0:403          Compare Less Than ( temp 2X2 matrix of bool)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            Constant:
-0:403              0.000000
-0:403              0.000000
-0:403              0.000000
-0:403              0.000000
-0:403        true case
-0:403        Branch: Kill
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r008' ( temp 2X2 matrix of float)
-0:403          clamp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403            'inF2' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r009' ( temp 2X2 matrix of float)
-0:403          cosine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r010' ( temp 2X2 matrix of float)
-0:403          hyp. cosine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r011' ( temp 2X2 matrix of float)
-0:403          dPdx ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r012' ( temp 2X2 matrix of float)
-0:403          dPdxCoarse ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r013' ( temp 2X2 matrix of float)
-0:403          dPdxFine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r014' ( temp 2X2 matrix of float)
-0:403          dPdy ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r015' ( temp 2X2 matrix of float)
-0:403          dPdyCoarse ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r016' ( temp 2X2 matrix of float)
-0:403          dPdyFine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r017' ( temp 2X2 matrix of float)
-0:403          degrees ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp float)
-0:403          'r018' ( temp float)
-0:403          determinant ( temp float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r019' ( temp 2X2 matrix of float)
-0:403          exp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'R020' ( temp 2X2 matrix of float)
-0:403          exp2 ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r021' ( temp 2X2 matrix of float)
-0:403          Floor ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r022' ( temp 2X2 matrix of float)
-0:403          mod ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r023' ( temp 2X2 matrix of float)
-0:403          Fraction ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r024' ( temp 2X2 matrix of float)
-0:403          frexp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r025' ( temp 2X2 matrix of float)
-0:403          fwidth ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r026' ( temp 2X2 matrix of float)
-0:403          ldexp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r026a' ( temp 2X2 matrix of float)
-0:403          mix ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403            'inF2' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r027' ( temp 2X2 matrix of float)
-0:403          log ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r028' ( temp 2X2 matrix of float)
-0:403          matrix-scale ( temp 2X2 matrix of float)
-0:403            log2 ( temp 2X2 matrix of float)
-0:403              'inF0' ( in 2X2 matrix of float)
-0:403            Constant:
-0:403              0.301030
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r029' ( temp 2X2 matrix of float)
-0:403          log2 ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r030' ( temp 2X2 matrix of float)
-0:403          max ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r031' ( temp 2X2 matrix of float)
-0:403          min ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r032' ( temp 2X2 matrix of float)
-0:403          pow ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r033' ( temp 2X2 matrix of float)
-0:403          radians ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r034' ( temp 2X2 matrix of float)
-0:403          roundEven ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r035' ( temp 2X2 matrix of float)
-0:403          inverse sqrt ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r036' ( temp 2X2 matrix of float)
-0:403          clamp ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            Constant:
-0:403              0.000000
-0:403            Constant:
-0:403              1.000000
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r037' ( temp 2X2 matrix of float)
-0:403          Sign ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r038' ( temp 2X2 matrix of float)
-0:403          sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'inF1' ( in 2X2 matrix of float)
-0:403          sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'inF2' ( in 2X2 matrix of float)
-0:403          cosine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r039' ( temp 2X2 matrix of float)
-0:403          hyp. sine ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r049' ( temp 2X2 matrix of float)
-0:403          smoothstep ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403            'inF2' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r041' ( temp 2X2 matrix of float)
-0:403          sqrt ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r042' ( temp 2X2 matrix of float)
-0:403          step ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403            'inF1' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r043' ( temp 2X2 matrix of float)
-0:403          tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r044' ( temp 2X2 matrix of float)
-0:403          hyp. tangent ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:403      transpose ( temp 2X2 matrix of float)
-0:403        'inF0' ( in 2X2 matrix of float)
-0:403      Sequence
-0:403        move second child to first child ( temp 2X2 matrix of float)
-0:403          'r046' ( temp 2X2 matrix of float)
-0:403          trunc ( temp 2X2 matrix of float)
-0:403            'inF0' ( in 2X2 matrix of float)
-0:406      Branch: Return with expression
+0:398      Sequence
+0:398        move second child to first child ( temp bool)
+0:398          'r000' ( temp bool)
+0:398          all ( temp bool)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r001' ( temp 2X2 matrix of float)
+0:398          Absolute value ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      arc cosine ( temp 2X2 matrix of float)
+0:398        'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp bool)
+0:398          'r003' ( temp bool)
+0:398          any ( temp bool)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r004' ( temp 2X2 matrix of float)
+0:398          arc sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r005' ( temp 2X2 matrix of float)
+0:398          arc tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r006' ( temp 2X2 matrix of float)
+0:398          arc tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r007' ( temp 2X2 matrix of float)
+0:398          Ceiling ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Test condition and select ( temp void)
+0:398        Condition
+0:398        any ( temp bool)
+0:398          Compare Less Than ( temp 2X2 matrix of bool)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            Constant:
+0:398              0.000000
+0:398              0.000000
+0:398              0.000000
+0:398              0.000000
+0:398        true case
+0:398        Branch: Kill
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r008' ( temp 2X2 matrix of float)
+0:398          clamp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398            'inF2' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r009' ( temp 2X2 matrix of float)
+0:398          cosine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r010' ( temp 2X2 matrix of float)
+0:398          hyp. cosine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r011' ( temp 2X2 matrix of float)
+0:398          dPdx ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r012' ( temp 2X2 matrix of float)
+0:398          dPdxCoarse ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r013' ( temp 2X2 matrix of float)
+0:398          dPdxFine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r014' ( temp 2X2 matrix of float)
+0:398          dPdy ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r015' ( temp 2X2 matrix of float)
+0:398          dPdyCoarse ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r016' ( temp 2X2 matrix of float)
+0:398          dPdyFine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r017' ( temp 2X2 matrix of float)
+0:398          degrees ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp float)
+0:398          'r018' ( temp float)
+0:398          determinant ( temp float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r019' ( temp 2X2 matrix of float)
+0:398          exp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'R020' ( temp 2X2 matrix of float)
+0:398          exp2 ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r021' ( temp 2X2 matrix of float)
+0:398          Floor ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r022' ( temp 2X2 matrix of float)
+0:398          mod ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r023' ( temp 2X2 matrix of float)
+0:398          Fraction ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r025' ( temp 2X2 matrix of float)
+0:398          fwidth ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r026' ( temp 2X2 matrix of float)
+0:398          ldexp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r026a' ( temp 2X2 matrix of float)
+0:398          mix ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398            'inF2' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r027' ( temp 2X2 matrix of float)
+0:398          log ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r028' ( temp 2X2 matrix of float)
+0:398          matrix-scale ( temp 2X2 matrix of float)
+0:398            log2 ( temp 2X2 matrix of float)
+0:398              'inF0' ( in 2X2 matrix of float)
+0:398            Constant:
+0:398              0.301030
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r029' ( temp 2X2 matrix of float)
+0:398          log2 ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r030' ( temp 2X2 matrix of float)
+0:398          max ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r031' ( temp 2X2 matrix of float)
+0:398          min ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r032' ( temp 2X2 matrix of float)
+0:398          pow ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r033' ( temp 2X2 matrix of float)
+0:398          radians ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r034' ( temp 2X2 matrix of float)
+0:398          roundEven ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r035' ( temp 2X2 matrix of float)
+0:398          inverse sqrt ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r036' ( temp 2X2 matrix of float)
+0:398          clamp ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            Constant:
+0:398              0.000000
+0:398            Constant:
+0:398              1.000000
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r037' ( temp 2X2 matrix of float)
+0:398          Sign ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r038' ( temp 2X2 matrix of float)
+0:398          sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'inF1' ( in 2X2 matrix of float)
+0:398          sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'inF2' ( in 2X2 matrix of float)
+0:398          cosine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r039' ( temp 2X2 matrix of float)
+0:398          hyp. sine ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r049' ( temp 2X2 matrix of float)
+0:398          smoothstep ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398            'inF2' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r041' ( temp 2X2 matrix of float)
+0:398          sqrt ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r042' ( temp 2X2 matrix of float)
+0:398          step ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398            'inF1' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r043' ( temp 2X2 matrix of float)
+0:398          tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r044' ( temp 2X2 matrix of float)
+0:398          hyp. tangent ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:398      transpose ( temp 2X2 matrix of float)
+0:398        'inF0' ( in 2X2 matrix of float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2X2 matrix of float)
+0:398          'r046' ( temp 2X2 matrix of float)
+0:398          trunc ( temp 2X2 matrix of float)
+0:398            'inF0' ( in 2X2 matrix of float)
+0:401      Branch: Return with expression
 0:?         Constant:
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
-0:410  Function Definition: PixelShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
-0:410    Function Parameters: 
-0:410      'inF0' ( in 3X3 matrix of float)
-0:410      'inF1' ( in 3X3 matrix of float)
-0:410      'inF2' ( in 3X3 matrix of float)
+0:405  Function Definition: PixelShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
+0:405    Function Parameters: 
+0:405      'inF0' ( in 3X3 matrix of float)
+0:405      'inF1' ( in 3X3 matrix of float)
+0:405      'inF2' ( in 3X3 matrix of float)
 0:?     Sequence
-0:412      Sequence
-0:412        move second child to first child ( temp bool)
-0:412          'r000' ( temp bool)
-0:412          all ( temp bool)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r001' ( temp 3X3 matrix of float)
-0:412          Absolute value ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      arc cosine ( temp 3X3 matrix of float)
-0:412        'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp bool)
-0:412          'r003' ( temp bool)
-0:412          any ( temp bool)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r004' ( temp 3X3 matrix of float)
-0:412          arc sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r005' ( temp 3X3 matrix of float)
-0:412          arc tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r006' ( temp 3X3 matrix of float)
-0:412          arc tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r007' ( temp 3X3 matrix of float)
-0:412          Ceiling ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Test condition and select ( temp void)
-0:412        Condition
-0:412        any ( temp bool)
-0:412          Compare Less Than ( temp 3X3 matrix of bool)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            Constant:
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412              0.000000
-0:412        true case
-0:412        Branch: Kill
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r008' ( temp 3X3 matrix of float)
-0:412          clamp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412            'inF2' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r009' ( temp 3X3 matrix of float)
-0:412          cosine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r010' ( temp 3X3 matrix of float)
-0:412          hyp. cosine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r011' ( temp 3X3 matrix of float)
-0:412          dPdx ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r012' ( temp 3X3 matrix of float)
-0:412          dPdxCoarse ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r013' ( temp 3X3 matrix of float)
-0:412          dPdxFine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r014' ( temp 3X3 matrix of float)
-0:412          dPdy ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r015' ( temp 3X3 matrix of float)
-0:412          dPdyCoarse ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r016' ( temp 3X3 matrix of float)
-0:412          dPdyFine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r017' ( temp 3X3 matrix of float)
-0:412          degrees ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp float)
-0:412          'r018' ( temp float)
-0:412          determinant ( temp float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r019' ( temp 3X3 matrix of float)
-0:412          exp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'R020' ( temp 3X3 matrix of float)
-0:412          exp2 ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r021' ( temp 3X3 matrix of float)
-0:412          Floor ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r022' ( temp 3X3 matrix of float)
-0:412          mod ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r023' ( temp 3X3 matrix of float)
-0:412          Fraction ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r024' ( temp 3X3 matrix of float)
-0:412          frexp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r025' ( temp 3X3 matrix of float)
-0:412          fwidth ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r026' ( temp 3X3 matrix of float)
-0:412          ldexp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r026a' ( temp 3X3 matrix of float)
-0:412          mix ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412            'inF2' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r027' ( temp 3X3 matrix of float)
-0:412          log ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r028' ( temp 3X3 matrix of float)
-0:412          matrix-scale ( temp 3X3 matrix of float)
-0:412            log2 ( temp 3X3 matrix of float)
-0:412              'inF0' ( in 3X3 matrix of float)
-0:412            Constant:
-0:412              0.301030
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r029' ( temp 3X3 matrix of float)
-0:412          log2 ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r030' ( temp 3X3 matrix of float)
-0:412          max ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r031' ( temp 3X3 matrix of float)
-0:412          min ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r032' ( temp 3X3 matrix of float)
-0:412          pow ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r033' ( temp 3X3 matrix of float)
-0:412          radians ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r034' ( temp 3X3 matrix of float)
-0:412          roundEven ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r035' ( temp 3X3 matrix of float)
-0:412          inverse sqrt ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r036' ( temp 3X3 matrix of float)
-0:412          clamp ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            Constant:
-0:412              0.000000
-0:412            Constant:
-0:412              1.000000
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r037' ( temp 3X3 matrix of float)
-0:412          Sign ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r038' ( temp 3X3 matrix of float)
-0:412          sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'inF1' ( in 3X3 matrix of float)
-0:412          sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'inF2' ( in 3X3 matrix of float)
-0:412          cosine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r039' ( temp 3X3 matrix of float)
-0:412          hyp. sine ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r049' ( temp 3X3 matrix of float)
-0:412          smoothstep ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412            'inF2' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r041' ( temp 3X3 matrix of float)
-0:412          sqrt ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r042' ( temp 3X3 matrix of float)
-0:412          step ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412            'inF1' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r043' ( temp 3X3 matrix of float)
-0:412          tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r044' ( temp 3X3 matrix of float)
-0:412          hyp. tangent ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:412      transpose ( temp 3X3 matrix of float)
-0:412        'inF0' ( in 3X3 matrix of float)
-0:412      Sequence
-0:412        move second child to first child ( temp 3X3 matrix of float)
-0:412          'r046' ( temp 3X3 matrix of float)
-0:412          trunc ( temp 3X3 matrix of float)
-0:412            'inF0' ( in 3X3 matrix of float)
-0:415      Branch: Return with expression
+0:407      Sequence
+0:407        move second child to first child ( temp bool)
+0:407          'r000' ( temp bool)
+0:407          all ( temp bool)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r001' ( temp 3X3 matrix of float)
+0:407          Absolute value ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      arc cosine ( temp 3X3 matrix of float)
+0:407        'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp bool)
+0:407          'r003' ( temp bool)
+0:407          any ( temp bool)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r004' ( temp 3X3 matrix of float)
+0:407          arc sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r005' ( temp 3X3 matrix of float)
+0:407          arc tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r006' ( temp 3X3 matrix of float)
+0:407          arc tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r007' ( temp 3X3 matrix of float)
+0:407          Ceiling ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Test condition and select ( temp void)
+0:407        Condition
+0:407        any ( temp bool)
+0:407          Compare Less Than ( temp 3X3 matrix of bool)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            Constant:
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407              0.000000
+0:407        true case
+0:407        Branch: Kill
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r008' ( temp 3X3 matrix of float)
+0:407          clamp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407            'inF2' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r009' ( temp 3X3 matrix of float)
+0:407          cosine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r010' ( temp 3X3 matrix of float)
+0:407          hyp. cosine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r011' ( temp 3X3 matrix of float)
+0:407          dPdx ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r012' ( temp 3X3 matrix of float)
+0:407          dPdxCoarse ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r013' ( temp 3X3 matrix of float)
+0:407          dPdxFine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r014' ( temp 3X3 matrix of float)
+0:407          dPdy ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r015' ( temp 3X3 matrix of float)
+0:407          dPdyCoarse ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r016' ( temp 3X3 matrix of float)
+0:407          dPdyFine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r017' ( temp 3X3 matrix of float)
+0:407          degrees ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp float)
+0:407          'r018' ( temp float)
+0:407          determinant ( temp float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r019' ( temp 3X3 matrix of float)
+0:407          exp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'R020' ( temp 3X3 matrix of float)
+0:407          exp2 ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r021' ( temp 3X3 matrix of float)
+0:407          Floor ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r022' ( temp 3X3 matrix of float)
+0:407          mod ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r023' ( temp 3X3 matrix of float)
+0:407          Fraction ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r025' ( temp 3X3 matrix of float)
+0:407          fwidth ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r026' ( temp 3X3 matrix of float)
+0:407          ldexp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r026a' ( temp 3X3 matrix of float)
+0:407          mix ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407            'inF2' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r027' ( temp 3X3 matrix of float)
+0:407          log ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r028' ( temp 3X3 matrix of float)
+0:407          matrix-scale ( temp 3X3 matrix of float)
+0:407            log2 ( temp 3X3 matrix of float)
+0:407              'inF0' ( in 3X3 matrix of float)
+0:407            Constant:
+0:407              0.301030
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r029' ( temp 3X3 matrix of float)
+0:407          log2 ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r030' ( temp 3X3 matrix of float)
+0:407          max ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r031' ( temp 3X3 matrix of float)
+0:407          min ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r032' ( temp 3X3 matrix of float)
+0:407          pow ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r033' ( temp 3X3 matrix of float)
+0:407          radians ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r034' ( temp 3X3 matrix of float)
+0:407          roundEven ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r035' ( temp 3X3 matrix of float)
+0:407          inverse sqrt ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r036' ( temp 3X3 matrix of float)
+0:407          clamp ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            Constant:
+0:407              0.000000
+0:407            Constant:
+0:407              1.000000
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r037' ( temp 3X3 matrix of float)
+0:407          Sign ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r038' ( temp 3X3 matrix of float)
+0:407          sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'inF1' ( in 3X3 matrix of float)
+0:407          sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'inF2' ( in 3X3 matrix of float)
+0:407          cosine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r039' ( temp 3X3 matrix of float)
+0:407          hyp. sine ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r049' ( temp 3X3 matrix of float)
+0:407          smoothstep ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407            'inF2' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r041' ( temp 3X3 matrix of float)
+0:407          sqrt ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r042' ( temp 3X3 matrix of float)
+0:407          step ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407            'inF1' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r043' ( temp 3X3 matrix of float)
+0:407          tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r044' ( temp 3X3 matrix of float)
+0:407          hyp. tangent ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:407      transpose ( temp 3X3 matrix of float)
+0:407        'inF0' ( in 3X3 matrix of float)
+0:407      Sequence
+0:407        move second child to first child ( temp 3X3 matrix of float)
+0:407          'r046' ( temp 3X3 matrix of float)
+0:407          trunc ( temp 3X3 matrix of float)
+0:407            'inF0' ( in 3X3 matrix of float)
+0:410      Branch: Return with expression
 0:?         Constant:
 0:?           3.000000
 0:?           3.000000
@@ -4950,301 +4872,295 @@ gl_FragCoord origin is upper left
 0:?           3.000000
 0:?           3.000000
 0:?           3.000000
-0:419  Function Definition: PixelShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
-0:419    Function Parameters: 
-0:419      'inF0' ( in 4X4 matrix of float)
-0:419      'inF1' ( in 4X4 matrix of float)
-0:419      'inF2' ( in 4X4 matrix of float)
+0:414  Function Definition: PixelShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
+0:414    Function Parameters: 
+0:414      'inF0' ( in 4X4 matrix of float)
+0:414      'inF1' ( in 4X4 matrix of float)
+0:414      'inF2' ( in 4X4 matrix of float)
 0:?     Sequence
-0:421      Sequence
-0:421        move second child to first child ( temp bool)
-0:421          'r000' ( temp bool)
-0:421          all ( temp bool)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r001' ( temp 4X4 matrix of float)
-0:421          Absolute value ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      arc cosine ( temp 4X4 matrix of float)
-0:421        'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp bool)
-0:421          'r003' ( temp bool)
-0:421          any ( temp bool)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r004' ( temp 4X4 matrix of float)
-0:421          arc sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r005' ( temp 4X4 matrix of float)
-0:421          arc tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r006' ( temp 4X4 matrix of float)
-0:421          arc tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r007' ( temp 4X4 matrix of float)
-0:421          Ceiling ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Test condition and select ( temp void)
-0:421        Condition
-0:421        any ( temp bool)
-0:421          Compare Less Than ( temp 4X4 matrix of bool)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            Constant:
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421              0.000000
-0:421        true case
-0:421        Branch: Kill
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r008' ( temp 4X4 matrix of float)
-0:421          clamp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421            'inF2' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r009' ( temp 4X4 matrix of float)
-0:421          cosine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r010' ( temp 4X4 matrix of float)
-0:421          hyp. cosine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r011' ( temp 4X4 matrix of float)
-0:421          dPdx ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r012' ( temp 4X4 matrix of float)
-0:421          dPdxCoarse ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r013' ( temp 4X4 matrix of float)
-0:421          dPdxFine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r014' ( temp 4X4 matrix of float)
-0:421          dPdy ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r015' ( temp 4X4 matrix of float)
-0:421          dPdyCoarse ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r016' ( temp 4X4 matrix of float)
-0:421          dPdyFine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r017' ( temp 4X4 matrix of float)
-0:421          degrees ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp float)
-0:421          'r018' ( temp float)
-0:421          determinant ( temp float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r019' ( temp 4X4 matrix of float)
-0:421          exp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'R020' ( temp 4X4 matrix of float)
-0:421          exp2 ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r021' ( temp 4X4 matrix of float)
-0:421          Floor ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r022' ( temp 4X4 matrix of float)
-0:421          mod ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r023' ( temp 4X4 matrix of float)
-0:421          Fraction ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r024' ( temp 4X4 matrix of float)
-0:421          frexp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r025' ( temp 4X4 matrix of float)
-0:421          fwidth ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r026' ( temp 4X4 matrix of float)
-0:421          ldexp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r026a' ( temp 4X4 matrix of float)
-0:421          mix ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421            'inF2' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r027' ( temp 4X4 matrix of float)
-0:421          log ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r028' ( temp 4X4 matrix of float)
-0:421          matrix-scale ( temp 4X4 matrix of float)
-0:421            log2 ( temp 4X4 matrix of float)
-0:421              'inF0' ( in 4X4 matrix of float)
-0:421            Constant:
-0:421              0.301030
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r029' ( temp 4X4 matrix of float)
-0:421          log2 ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r030' ( temp 4X4 matrix of float)
-0:421          max ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r031' ( temp 4X4 matrix of float)
-0:421          min ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r032' ( temp 4X4 matrix of float)
-0:421          pow ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r033' ( temp 4X4 matrix of float)
-0:421          radians ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r034' ( temp 4X4 matrix of float)
-0:421          roundEven ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r035' ( temp 4X4 matrix of float)
-0:421          inverse sqrt ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r036' ( temp 4X4 matrix of float)
-0:421          clamp ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            Constant:
-0:421              0.000000
-0:421            Constant:
-0:421              1.000000
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r037' ( temp 4X4 matrix of float)
-0:421          Sign ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r038' ( temp 4X4 matrix of float)
-0:421          sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'inF1' ( in 4X4 matrix of float)
-0:421          sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'inF2' ( in 4X4 matrix of float)
-0:421          cosine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r039' ( temp 4X4 matrix of float)
-0:421          hyp. sine ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r049' ( temp 4X4 matrix of float)
-0:421          smoothstep ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421            'inF2' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r041' ( temp 4X4 matrix of float)
-0:421          sqrt ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r042' ( temp 4X4 matrix of float)
-0:421          step ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421            'inF1' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r043' ( temp 4X4 matrix of float)
-0:421          tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r044' ( temp 4X4 matrix of float)
-0:421          hyp. tangent ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:421      transpose ( temp 4X4 matrix of float)
-0:421        'inF0' ( in 4X4 matrix of float)
-0:421      Sequence
-0:421        move second child to first child ( temp 4X4 matrix of float)
-0:421          'r046' ( temp 4X4 matrix of float)
-0:421          trunc ( temp 4X4 matrix of float)
-0:421            'inF0' ( in 4X4 matrix of float)
-0:424      Branch: Return with expression
+0:416      Sequence
+0:416        move second child to first child ( temp bool)
+0:416          'r000' ( temp bool)
+0:416          all ( temp bool)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r001' ( temp 4X4 matrix of float)
+0:416          Absolute value ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      arc cosine ( temp 4X4 matrix of float)
+0:416        'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp bool)
+0:416          'r003' ( temp bool)
+0:416          any ( temp bool)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r004' ( temp 4X4 matrix of float)
+0:416          arc sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r005' ( temp 4X4 matrix of float)
+0:416          arc tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r006' ( temp 4X4 matrix of float)
+0:416          arc tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r007' ( temp 4X4 matrix of float)
+0:416          Ceiling ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Test condition and select ( temp void)
+0:416        Condition
+0:416        any ( temp bool)
+0:416          Compare Less Than ( temp 4X4 matrix of bool)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            Constant:
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416              0.000000
+0:416        true case
+0:416        Branch: Kill
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r008' ( temp 4X4 matrix of float)
+0:416          clamp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416            'inF2' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r009' ( temp 4X4 matrix of float)
+0:416          cosine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r010' ( temp 4X4 matrix of float)
+0:416          hyp. cosine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r011' ( temp 4X4 matrix of float)
+0:416          dPdx ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r012' ( temp 4X4 matrix of float)
+0:416          dPdxCoarse ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r013' ( temp 4X4 matrix of float)
+0:416          dPdxFine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r014' ( temp 4X4 matrix of float)
+0:416          dPdy ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r015' ( temp 4X4 matrix of float)
+0:416          dPdyCoarse ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r016' ( temp 4X4 matrix of float)
+0:416          dPdyFine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r017' ( temp 4X4 matrix of float)
+0:416          degrees ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp float)
+0:416          'r018' ( temp float)
+0:416          determinant ( temp float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r019' ( temp 4X4 matrix of float)
+0:416          exp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'R020' ( temp 4X4 matrix of float)
+0:416          exp2 ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r021' ( temp 4X4 matrix of float)
+0:416          Floor ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r022' ( temp 4X4 matrix of float)
+0:416          mod ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r023' ( temp 4X4 matrix of float)
+0:416          Fraction ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r025' ( temp 4X4 matrix of float)
+0:416          fwidth ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r026' ( temp 4X4 matrix of float)
+0:416          ldexp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r026a' ( temp 4X4 matrix of float)
+0:416          mix ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416            'inF2' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r027' ( temp 4X4 matrix of float)
+0:416          log ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r028' ( temp 4X4 matrix of float)
+0:416          matrix-scale ( temp 4X4 matrix of float)
+0:416            log2 ( temp 4X4 matrix of float)
+0:416              'inF0' ( in 4X4 matrix of float)
+0:416            Constant:
+0:416              0.301030
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r029' ( temp 4X4 matrix of float)
+0:416          log2 ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r030' ( temp 4X4 matrix of float)
+0:416          max ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r031' ( temp 4X4 matrix of float)
+0:416          min ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r032' ( temp 4X4 matrix of float)
+0:416          pow ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r033' ( temp 4X4 matrix of float)
+0:416          radians ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r034' ( temp 4X4 matrix of float)
+0:416          roundEven ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r035' ( temp 4X4 matrix of float)
+0:416          inverse sqrt ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r036' ( temp 4X4 matrix of float)
+0:416          clamp ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            Constant:
+0:416              0.000000
+0:416            Constant:
+0:416              1.000000
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r037' ( temp 4X4 matrix of float)
+0:416          Sign ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r038' ( temp 4X4 matrix of float)
+0:416          sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'inF1' ( in 4X4 matrix of float)
+0:416          sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'inF2' ( in 4X4 matrix of float)
+0:416          cosine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r039' ( temp 4X4 matrix of float)
+0:416          hyp. sine ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r049' ( temp 4X4 matrix of float)
+0:416          smoothstep ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416            'inF2' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r041' ( temp 4X4 matrix of float)
+0:416          sqrt ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r042' ( temp 4X4 matrix of float)
+0:416          step ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416            'inF1' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r043' ( temp 4X4 matrix of float)
+0:416          tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r044' ( temp 4X4 matrix of float)
+0:416          hyp. tangent ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:416      transpose ( temp 4X4 matrix of float)
+0:416        'inF0' ( in 4X4 matrix of float)
+0:416      Sequence
+0:416        move second child to first child ( temp 4X4 matrix of float)
+0:416          'r046' ( temp 4X4 matrix of float)
+0:416          trunc ( temp 4X4 matrix of float)
+0:416            'inF0' ( in 4X4 matrix of float)
+0:419      Branch: Return with expression
 0:?         Constant:
 0:?           4.000000
 0:?           4.000000
@@ -5262,334 +5178,334 @@ gl_FragCoord origin is upper left
 0:?           4.000000
 0:?           4.000000
 0:?           4.000000
-0:442  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
-0:442    Function Parameters: 
-0:442      'inF0' ( in float)
-0:442      'inF1' ( in float)
-0:442      'inFV0' ( in 2-component vector of float)
-0:442      'inFV1' ( in 2-component vector of float)
-0:442      'inFM0' ( in 2X2 matrix of float)
-0:442      'inFM1' ( in 2X2 matrix of float)
+0:437  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
+0:437    Function Parameters: 
+0:437      'inF0' ( in float)
+0:437      'inF1' ( in float)
+0:437      'inFV0' ( in 2-component vector of float)
+0:437      'inFV1' ( in 2-component vector of float)
+0:437      'inFM0' ( in 2X2 matrix of float)
+0:437      'inFM1' ( in 2X2 matrix of float)
 0:?     Sequence
-0:443      Sequence
-0:443        move second child to first child ( temp float)
-0:443          'r0' ( temp float)
-0:443          component-wise multiply ( temp float)
-0:443            'inF1' ( in float)
-0:443            'inF0' ( in float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r1' ( temp 2-component vector of float)
-0:443          vector-scale ( temp 2-component vector of float)
-0:443            'inF0' ( in float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r2' ( temp 2-component vector of float)
-0:443          vector-scale ( temp 2-component vector of float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443            'inF0' ( in float)
-0:443      Sequence
-0:443        move second child to first child ( temp float)
-0:443          'r3' ( temp float)
-0:443          dot-product ( temp float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443            'inFV1' ( in 2-component vector of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r4' ( temp 2-component vector of float)
-0:443          vector-times-matrix ( temp 2-component vector of float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2-component vector of float)
-0:443          'r5' ( temp 2-component vector of float)
-0:443          matrix-times-vector ( temp 2-component vector of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443            'inFV0' ( in 2-component vector of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2X2 matrix of float)
-0:443          'r6' ( temp 2X2 matrix of float)
-0:443          matrix-scale ( temp 2X2 matrix of float)
-0:443            'inF0' ( in float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2X2 matrix of float)
-0:443          'r7' ( temp 2X2 matrix of float)
-0:443          matrix-scale ( temp 2X2 matrix of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:443            'inF0' ( in float)
-0:443      Sequence
-0:443        move second child to first child ( temp 2X2 matrix of float)
-0:443          'r8' ( temp 2X2 matrix of float)
-0:443          matrix-multiply ( temp 2X2 matrix of float)
-0:443            'inFM1' ( in 2X2 matrix of float)
-0:443            'inFM0' ( in 2X2 matrix of float)
-0:449  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
-0:449    Function Parameters: 
-0:449      'inF0' ( in float)
-0:449      'inF1' ( in float)
-0:449      'inFV0' ( in 3-component vector of float)
-0:449      'inFV1' ( in 3-component vector of float)
-0:449      'inFM0' ( in 3X3 matrix of float)
-0:449      'inFM1' ( in 3X3 matrix of float)
+0:438      Sequence
+0:438        move second child to first child ( temp float)
+0:438          'r0' ( temp float)
+0:438          component-wise multiply ( temp float)
+0:438            'inF1' ( in float)
+0:438            'inF0' ( in float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r1' ( temp 2-component vector of float)
+0:438          vector-scale ( temp 2-component vector of float)
+0:438            'inF0' ( in float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r2' ( temp 2-component vector of float)
+0:438          vector-scale ( temp 2-component vector of float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438            'inF0' ( in float)
+0:438      Sequence
+0:438        move second child to first child ( temp float)
+0:438          'r3' ( temp float)
+0:438          dot-product ( temp float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438            'inFV1' ( in 2-component vector of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r4' ( temp 2-component vector of float)
+0:438          vector-times-matrix ( temp 2-component vector of float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2-component vector of float)
+0:438          'r5' ( temp 2-component vector of float)
+0:438          matrix-times-vector ( temp 2-component vector of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438            'inFV0' ( in 2-component vector of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2X2 matrix of float)
+0:438          'r6' ( temp 2X2 matrix of float)
+0:438          matrix-scale ( temp 2X2 matrix of float)
+0:438            'inF0' ( in float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2X2 matrix of float)
+0:438          'r7' ( temp 2X2 matrix of float)
+0:438          matrix-scale ( temp 2X2 matrix of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:438            'inF0' ( in float)
+0:438      Sequence
+0:438        move second child to first child ( temp 2X2 matrix of float)
+0:438          'r8' ( temp 2X2 matrix of float)
+0:438          matrix-multiply ( temp 2X2 matrix of float)
+0:438            'inFM1' ( in 2X2 matrix of float)
+0:438            'inFM0' ( in 2X2 matrix of float)
+0:444  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
+0:444    Function Parameters: 
+0:444      'inF0' ( in float)
+0:444      'inF1' ( in float)
+0:444      'inFV0' ( in 3-component vector of float)
+0:444      'inFV1' ( in 3-component vector of float)
+0:444      'inFM0' ( in 3X3 matrix of float)
+0:444      'inFM1' ( in 3X3 matrix of float)
 0:?     Sequence
-0:450      Sequence
-0:450        move second child to first child ( temp float)
-0:450          'r0' ( temp float)
-0:450          component-wise multiply ( temp float)
-0:450            'inF1' ( in float)
-0:450            'inF0' ( in float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r1' ( temp 3-component vector of float)
-0:450          vector-scale ( temp 3-component vector of float)
-0:450            'inF0' ( in float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r2' ( temp 3-component vector of float)
-0:450          vector-scale ( temp 3-component vector of float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450            'inF0' ( in float)
-0:450      Sequence
-0:450        move second child to first child ( temp float)
-0:450          'r3' ( temp float)
-0:450          dot-product ( temp float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450            'inFV1' ( in 3-component vector of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r4' ( temp 3-component vector of float)
-0:450          vector-times-matrix ( temp 3-component vector of float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3-component vector of float)
-0:450          'r5' ( temp 3-component vector of float)
-0:450          matrix-times-vector ( temp 3-component vector of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450            'inFV0' ( in 3-component vector of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3X3 matrix of float)
-0:450          'r6' ( temp 3X3 matrix of float)
-0:450          matrix-scale ( temp 3X3 matrix of float)
-0:450            'inF0' ( in float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3X3 matrix of float)
-0:450          'r7' ( temp 3X3 matrix of float)
-0:450          matrix-scale ( temp 3X3 matrix of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:450            'inF0' ( in float)
-0:450      Sequence
-0:450        move second child to first child ( temp 3X3 matrix of float)
-0:450          'r8' ( temp 3X3 matrix of float)
-0:450          matrix-multiply ( temp 3X3 matrix of float)
-0:450            'inFM1' ( in 3X3 matrix of float)
-0:450            'inFM0' ( in 3X3 matrix of float)
-0:456  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
-0:456    Function Parameters: 
-0:456      'inF0' ( in float)
-0:456      'inF1' ( in float)
-0:456      'inFV0' ( in 4-component vector of float)
-0:456      'inFV1' ( in 4-component vector of float)
-0:456      'inFM0' ( in 4X4 matrix of float)
-0:456      'inFM1' ( in 4X4 matrix of float)
+0:445      Sequence
+0:445        move second child to first child ( temp float)
+0:445          'r0' ( temp float)
+0:445          component-wise multiply ( temp float)
+0:445            'inF1' ( in float)
+0:445            'inF0' ( in float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r1' ( temp 3-component vector of float)
+0:445          vector-scale ( temp 3-component vector of float)
+0:445            'inF0' ( in float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r2' ( temp 3-component vector of float)
+0:445          vector-scale ( temp 3-component vector of float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445            'inF0' ( in float)
+0:445      Sequence
+0:445        move second child to first child ( temp float)
+0:445          'r3' ( temp float)
+0:445          dot-product ( temp float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445            'inFV1' ( in 3-component vector of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r4' ( temp 3-component vector of float)
+0:445          vector-times-matrix ( temp 3-component vector of float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3-component vector of float)
+0:445          'r5' ( temp 3-component vector of float)
+0:445          matrix-times-vector ( temp 3-component vector of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445            'inFV0' ( in 3-component vector of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3X3 matrix of float)
+0:445          'r6' ( temp 3X3 matrix of float)
+0:445          matrix-scale ( temp 3X3 matrix of float)
+0:445            'inF0' ( in float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3X3 matrix of float)
+0:445          'r7' ( temp 3X3 matrix of float)
+0:445          matrix-scale ( temp 3X3 matrix of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:445            'inF0' ( in float)
+0:445      Sequence
+0:445        move second child to first child ( temp 3X3 matrix of float)
+0:445          'r8' ( temp 3X3 matrix of float)
+0:445          matrix-multiply ( temp 3X3 matrix of float)
+0:445            'inFM1' ( in 3X3 matrix of float)
+0:445            'inFM0' ( in 3X3 matrix of float)
+0:451  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
+0:451    Function Parameters: 
+0:451      'inF0' ( in float)
+0:451      'inF1' ( in float)
+0:451      'inFV0' ( in 4-component vector of float)
+0:451      'inFV1' ( in 4-component vector of float)
+0:451      'inFM0' ( in 4X4 matrix of float)
+0:451      'inFM1' ( in 4X4 matrix of float)
 0:?     Sequence
-0:457      Sequence
-0:457        move second child to first child ( temp float)
-0:457          'r0' ( temp float)
-0:457          component-wise multiply ( temp float)
-0:457            'inF1' ( in float)
-0:457            'inF0' ( in float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r1' ( temp 4-component vector of float)
-0:457          vector-scale ( temp 4-component vector of float)
-0:457            'inF0' ( in float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r2' ( temp 4-component vector of float)
-0:457          vector-scale ( temp 4-component vector of float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457            'inF0' ( in float)
-0:457      Sequence
-0:457        move second child to first child ( temp float)
-0:457          'r3' ( temp float)
-0:457          dot-product ( temp float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457            'inFV1' ( in 4-component vector of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r4' ( temp 4-component vector of float)
-0:457          vector-times-matrix ( temp 4-component vector of float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4-component vector of float)
-0:457          'r5' ( temp 4-component vector of float)
-0:457          matrix-times-vector ( temp 4-component vector of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457            'inFV0' ( in 4-component vector of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4X4 matrix of float)
-0:457          'r6' ( temp 4X4 matrix of float)
-0:457          matrix-scale ( temp 4X4 matrix of float)
-0:457            'inF0' ( in float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4X4 matrix of float)
-0:457          'r7' ( temp 4X4 matrix of float)
-0:457          matrix-scale ( temp 4X4 matrix of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:457            'inF0' ( in float)
-0:457      Sequence
-0:457        move second child to first child ( temp 4X4 matrix of float)
-0:457          'r8' ( temp 4X4 matrix of float)
-0:457          matrix-multiply ( temp 4X4 matrix of float)
-0:457            'inFM1' ( in 4X4 matrix of float)
-0:457            'inFM0' ( in 4X4 matrix of float)
-0:466  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
-0:466    Function Parameters: 
-0:466      'inF0' ( in float)
-0:466      'inF1' ( in float)
-0:466      'inFV2' ( in 2-component vector of float)
-0:466      'inFV3' ( in 3-component vector of float)
-0:466      'inFM2x3' ( in 2X3 matrix of float)
-0:466      'inFM3x2' ( in 3X2 matrix of float)
-0:466      'inFM3x3' ( in 3X3 matrix of float)
-0:466      'inFM3x4' ( in 3X4 matrix of float)
-0:466      'inFM2x4' ( in 2X4 matrix of float)
+0:452      Sequence
+0:452        move second child to first child ( temp float)
+0:452          'r0' ( temp float)
+0:452          component-wise multiply ( temp float)
+0:452            'inF1' ( in float)
+0:452            'inF0' ( in float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r1' ( temp 4-component vector of float)
+0:452          vector-scale ( temp 4-component vector of float)
+0:452            'inF0' ( in float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r2' ( temp 4-component vector of float)
+0:452          vector-scale ( temp 4-component vector of float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452            'inF0' ( in float)
+0:452      Sequence
+0:452        move second child to first child ( temp float)
+0:452          'r3' ( temp float)
+0:452          dot-product ( temp float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452            'inFV1' ( in 4-component vector of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r4' ( temp 4-component vector of float)
+0:452          vector-times-matrix ( temp 4-component vector of float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4-component vector of float)
+0:452          'r5' ( temp 4-component vector of float)
+0:452          matrix-times-vector ( temp 4-component vector of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452            'inFV0' ( in 4-component vector of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4X4 matrix of float)
+0:452          'r6' ( temp 4X4 matrix of float)
+0:452          matrix-scale ( temp 4X4 matrix of float)
+0:452            'inF0' ( in float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4X4 matrix of float)
+0:452          'r7' ( temp 4X4 matrix of float)
+0:452          matrix-scale ( temp 4X4 matrix of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:452            'inF0' ( in float)
+0:452      Sequence
+0:452        move second child to first child ( temp 4X4 matrix of float)
+0:452          'r8' ( temp 4X4 matrix of float)
+0:452          matrix-multiply ( temp 4X4 matrix of float)
+0:452            'inFM1' ( in 4X4 matrix of float)
+0:452            'inFM0' ( in 4X4 matrix of float)
+0:461  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
+0:461    Function Parameters: 
+0:461      'inF0' ( in float)
+0:461      'inF1' ( in float)
+0:461      'inFV2' ( in 2-component vector of float)
+0:461      'inFV3' ( in 3-component vector of float)
+0:461      'inFM2x3' ( in 2X3 matrix of float)
+0:461      'inFM3x2' ( in 3X2 matrix of float)
+0:461      'inFM3x3' ( in 3X3 matrix of float)
+0:461      'inFM3x4' ( in 3X4 matrix of float)
+0:461      'inFM2x4' ( in 2X4 matrix of float)
 0:?     Sequence
+0:462      Sequence
+0:462        move second child to first child ( temp float)
+0:462          'r00' ( temp float)
+0:462          component-wise multiply ( temp float)
+0:462            'inF1' ( in float)
+0:462            'inF0' ( in float)
+0:463      Sequence
+0:463        move second child to first child ( temp 2-component vector of float)
+0:463          'r01' ( temp 2-component vector of float)
+0:463          vector-scale ( temp 2-component vector of float)
+0:463            'inF0' ( in float)
+0:463            'inFV2' ( in 2-component vector of float)
+0:464      Sequence
+0:464        move second child to first child ( temp 3-component vector of float)
+0:464          'r02' ( temp 3-component vector of float)
+0:464          vector-scale ( temp 3-component vector of float)
+0:464            'inF0' ( in float)
+0:464            'inFV3' ( in 3-component vector of float)
+0:465      Sequence
+0:465        move second child to first child ( temp 2-component vector of float)
+0:465          'r03' ( temp 2-component vector of float)
+0:465          vector-scale ( temp 2-component vector of float)
+0:465            'inFV2' ( in 2-component vector of float)
+0:465            'inF0' ( in float)
+0:466      Sequence
+0:466        move second child to first child ( temp 3-component vector of float)
+0:466          'r04' ( temp 3-component vector of float)
+0:466          vector-scale ( temp 3-component vector of float)
+0:466            'inFV3' ( in 3-component vector of float)
+0:466            'inF0' ( in float)
 0:467      Sequence
 0:467        move second child to first child ( temp float)
-0:467          'r00' ( temp float)
-0:467          component-wise multiply ( temp float)
-0:467            'inF1' ( in float)
-0:467            'inF0' ( in float)
+0:467          'r05' ( temp float)
+0:467          dot-product ( temp float)
+0:467            'inFV2' ( in 2-component vector of float)
+0:467            'inFV2' ( in 2-component vector of float)
 0:468      Sequence
-0:468        move second child to first child ( temp 2-component vector of float)
-0:468          'r01' ( temp 2-component vector of float)
-0:468          vector-scale ( temp 2-component vector of float)
-0:468            'inF0' ( in float)
-0:468            'inFV2' ( in 2-component vector of float)
+0:468        move second child to first child ( temp float)
+0:468          'r06' ( temp float)
+0:468          dot-product ( temp float)
+0:468            'inFV3' ( in 3-component vector of float)
+0:468            'inFV3' ( in 3-component vector of float)
 0:469      Sequence
 0:469        move second child to first child ( temp 3-component vector of float)
-0:469          'r02' ( temp 3-component vector of float)
-0:469          vector-scale ( temp 3-component vector of float)
-0:469            'inF0' ( in float)
-0:469            'inFV3' ( in 3-component vector of float)
+0:469          'r07' ( temp 3-component vector of float)
+0:469          matrix-times-vector ( temp 3-component vector of float)
+0:469            'inFM2x3' ( in 2X3 matrix of float)
+0:469            'inFV2' ( in 2-component vector of float)
 0:470      Sequence
 0:470        move second child to first child ( temp 2-component vector of float)
-0:470          'r03' ( temp 2-component vector of float)
-0:470          vector-scale ( temp 2-component vector of float)
-0:470            'inFV2' ( in 2-component vector of float)
-0:470            'inF0' ( in float)
+0:470          'r08' ( temp 2-component vector of float)
+0:470          matrix-times-vector ( temp 2-component vector of float)
+0:470            'inFM3x2' ( in 3X2 matrix of float)
+0:470            'inFV3' ( in 3-component vector of float)
 0:471      Sequence
-0:471        move second child to first child ( temp 3-component vector of float)
-0:471          'r04' ( temp 3-component vector of float)
-0:471          vector-scale ( temp 3-component vector of float)
+0:471        move second child to first child ( temp 2-component vector of float)
+0:471          'r09' ( temp 2-component vector of float)
+0:471          vector-times-matrix ( temp 2-component vector of float)
 0:471            'inFV3' ( in 3-component vector of float)
-0:471            'inF0' ( in float)
+0:471            'inFM2x3' ( in 2X3 matrix of float)
 0:472      Sequence
-0:472        move second child to first child ( temp float)
-0:472          'r05' ( temp float)
-0:472          dot-product ( temp float)
-0:472            'inFV2' ( in 2-component vector of float)
+0:472        move second child to first child ( temp 3-component vector of float)
+0:472          'r10' ( temp 3-component vector of float)
+0:472          vector-times-matrix ( temp 3-component vector of float)
 0:472            'inFV2' ( in 2-component vector of float)
+0:472            'inFM3x2' ( in 3X2 matrix of float)
 0:473      Sequence
-0:473        move second child to first child ( temp float)
-0:473          'r06' ( temp float)
-0:473          dot-product ( temp float)
-0:473            'inFV3' ( in 3-component vector of float)
-0:473            'inFV3' ( in 3-component vector of float)
+0:473        move second child to first child ( temp 2X3 matrix of float)
+0:473          'r11' ( temp 2X3 matrix of float)
+0:473          matrix-scale ( temp 2X3 matrix of float)
+0:473            'inF0' ( in float)
+0:473            'inFM2x3' ( in 2X3 matrix of float)
 0:474      Sequence
-0:474        move second child to first child ( temp 3-component vector of float)
-0:474          'r07' ( temp 3-component vector of float)
-0:474          matrix-times-vector ( temp 3-component vector of float)
-0:474            'inFM2x3' ( in 2X3 matrix of float)
-0:474            'inFV2' ( in 2-component vector of float)
+0:474        move second child to first child ( temp 3X2 matrix of float)
+0:474          'r12' ( temp 3X2 matrix of float)
+0:474          matrix-scale ( temp 3X2 matrix of float)
+0:474            'inF0' ( in float)
+0:474            'inFM3x2' ( in 3X2 matrix of float)
 0:475      Sequence
-0:475        move second child to first child ( temp 2-component vector of float)
-0:475          'r08' ( temp 2-component vector of float)
-0:475          matrix-times-vector ( temp 2-component vector of float)
+0:475        move second child to first child ( temp 2X2 matrix of float)
+0:475          'r13' ( temp 2X2 matrix of float)
+0:475          matrix-multiply ( temp 2X2 matrix of float)
 0:475            'inFM3x2' ( in 3X2 matrix of float)
-0:475            'inFV3' ( in 3-component vector of float)
+0:475            'inFM2x3' ( in 2X3 matrix of float)
 0:476      Sequence
-0:476        move second child to first child ( temp 2-component vector of float)
-0:476          'r09' ( temp 2-component vector of float)
-0:476          vector-times-matrix ( temp 2-component vector of float)
-0:476            'inFV3' ( in 3-component vector of float)
+0:476        move second child to first child ( temp 2X3 matrix of float)
+0:476          'r14' ( temp 2X3 matrix of float)
+0:476          matrix-multiply ( temp 2X3 matrix of float)
+0:476            'inFM3x3' ( in 3X3 matrix of float)
 0:476            'inFM2x3' ( in 2X3 matrix of float)
 0:477      Sequence
-0:477        move second child to first child ( temp 3-component vector of float)
-0:477          'r10' ( temp 3-component vector of float)
-0:477          vector-times-matrix ( temp 3-component vector of float)
-0:477            'inFV2' ( in 2-component vector of float)
-0:477            'inFM3x2' ( in 3X2 matrix of float)
+0:477        move second child to first child ( temp 2X4 matrix of float)
+0:477          'r15' ( temp 2X4 matrix of float)
+0:477          matrix-multiply ( temp 2X4 matrix of float)
+0:477            'inFM3x4' ( in 3X4 matrix of float)
+0:477            'inFM2x3' ( in 2X3 matrix of float)
 0:478      Sequence
-0:478        move second child to first child ( temp 2X3 matrix of float)
-0:478          'r11' ( temp 2X3 matrix of float)
-0:478          matrix-scale ( temp 2X3 matrix of float)
-0:478            'inF0' ( in float)
-0:478            'inFM2x3' ( in 2X3 matrix of float)
-0:479      Sequence
-0:479        move second child to first child ( temp 3X2 matrix of float)
-0:479          'r12' ( temp 3X2 matrix of float)
-0:479          matrix-scale ( temp 3X2 matrix of float)
-0:479            'inF0' ( in float)
-0:479            'inFM3x2' ( in 3X2 matrix of float)
-0:480      Sequence
-0:480        move second child to first child ( temp 2X2 matrix of float)
-0:480          'r13' ( temp 2X2 matrix of float)
-0:480          matrix-multiply ( temp 2X2 matrix of float)
-0:480            'inFM3x2' ( in 3X2 matrix of float)
-0:480            'inFM2x3' ( in 2X3 matrix of float)
-0:481      Sequence
-0:481        move second child to first child ( temp 2X3 matrix of float)
-0:481          'r14' ( temp 2X3 matrix of float)
-0:481          matrix-multiply ( temp 2X3 matrix of float)
-0:481            'inFM3x3' ( in 3X3 matrix of float)
-0:481            'inFM2x3' ( in 2X3 matrix of float)
-0:482      Sequence
-0:482        move second child to first child ( temp 2X4 matrix of float)
-0:482          'r15' ( temp 2X4 matrix of float)
-0:482          matrix-multiply ( temp 2X4 matrix of float)
-0:482            'inFM3x4' ( in 3X4 matrix of float)
-0:482            'inFM2x3' ( in 2X3 matrix of float)
-0:483      Sequence
-0:483        move second child to first child ( temp 3X4 matrix of float)
-0:483          'r16' ( temp 3X4 matrix of float)
-0:483          matrix-multiply ( temp 3X4 matrix of float)
-0:483            'inFM2x4' ( in 2X4 matrix of float)
-0:483            'inFM3x2' ( in 3X2 matrix of float)
-0:489  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
-0:489    Function Parameters: 
+0:478        move second child to first child ( temp 3X4 matrix of float)
+0:478          'r16' ( temp 3X4 matrix of float)
+0:478          matrix-multiply ( temp 3X4 matrix of float)
+0:478            'inFM2x4' ( in 2X4 matrix of float)
+0:478            'inFM3x2' ( in 3X2 matrix of float)
+0:484  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
+0:484    Function Parameters: 
 0:?     Sequence
-0:491      move second child to first child ( temp 4-component vector of float)
-0:491        color: direct index for structure ( temp 4-component vector of float)
-0:491          'ps_output' ( temp structure{ temp 4-component vector of float color})
-0:491          Constant:
-0:491            0 (const int)
-0:491        Constant:
-0:491          1.000000
-0:491          1.000000
-0:491          1.000000
-0:491          1.000000
-0:492      Branch: Return with expression
-0:492        'ps_output' ( temp structure{ temp 4-component vector of float color})
-0:489  Function Definition: main( ( temp void)
-0:489    Function Parameters: 
+0:486      move second child to first child ( temp 4-component vector of float)
+0:486        color: direct index for structure ( temp 4-component vector of float)
+0:486          'ps_output' ( temp structure{ temp 4-component vector of float color})
+0:486          Constant:
+0:486            0 (const int)
+0:486        Constant:
+0:486          1.000000
+0:486          1.000000
+0:486          1.000000
+0:486          1.000000
+0:487      Branch: Return with expression
+0:487        'ps_output' ( temp structure{ temp 4-component vector of float color})
+0:484  Function Definition: main( ( temp void)
+0:484    Function Parameters: 
 0:?     Sequence
-0:489      Sequence
-0:489        move second child to first child ( temp 4-component vector of float)
+0:484      Sequence
+0:484        move second child to first child ( temp 4-component vector of float)
 0:?           'color' (layout( location=0) out 4-component vector of float)
-0:489          color: direct index for structure ( temp 4-component vector of float)
-0:489            Function Call: @main( ( temp structure{ temp 4-component vector of float color})
-0:489            Constant:
-0:489              0 (const int)
+0:484          color: direct index for structure ( temp 4-component vector of float)
+0:484            Function Call: @main( ( temp structure{ temp 4-component vector of float color})
+0:484            Constant:
+0:484              0 (const int)
 0:?   Linker Objects
 0:?     'gs_ua' ( shared uint)
 0:?     'gs_ub' ( shared uint)
@@ -5607,14 +5523,15 @@ gl_FragCoord origin is upper left
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 1833
+// Id's are bound by 1791
 
                               Capability Shader
                               Capability DerivativeControl
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "main" 1814
+                              EntryPoint Fragment 4  "main" 1772
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 16  "PixelShaderFunctionS(f1;f1;f1;u1;u1;"
                               Name 11  "inF0"
@@ -5719,429 +5636,415 @@ gl_FragCoord origin is upper left
                               Name 226  "r031"
                               Name 229  "r033"
                               Name 233  "r034"
-                              Name 236  "r035"
-                              Name 238  "ResType"
-                              Name 242  "r036"
-                              Name 245  "r037"
-                              Name 248  "r038"
-                              Name 251  "r039"
-                              Name 255  "r039a"
-                              Name 260  "r040"
-                              Name 263  "r041"
-                              Name 268  "r042"
-                              Name 271  "r043"
-                              Name 275  "r044"
-                              Name 279  "r045"
-                              Name 283  "r046"
-                              Name 286  "r047"
-                              Name 290  "r048"
-                              Name 294  "r049"
-                              Name 297  "r050"
-                              Name 300  "r051"
-                              Name 303  "r052"
-                              Name 306  "r053"
-                              Name 313  "r055"
-                              Name 316  "r056"
-                              Name 321  "r057"
-                              Name 324  "r058"
-                              Name 328  "r059"
-                              Name 331  "r060"
-                              Name 334  "r061"
-                              Name 341  "r000"
-                              Name 344  "r001"
-                              Name 347  "r002"
-                              Name 350  "r003"
-                              Name 353  "r004"
-                              Name 358  "r005"
-                              Name 361  "r006"
-                              Name 364  "r007"
-                              Name 367  "r009"
-                              Name 370  "r010"
-                              Name 374  "r011"
-                              Name 377  "r012"
-                              Name 390  "r013"
-                              Name 393  "r015"
-                              Name 396  "r016"
-                              Name 400  "r017"
-                              Name 403  "r018"
-                              Name 406  "r019"
-                              Name 409  "r020"
-                              Name 412  "r021"
-                              Name 415  "r022"
-                              Name 418  "r023"
-                              Name 421  "r026"
-                              Name 425  "r027"
-                              Name 429  "r028"
-                              Name 432  "r029"
-                              Name 435  "r030"
-                              Name 440  "r031"
-                              Name 445  "r032"
-                              Name 447  "r033"
-                              Name 450  "r035"
-                              Name 454  "r036"
-                              Name 457  "r037"
-                              Name 459  "ResType"
-                              Name 463  "r038"
-                              Name 467  "r039"
-                              Name 470  "r040"
-                              Name 473  "r041"
-                              Name 477  "r039a"
-                              Name 482  "r042"
-                              Name 485  "r043"
-                              Name 488  "r044"
-                              Name 492  "r045"
-                              Name 495  "r046"
-                              Name 499  "r047"
-                              Name 503  "r048"
-                              Name 506  "r049"
-                              Name 510  "r050"
-                              Name 513  "r051"
-                              Name 517  "r052"
-                              Name 521  "r053"
-                              Name 526  "r054"
-                              Name 531  "r055"
-                              Name 534  "r056"
-                              Name 537  "r057"
-                              Name 542  "r058"
-                              Name 545  "r059"
-                              Name 552  "r060"
-                              Name 555  "r061"
-                              Name 560  "r062"
-                              Name 563  "r063"
-                              Name 567  "r064"
-                              Name 570  "r065"
-                              Name 573  "r066"
-                              Name 579  "r000"
-                              Name 582  "r001"
-                              Name 585  "r002"
-                              Name 588  "r003"
-                              Name 591  "r004"
-                              Name 596  "r005"
-                              Name 599  "r006"
-                              Name 602  "r007"
-                              Name 605  "r009"
-                              Name 608  "r010"
-                              Name 612  "r011"
-                              Name 615  "r012"
-                              Name 628  "r013"
-                              Name 631  "r014"
-                              Name 634  "r015"
-                              Name 639  "r016"
-                              Name 643  "r017"
-                              Name 646  "r018"
-                              Name 649  "r019"
-                              Name 652  "r020"
-                              Name 655  "r021"
-                              Name 658  "r022"
-                              Name 661  "r023"
-                              Name 664  "r024"
-                              Name 668  "r025"
-                              Name 672  "r029"
-                              Name 675  "r030"
-                              Name 678  "r031"
-                              Name 683  "r032"
-                              Name 687  "r033"
-                              Name 689  "r034"
-                              Name 692  "r036"
-                              Name 696  "r037"
-                              Name 699  "r038"
-                              Name 701  "ResType"
-                              Name 705  "r039"
-                              Name 709  "r040"
-                              Name 712  "r041"
-                              Name 715  "r042"
-                              Name 719  "r039a"
-                              Name 724  "r039b"
-                              Name 730  "r043"
-                              Name 733  "r044"
-                              Name 736  "r045"
-                              Name 740  "r046"
-                              Name 743  "r047"
-                              Name 747  "r048"
-                              Name 751  "r049"
-                              Name 754  "r050"
-                              Name 758  "r051"
-                              Name 761  "r052"
-                              Name 765  "r053"
-                              Name 769  "r054"
-                              Name 773  "r055"
-                              Name 776  "r056"
-                              Name 779  "r057"
-                              Name 782  "r058"
-                              Name 787  "r059"
-                              Name 790  "r060"
-                              Name 797  "r061"
-                              Name 800  "r062"
-                              Name 805  "r063"
-                              Name 808  "r064"
-                              Name 812  "r065"
-                              Name 815  "r066"
-                              Name 818  "r067"
-                              Name 825  "r000"
-                              Name 828  "r001"
-                              Name 831  "r002"
-                              Name 834  "r003"
-                              Name 837  "r004"
-                              Name 842  "r005"
-                              Name 845  "r006"
-                              Name 848  "r007"
-                              Name 851  "r009"
-                              Name 854  "r010"
-                              Name 858  "r011"
-                              Name 861  "r012"
-                              Name 874  "r013"
-                              Name 877  "r014"
-                              Name 880  "r015"
-                              Name 883  "r016"
-                              Name 886  "r017"
-                              Name 889  "r018"
-                              Name 892  "r019"
-                              Name 895  "r020"
-                              Name 898  "r021"
-                              Name 901  "r022"
-                              Name 904  "r023"
-                              Name 908  "r024"
-                              Name 912  "r025"
-                              Name 923  "r029"
-                              Name 926  "r030"
-                              Name 929  "r031"
-                              Name 934  "r032"
-                              Name 939  "r033"
-                              Name 941  "r034"
-                              Name 944  "r036"
-                              Name 948  "r037"
-                              Name 951  "r038"
-                              Name 953  "ResType"
-                              Name 957  "r039"
-                              Name 961  "r040"
-                              Name 964  "r041"
-                              Name 967  "r042"
-                              Name 971  "r039a"
-                              Name 976  "r043"
-                              Name 979  "r044"
-                              Name 982  "r045"
-                              Name 986  "r046"
-                              Name 989  "r047"
-                              Name 993  "r048"
-                              Name 997  "r049"
-                              Name 1000  "r050"
-                              Name 1004  "r051"
-                              Name 1007  "r052"
-                              Name 1011  "r053"
-                              Name 1015  "r054"
-                              Name 1019  "r055"
-                              Name 1022  "r056"
-                              Name 1025  "r057"
-                              Name 1028  "r058"
-                              Name 1033  "r059"
-                              Name 1036  "r060"
-                              Name 1043  "r061"
-                              Name 1046  "r062"
-                              Name 1051  "r063"
-                              Name 1054  "r064"
-                              Name 1058  "r065"
-                              Name 1061  "r066"
-                              Name 1064  "r067"
-                              Name 1071  "r000"
-                              Name 1074  "r001"
-                              Name 1079  "r003"
-                              Name 1082  "r004"
-                              Name 1085  "r005"
-                              Name 1088  "r006"
-                              Name 1092  "r007"
-                              Name 1103  "r008"
-                              Name 1108  "r009"
-                              Name 1111  "r010"
-                              Name 1114  "r011"
-                              Name 1117  "r012"
-                              Name 1120  "r013"
-                              Name 1123  "r014"
-                              Name 1126  "r015"
-                              Name 1129  "r016"
-                              Name 1132  "r017"
-                              Name 1135  "r018"
-                              Name 1138  "r019"
-                              Name 1141  "R020"
-                              Name 1144  "r021"
-                              Name 1147  "r022"
-                              Name 1157  "r023"
-                              Name 1160  "r024"
-                              Name 1162  "ResType"
-                              Name 1166  "r025"
-                              Name 1169  "r026"
-                              Name 1173  "r026a"
-                              Name 1178  "r027"
-                              Name 1181  "r028"
-                              Name 1185  "r029"
-                              Name 1188  "r030"
-                              Name 1192  "r031"
-                              Name 1196  "r032"
-                              Name 1200  "r033"
-                              Name 1203  "r034"
-                              Name 1206  "r035"
-                              Name 1209  "r036"
-                              Name 1214  "r037"
-                              Name 1217  "r038"
-                              Name 1224  "r039"
-                              Name 1227  "r049"
-                              Name 1232  "r041"
-                              Name 1235  "r042"
-                              Name 1239  "r043"
-                              Name 1242  "r044"
-                              Name 1247  "r046"
-                              Name 1254  "r000"
-                              Name 1257  "r001"
-                              Name 1262  "r003"
-                              Name 1265  "r004"
-                              Name 1268  "r005"
-                              Name 1271  "r006"
-                              Name 1275  "r007"
-                              Name 1286  "r008"
-                              Name 1291  "r009"
-                              Name 1294  "r010"
-                              Name 1297  "r011"
-                              Name 1300  "r012"
-                              Name 1303  "r013"
-                              Name 1306  "r014"
-                              Name 1309  "r015"
-                              Name 1312  "r016"
-                              Name 1315  "r017"
-                              Name 1318  "r018"
-                              Name 1321  "r019"
-                              Name 1324  "R020"
-                              Name 1327  "r021"
-                              Name 1330  "r022"
-                              Name 1343  "r023"
-                              Name 1346  "r024"
-                              Name 1348  "ResType"
-                              Name 1352  "r025"
-                              Name 1355  "r026"
-                              Name 1359  "r026a"
-                              Name 1364  "r027"
-                              Name 1367  "r028"
-                              Name 1371  "r029"
-                              Name 1374  "r030"
-                              Name 1378  "r031"
-                              Name 1382  "r032"
-                              Name 1386  "r033"
-                              Name 1389  "r034"
-                              Name 1392  "r035"
-                              Name 1395  "r036"
-                              Name 1400  "r037"
-                              Name 1403  "r038"
-                              Name 1410  "r039"
-                              Name 1413  "r049"
-                              Name 1418  "r041"
-                              Name 1421  "r042"
-                              Name 1425  "r043"
-                              Name 1428  "r044"
-                              Name 1433  "r046"
-                              Name 1440  "r000"
-                              Name 1443  "r001"
-                              Name 1448  "r003"
-                              Name 1451  "r004"
-                              Name 1454  "r005"
-                              Name 1457  "r006"
-                              Name 1461  "r007"
-                              Name 1472  "r008"
-                              Name 1477  "r009"
-                              Name 1480  "r010"
-                              Name 1483  "r011"
-                              Name 1486  "r012"
-                              Name 1489  "r013"
-                              Name 1492  "r014"
-                              Name 1495  "r015"
-                              Name 1498  "r016"
-                              Name 1501  "r017"
-                              Name 1504  "r018"
-                              Name 1507  "r019"
-                              Name 1510  "R020"
-                              Name 1513  "r021"
-                              Name 1516  "r022"
-                              Name 1532  "r023"
-                              Name 1535  "r024"
-                              Name 1537  "ResType"
-                              Name 1541  "r025"
-                              Name 1544  "r026"
-                              Name 1548  "r026a"
-                              Name 1553  "r027"
-                              Name 1556  "r028"
-                              Name 1560  "r029"
-                              Name 1563  "r030"
-                              Name 1567  "r031"
-                              Name 1571  "r032"
-                              Name 1575  "r033"
-                              Name 1578  "r034"
-                              Name 1581  "r035"
-                              Name 1584  "r036"
-                              Name 1589  "r037"
-                              Name 1592  "r038"
-                              Name 1599  "r039"
-                              Name 1602  "r049"
-                              Name 1607  "r041"
-                              Name 1610  "r042"
-                              Name 1614  "r043"
-                              Name 1617  "r044"
-                              Name 1622  "r046"
-                              Name 1629  "r0"
-                              Name 1633  "r1"
-                              Name 1637  "r2"
-                              Name 1641  "r3"
-                              Name 1645  "r4"
-                              Name 1649  "r5"
-                              Name 1653  "r6"
-                              Name 1657  "r7"
-                              Name 1661  "r8"
-                              Name 1665  "r0"
-                              Name 1669  "r1"
-                              Name 1673  "r2"
-                              Name 1677  "r3"
-                              Name 1681  "r4"
-                              Name 1685  "r5"
-                              Name 1689  "r6"
-                              Name 1693  "r7"
-                              Name 1697  "r8"
-                              Name 1701  "r0"
-                              Name 1705  "r1"
-                              Name 1709  "r2"
-                              Name 1713  "r3"
-                              Name 1717  "r4"
-                              Name 1721  "r5"
-                              Name 1725  "r6"
-                              Name 1729  "r7"
-                              Name 1733  "r8"
-                              Name 1737  "r00"
-                              Name 1741  "r01"
-                              Name 1745  "r02"
-                              Name 1749  "r03"
-                              Name 1753  "r04"
-                              Name 1757  "r05"
-                              Name 1761  "r06"
-                              Name 1765  "r07"
-                              Name 1769  "r08"
-                              Name 1773  "r09"
-                              Name 1777  "r10"
-                              Name 1781  "r11"
-                              Name 1785  "r12"
-                              Name 1789  "r13"
-                              Name 1793  "r14"
-                              Name 1797  "r15"
-                              Name 1801  "r16"
-                              Name 1806  "ps_output"
-                              Name 1814  "color"
-                              Name 1818  "gs_ua"
-                              Name 1819  "gs_ub"
-                              Name 1820  "gs_uc"
-                              Name 1822  "gs_ua2"
-                              Name 1823  "gs_ub2"
-                              Name 1824  "gs_uc2"
-                              Name 1826  "gs_ua3"
-                              Name 1827  "gs_ub3"
-                              Name 1828  "gs_uc3"
-                              Name 1830  "gs_ua4"
-                              Name 1831  "gs_ub4"
-                              Name 1832  "gs_uc4"
-                              Decorate 1814(color) Location 0
+                              Name 236  "r036"
+                              Name 239  "r037"
+                              Name 242  "r038"
+                              Name 245  "r039"
+                              Name 249  "r039a"
+                              Name 254  "r040"
+                              Name 257  "r041"
+                              Name 262  "r042"
+                              Name 265  "r043"
+                              Name 269  "r044"
+                              Name 273  "r045"
+                              Name 277  "r046"
+                              Name 280  "r047"
+                              Name 284  "r048"
+                              Name 288  "r049"
+                              Name 291  "r050"
+                              Name 294  "r051"
+                              Name 297  "r052"
+                              Name 300  "r053"
+                              Name 307  "r055"
+                              Name 310  "r056"
+                              Name 315  "r057"
+                              Name 318  "r058"
+                              Name 322  "r059"
+                              Name 325  "r060"
+                              Name 328  "r061"
+                              Name 335  "r000"
+                              Name 338  "r001"
+                              Name 341  "r002"
+                              Name 344  "r003"
+                              Name 347  "r004"
+                              Name 352  "r005"
+                              Name 355  "r006"
+                              Name 358  "r007"
+                              Name 361  "r009"
+                              Name 364  "r010"
+                              Name 368  "r011"
+                              Name 371  "r012"
+                              Name 384  "r013"
+                              Name 387  "r015"
+                              Name 390  "r016"
+                              Name 394  "r017"
+                              Name 397  "r018"
+                              Name 400  "r019"
+                              Name 403  "r020"
+                              Name 406  "r021"
+                              Name 409  "r022"
+                              Name 412  "r023"
+                              Name 415  "r026"
+                              Name 419  "r027"
+                              Name 423  "r028"
+                              Name 426  "r029"
+                              Name 429  "r030"
+                              Name 434  "r031"
+                              Name 439  "r032"
+                              Name 441  "r033"
+                              Name 444  "r035"
+                              Name 448  "r036"
+                              Name 451  "r038"
+                              Name 455  "r039"
+                              Name 458  "r040"
+                              Name 461  "r041"
+                              Name 465  "r039a"
+                              Name 470  "r042"
+                              Name 473  "r043"
+                              Name 476  "r044"
+                              Name 480  "r045"
+                              Name 483  "r046"
+                              Name 487  "r047"
+                              Name 491  "r048"
+                              Name 494  "r049"
+                              Name 498  "r050"
+                              Name 501  "r051"
+                              Name 505  "r052"
+                              Name 509  "r053"
+                              Name 514  "r054"
+                              Name 519  "r055"
+                              Name 522  "r056"
+                              Name 525  "r057"
+                              Name 530  "r058"
+                              Name 533  "r059"
+                              Name 540  "r060"
+                              Name 543  "r061"
+                              Name 548  "r062"
+                              Name 551  "r063"
+                              Name 555  "r064"
+                              Name 558  "r065"
+                              Name 561  "r066"
+                              Name 567  "r000"
+                              Name 570  "r001"
+                              Name 573  "r002"
+                              Name 576  "r003"
+                              Name 579  "r004"
+                              Name 584  "r005"
+                              Name 587  "r006"
+                              Name 590  "r007"
+                              Name 593  "r009"
+                              Name 596  "r010"
+                              Name 600  "r011"
+                              Name 603  "r012"
+                              Name 616  "r013"
+                              Name 619  "r014"
+                              Name 622  "r015"
+                              Name 627  "r016"
+                              Name 631  "r017"
+                              Name 634  "r018"
+                              Name 637  "r019"
+                              Name 640  "r020"
+                              Name 643  "r021"
+                              Name 646  "r022"
+                              Name 649  "r023"
+                              Name 652  "r024"
+                              Name 656  "r025"
+                              Name 660  "r029"
+                              Name 663  "r030"
+                              Name 666  "r031"
+                              Name 671  "r032"
+                              Name 675  "r033"
+                              Name 677  "r034"
+                              Name 680  "r036"
+                              Name 684  "r037"
+                              Name 687  "r039"
+                              Name 691  "r040"
+                              Name 694  "r041"
+                              Name 697  "r042"
+                              Name 701  "r039a"
+                              Name 706  "r039b"
+                              Name 712  "r043"
+                              Name 715  "r044"
+                              Name 718  "r045"
+                              Name 722  "r046"
+                              Name 725  "r047"
+                              Name 729  "r048"
+                              Name 733  "r049"
+                              Name 736  "r050"
+                              Name 740  "r051"
+                              Name 743  "r052"
+                              Name 747  "r053"
+                              Name 751  "r054"
+                              Name 755  "r055"
+                              Name 758  "r056"
+                              Name 761  "r057"
+                              Name 764  "r058"
+                              Name 769  "r059"
+                              Name 772  "r060"
+                              Name 779  "r061"
+                              Name 782  "r062"
+                              Name 787  "r063"
+                              Name 790  "r064"
+                              Name 794  "r065"
+                              Name 797  "r066"
+                              Name 800  "r067"
+                              Name 807  "r000"
+                              Name 810  "r001"
+                              Name 813  "r002"
+                              Name 816  "r003"
+                              Name 819  "r004"
+                              Name 824  "r005"
+                              Name 827  "r006"
+                              Name 830  "r007"
+                              Name 833  "r009"
+                              Name 836  "r010"
+                              Name 840  "r011"
+                              Name 843  "r012"
+                              Name 856  "r013"
+                              Name 859  "r014"
+                              Name 862  "r015"
+                              Name 865  "r016"
+                              Name 868  "r017"
+                              Name 871  "r018"
+                              Name 874  "r019"
+                              Name 877  "r020"
+                              Name 880  "r021"
+                              Name 883  "r022"
+                              Name 886  "r023"
+                              Name 890  "r024"
+                              Name 894  "r025"
+                              Name 905  "r029"
+                              Name 908  "r030"
+                              Name 911  "r031"
+                              Name 916  "r032"
+                              Name 921  "r033"
+                              Name 923  "r034"
+                              Name 926  "r036"
+                              Name 930  "r037"
+                              Name 933  "r039"
+                              Name 937  "r040"
+                              Name 940  "r041"
+                              Name 943  "r042"
+                              Name 947  "r039a"
+                              Name 952  "r043"
+                              Name 955  "r044"
+                              Name 958  "r045"
+                              Name 962  "r046"
+                              Name 965  "r047"
+                              Name 969  "r048"
+                              Name 973  "r049"
+                              Name 976  "r050"
+                              Name 980  "r051"
+                              Name 983  "r052"
+                              Name 987  "r053"
+                              Name 991  "r054"
+                              Name 995  "r055"
+                              Name 998  "r056"
+                              Name 1001  "r057"
+                              Name 1004  "r058"
+                              Name 1009  "r059"
+                              Name 1012  "r060"
+                              Name 1019  "r061"
+                              Name 1022  "r062"
+                              Name 1027  "r063"
+                              Name 1030  "r064"
+                              Name 1034  "r065"
+                              Name 1037  "r066"
+                              Name 1040  "r067"
+                              Name 1047  "r000"
+                              Name 1050  "r001"
+                              Name 1055  "r003"
+                              Name 1058  "r004"
+                              Name 1061  "r005"
+                              Name 1064  "r006"
+                              Name 1068  "r007"
+                              Name 1079  "r008"
+                              Name 1084  "r009"
+                              Name 1087  "r010"
+                              Name 1090  "r011"
+                              Name 1093  "r012"
+                              Name 1096  "r013"
+                              Name 1099  "r014"
+                              Name 1102  "r015"
+                              Name 1105  "r016"
+                              Name 1108  "r017"
+                              Name 1111  "r018"
+                              Name 1114  "r019"
+                              Name 1117  "R020"
+                              Name 1120  "r021"
+                              Name 1123  "r022"
+                              Name 1133  "r023"
+                              Name 1136  "r025"
+                              Name 1139  "r026"
+                              Name 1143  "r026a"
+                              Name 1148  "r027"
+                              Name 1151  "r028"
+                              Name 1155  "r029"
+                              Name 1158  "r030"
+                              Name 1162  "r031"
+                              Name 1166  "r032"
+                              Name 1170  "r033"
+                              Name 1173  "r034"
+                              Name 1176  "r035"
+                              Name 1179  "r036"
+                              Name 1184  "r037"
+                              Name 1187  "r038"
+                              Name 1194  "r039"
+                              Name 1197  "r049"
+                              Name 1202  "r041"
+                              Name 1205  "r042"
+                              Name 1209  "r043"
+                              Name 1212  "r044"
+                              Name 1217  "r046"
+                              Name 1224  "r000"
+                              Name 1227  "r001"
+                              Name 1232  "r003"
+                              Name 1235  "r004"
+                              Name 1238  "r005"
+                              Name 1241  "r006"
+                              Name 1245  "r007"
+                              Name 1256  "r008"
+                              Name 1261  "r009"
+                              Name 1264  "r010"
+                              Name 1267  "r011"
+                              Name 1270  "r012"
+                              Name 1273  "r013"
+                              Name 1276  "r014"
+                              Name 1279  "r015"
+                              Name 1282  "r016"
+                              Name 1285  "r017"
+                              Name 1288  "r018"
+                              Name 1291  "r019"
+                              Name 1294  "R020"
+                              Name 1297  "r021"
+                              Name 1300  "r022"
+                              Name 1313  "r023"
+                              Name 1316  "r025"
+                              Name 1319  "r026"
+                              Name 1323  "r026a"
+                              Name 1328  "r027"
+                              Name 1331  "r028"
+                              Name 1335  "r029"
+                              Name 1338  "r030"
+                              Name 1342  "r031"
+                              Name 1346  "r032"
+                              Name 1350  "r033"
+                              Name 1353  "r034"
+                              Name 1356  "r035"
+                              Name 1359  "r036"
+                              Name 1364  "r037"
+                              Name 1367  "r038"
+                              Name 1374  "r039"
+                              Name 1377  "r049"
+                              Name 1382  "r041"
+                              Name 1385  "r042"
+                              Name 1389  "r043"
+                              Name 1392  "r044"
+                              Name 1397  "r046"
+                              Name 1404  "r000"
+                              Name 1407  "r001"
+                              Name 1412  "r003"
+                              Name 1415  "r004"
+                              Name 1418  "r005"
+                              Name 1421  "r006"
+                              Name 1425  "r007"
+                              Name 1436  "r008"
+                              Name 1441  "r009"
+                              Name 1444  "r010"
+                              Name 1447  "r011"
+                              Name 1450  "r012"
+                              Name 1453  "r013"
+                              Name 1456  "r014"
+                              Name 1459  "r015"
+                              Name 1462  "r016"
+                              Name 1465  "r017"
+                              Name 1468  "r018"
+                              Name 1471  "r019"
+                              Name 1474  "R020"
+                              Name 1477  "r021"
+                              Name 1480  "r022"
+                              Name 1496  "r023"
+                              Name 1499  "r025"
+                              Name 1502  "r026"
+                              Name 1506  "r026a"
+                              Name 1511  "r027"
+                              Name 1514  "r028"
+                              Name 1518  "r029"
+                              Name 1521  "r030"
+                              Name 1525  "r031"
+                              Name 1529  "r032"
+                              Name 1533  "r033"
+                              Name 1536  "r034"
+                              Name 1539  "r035"
+                              Name 1542  "r036"
+                              Name 1547  "r037"
+                              Name 1550  "r038"
+                              Name 1557  "r039"
+                              Name 1560  "r049"
+                              Name 1565  "r041"
+                              Name 1568  "r042"
+                              Name 1572  "r043"
+                              Name 1575  "r044"
+                              Name 1580  "r046"
+                              Name 1587  "r0"
+                              Name 1591  "r1"
+                              Name 1595  "r2"
+                              Name 1599  "r3"
+                              Name 1603  "r4"
+                              Name 1607  "r5"
+                              Name 1611  "r6"
+                              Name 1615  "r7"
+                              Name 1619  "r8"
+                              Name 1623  "r0"
+                              Name 1627  "r1"
+                              Name 1631  "r2"
+                              Name 1635  "r3"
+                              Name 1639  "r4"
+                              Name 1643  "r5"
+                              Name 1647  "r6"
+                              Name 1651  "r7"
+                              Name 1655  "r8"
+                              Name 1659  "r0"
+                              Name 1663  "r1"
+                              Name 1667  "r2"
+                              Name 1671  "r3"
+                              Name 1675  "r4"
+                              Name 1679  "r5"
+                              Name 1683  "r6"
+                              Name 1687  "r7"
+                              Name 1691  "r8"
+                              Name 1695  "r00"
+                              Name 1699  "r01"
+                              Name 1703  "r02"
+                              Name 1707  "r03"
+                              Name 1711  "r04"
+                              Name 1715  "r05"
+                              Name 1719  "r06"
+                              Name 1723  "r07"
+                              Name 1727  "r08"
+                              Name 1731  "r09"
+                              Name 1735  "r10"
+                              Name 1739  "r11"
+                              Name 1743  "r12"
+                              Name 1747  "r13"
+                              Name 1751  "r14"
+                              Name 1755  "r15"
+                              Name 1759  "r16"
+                              Name 1764  "ps_output"
+                              Name 1772  "color"
+                              Name 1776  "gs_ua"
+                              Name 1777  "gs_ub"
+                              Name 1778  "gs_uc"
+                              Name 1780  "gs_ua2"
+                              Name 1781  "gs_ub2"
+                              Name 1782  "gs_uc2"
+                              Name 1784  "gs_ua3"
+                              Name 1785  "gs_ub3"
+                              Name 1786  "gs_uc3"
+                              Name 1788  "gs_ua4"
+                              Name 1789  "gs_ub4"
+                              Name 1790  "gs_uc4"
+                              Decorate 1772(color) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -6194,95 +6097,88 @@ gl_FragCoord origin is upper left
              153:             TypePointer Function 152(int)
              179:    6(float) Constant 0
              191:    152(int) Constant 7
-    238(ResType):             TypeStruct 6(float) 152(int)
-             266:    6(float) Constant 1050288283
-             287:    6(float) Constant 1065353216
-             291:    152(int) Constant 2
-             356:             TypeVector 152(int) 2
-             357:             TypePointer Function 356(ivec2)
-             383:   24(fvec2) ConstantComposite 179 179
-             384:             TypeVector 135(bool) 2
-             397:    152(int) Constant 3
-             398:  356(ivec2) ConstantComposite 191 397
-             441:      8(int) Constant 7
-             442:      8(int) Constant 8
-             443:   26(ivec2) ConstantComposite 441 442
-    459(ResType):             TypeStruct 24(fvec2) 356(ivec2)
-             466:             TypePointer Function 384(bvec2)
-             524:    6(float) Constant 1073741824
-             527:      8(int) Constant 1
-             528:      8(int) Constant 2
-             529:   26(ivec2) ConstantComposite 527 528
-             576:   24(fvec2) ConstantComposite 287 524
-             594:             TypeVector 152(int) 3
-             595:             TypePointer Function 594(ivec3)
-             621:   36(fvec3) ConstantComposite 179 179 179
-             622:             TypeVector 135(bool) 3
-             635:      8(int) Constant 3
-             636:      8(int) Constant 5
-             637:   38(ivec3) ConstantComposite 441 635 636
-             684:      8(int) Constant 4
-             685:   38(ivec3) ConstantComposite 528 635 684
-    701(ResType):             TypeStruct 36(fvec3) 594(ivec3)
-             708:             TypePointer Function 622(bvec3)
-             727:    6(float) Constant 1050253722
-             774:   38(ivec3) ConstantComposite 527 528 635
-             821:    6(float) Constant 1077936128
-             822:   36(fvec3) ConstantComposite 287 524 821
-             840:             TypeVector 152(int) 4
-             841:             TypePointer Function 840(ivec4)
-             867:   48(fvec4) ConstantComposite 179 179 179 179
-             868:             TypeVector 135(bool) 4
-             881:   50(ivec4) ConstantComposite 441 635 636 528
-             935:      8(int) Constant 9
-             936:      8(int) Constant 10
-             937:   50(ivec4) ConstantComposite 441 442 935 936
-    953(ResType):             TypeStruct 48(fvec4) 840(ivec4)
-             960:             TypePointer Function 868(bvec4)
-            1020:   50(ivec4) ConstantComposite 527 528 635 684
-            1067:    6(float) Constant 1082130432
-            1068:   48(fvec4) ConstantComposite 287 524 821 1067
-            1096:          60 ConstantComposite 383 383
-            1097:             TypeMatrix 384(bvec2) 2
-   1162(ResType):             TypeStruct 60 356(ivec2)
-            1250:   24(fvec2) ConstantComposite 524 524
-            1251:          60 ConstantComposite 1250 1250
-            1279:          68 ConstantComposite 621 621 621
-            1280:             TypeMatrix 622(bvec3) 3
-   1348(ResType):             TypeStruct 68 594(ivec3)
-            1436:   36(fvec3) ConstantComposite 821 821 821
-            1437:          68 ConstantComposite 1436 1436 1436
-            1465:          76 ConstantComposite 867 867 867 867
-            1466:             TypeMatrix 868(bvec4) 4
-   1537(ResType):             TypeStruct 76 840(ivec4)
-            1625:   48(fvec4) ConstantComposite 1067 1067 1067 1067
-            1626:          76 ConstantComposite 1625 1625 1625 1625
-            1805:             TypePointer Function 131(PS_OUTPUT)
-            1807:    152(int) Constant 0
-            1808:   48(fvec4) ConstantComposite 287 287 287 287
-            1813:             TypePointer Output 48(fvec4)
-     1814(color):   1813(ptr) Variable Output
-            1817:             TypePointer Workgroup 8(int)
-     1818(gs_ua):   1817(ptr) Variable Workgroup
-     1819(gs_ub):   1817(ptr) Variable Workgroup
-     1820(gs_uc):   1817(ptr) Variable Workgroup
-            1821:             TypePointer Workgroup 26(ivec2)
-    1822(gs_ua2):   1821(ptr) Variable Workgroup
-    1823(gs_ub2):   1821(ptr) Variable Workgroup
-    1824(gs_uc2):   1821(ptr) Variable Workgroup
-            1825:             TypePointer Workgroup 38(ivec3)
-    1826(gs_ua3):   1825(ptr) Variable Workgroup
-    1827(gs_ub3):   1825(ptr) Variable Workgroup
-    1828(gs_uc3):   1825(ptr) Variable Workgroup
-            1829:             TypePointer Workgroup 50(ivec4)
-    1830(gs_ua4):   1829(ptr) Variable Workgroup
-    1831(gs_ub4):   1829(ptr) Variable Workgroup
-    1832(gs_uc4):   1829(ptr) Variable Workgroup
+             260:    6(float) Constant 1050288283
+             281:    6(float) Constant 1065353216
+             285:    152(int) Constant 2
+             350:             TypeVector 152(int) 2
+             351:             TypePointer Function 350(ivec2)
+             377:   24(fvec2) ConstantComposite 179 179
+             378:             TypeVector 135(bool) 2
+             391:    152(int) Constant 3
+             392:  350(ivec2) ConstantComposite 191 391
+             435:      8(int) Constant 7
+             436:      8(int) Constant 8
+             437:   26(ivec2) ConstantComposite 435 436
+             454:             TypePointer Function 378(bvec2)
+             512:    6(float) Constant 1073741824
+             515:      8(int) Constant 1
+             516:      8(int) Constant 2
+             517:   26(ivec2) ConstantComposite 515 516
+             564:   24(fvec2) ConstantComposite 281 512
+             582:             TypeVector 152(int) 3
+             583:             TypePointer Function 582(ivec3)
+             609:   36(fvec3) ConstantComposite 179 179 179
+             610:             TypeVector 135(bool) 3
+             623:      8(int) Constant 3
+             624:      8(int) Constant 5
+             625:   38(ivec3) ConstantComposite 435 623 624
+             672:      8(int) Constant 4
+             673:   38(ivec3) ConstantComposite 516 623 672
+             690:             TypePointer Function 610(bvec3)
+             709:    6(float) Constant 1050253722
+             756:   38(ivec3) ConstantComposite 515 516 623
+             803:    6(float) Constant 1077936128
+             804:   36(fvec3) ConstantComposite 281 512 803
+             822:             TypeVector 152(int) 4
+             823:             TypePointer Function 822(ivec4)
+             849:   48(fvec4) ConstantComposite 179 179 179 179
+             850:             TypeVector 135(bool) 4
+             863:   50(ivec4) ConstantComposite 435 623 624 516
+             917:      8(int) Constant 9
+             918:      8(int) Constant 10
+             919:   50(ivec4) ConstantComposite 435 436 917 918
+             936:             TypePointer Function 850(bvec4)
+             996:   50(ivec4) ConstantComposite 515 516 623 672
+            1043:    6(float) Constant 1082130432
+            1044:   48(fvec4) ConstantComposite 281 512 803 1043
+            1072:          60 ConstantComposite 377 377
+            1073:             TypeMatrix 378(bvec2) 2
+            1220:   24(fvec2) ConstantComposite 512 512
+            1221:          60 ConstantComposite 1220 1220
+            1249:          68 ConstantComposite 609 609 609
+            1250:             TypeMatrix 610(bvec3) 3
+            1400:   36(fvec3) ConstantComposite 803 803 803
+            1401:          68 ConstantComposite 1400 1400 1400
+            1429:          76 ConstantComposite 849 849 849 849
+            1430:             TypeMatrix 850(bvec4) 4
+            1583:   48(fvec4) ConstantComposite 1043 1043 1043 1043
+            1584:          76 ConstantComposite 1583 1583 1583 1583
+            1763:             TypePointer Function 131(PS_OUTPUT)
+            1765:    152(int) Constant 0
+            1766:   48(fvec4) ConstantComposite 281 281 281 281
+            1771:             TypePointer Output 48(fvec4)
+     1772(color):   1771(ptr) Variable Output
+            1775:             TypePointer Workgroup 8(int)
+     1776(gs_ua):   1775(ptr) Variable Workgroup
+     1777(gs_ub):   1775(ptr) Variable Workgroup
+     1778(gs_uc):   1775(ptr) Variable Workgroup
+            1779:             TypePointer Workgroup 26(ivec2)
+    1780(gs_ua2):   1779(ptr) Variable Workgroup
+    1781(gs_ub2):   1779(ptr) Variable Workgroup
+    1782(gs_uc2):   1779(ptr) Variable Workgroup
+            1783:             TypePointer Workgroup 38(ivec3)
+    1784(gs_ua3):   1783(ptr) Variable Workgroup
+    1785(gs_ub3):   1783(ptr) Variable Workgroup
+    1786(gs_uc3):   1783(ptr) Variable Workgroup
+            1787:             TypePointer Workgroup 50(ivec4)
+    1788(gs_ua4):   1787(ptr) Variable Workgroup
+    1789(gs_ub4):   1787(ptr) Variable Workgroup
+    1790(gs_uc4):   1787(ptr) Variable Workgroup
          4(main):           2 Function None 3
                5:             Label
-            1815:131(PS_OUTPUT) FunctionCall 133(@main()
-            1816:   48(fvec4) CompositeExtract 1815 0
-                              Store 1814(color) 1816
+            1773:131(PS_OUTPUT) FunctionCall 133(@main()
+            1774:   48(fvec4) CompositeExtract 1773 0
+                              Store 1772(color) 1774
                               Return
                               FunctionEnd
 16(PixelShaderFunctionS(f1;f1;f1;u1;u1;):    6(float) Function None 10
@@ -6321,33 +6217,32 @@ gl_FragCoord origin is upper left
        226(r031):      7(ptr) Variable Function
        229(r033):      7(ptr) Variable Function
        233(r034):      7(ptr) Variable Function
-       236(r035):      7(ptr) Variable Function
-       242(r036):      7(ptr) Variable Function
-       245(r037):    136(ptr) Variable Function
-       248(r038):    136(ptr) Variable Function
-       251(r039):      7(ptr) Variable Function
-      255(r039a):      7(ptr) Variable Function
-       260(r040):      7(ptr) Variable Function
-       263(r041):      7(ptr) Variable Function
-       268(r042):      7(ptr) Variable Function
-       271(r043):      7(ptr) Variable Function
-       275(r044):      7(ptr) Variable Function
-       279(r045):      7(ptr) Variable Function
-       283(r046):      7(ptr) Variable Function
-       286(r047):      7(ptr) Variable Function
-       290(r048):      9(ptr) Variable Function
-       294(r049):      7(ptr) Variable Function
-       297(r050):      7(ptr) Variable Function
-       300(r051):      7(ptr) Variable Function
-       303(r052):      7(ptr) Variable Function
-       306(r053):      7(ptr) Variable Function
-       313(r055):      7(ptr) Variable Function
-       316(r056):      7(ptr) Variable Function
-       321(r057):      7(ptr) Variable Function
-       324(r058):      7(ptr) Variable Function
-       328(r059):      7(ptr) Variable Function
-       331(r060):      7(ptr) Variable Function
-       334(r061):      7(ptr) Variable Function
+       236(r036):      7(ptr) Variable Function
+       239(r037):    136(ptr) Variable Function
+       242(r038):    136(ptr) Variable Function
+       245(r039):      7(ptr) Variable Function
+      249(r039a):      7(ptr) Variable Function
+       254(r040):      7(ptr) Variable Function
+       257(r041):      7(ptr) Variable Function
+       262(r042):      7(ptr) Variable Function
+       265(r043):      7(ptr) Variable Function
+       269(r044):      7(ptr) Variable Function
+       273(r045):      7(ptr) Variable Function
+       277(r046):      7(ptr) Variable Function
+       280(r047):      7(ptr) Variable Function
+       284(r048):      9(ptr) Variable Function
+       288(r049):      7(ptr) Variable Function
+       291(r050):      7(ptr) Variable Function
+       294(r051):      7(ptr) Variable Function
+       297(r052):      7(ptr) Variable Function
+       300(r053):      7(ptr) Variable Function
+       307(r055):      7(ptr) Variable Function
+       310(r056):      7(ptr) Variable Function
+       315(r057):      7(ptr) Variable Function
+       318(r058):      7(ptr) Variable Function
+       322(r059):      7(ptr) Variable Function
+       325(r060):      7(ptr) Variable Function
+       328(r061):      7(ptr) Variable Function
              138:    6(float) Load 11(inF0)
              139:   135(bool) All 138
                               Store 137(r000) 139
@@ -6446,105 +6341,99 @@ gl_FragCoord origin is upper left
              235:    6(float) ExtInst 1(GLSL.std.450) 10(Fract) 234
                               Store 233(r034) 235
              237:    6(float) Load 11(inF0)
-             239:238(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 237
-             240:    152(int) CompositeExtract 239 1
-                              Store 12(inF1) 240
-             241:    6(float) CompositeExtract 239 0
-                              Store 236(r035) 241
+             238:    6(float) Fwidth 237
+                              Store 236(r036) 238
+             240:    6(float) Load 11(inF0)
+             241:   135(bool) IsInf 240
+                              Store 239(r037) 241
              243:    6(float) Load 11(inF0)
-             244:    6(float) Fwidth 243
-                              Store 242(r036) 244
+             244:   135(bool) IsNan 243
+                              Store 242(r038) 244
              246:    6(float) Load 11(inF0)
-             247:   135(bool) IsInf 246
-                              Store 245(r037) 247
-             249:    6(float) Load 11(inF0)
-             250:   135(bool) IsNan 249
-                              Store 248(r038) 250
-             252:    6(float) Load 11(inF0)
-             253:    6(float) Load 12(inF1)
-             254:    6(float) ExtInst 1(GLSL.std.450) 53(Ldexp) 252 253
-                              Store 251(r039) 254
-             256:    6(float) Load 11(inF0)
-             257:    6(float) Load 12(inF1)
-             258:    6(float) Load 13(inF2)
-             259:    6(float) ExtInst 1(GLSL.std.450) 46(FMix) 256 257 258
-                              Store 255(r039a) 259
-             261:    6(float) Load 11(inF0)
-             262:    6(float) ExtInst 1(GLSL.std.450) 28(Log) 261
-                              Store 260(r040) 262
-             264:    6(float) Load 11(inF0)
-             265:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 264
-             267:    6(float) FMul 265 266
-                              Store 263(r041) 267
-             269:    6(float) Load 11(inF0)
-             270:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 269
-                              Store 268(r042) 270
-             272:    6(float) Load 11(inF0)
-             273:    6(float) Load 12(inF1)
-             274:    6(float) ExtInst 1(GLSL.std.450) 40(FMax) 272 273
-                              Store 271(r043) 274
-             276:    6(float) Load 11(inF0)
-             277:    6(float) Load 12(inF1)
-             278:    6(float) ExtInst 1(GLSL.std.450) 37(FMin) 276 277
-                              Store 275(r044) 278
-             280:    6(float) Load 11(inF0)
-             281:    6(float) Load 12(inF1)
-             282:    6(float) ExtInst 1(GLSL.std.450) 26(Pow) 280 281
-                              Store 279(r045) 282
-             284:    6(float) Load 11(inF0)
-             285:    6(float) ExtInst 1(GLSL.std.450) 11(Radians) 284
-                              Store 283(r046) 285
-             288:    6(float) Load 11(inF0)
-             289:    6(float) FDiv 287 288
-                              Store 286(r047) 289
-             292:    152(int) BitReverse 291
-             293:      8(int) Bitcast 292
-                              Store 290(r048) 293
+             247:    6(float) Load 12(inF1)
+             248:    6(float) ExtInst 1(GLSL.std.450) 53(Ldexp) 246 247
+                              Store 245(r039) 248
+             250:    6(float) Load 11(inF0)
+             251:    6(float) Load 12(inF1)
+             252:    6(float) Load 13(inF2)
+             253:    6(float) ExtInst 1(GLSL.std.450) 46(FMix) 250 251 252
+                              Store 249(r039a) 253
+             255:    6(float) Load 11(inF0)
+             256:    6(float) ExtInst 1(GLSL.std.450) 28(Log) 255
+                              Store 254(r040) 256
+             258:    6(float) Load 11(inF0)
+             259:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 258
+             261:    6(float) FMul 259 260
+                              Store 257(r041) 261
+             263:    6(float) Load 11(inF0)
+             264:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 263
+                              Store 262(r042) 264
+             266:    6(float) Load 11(inF0)
+             267:    6(float) Load 12(inF1)
+             268:    6(float) ExtInst 1(GLSL.std.450) 40(FMax) 266 267
+                              Store 265(r043) 268
+             270:    6(float) Load 11(inF0)
+             271:    6(float) Load 12(inF1)
+             272:    6(float) ExtInst 1(GLSL.std.450) 37(FMin) 270 271
+                              Store 269(r044) 272
+             274:    6(float) Load 11(inF0)
+             275:    6(float) Load 12(inF1)
+             276:    6(float) ExtInst 1(GLSL.std.450) 26(Pow) 274 275
+                              Store 273(r045) 276
+             278:    6(float) Load 11(inF0)
+             279:    6(float) ExtInst 1(GLSL.std.450) 11(Radians) 278
+                              Store 277(r046) 279
+             282:    6(float) Load 11(inF0)
+             283:    6(float) FDiv 281 282
+                              Store 280(r047) 283
+             286:    152(int) BitReverse 285
+             287:      8(int) Bitcast 286
+                              Store 284(r048) 287
+             289:    6(float) Load 11(inF0)
+             290:    6(float) ExtInst 1(GLSL.std.450) 2(RoundEven) 289
+                              Store 288(r049) 290
+             292:    6(float) Load 11(inF0)
+             293:    6(float) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 292
+                              Store 291(r050) 293
              295:    6(float) Load 11(inF0)
-             296:    6(float) ExtInst 1(GLSL.std.450) 2(RoundEven) 295
-                              Store 294(r049) 296
+             296:    6(float) ExtInst 1(GLSL.std.450) 43(FClamp) 295 179 281
+                              Store 294(r051) 296
              298:    6(float) Load 11(inF0)
-             299:    6(float) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 298
-                              Store 297(r050) 299
+             299:    6(float) ExtInst 1(GLSL.std.450) 6(FSign) 298
+                              Store 297(r052) 299
              301:    6(float) Load 11(inF0)
-             302:    6(float) ExtInst 1(GLSL.std.450) 43(FClamp) 301 179 287
-                              Store 300(r051) 302
-             304:    6(float) Load 11(inF0)
-             305:    6(float) ExtInst 1(GLSL.std.450) 6(FSign) 304
-                              Store 303(r052) 305
-             307:    6(float) Load 11(inF0)
-             308:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 307
-                              Store 306(r053) 308
-             309:    6(float) Load 11(inF0)
-             310:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 309
-                              Store 12(inF1) 310
+             302:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 301
+                              Store 300(r053) 302
+             303:    6(float) Load 11(inF0)
+             304:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 303
+                              Store 12(inF1) 304
+             305:    6(float) Load 11(inF0)
+             306:    6(float) ExtInst 1(GLSL.std.450) 14(Cos) 305
+                              Store 13(inF2) 306
+             308:    6(float) Load 11(inF0)
+             309:    6(float) ExtInst 1(GLSL.std.450) 19(Sinh) 308
+                              Store 307(r055) 309
              311:    6(float) Load 11(inF0)
-             312:    6(float) ExtInst 1(GLSL.std.450) 14(Cos) 311
-                              Store 13(inF2) 312
-             314:    6(float) Load 11(inF0)
-             315:    6(float) ExtInst 1(GLSL.std.450) 19(Sinh) 314
-                              Store 313(r055) 315
-             317:    6(float) Load 11(inF0)
-             318:    6(float) Load 12(inF1)
-             319:    6(float) Load 13(inF2)
-             320:    6(float) ExtInst 1(GLSL.std.450) 49(SmoothStep) 317 318 319
-                              Store 316(r056) 320
-             322:    6(float) Load 11(inF0)
-             323:    6(float) ExtInst 1(GLSL.std.450) 31(Sqrt) 322
-                              Store 321(r057) 323
-             325:    6(float) Load 11(inF0)
-             326:    6(float) Load 12(inF1)
-             327:    6(float) ExtInst 1(GLSL.std.450) 48(Step) 325 326
-                              Store 324(r058) 327
+             312:    6(float) Load 12(inF1)
+             313:    6(float) Load 13(inF2)
+             314:    6(float) ExtInst 1(GLSL.std.450) 49(SmoothStep) 311 312 313
+                              Store 310(r056) 314
+             316:    6(float) Load 11(inF0)
+             317:    6(float) ExtInst 1(GLSL.std.450) 31(Sqrt) 316
+                              Store 315(r057) 317
+             319:    6(float) Load 11(inF0)
+             320:    6(float) Load 12(inF1)
+             321:    6(float) ExtInst 1(GLSL.std.450) 48(Step) 319 320
+                              Store 318(r058) 321
+             323:    6(float) Load 11(inF0)
+             324:    6(float) ExtInst 1(GLSL.std.450) 15(Tan) 323
+                              Store 322(r059) 324
+             326:    6(float) Load 11(inF0)
+             327:    6(float) ExtInst 1(GLSL.std.450) 21(Tanh) 326
+                              Store 325(r060) 327
              329:    6(float) Load 11(inF0)
-             330:    6(float) ExtInst 1(GLSL.std.450) 15(Tan) 329
-                              Store 328(r059) 330
-             332:    6(float) Load 11(inF0)
-             333:    6(float) ExtInst 1(GLSL.std.450) 21(Tanh) 332
-                              Store 331(r060) 333
-             335:    6(float) Load 11(inF0)
-             336:    6(float) ExtInst 1(GLSL.std.450) 3(Trunc) 335
-                              Store 334(r061) 336
+             330:    6(float) ExtInst 1(GLSL.std.450) 3(Trunc) 329
+                              Store 328(r061) 330
                               ReturnValue 179
                               FunctionEnd
 22(PixelShaderFunction1(vf1;vf1;vf1;):    6(float) Function None 18
@@ -6561,295 +6450,288 @@ gl_FragCoord origin is upper left
         32(inU0):     27(ptr) FunctionParameter
         33(inU1):     27(ptr) FunctionParameter
               35:             Label
-       341(r000):    136(ptr) Variable Function
-       344(r001):     25(ptr) Variable Function
-       347(r002):     25(ptr) Variable Function
-       350(r003):    136(ptr) Variable Function
-       353(r004):     25(ptr) Variable Function
-       358(r005):    357(ptr) Variable Function
-       361(r006):     27(ptr) Variable Function
-       364(r007):     25(ptr) Variable Function
-       367(r009):     25(ptr) Variable Function
-       370(r010):     25(ptr) Variable Function
-       374(r011):     25(ptr) Variable Function
-       377(r012):     25(ptr) Variable Function
-       390(r013):     25(ptr) Variable Function
-       393(r015):     25(ptr) Variable Function
-       396(r016):    357(ptr) Variable Function
-       400(r017):     25(ptr) Variable Function
-       403(r018):     25(ptr) Variable Function
-       406(r019):     25(ptr) Variable Function
-       409(r020):     25(ptr) Variable Function
-       412(r021):     25(ptr) Variable Function
-       415(r022):     25(ptr) Variable Function
-       418(r023):     25(ptr) Variable Function
-       421(r026):      7(ptr) Variable Function
-       425(r027):      7(ptr) Variable Function
-       429(r028):     25(ptr) Variable Function
-       432(r029):     25(ptr) Variable Function
-       435(r030):     25(ptr) Variable Function
-       440(r031):     27(ptr) Variable Function
-       445(r032):     27(ptr) Variable Function
-       447(r033):     25(ptr) Variable Function
-       450(r035):     25(ptr) Variable Function
-       454(r036):     25(ptr) Variable Function
-       457(r037):     25(ptr) Variable Function
-       463(r038):     25(ptr) Variable Function
-       467(r039):    466(ptr) Variable Function
-       470(r040):    466(ptr) Variable Function
-       473(r041):     25(ptr) Variable Function
-      477(r039a):     25(ptr) Variable Function
-       482(r042):      7(ptr) Variable Function
-       485(r043):     25(ptr) Variable Function
-       488(r044):     25(ptr) Variable Function
-       492(r045):     25(ptr) Variable Function
-       495(r046):     25(ptr) Variable Function
-       499(r047):     25(ptr) Variable Function
-       503(r048):     25(ptr) Variable Function
-       506(r049):     25(ptr) Variable Function
-       510(r050):     25(ptr) Variable Function
-       513(r051):     25(ptr) Variable Function
-       517(r052):     25(ptr) Variable Function
-       521(r053):     25(ptr) Variable Function
-       526(r054):     27(ptr) Variable Function
-       531(r055):     25(ptr) Variable Function
-       534(r056):     25(ptr) Variable Function
-       537(r057):     25(ptr) Variable Function
-       542(r058):     25(ptr) Variable Function
-       545(r059):     25(ptr) Variable Function
-       552(r060):     25(ptr) Variable Function
-       555(r061):     25(ptr) Variable Function
-       560(r062):     25(ptr) Variable Function
-       563(r063):     25(ptr) Variable Function
-       567(r064):     25(ptr) Variable Function
-       570(r065):     25(ptr) Variable Function
-       573(r066):     25(ptr) Variable Function
+       335(r000):    136(ptr) Variable Function
+       338(r001):     25(ptr) Variable Function
+       341(r002):     25(ptr) Variable Function
+       344(r003):    136(ptr) Variable Function
+       347(r004):     25(ptr) Variable Function
+       352(r005):    351(ptr) Variable Function
+       355(r006):     27(ptr) Variable Function
+       358(r007):     25(ptr) Variable Function
+       361(r009):     25(ptr) Variable Function
+       364(r010):     25(ptr) Variable Function
+       368(r011):     25(ptr) Variable Function
+       371(r012):     25(ptr) Variable Function
+       384(r013):     25(ptr) Variable Function
+       387(r015):     25(ptr) Variable Function
+       390(r016):    351(ptr) Variable Function
+       394(r017):     25(ptr) Variable Function
+       397(r018):     25(ptr) Variable Function
+       400(r019):     25(ptr) Variable Function
+       403(r020):     25(ptr) Variable Function
+       406(r021):     25(ptr) Variable Function
+       409(r022):     25(ptr) Variable Function
+       412(r023):     25(ptr) Variable Function
+       415(r026):      7(ptr) Variable Function
+       419(r027):      7(ptr) Variable Function
+       423(r028):     25(ptr) Variable Function
+       426(r029):     25(ptr) Variable Function
+       429(r030):     25(ptr) Variable Function
+       434(r031):     27(ptr) Variable Function
+       439(r032):     27(ptr) Variable Function
+       441(r033):     25(ptr) Variable Function
+       444(r035):     25(ptr) Variable Function
+       448(r036):     25(ptr) Variable Function
+       451(r038):     25(ptr) Variable Function
+       455(r039):    454(ptr) Variable Function
+       458(r040):    454(ptr) Variable Function
+       461(r041):     25(ptr) Variable Function
+      465(r039a):     25(ptr) Variable Function
+       470(r042):      7(ptr) Variable Function
+       473(r043):     25(ptr) Variable Function
+       476(r044):     25(ptr) Variable Function
+       480(r045):     25(ptr) Variable Function
+       483(r046):     25(ptr) Variable Function
+       487(r047):     25(ptr) Variable Function
+       491(r048):     25(ptr) Variable Function
+       494(r049):     25(ptr) Variable Function
+       498(r050):     25(ptr) Variable Function
+       501(r051):     25(ptr) Variable Function
+       505(r052):     25(ptr) Variable Function
+       509(r053):     25(ptr) Variable Function
+       514(r054):     27(ptr) Variable Function
+       519(r055):     25(ptr) Variable Function
+       522(r056):     25(ptr) Variable Function
+       525(r057):     25(ptr) Variable Function
+       530(r058):     25(ptr) Variable Function
+       533(r059):     25(ptr) Variable Function
+       540(r060):     25(ptr) Variable Function
+       543(r061):     25(ptr) Variable Function
+       548(r062):     25(ptr) Variable Function
+       551(r063):     25(ptr) Variable Function
+       555(r064):     25(ptr) Variable Function
+       558(r065):     25(ptr) Variable Function
+       561(r066):     25(ptr) Variable Function
+             336:   24(fvec2) Load 29(inF0)
+             337:   135(bool) All 336
+                              Store 335(r000) 337
+             339:   24(fvec2) Load 29(inF0)
+             340:   24(fvec2) ExtInst 1(GLSL.std.450) 4(FAbs) 339
+                              Store 338(r001) 340
              342:   24(fvec2) Load 29(inF0)
-             343:   135(bool) All 342
-                              Store 341(r000) 343
+             343:   24(fvec2) ExtInst 1(GLSL.std.450) 17(Acos) 342
+                              Store 341(r002) 343
              345:   24(fvec2) Load 29(inF0)
-             346:   24(fvec2) ExtInst 1(GLSL.std.450) 4(FAbs) 345
-                              Store 344(r001) 346
+             346:   135(bool) Any 345
+                              Store 344(r003) 346
              348:   24(fvec2) Load 29(inF0)
-             349:   24(fvec2) ExtInst 1(GLSL.std.450) 17(Acos) 348
-                              Store 347(r002) 349
-             351:   24(fvec2) Load 29(inF0)
-             352:   135(bool) Any 351
-                              Store 350(r003) 352
-             354:   24(fvec2) Load 29(inF0)
-             355:   24(fvec2) ExtInst 1(GLSL.std.450) 16(Asin) 354
-                              Store 353(r004) 355
-             359:   24(fvec2) Load 29(inF0)
-             360:  356(ivec2) Bitcast 359
-                              Store 358(r005) 360
+             349:   24(fvec2) ExtInst 1(GLSL.std.450) 16(Asin) 348
+                              Store 347(r004) 349
+             353:   24(fvec2) Load 29(inF0)
+             354:  350(ivec2) Bitcast 353
+                              Store 352(r005) 354
+             356:   24(fvec2) Load 29(inF0)
+             357:   26(ivec2) Bitcast 356
+                              Store 355(r006) 357
+             359:   26(ivec2) Load 32(inU0)
+             360:   24(fvec2) Bitcast 359
+                              Store 358(r007) 360
              362:   24(fvec2) Load 29(inF0)
-             363:   26(ivec2) Bitcast 362
-                              Store 361(r006) 363
-             365:   26(ivec2) Load 32(inU0)
-             366:   24(fvec2) Bitcast 365
-                              Store 364(r007) 366
-             368:   24(fvec2) Load 29(inF0)
-             369:   24(fvec2) ExtInst 1(GLSL.std.450) 18(Atan) 368
-                              Store 367(r009) 369
-             371:   24(fvec2) Load 29(inF0)
-             372:   24(fvec2) Load 30(inF1)
-             373:   24(fvec2) ExtInst 1(GLSL.std.450) 25(Atan2) 371 372
-                              Store 370(r010) 373
-             375:   24(fvec2) Load 29(inF0)
-             376:   24(fvec2) ExtInst 1(GLSL.std.450) 9(Ceil) 375
-                              Store 374(r011) 376
-             378:   24(fvec2) Load 29(inF0)
-             379:   24(fvec2) Load 30(inF1)
-             380:   24(fvec2) Load 31(inF2)
-             381:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 378 379 380
-                              Store 377(r012) 381
-             382:   24(fvec2) Load 29(inF0)
-             385:  384(bvec2) FOrdLessThan 382 383
-             386:   135(bool) Any 385
-                              SelectionMerge 388 None
-                              BranchConditional 386 387 388
-             387:               Label
+             363:   24(fvec2) ExtInst 1(GLSL.std.450) 18(Atan) 362
+                              Store 361(r009) 363
+             365:   24(fvec2) Load 29(inF0)
+             366:   24(fvec2) Load 30(inF1)
+             367:   24(fvec2) ExtInst 1(GLSL.std.450) 25(Atan2) 365 366
+                              Store 364(r010) 367
+             369:   24(fvec2) Load 29(inF0)
+             370:   24(fvec2) ExtInst 1(GLSL.std.450) 9(Ceil) 369
+                              Store 368(r011) 370
+             372:   24(fvec2) Load 29(inF0)
+             373:   24(fvec2) Load 30(inF1)
+             374:   24(fvec2) Load 31(inF2)
+             375:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 372 373 374
+                              Store 371(r012) 375
+             376:   24(fvec2) Load 29(inF0)
+             379:  378(bvec2) FOrdLessThan 376 377
+             380:   135(bool) Any 379
+                              SelectionMerge 382 None
+                              BranchConditional 380 381 382
+             381:               Label
                                 Kill
-             388:             Label
-             391:   24(fvec2) Load 29(inF0)
-             392:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 391
-                              Store 390(r013) 392
-             394:   24(fvec2) Load 29(inF0)
-             395:   24(fvec2) ExtInst 1(GLSL.std.450) 20(Cosh) 394
-                              Store 393(r015) 395
-             399:  356(ivec2) BitCount 398
-                              Store 396(r016) 399
+             382:             Label
+             385:   24(fvec2) Load 29(inF0)
+             386:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 385
+                              Store 384(r013) 386
+             388:   24(fvec2) Load 29(inF0)
+             389:   24(fvec2) ExtInst 1(GLSL.std.450) 20(Cosh) 388
+                              Store 387(r015) 389
+             393:  350(ivec2) BitCount 392
+                              Store 390(r016) 393
+             395:   24(fvec2) Load 29(inF0)
+             396:   24(fvec2) DPdx 395
+                              Store 394(r017) 396
+             398:   24(fvec2) Load 29(inF0)
+             399:   24(fvec2) DPdxCoarse 398
+                              Store 397(r018) 399
              401:   24(fvec2) Load 29(inF0)
-             402:   24(fvec2) DPdx 401
-                              Store 400(r017) 402
+             402:   24(fvec2) DPdxFine 401
+                              Store 400(r019) 402
              404:   24(fvec2) Load 29(inF0)
-             405:   24(fvec2) DPdxCoarse 404
-                              Store 403(r018) 405
+             405:   24(fvec2) DPdy 404
+                              Store 403(r020) 405
              407:   24(fvec2) Load 29(inF0)
-             408:   24(fvec2) DPdxFine 407
-                              Store 406(r019) 408
+             408:   24(fvec2) DPdyCoarse 407
+                              Store 406(r021) 408
              410:   24(fvec2) Load 29(inF0)
-             411:   24(fvec2) DPdy 410
-                              Store 409(r020) 411
+             411:   24(fvec2) DPdyFine 410
+                              Store 409(r022) 411
              413:   24(fvec2) Load 29(inF0)
-             414:   24(fvec2) DPdyCoarse 413
-                              Store 412(r021) 414
+             414:   24(fvec2) ExtInst 1(GLSL.std.450) 12(Degrees) 413
+                              Store 412(r023) 414
              416:   24(fvec2) Load 29(inF0)
-             417:   24(fvec2) DPdyFine 416
-                              Store 415(r022) 417
-             419:   24(fvec2) Load 29(inF0)
-             420:   24(fvec2) ExtInst 1(GLSL.std.450) 12(Degrees) 419
-                              Store 418(r023) 420
-             422:   24(fvec2) Load 29(inF0)
-             423:   24(fvec2) Load 30(inF1)
-             424:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 422 423
-                              Store 421(r026) 424
-             426:   24(fvec2) Load 29(inF0)
-             427:   24(fvec2) Load 30(inF1)
-             428:    6(float) Dot 426 427
-                              Store 425(r027) 428
+             417:   24(fvec2) Load 30(inF1)
+             418:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 416 417
+                              Store 415(r026) 418
+             420:   24(fvec2) Load 29(inF0)
+             421:   24(fvec2) Load 30(inF1)
+             422:    6(float) Dot 420 421
+                              Store 419(r027) 422
+             424:   24(fvec2) Load 29(inF0)
+             425:   24(fvec2) ExtInst 1(GLSL.std.450) 27(Exp) 424
+                              Store 423(r028) 425
+             427:   24(fvec2) Load 29(inF0)
+             428:   24(fvec2) ExtInst 1(GLSL.std.450) 29(Exp2) 427
+                              Store 426(r029) 428
              430:   24(fvec2) Load 29(inF0)
-             431:   24(fvec2) ExtInst 1(GLSL.std.450) 27(Exp) 430
-                              Store 429(r028) 431
-             433:   24(fvec2) Load 29(inF0)
-             434:   24(fvec2) ExtInst 1(GLSL.std.450) 29(Exp2) 433
-                              Store 432(r029) 434
-             436:   24(fvec2) Load 29(inF0)
-             437:   24(fvec2) Load 30(inF1)
-             438:   24(fvec2) Load 31(inF2)
-             439:   24(fvec2) ExtInst 1(GLSL.std.450) 70(FaceForward) 436 437 438
-                              Store 435(r030) 439
-             444:   26(ivec2) ExtInst 1(GLSL.std.450) 75(FindUMsb) 443
-                              Store 440(r031) 444
-             446:   26(ivec2) ExtInst 1(GLSL.std.450) 73(FindILsb) 443
-                              Store 445(r032) 446
-             448:   24(fvec2) Load 29(inF0)
-             449:   24(fvec2) ExtInst 1(GLSL.std.450) 8(Floor) 448
-                              Store 447(r033) 449
-             451:   24(fvec2) Load 29(inF0)
-             452:   24(fvec2) Load 30(inF1)
-             453:   24(fvec2) FMod 451 452
-                              Store 450(r035) 453
-             455:   24(fvec2) Load 29(inF0)
-             456:   24(fvec2) ExtInst 1(GLSL.std.450) 10(Fract) 455
-                              Store 454(r036) 456
-             458:   24(fvec2) Load 29(inF0)
-             460:459(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 458
-             461:  356(ivec2) CompositeExtract 460 1
-                              Store 30(inF1) 461
-             462:   24(fvec2) CompositeExtract 460 0
-                              Store 457(r037) 462
-             464:   24(fvec2) Load 29(inF0)
-             465:   24(fvec2) Fwidth 464
-                              Store 463(r038) 465
-             468:   24(fvec2) Load 29(inF0)
-             469:  384(bvec2) IsInf 468
-                              Store 467(r039) 469
+             431:   24(fvec2) Load 30(inF1)
+             432:   24(fvec2) Load 31(inF2)
+             433:   24(fvec2) ExtInst 1(GLSL.std.450) 70(FaceForward) 430 431 432
+                              Store 429(r030) 433
+             438:   26(ivec2) ExtInst 1(GLSL.std.450) 75(FindUMsb) 437
+                              Store 434(r031) 438
+             440:   26(ivec2) ExtInst 1(GLSL.std.450) 73(FindILsb) 437
+                              Store 439(r032) 440
+             442:   24(fvec2) Load 29(inF0)
+             443:   24(fvec2) ExtInst 1(GLSL.std.450) 8(Floor) 442
+                              Store 441(r033) 443
+             445:   24(fvec2) Load 29(inF0)
+             446:   24(fvec2) Load 30(inF1)
+             447:   24(fvec2) FMod 445 446
+                              Store 444(r035) 447
+             449:   24(fvec2) Load 29(inF0)
+             450:   24(fvec2) ExtInst 1(GLSL.std.450) 10(Fract) 449
+                              Store 448(r036) 450
+             452:   24(fvec2) Load 29(inF0)
+             453:   24(fvec2) Fwidth 452
+                              Store 451(r038) 453
+             456:   24(fvec2) Load 29(inF0)
+             457:  378(bvec2) IsInf 456
+                              Store 455(r039) 457
+             459:   24(fvec2) Load 29(inF0)
+             460:  378(bvec2) IsNan 459
+                              Store 458(r040) 460
+             462:   24(fvec2) Load 29(inF0)
+             463:   24(fvec2) Load 30(inF1)
+             464:   24(fvec2) ExtInst 1(GLSL.std.450) 53(Ldexp) 462 463
+                              Store 461(r041) 464
+             466:   24(fvec2) Load 29(inF0)
+             467:   24(fvec2) Load 30(inF1)
+             468:   24(fvec2) Load 31(inF2)
+             469:   24(fvec2) ExtInst 1(GLSL.std.450) 46(FMix) 466 467 468
+                              Store 465(r039a) 469
              471:   24(fvec2) Load 29(inF0)
-             472:  384(bvec2) IsNan 471
-                              Store 470(r040) 472
+             472:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 471
+                              Store 470(r042) 472
              474:   24(fvec2) Load 29(inF0)
-             475:   24(fvec2) Load 30(inF1)
-             476:   24(fvec2) ExtInst 1(GLSL.std.450) 53(Ldexp) 474 475
-                              Store 473(r041) 476
-             478:   24(fvec2) Load 29(inF0)
-             479:   24(fvec2) Load 30(inF1)
-             480:   24(fvec2) Load 31(inF2)
-             481:   24(fvec2) ExtInst 1(GLSL.std.450) 46(FMix) 478 479 480
-                              Store 477(r039a) 481
-             483:   24(fvec2) Load 29(inF0)
-             484:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 483
-                              Store 482(r042) 484
-             486:   24(fvec2) Load 29(inF0)
-             487:   24(fvec2) ExtInst 1(GLSL.std.450) 28(Log) 486
-                              Store 485(r043) 487
-             489:   24(fvec2) Load 29(inF0)
-             490:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 489
-             491:   24(fvec2) VectorTimesScalar 490 266
-                              Store 488(r044) 491
-             493:   24(fvec2) Load 29(inF0)
-             494:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 493
-                              Store 492(r045) 494
-             496:   24(fvec2) Load 29(inF0)
-             497:   24(fvec2) Load 30(inF1)
-             498:   24(fvec2) ExtInst 1(GLSL.std.450) 40(FMax) 496 497
-                              Store 495(r046) 498
-             500:   24(fvec2) Load 29(inF0)
-             501:   24(fvec2) Load 30(inF1)
-             502:   24(fvec2) ExtInst 1(GLSL.std.450) 37(FMin) 500 501
-                              Store 499(r047) 502
-             504:   24(fvec2) Load 29(inF0)
-             505:   24(fvec2) ExtInst 1(GLSL.std.450) 69(Normalize) 504
-                              Store 503(r048) 505
-             507:   24(fvec2) Load 29(inF0)
-             508:   24(fvec2) Load 30(inF1)
-             509:   24(fvec2) ExtInst 1(GLSL.std.450) 26(Pow) 507 508
-                              Store 506(r049) 509
-             511:   24(fvec2) Load 29(inF0)
-             512:   24(fvec2) ExtInst 1(GLSL.std.450) 11(Radians) 511
-                              Store 510(r050) 512
-             514:   24(fvec2) Load 29(inF0)
-             515:   24(fvec2) CompositeConstruct 287 287
-             516:   24(fvec2) FDiv 515 514
-                              Store 513(r051) 516
-             518:   24(fvec2) Load 29(inF0)
-             519:   24(fvec2) Load 30(inF1)
-             520:   24(fvec2) ExtInst 1(GLSL.std.450) 71(Reflect) 518 519
-                              Store 517(r052) 520
-             522:   24(fvec2) Load 29(inF0)
-             523:   24(fvec2) Load 30(inF1)
-             525:   24(fvec2) ExtInst 1(GLSL.std.450) 72(Refract) 522 523 524
-                              Store 521(r053) 525
-             530:   26(ivec2) BitReverse 529
-                              Store 526(r054) 530
-             532:   24(fvec2) Load 29(inF0)
-             533:   24(fvec2) ExtInst 1(GLSL.std.450) 2(RoundEven) 532
-                              Store 531(r055) 533
-             535:   24(fvec2) Load 29(inF0)
-             536:   24(fvec2) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 535
-                              Store 534(r056) 536
+             475:   24(fvec2) ExtInst 1(GLSL.std.450) 28(Log) 474
+                              Store 473(r043) 475
+             477:   24(fvec2) Load 29(inF0)
+             478:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 477
+             479:   24(fvec2) VectorTimesScalar 478 260
+                              Store 476(r044) 479
+             481:   24(fvec2) Load 29(inF0)
+             482:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 481
+                              Store 480(r045) 482
+             484:   24(fvec2) Load 29(inF0)
+             485:   24(fvec2) Load 30(inF1)
+             486:   24(fvec2) ExtInst 1(GLSL.std.450) 40(FMax) 484 485
+                              Store 483(r046) 486
+             488:   24(fvec2) Load 29(inF0)
+             489:   24(fvec2) Load 30(inF1)
+             490:   24(fvec2) ExtInst 1(GLSL.std.450) 37(FMin) 488 489
+                              Store 487(r047) 490
+             492:   24(fvec2) Load 29(inF0)
+             493:   24(fvec2) ExtInst 1(GLSL.std.450) 69(Normalize) 492
+                              Store 491(r048) 493
+             495:   24(fvec2) Load 29(inF0)
+             496:   24(fvec2) Load 30(inF1)
+             497:   24(fvec2) ExtInst 1(GLSL.std.450) 26(Pow) 495 496
+                              Store 494(r049) 497
+             499:   24(fvec2) Load 29(inF0)
+             500:   24(fvec2) ExtInst 1(GLSL.std.450) 11(Radians) 499
+                              Store 498(r050) 500
+             502:   24(fvec2) Load 29(inF0)
+             503:   24(fvec2) CompositeConstruct 281 281
+             504:   24(fvec2) FDiv 503 502
+                              Store 501(r051) 504
+             506:   24(fvec2) Load 29(inF0)
+             507:   24(fvec2) Load 30(inF1)
+             508:   24(fvec2) ExtInst 1(GLSL.std.450) 71(Reflect) 506 507
+                              Store 505(r052) 508
+             510:   24(fvec2) Load 29(inF0)
+             511:   24(fvec2) Load 30(inF1)
+             513:   24(fvec2) ExtInst 1(GLSL.std.450) 72(Refract) 510 511 512
+                              Store 509(r053) 513
+             518:   26(ivec2) BitReverse 517
+                              Store 514(r054) 518
+             520:   24(fvec2) Load 29(inF0)
+             521:   24(fvec2) ExtInst 1(GLSL.std.450) 2(RoundEven) 520
+                              Store 519(r055) 521
+             523:   24(fvec2) Load 29(inF0)
+             524:   24(fvec2) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 523
+                              Store 522(r056) 524
+             526:   24(fvec2) Load 29(inF0)
+             527:   24(fvec2) CompositeConstruct 179 179
+             528:   24(fvec2) CompositeConstruct 281 281
+             529:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 526 527 528
+                              Store 525(r057) 529
+             531:   24(fvec2) Load 29(inF0)
+             532:   24(fvec2) ExtInst 1(GLSL.std.450) 6(FSign) 531
+                              Store 530(r058) 532
+             534:   24(fvec2) Load 29(inF0)
+             535:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 534
+                              Store 533(r059) 535
+             536:   24(fvec2) Load 29(inF0)
+             537:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 536
+                              Store 30(inF1) 537
              538:   24(fvec2) Load 29(inF0)
-             539:   24(fvec2) CompositeConstruct 179 179
-             540:   24(fvec2) CompositeConstruct 287 287
-             541:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 538 539 540
-                              Store 537(r057) 541
-             543:   24(fvec2) Load 29(inF0)
-             544:   24(fvec2) ExtInst 1(GLSL.std.450) 6(FSign) 543
-                              Store 542(r058) 544
-             546:   24(fvec2) Load 29(inF0)
-             547:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 546
-                              Store 545(r059) 547
-             548:   24(fvec2) Load 29(inF0)
-             549:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 548
-                              Store 30(inF1) 549
-             550:   24(fvec2) Load 29(inF0)
-             551:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 550
-                              Store 31(inF2) 551
-             553:   24(fvec2) Load 29(inF0)
-             554:   24(fvec2) ExtInst 1(GLSL.std.450) 19(Sinh) 553
-                              Store 552(r060) 554
+             539:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 538
+                              Store 31(inF2) 539
+             541:   24(fvec2) Load 29(inF0)
+             542:   24(fvec2) ExtInst 1(GLSL.std.450) 19(Sinh) 541
+                              Store 540(r060) 542
+             544:   24(fvec2) Load 29(inF0)
+             545:   24(fvec2) Load 30(inF1)
+             546:   24(fvec2) Load 31(inF2)
+             547:   24(fvec2) ExtInst 1(GLSL.std.450) 49(SmoothStep) 544 545 546
+                              Store 543(r061) 547
+             549:   24(fvec2) Load 29(inF0)
+             550:   24(fvec2) ExtInst 1(GLSL.std.450) 31(Sqrt) 549
+                              Store 548(r062) 550
+             552:   24(fvec2) Load 29(inF0)
+             553:   24(fvec2) Load 30(inF1)
+             554:   24(fvec2) ExtInst 1(GLSL.std.450) 48(Step) 552 553
+                              Store 551(r063) 554
              556:   24(fvec2) Load 29(inF0)
-             557:   24(fvec2) Load 30(inF1)
-             558:   24(fvec2) Load 31(inF2)
-             559:   24(fvec2) ExtInst 1(GLSL.std.450) 49(SmoothStep) 556 557 558
-                              Store 555(r061) 559
-             561:   24(fvec2) Load 29(inF0)
-             562:   24(fvec2) ExtInst 1(GLSL.std.450) 31(Sqrt) 561
-                              Store 560(r062) 562
-             564:   24(fvec2) Load 29(inF0)
-             565:   24(fvec2) Load 30(inF1)
-             566:   24(fvec2) ExtInst 1(GLSL.std.450) 48(Step) 564 565
-                              Store 563(r063) 566
-             568:   24(fvec2) Load 29(inF0)
-             569:   24(fvec2) ExtInst 1(GLSL.std.450) 15(Tan) 568
-                              Store 567(r064) 569
-             571:   24(fvec2) Load 29(inF0)
-             572:   24(fvec2) ExtInst 1(GLSL.std.450) 21(Tanh) 571
-                              Store 570(r065) 572
-             574:   24(fvec2) Load 29(inF0)
-             575:   24(fvec2) ExtInst 1(GLSL.std.450) 3(Trunc) 574
-                              Store 573(r066) 575
-                              ReturnValue 576
+             557:   24(fvec2) ExtInst 1(GLSL.std.450) 15(Tan) 556
+                              Store 555(r064) 557
+             559:   24(fvec2) Load 29(inF0)
+             560:   24(fvec2) ExtInst 1(GLSL.std.450) 21(Tanh) 559
+                              Store 558(r065) 560
+             562:   24(fvec2) Load 29(inF0)
+             563:   24(fvec2) ExtInst 1(GLSL.std.450) 3(Trunc) 562
+                              Store 561(r066) 563
+                              ReturnValue 564
                               FunctionEnd
 46(PixelShaderFunction3(vf3;vf3;vf3;vu3;vu3;):   36(fvec3) Function None 40
         41(inF0):     37(ptr) FunctionParameter
@@ -6858,306 +6740,299 @@ gl_FragCoord origin is upper left
         44(inU0):     39(ptr) FunctionParameter
         45(inU1):     39(ptr) FunctionParameter
               47:             Label
-       579(r000):    136(ptr) Variable Function
-       582(r001):     37(ptr) Variable Function
-       585(r002):     37(ptr) Variable Function
-       588(r003):    136(ptr) Variable Function
-       591(r004):     37(ptr) Variable Function
-       596(r005):    595(ptr) Variable Function
-       599(r006):     39(ptr) Variable Function
-       602(r007):     37(ptr) Variable Function
-       605(r009):     37(ptr) Variable Function
-       608(r010):     37(ptr) Variable Function
-       612(r011):     37(ptr) Variable Function
-       615(r012):     37(ptr) Variable Function
-       628(r013):     37(ptr) Variable Function
-       631(r014):     37(ptr) Variable Function
-       634(r015):     39(ptr) Variable Function
-       639(r016):     37(ptr) Variable Function
-       643(r017):     37(ptr) Variable Function
-       646(r018):     37(ptr) Variable Function
-       649(r019):     37(ptr) Variable Function
-       652(r020):     37(ptr) Variable Function
-       655(r021):     37(ptr) Variable Function
-       658(r022):     37(ptr) Variable Function
-       661(r023):     37(ptr) Variable Function
-       664(r024):      7(ptr) Variable Function
-       668(r025):      7(ptr) Variable Function
-       672(r029):     37(ptr) Variable Function
-       675(r030):     37(ptr) Variable Function
-       678(r031):     37(ptr) Variable Function
-       683(r032):     39(ptr) Variable Function
-       687(r033):     39(ptr) Variable Function
-       689(r034):     37(ptr) Variable Function
-       692(r036):     37(ptr) Variable Function
-       696(r037):     37(ptr) Variable Function
-       699(r038):     37(ptr) Variable Function
-       705(r039):     37(ptr) Variable Function
-       709(r040):    708(ptr) Variable Function
-       712(r041):    708(ptr) Variable Function
-       715(r042):     37(ptr) Variable Function
-      719(r039a):     37(ptr) Variable Function
-      724(r039b):     37(ptr) Variable Function
-       730(r043):      7(ptr) Variable Function
-       733(r044):     37(ptr) Variable Function
-       736(r045):     37(ptr) Variable Function
-       740(r046):     37(ptr) Variable Function
-       743(r047):     37(ptr) Variable Function
-       747(r048):     37(ptr) Variable Function
-       751(r049):     37(ptr) Variable Function
-       754(r050):     37(ptr) Variable Function
-       758(r051):     37(ptr) Variable Function
-       761(r052):     37(ptr) Variable Function
-       765(r053):     37(ptr) Variable Function
-       769(r054):     37(ptr) Variable Function
-       773(r055):     39(ptr) Variable Function
-       776(r056):     37(ptr) Variable Function
-       779(r057):     37(ptr) Variable Function
-       782(r058):     37(ptr) Variable Function
-       787(r059):     37(ptr) Variable Function
-       790(r060):     37(ptr) Variable Function
-       797(r061):     37(ptr) Variable Function
-       800(r062):     37(ptr) Variable Function
-       805(r063):     37(ptr) Variable Function
-       808(r064):     37(ptr) Variable Function
-       812(r065):     37(ptr) Variable Function
-       815(r066):     37(ptr) Variable Function
-       818(r067):     37(ptr) Variable Function
+       567(r000):    136(ptr) Variable Function
+       570(r001):     37(ptr) Variable Function
+       573(r002):     37(ptr) Variable Function
+       576(r003):    136(ptr) Variable Function
+       579(r004):     37(ptr) Variable Function
+       584(r005):    583(ptr) Variable Function
+       587(r006):     39(ptr) Variable Function
+       590(r007):     37(ptr) Variable Function
+       593(r009):     37(ptr) Variable Function
+       596(r010):     37(ptr) Variable Function
+       600(r011):     37(ptr) Variable Function
+       603(r012):     37(ptr) Variable Function
+       616(r013):     37(ptr) Variable Function
+       619(r014):     37(ptr) Variable Function
+       622(r015):     39(ptr) Variable Function
+       627(r016):     37(ptr) Variable Function
+       631(r017):     37(ptr) Variable Function
+       634(r018):     37(ptr) Variable Function
+       637(r019):     37(ptr) Variable Function
+       640(r020):     37(ptr) Variable Function
+       643(r021):     37(ptr) Variable Function
+       646(r022):     37(ptr) Variable Function
+       649(r023):     37(ptr) Variable Function
+       652(r024):      7(ptr) Variable Function
+       656(r025):      7(ptr) Variable Function
+       660(r029):     37(ptr) Variable Function
+       663(r030):     37(ptr) Variable Function
+       666(r031):     37(ptr) Variable Function
+       671(r032):     39(ptr) Variable Function
+       675(r033):     39(ptr) Variable Function
+       677(r034):     37(ptr) Variable Function
+       680(r036):     37(ptr) Variable Function
+       684(r037):     37(ptr) Variable Function
+       687(r039):     37(ptr) Variable Function
+       691(r040):    690(ptr) Variable Function
+       694(r041):    690(ptr) Variable Function
+       697(r042):     37(ptr) Variable Function
+      701(r039a):     37(ptr) Variable Function
+      706(r039b):     37(ptr) Variable Function
+       712(r043):      7(ptr) Variable Function
+       715(r044):     37(ptr) Variable Function
+       718(r045):     37(ptr) Variable Function
+       722(r046):     37(ptr) Variable Function
+       725(r047):     37(ptr) Variable Function
+       729(r048):     37(ptr) Variable Function
+       733(r049):     37(ptr) Variable Function
+       736(r050):     37(ptr) Variable Function
+       740(r051):     37(ptr) Variable Function
+       743(r052):     37(ptr) Variable Function
+       747(r053):     37(ptr) Variable Function
+       751(r054):     37(ptr) Variable Function
+       755(r055):     39(ptr) Variable Function
+       758(r056):     37(ptr) Variable Function
+       761(r057):     37(ptr) Variable Function
+       764(r058):     37(ptr) Variable Function
+       769(r059):     37(ptr) Variable Function
+       772(r060):     37(ptr) Variable Function
+       779(r061):     37(ptr) Variable Function
+       782(r062):     37(ptr) Variable Function
+       787(r063):     37(ptr) Variable Function
+       790(r064):     37(ptr) Variable Function
+       794(r065):     37(ptr) Variable Function
+       797(r066):     37(ptr) Variable Function
+       800(r067):     37(ptr) Variable Function
+             568:   36(fvec3) Load 41(inF0)
+             569:   135(bool) All 568
+                              Store 567(r000) 569
+             571:   36(fvec3) Load 41(inF0)
+             572:   36(fvec3) ExtInst 1(GLSL.std.450) 4(FAbs) 571
+                              Store 570(r001) 572
+             574:   36(fvec3) Load 41(inF0)
+             575:   36(fvec3) ExtInst 1(GLSL.std.450) 17(Acos) 574
+                              Store 573(r002) 575
+             577:   36(fvec3) Load 41(inF0)
+             578:   135(bool) Any 577
+                              Store 576(r003) 578
              580:   36(fvec3) Load 41(inF0)
-             581:   135(bool) All 580
-                              Store 579(r000) 581
-             583:   36(fvec3) Load 41(inF0)
-             584:   36(fvec3) ExtInst 1(GLSL.std.450) 4(FAbs) 583
-                              Store 582(r001) 584
-             586:   36(fvec3) Load 41(inF0)
-             587:   36(fvec3) ExtInst 1(GLSL.std.450) 17(Acos) 586
-                              Store 585(r002) 587
-             589:   36(fvec3) Load 41(inF0)
-             590:   135(bool) Any 589
-                              Store 588(r003) 590
-             592:   36(fvec3) Load 41(inF0)
-             593:   36(fvec3) ExtInst 1(GLSL.std.450) 16(Asin) 592
-                              Store 591(r004) 593
+             581:   36(fvec3) ExtInst 1(GLSL.std.450) 16(Asin) 580
+                              Store 579(r004) 581
+             585:   36(fvec3) Load 41(inF0)
+             586:  582(ivec3) Bitcast 585
+                              Store 584(r005) 586
+             588:   36(fvec3) Load 41(inF0)
+             589:   38(ivec3) Bitcast 588
+                              Store 587(r006) 589
+             591:   38(ivec3) Load 44(inU0)
+             592:   36(fvec3) Bitcast 591
+                              Store 590(r007) 592
+             594:   36(fvec3) Load 41(inF0)
+             595:   36(fvec3) ExtInst 1(GLSL.std.450) 18(Atan) 594
+                              Store 593(r009) 595
              597:   36(fvec3) Load 41(inF0)
-             598:  594(ivec3) Bitcast 597
-                              Store 596(r005) 598
-             600:   36(fvec3) Load 41(inF0)
-             601:   38(ivec3) Bitcast 600
-                              Store 599(r006) 601
-             603:   38(ivec3) Load 44(inU0)
-             604:   36(fvec3) Bitcast 603
-                              Store 602(r007) 604
-             606:   36(fvec3) Load 41(inF0)
-             607:   36(fvec3) ExtInst 1(GLSL.std.450) 18(Atan) 606
-                              Store 605(r009) 607
-             609:   36(fvec3) Load 41(inF0)
-             610:   36(fvec3) Load 42(inF1)
-             611:   36(fvec3) ExtInst 1(GLSL.std.450) 25(Atan2) 609 610
-                              Store 608(r010) 611
-             613:   36(fvec3) Load 41(inF0)
-             614:   36(fvec3) ExtInst 1(GLSL.std.450) 9(Ceil) 613
-                              Store 612(r011) 614
-             616:   36(fvec3) Load 41(inF0)
-             617:   36(fvec3) Load 42(inF1)
-             618:   36(fvec3) Load 43(inF2)
-             619:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 616 617 618
-                              Store 615(r012) 619
-             620:   36(fvec3) Load 41(inF0)
-             623:  622(bvec3) FOrdLessThan 620 621
-             624:   135(bool) Any 623
-                              SelectionMerge 626 None
-                              BranchConditional 624 625 626
-             625:               Label
+             598:   36(fvec3) Load 42(inF1)
+             599:   36(fvec3) ExtInst 1(GLSL.std.450) 25(Atan2) 597 598
+                              Store 596(r010) 599
+             601:   36(fvec3) Load 41(inF0)
+             602:   36(fvec3) ExtInst 1(GLSL.std.450) 9(Ceil) 601
+                              Store 600(r011) 602
+             604:   36(fvec3) Load 41(inF0)
+             605:   36(fvec3) Load 42(inF1)
+             606:   36(fvec3) Load 43(inF2)
+             607:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 604 605 606
+                              Store 603(r012) 607
+             608:   36(fvec3) Load 41(inF0)
+             611:  610(bvec3) FOrdLessThan 608 609
+             612:   135(bool) Any 611
+                              SelectionMerge 614 None
+                              BranchConditional 612 613 614
+             613:               Label
                                 Kill
-             626:             Label
-             629:   36(fvec3) Load 41(inF0)
-             630:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 629
-                              Store 628(r013) 630
+             614:             Label
+             617:   36(fvec3) Load 41(inF0)
+             618:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 617
+                              Store 616(r013) 618
+             620:   36(fvec3) Load 41(inF0)
+             621:   36(fvec3) ExtInst 1(GLSL.std.450) 20(Cosh) 620
+                              Store 619(r014) 621
+             626:   38(ivec3) BitCount 625
+                              Store 622(r015) 626
+             628:   36(fvec3) Load 41(inF0)
+             629:   36(fvec3) Load 42(inF1)
+             630:   36(fvec3) ExtInst 1(GLSL.std.450) 68(Cross) 628 629
+                              Store 627(r016) 630
              632:   36(fvec3) Load 41(inF0)
-             633:   36(fvec3) ExtInst 1(GLSL.std.450) 20(Cosh) 632
-                              Store 631(r014) 633
-             638:   38(ivec3) BitCount 637
-                              Store 634(r015) 638
-             640:   36(fvec3) Load 41(inF0)
-             641:   36(fvec3) Load 42(inF1)
-             642:   36(fvec3) ExtInst 1(GLSL.std.450) 68(Cross) 640 641
-                              Store 639(r016) 642
+             633:   36(fvec3) DPdx 632
+                              Store 631(r017) 633
+             635:   36(fvec3) Load 41(inF0)
+             636:   36(fvec3) DPdxCoarse 635
+                              Store 634(r018) 636
+             638:   36(fvec3) Load 41(inF0)
+             639:   36(fvec3) DPdxFine 638
+                              Store 637(r019) 639
+             641:   36(fvec3) Load 41(inF0)
+             642:   36(fvec3) DPdy 641
+                              Store 640(r020) 642
              644:   36(fvec3) Load 41(inF0)
-             645:   36(fvec3) DPdx 644
-                              Store 643(r017) 645
+             645:   36(fvec3) DPdyCoarse 644
+                              Store 643(r021) 645
              647:   36(fvec3) Load 41(inF0)
-             648:   36(fvec3) DPdxCoarse 647
-                              Store 646(r018) 648
+             648:   36(fvec3) DPdyFine 647
+                              Store 646(r022) 648
              650:   36(fvec3) Load 41(inF0)
-             651:   36(fvec3) DPdxFine 650
-                              Store 649(r019) 651
+             651:   36(fvec3) ExtInst 1(GLSL.std.450) 12(Degrees) 650
+                              Store 649(r023) 651
              653:   36(fvec3) Load 41(inF0)
-             654:   36(fvec3) DPdy 653
-                              Store 652(r020) 654
-             656:   36(fvec3) Load 41(inF0)
-             657:   36(fvec3) DPdyCoarse 656
-                              Store 655(r021) 657
-             659:   36(fvec3) Load 41(inF0)
-             660:   36(fvec3) DPdyFine 659
-                              Store 658(r022) 660
-             662:   36(fvec3) Load 41(inF0)
-             663:   36(fvec3) ExtInst 1(GLSL.std.450) 12(Degrees) 662
-                              Store 661(r023) 663
-             665:   36(fvec3) Load 41(inF0)
-             666:   36(fvec3) Load 42(inF1)
-             667:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 665 666
-                              Store 664(r024) 667
-             669:   36(fvec3) Load 41(inF0)
-             670:   36(fvec3) Load 42(inF1)
-             671:    6(float) Dot 669 670
-                              Store 668(r025) 671
-             673:   36(fvec3) Load 41(inF0)
-             674:   36(fvec3) ExtInst 1(GLSL.std.450) 27(Exp) 673
-                              Store 672(r029) 674
-             676:   36(fvec3) Load 41(inF0)
-             677:   36(fvec3) ExtInst 1(GLSL.std.450) 29(Exp2) 676
-                              Store 675(r030) 677
-             679:   36(fvec3) Load 41(inF0)
-             680:   36(fvec3) Load 42(inF1)
-             681:   36(fvec3) Load 43(inF2)
-             682:   36(fvec3) ExtInst 1(GLSL.std.450) 70(FaceForward) 679 680 681
-                              Store 678(r031) 682
-             686:   38(ivec3) ExtInst 1(GLSL.std.450) 75(FindUMsb) 685
-                              Store 683(r032) 686
-             688:   38(ivec3) ExtInst 1(GLSL.std.450) 73(FindILsb) 685
-                              Store 687(r033) 688
-             690:   36(fvec3) Load 41(inF0)
-             691:   36(fvec3) ExtInst 1(GLSL.std.450) 8(Floor) 690
-                              Store 689(r034) 691
-             693:   36(fvec3) Load 41(inF0)
-             694:   36(fvec3) Load 42(inF1)
-             695:   36(fvec3) FMod 693 694
-                              Store 692(r036) 695
-             697:   36(fvec3) Load 41(inF0)
-             698:   36(fvec3) ExtInst 1(GLSL.std.450) 10(Fract) 697
-                              Store 696(r037) 698
-             700:   36(fvec3) Load 41(inF0)
-             702:701(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 700
-             703:  594(ivec3) CompositeExtract 702 1
-                              Store 42(inF1) 703
-             704:   36(fvec3) CompositeExtract 702 0
-                              Store 699(r038) 704
-             706:   36(fvec3) Load 41(inF0)
-             707:   36(fvec3) Fwidth 706
-                              Store 705(r039) 707
-             710:   36(fvec3) Load 41(inF0)
-             711:  622(bvec3) IsInf 710
-                              Store 709(r040) 711
+             654:   36(fvec3) Load 42(inF1)
+             655:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 653 654
+                              Store 652(r024) 655
+             657:   36(fvec3) Load 41(inF0)
+             658:   36(fvec3) Load 42(inF1)
+             659:    6(float) Dot 657 658
+                              Store 656(r025) 659
+             661:   36(fvec3) Load 41(inF0)
+             662:   36(fvec3) ExtInst 1(GLSL.std.450) 27(Exp) 661
+                              Store 660(r029) 662
+             664:   36(fvec3) Load 41(inF0)
+             665:   36(fvec3) ExtInst 1(GLSL.std.450) 29(Exp2) 664
+                              Store 663(r030) 665
+             667:   36(fvec3) Load 41(inF0)
+             668:   36(fvec3) Load 42(inF1)
+             669:   36(fvec3) Load 43(inF2)
+             670:   36(fvec3) ExtInst 1(GLSL.std.450) 70(FaceForward) 667 668 669
+                              Store 666(r031) 670
+             674:   38(ivec3) ExtInst 1(GLSL.std.450) 75(FindUMsb) 673
+                              Store 671(r032) 674
+             676:   38(ivec3) ExtInst 1(GLSL.std.450) 73(FindILsb) 673
+                              Store 675(r033) 676
+             678:   36(fvec3) Load 41(inF0)
+             679:   36(fvec3) ExtInst 1(GLSL.std.450) 8(Floor) 678
+                              Store 677(r034) 679
+             681:   36(fvec3) Load 41(inF0)
+             682:   36(fvec3) Load 42(inF1)
+             683:   36(fvec3) FMod 681 682
+                              Store 680(r036) 683
+             685:   36(fvec3) Load 41(inF0)
+             686:   36(fvec3) ExtInst 1(GLSL.std.450) 10(Fract) 685
+                              Store 684(r037) 686
+             688:   36(fvec3) Load 41(inF0)
+             689:   36(fvec3) Fwidth 688
+                              Store 687(r039) 689
+             692:   36(fvec3) Load 41(inF0)
+             693:  610(bvec3) IsInf 692
+                              Store 691(r040) 693
+             695:   36(fvec3) Load 41(inF0)
+             696:  610(bvec3) IsNan 695
+                              Store 694(r041) 696
+             698:   36(fvec3) Load 41(inF0)
+             699:   36(fvec3) Load 42(inF1)
+             700:   36(fvec3) ExtInst 1(GLSL.std.450) 53(Ldexp) 698 699
+                              Store 697(r042) 700
+             702:   36(fvec3) Load 41(inF0)
+             703:   36(fvec3) Load 42(inF1)
+             704:   36(fvec3) Load 43(inF2)
+             705:   36(fvec3) ExtInst 1(GLSL.std.450) 46(FMix) 702 703 704
+                              Store 701(r039a) 705
+             707:   36(fvec3) Load 41(inF0)
+             708:   36(fvec3) Load 42(inF1)
+             710:   36(fvec3) CompositeConstruct 709 709 709
+             711:   36(fvec3) ExtInst 1(GLSL.std.450) 46(FMix) 707 708 710
+                              Store 706(r039b) 711
              713:   36(fvec3) Load 41(inF0)
-             714:  622(bvec3) IsNan 713
-                              Store 712(r041) 714
+             714:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 713
+                              Store 712(r043) 714
              716:   36(fvec3) Load 41(inF0)
-             717:   36(fvec3) Load 42(inF1)
-             718:   36(fvec3) ExtInst 1(GLSL.std.450) 53(Ldexp) 716 717
-                              Store 715(r042) 718
-             720:   36(fvec3) Load 41(inF0)
-             721:   36(fvec3) Load 42(inF1)
-             722:   36(fvec3) Load 43(inF2)
-             723:   36(fvec3) ExtInst 1(GLSL.std.450) 46(FMix) 720 721 722
-                              Store 719(r039a) 723
-             725:   36(fvec3) Load 41(inF0)
-             726:   36(fvec3) Load 42(inF1)
-             728:   36(fvec3) CompositeConstruct 727 727 727
-             729:   36(fvec3) ExtInst 1(GLSL.std.450) 46(FMix) 725 726 728
-                              Store 724(r039b) 729
-             731:   36(fvec3) Load 41(inF0)
-             732:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 731
-                              Store 730(r043) 732
+             717:   36(fvec3) ExtInst 1(GLSL.std.450) 28(Log) 716
+                              Store 715(r044) 717
+             719:   36(fvec3) Load 41(inF0)
+             720:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 719
+             721:   36(fvec3) VectorTimesScalar 720 260
+                              Store 718(r045) 721
+             723:   36(fvec3) Load 41(inF0)
+             724:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 723
+                              Store 722(r046) 724
+             726:   36(fvec3) Load 41(inF0)
+             727:   36(fvec3) Load 42(inF1)
+             728:   36(fvec3) ExtInst 1(GLSL.std.450) 40(FMax) 726 727
+                              Store 725(r047) 728
+             730:   36(fvec3) Load 41(inF0)
+             731:   36(fvec3) Load 42(inF1)
+             732:   36(fvec3) ExtInst 1(GLSL.std.450) 37(FMin) 730 731
+                              Store 729(r048) 732
              734:   36(fvec3) Load 41(inF0)
-             735:   36(fvec3) ExtInst 1(GLSL.std.450) 28(Log) 734
-                              Store 733(r044) 735
+             735:   36(fvec3) ExtInst 1(GLSL.std.450) 69(Normalize) 734
+                              Store 733(r049) 735
              737:   36(fvec3) Load 41(inF0)
-             738:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 737
-             739:   36(fvec3) VectorTimesScalar 738 266
-                              Store 736(r045) 739
+             738:   36(fvec3) Load 42(inF1)
+             739:   36(fvec3) ExtInst 1(GLSL.std.450) 26(Pow) 737 738
+                              Store 736(r050) 739
              741:   36(fvec3) Load 41(inF0)
-             742:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 741
-                              Store 740(r046) 742
+             742:   36(fvec3) ExtInst 1(GLSL.std.450) 11(Radians) 741
+                              Store 740(r051) 742
              744:   36(fvec3) Load 41(inF0)
-             745:   36(fvec3) Load 42(inF1)
-             746:   36(fvec3) ExtInst 1(GLSL.std.450) 40(FMax) 744 745
-                              Store 743(r047) 746
+             745:   36(fvec3) CompositeConstruct 281 281 281
+             746:   36(fvec3) FDiv 745 744
+                              Store 743(r052) 746
              748:   36(fvec3) Load 41(inF0)
              749:   36(fvec3) Load 42(inF1)
-             750:   36(fvec3) ExtInst 1(GLSL.std.450) 37(FMin) 748 749
-                              Store 747(r048) 750
+             750:   36(fvec3) ExtInst 1(GLSL.std.450) 71(Reflect) 748 749
+                              Store 747(r053) 750
              752:   36(fvec3) Load 41(inF0)
-             753:   36(fvec3) ExtInst 1(GLSL.std.450) 69(Normalize) 752
-                              Store 751(r049) 753
-             755:   36(fvec3) Load 41(inF0)
-             756:   36(fvec3) Load 42(inF1)
-             757:   36(fvec3) ExtInst 1(GLSL.std.450) 26(Pow) 755 756
-                              Store 754(r050) 757
+             753:   36(fvec3) Load 42(inF1)
+             754:   36(fvec3) ExtInst 1(GLSL.std.450) 72(Refract) 752 753 512
+                              Store 751(r054) 754
+             757:   38(ivec3) BitReverse 756
+                              Store 755(r055) 757
              759:   36(fvec3) Load 41(inF0)
-             760:   36(fvec3) ExtInst 1(GLSL.std.450) 11(Radians) 759
-                              Store 758(r051) 760
+             760:   36(fvec3) ExtInst 1(GLSL.std.450) 2(RoundEven) 759
+                              Store 758(r056) 760
              762:   36(fvec3) Load 41(inF0)
-             763:   36(fvec3) CompositeConstruct 287 287 287
-             764:   36(fvec3) FDiv 763 762
-                              Store 761(r052) 764
-             766:   36(fvec3) Load 41(inF0)
-             767:   36(fvec3) Load 42(inF1)
-             768:   36(fvec3) ExtInst 1(GLSL.std.450) 71(Reflect) 766 767
-                              Store 765(r053) 768
+             763:   36(fvec3) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 762
+                              Store 761(r057) 763
+             765:   36(fvec3) Load 41(inF0)
+             766:   36(fvec3) CompositeConstruct 179 179 179
+             767:   36(fvec3) CompositeConstruct 281 281 281
+             768:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 765 766 767
+                              Store 764(r058) 768
              770:   36(fvec3) Load 41(inF0)
-             771:   36(fvec3) Load 42(inF1)
-             772:   36(fvec3) ExtInst 1(GLSL.std.450) 72(Refract) 770 771 524
-                              Store 769(r054) 772
-             775:   38(ivec3) BitReverse 774
-                              Store 773(r055) 775
+             771:   36(fvec3) ExtInst 1(GLSL.std.450) 6(FSign) 770
+                              Store 769(r059) 771
+             773:   36(fvec3) Load 41(inF0)
+             774:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 773
+                              Store 772(r060) 774
+             775:   36(fvec3) Load 41(inF0)
+             776:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 775
+                              Store 42(inF1) 776
              777:   36(fvec3) Load 41(inF0)
-             778:   36(fvec3) ExtInst 1(GLSL.std.450) 2(RoundEven) 777
-                              Store 776(r056) 778
+             778:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 777
+                              Store 43(inF2) 778
              780:   36(fvec3) Load 41(inF0)
-             781:   36(fvec3) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 780
-                              Store 779(r057) 781
+             781:   36(fvec3) ExtInst 1(GLSL.std.450) 19(Sinh) 780
+                              Store 779(r061) 781
              783:   36(fvec3) Load 41(inF0)
-             784:   36(fvec3) CompositeConstruct 179 179 179
-             785:   36(fvec3) CompositeConstruct 287 287 287
-             786:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 783 784 785
-                              Store 782(r058) 786
+             784:   36(fvec3) Load 42(inF1)
+             785:   36(fvec3) Load 43(inF2)
+             786:   36(fvec3) ExtInst 1(GLSL.std.450) 49(SmoothStep) 783 784 785
+                              Store 782(r062) 786
              788:   36(fvec3) Load 41(inF0)
-             789:   36(fvec3) ExtInst 1(GLSL.std.450) 6(FSign) 788
-                              Store 787(r059) 789
+             789:   36(fvec3) ExtInst 1(GLSL.std.450) 31(Sqrt) 788
+                              Store 787(r063) 789
              791:   36(fvec3) Load 41(inF0)
-             792:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 791
-                              Store 790(r060) 792
-             793:   36(fvec3) Load 41(inF0)
-             794:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 793
-                              Store 42(inF1) 794
+             792:   36(fvec3) Load 42(inF1)
+             793:   36(fvec3) ExtInst 1(GLSL.std.450) 48(Step) 791 792
+                              Store 790(r064) 793
              795:   36(fvec3) Load 41(inF0)
-             796:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 795
-                              Store 43(inF2) 796
+             796:   36(fvec3) ExtInst 1(GLSL.std.450) 15(Tan) 795
+                              Store 794(r065) 796
              798:   36(fvec3) Load 41(inF0)
-             799:   36(fvec3) ExtInst 1(GLSL.std.450) 19(Sinh) 798
-                              Store 797(r061) 799
+             799:   36(fvec3) ExtInst 1(GLSL.std.450) 21(Tanh) 798
+                              Store 797(r066) 799
              801:   36(fvec3) Load 41(inF0)
-             802:   36(fvec3) Load 42(inF1)
-             803:   36(fvec3) Load 43(inF2)
-             804:   36(fvec3) ExtInst 1(GLSL.std.450) 49(SmoothStep) 801 802 803
-                              Store 800(r062) 804
-             806:   36(fvec3) Load 41(inF0)
-             807:   36(fvec3) ExtInst 1(GLSL.std.450) 31(Sqrt) 806
-                              Store 805(r063) 807
-             809:   36(fvec3) Load 41(inF0)
-             810:   36(fvec3) Load 42(inF1)
-             811:   36(fvec3) ExtInst 1(GLSL.std.450) 48(Step) 809 810
-                              Store 808(r064) 811
-             813:   36(fvec3) Load 41(inF0)
-             814:   36(fvec3) ExtInst 1(GLSL.std.450) 15(Tan) 813
-                              Store 812(r065) 814
-             816:   36(fvec3) Load 41(inF0)
-             817:   36(fvec3) ExtInst 1(GLSL.std.450) 21(Tanh) 816
-                              Store 815(r066) 817
-             819:   36(fvec3) Load 41(inF0)
-             820:   36(fvec3) ExtInst 1(GLSL.std.450) 3(Trunc) 819
-                              Store 818(r067) 820
-                              ReturnValue 822
+             802:   36(fvec3) ExtInst 1(GLSL.std.450) 3(Trunc) 801
+                              Store 800(r067) 802
+                              ReturnValue 804
                               FunctionEnd
 58(PixelShaderFunction(vf4;vf4;vf4;vu4;vu4;):   48(fvec4) Function None 52
         53(inF0):     49(ptr) FunctionParameter
@@ -7166,1018 +7041,990 @@ gl_FragCoord origin is upper left
         56(inU0):     51(ptr) FunctionParameter
         57(inU1):     51(ptr) FunctionParameter
               59:             Label
-       825(r000):    136(ptr) Variable Function
-       828(r001):     49(ptr) Variable Function
-       831(r002):     49(ptr) Variable Function
-       834(r003):    136(ptr) Variable Function
-       837(r004):     49(ptr) Variable Function
-       842(r005):    841(ptr) Variable Function
-       845(r006):     51(ptr) Variable Function
-       848(r007):     49(ptr) Variable Function
-       851(r009):     49(ptr) Variable Function
-       854(r010):     49(ptr) Variable Function
-       858(r011):     49(ptr) Variable Function
-       861(r012):     49(ptr) Variable Function
-       874(r013):     49(ptr) Variable Function
-       877(r014):     49(ptr) Variable Function
-       880(r015):     51(ptr) Variable Function
-       883(r016):     49(ptr) Variable Function
-       886(r017):     49(ptr) Variable Function
-       889(r018):     49(ptr) Variable Function
-       892(r019):     49(ptr) Variable Function
-       895(r020):     49(ptr) Variable Function
-       898(r021):     49(ptr) Variable Function
-       901(r022):     49(ptr) Variable Function
-       904(r023):      7(ptr) Variable Function
-       908(r024):      7(ptr) Variable Function
-       912(r025):     49(ptr) Variable Function
-       923(r029):     49(ptr) Variable Function
-       926(r030):     49(ptr) Variable Function
-       929(r031):     49(ptr) Variable Function
-       934(r032):     51(ptr) Variable Function
-       939(r033):     51(ptr) Variable Function
-       941(r034):     49(ptr) Variable Function
-       944(r036):     49(ptr) Variable Function
-       948(r037):     49(ptr) Variable Function
-       951(r038):     49(ptr) Variable Function
-       957(r039):     49(ptr) Variable Function
-       961(r040):    960(ptr) Variable Function
-       964(r041):    960(ptr) Variable Function
-       967(r042):     49(ptr) Variable Function
-      971(r039a):     49(ptr) Variable Function
-       976(r043):      7(ptr) Variable Function
-       979(r044):     49(ptr) Variable Function
-       982(r045):     49(ptr) Variable Function
-       986(r046):     49(ptr) Variable Function
-       989(r047):     49(ptr) Variable Function
-       993(r048):     49(ptr) Variable Function
-       997(r049):     49(ptr) Variable Function
-      1000(r050):     49(ptr) Variable Function
-      1004(r051):     49(ptr) Variable Function
-      1007(r052):     49(ptr) Variable Function
-      1011(r053):     49(ptr) Variable Function
-      1015(r054):     49(ptr) Variable Function
-      1019(r055):     51(ptr) Variable Function
-      1022(r056):     49(ptr) Variable Function
-      1025(r057):     49(ptr) Variable Function
-      1028(r058):     49(ptr) Variable Function
-      1033(r059):     49(ptr) Variable Function
-      1036(r060):     49(ptr) Variable Function
-      1043(r061):     49(ptr) Variable Function
-      1046(r062):     49(ptr) Variable Function
-      1051(r063):     49(ptr) Variable Function
-      1054(r064):     49(ptr) Variable Function
-      1058(r065):     49(ptr) Variable Function
-      1061(r066):     49(ptr) Variable Function
-      1064(r067):     49(ptr) Variable Function
-             826:   48(fvec4) Load 53(inF0)
-             827:   135(bool) All 826
-                              Store 825(r000) 827
-             829:   48(fvec4) Load 53(inF0)
-             830:   48(fvec4) ExtInst 1(GLSL.std.450) 4(FAbs) 829
-                              Store 828(r001) 830
-             832:   48(fvec4) Load 53(inF0)
-             833:   48(fvec4) ExtInst 1(GLSL.std.450) 17(Acos) 832
-                              Store 831(r002) 833
-             835:   48(fvec4) Load 53(inF0)
-             836:   135(bool) Any 835
-                              Store 834(r003) 836
-             838:   48(fvec4) Load 53(inF0)
-             839:   48(fvec4) ExtInst 1(GLSL.std.450) 16(Asin) 838
-                              Store 837(r004) 839
-             843:   48(fvec4) Load 53(inF0)
-             844:  840(ivec4) Bitcast 843
-                              Store 842(r005) 844
-             846:   48(fvec4) Load 53(inF0)
-             847:   50(ivec4) Bitcast 846
-                              Store 845(r006) 847
-             849:   50(ivec4) Load 56(inU0)
-             850:   48(fvec4) Bitcast 849
-                              Store 848(r007) 850
-             852:   48(fvec4) Load 53(inF0)
-             853:   48(fvec4) ExtInst 1(GLSL.std.450) 18(Atan) 852
-                              Store 851(r009) 853
-             855:   48(fvec4) Load 53(inF0)
-             856:   48(fvec4) Load 54(inF1)
-             857:   48(fvec4) ExtInst 1(GLSL.std.450) 25(Atan2) 855 856
-                              Store 854(r010) 857
-             859:   48(fvec4) Load 53(inF0)
-             860:   48(fvec4) ExtInst 1(GLSL.std.450) 9(Ceil) 859
-                              Store 858(r011) 860
-             862:   48(fvec4) Load 53(inF0)
-             863:   48(fvec4) Load 54(inF1)
-             864:   48(fvec4) Load 55(inF2)
-             865:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 862 863 864
-                              Store 861(r012) 865
-             866:   48(fvec4) Load 53(inF0)
-             869:  868(bvec4) FOrdLessThan 866 867
-             870:   135(bool) Any 869
-                              SelectionMerge 872 None
-                              BranchConditional 870 871 872
-             871:               Label
+       807(r000):    136(ptr) Variable Function
+       810(r001):     49(ptr) Variable Function
+       813(r002):     49(ptr) Variable Function
+       816(r003):    136(ptr) Variable Function
+       819(r004):     49(ptr) Variable Function
+       824(r005):    823(ptr) Variable Function
+       827(r006):     51(ptr) Variable Function
+       830(r007):     49(ptr) Variable Function
+       833(r009):     49(ptr) Variable Function
+       836(r010):     49(ptr) Variable Function
+       840(r011):     49(ptr) Variable Function
+       843(r012):     49(ptr) Variable Function
+       856(r013):     49(ptr) Variable Function
+       859(r014):     49(ptr) Variable Function
+       862(r015):     51(ptr) Variable Function
+       865(r016):     49(ptr) Variable Function
+       868(r017):     49(ptr) Variable Function
+       871(r018):     49(ptr) Variable Function
+       874(r019):     49(ptr) Variable Function
+       877(r020):     49(ptr) Variable Function
+       880(r021):     49(ptr) Variable Function
+       883(r022):     49(ptr) Variable Function
+       886(r023):      7(ptr) Variable Function
+       890(r024):      7(ptr) Variable Function
+       894(r025):     49(ptr) Variable Function
+       905(r029):     49(ptr) Variable Function
+       908(r030):     49(ptr) Variable Function
+       911(r031):     49(ptr) Variable Function
+       916(r032):     51(ptr) Variable Function
+       921(r033):     51(ptr) Variable Function
+       923(r034):     49(ptr) Variable Function
+       926(r036):     49(ptr) Variable Function
+       930(r037):     49(ptr) Variable Function
+       933(r039):     49(ptr) Variable Function
+       937(r040):    936(ptr) Variable Function
+       940(r041):    936(ptr) Variable Function
+       943(r042):     49(ptr) Variable Function
+      947(r039a):     49(ptr) Variable Function
+       952(r043):      7(ptr) Variable Function
+       955(r044):     49(ptr) Variable Function
+       958(r045):     49(ptr) Variable Function
+       962(r046):     49(ptr) Variable Function
+       965(r047):     49(ptr) Variable Function
+       969(r048):     49(ptr) Variable Function
+       973(r049):     49(ptr) Variable Function
+       976(r050):     49(ptr) Variable Function
+       980(r051):     49(ptr) Variable Function
+       983(r052):     49(ptr) Variable Function
+       987(r053):     49(ptr) Variable Function
+       991(r054):     49(ptr) Variable Function
+       995(r055):     51(ptr) Variable Function
+       998(r056):     49(ptr) Variable Function
+      1001(r057):     49(ptr) Variable Function
+      1004(r058):     49(ptr) Variable Function
+      1009(r059):     49(ptr) Variable Function
+      1012(r060):     49(ptr) Variable Function
+      1019(r061):     49(ptr) Variable Function
+      1022(r062):     49(ptr) Variable Function
+      1027(r063):     49(ptr) Variable Function
+      1030(r064):     49(ptr) Variable Function
+      1034(r065):     49(ptr) Variable Function
+      1037(r066):     49(ptr) Variable Function
+      1040(r067):     49(ptr) Variable Function
+             808:   48(fvec4) Load 53(inF0)
+             809:   135(bool) All 808
+                              Store 807(r000) 809
+             811:   48(fvec4) Load 53(inF0)
+             812:   48(fvec4) ExtInst 1(GLSL.std.450) 4(FAbs) 811
+                              Store 810(r001) 812
+             814:   48(fvec4) Load 53(inF0)
+             815:   48(fvec4) ExtInst 1(GLSL.std.450) 17(Acos) 814
+                              Store 813(r002) 815
+             817:   48(fvec4) Load 53(inF0)
+             818:   135(bool) Any 817
+                              Store 816(r003) 818
+             820:   48(fvec4) Load 53(inF0)
+             821:   48(fvec4) ExtInst 1(GLSL.std.450) 16(Asin) 820
+                              Store 819(r004) 821
+             825:   48(fvec4) Load 53(inF0)
+             826:  822(ivec4) Bitcast 825
+                              Store 824(r005) 826
+             828:   48(fvec4) Load 53(inF0)
+             829:   50(ivec4) Bitcast 828
+                              Store 827(r006) 829
+             831:   50(ivec4) Load 56(inU0)
+             832:   48(fvec4) Bitcast 831
+                              Store 830(r007) 832
+             834:   48(fvec4) Load 53(inF0)
+             835:   48(fvec4) ExtInst 1(GLSL.std.450) 18(Atan) 834
+                              Store 833(r009) 835
+             837:   48(fvec4) Load 53(inF0)
+             838:   48(fvec4) Load 54(inF1)
+             839:   48(fvec4) ExtInst 1(GLSL.std.450) 25(Atan2) 837 838
+                              Store 836(r010) 839
+             841:   48(fvec4) Load 53(inF0)
+             842:   48(fvec4) ExtInst 1(GLSL.std.450) 9(Ceil) 841
+                              Store 840(r011) 842
+             844:   48(fvec4) Load 53(inF0)
+             845:   48(fvec4) Load 54(inF1)
+             846:   48(fvec4) Load 55(inF2)
+             847:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 844 845 846
+                              Store 843(r012) 847
+             848:   48(fvec4) Load 53(inF0)
+             851:  850(bvec4) FOrdLessThan 848 849
+             852:   135(bool) Any 851
+                              SelectionMerge 854 None
+                              BranchConditional 852 853 854
+             853:               Label
                                 Kill
-             872:             Label
+             854:             Label
+             857:   48(fvec4) Load 53(inF0)
+             858:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 857
+                              Store 856(r013) 858
+             860:   48(fvec4) Load 53(inF0)
+             861:   48(fvec4) ExtInst 1(GLSL.std.450) 20(Cosh) 860
+                              Store 859(r014) 861
+             864:   50(ivec4) BitCount 863
+                              Store 862(r015) 864
+             866:   48(fvec4) Load 53(inF0)
+             867:   48(fvec4) DPdx 866
+                              Store 865(r016) 867
+             869:   48(fvec4) Load 53(inF0)
+             870:   48(fvec4) DPdxCoarse 869
+                              Store 868(r017) 870
+             872:   48(fvec4) Load 53(inF0)
+             873:   48(fvec4) DPdxFine 872
+                              Store 871(r018) 873
              875:   48(fvec4) Load 53(inF0)
-             876:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 875
-                              Store 874(r013) 876
+             876:   48(fvec4) DPdy 875
+                              Store 874(r019) 876
              878:   48(fvec4) Load 53(inF0)
-             879:   48(fvec4) ExtInst 1(GLSL.std.450) 20(Cosh) 878
-                              Store 877(r014) 879
-             882:   50(ivec4) BitCount 881
-                              Store 880(r015) 882
+             879:   48(fvec4) DPdyCoarse 878
+                              Store 877(r020) 879
+             881:   48(fvec4) Load 53(inF0)
+             882:   48(fvec4) DPdyFine 881
+                              Store 880(r021) 882
              884:   48(fvec4) Load 53(inF0)
-             885:   48(fvec4) DPdx 884
-                              Store 883(r016) 885
+             885:   48(fvec4) ExtInst 1(GLSL.std.450) 12(Degrees) 884
+                              Store 883(r022) 885
              887:   48(fvec4) Load 53(inF0)
-             888:   48(fvec4) DPdxCoarse 887
-                              Store 886(r017) 888
-             890:   48(fvec4) Load 53(inF0)
-             891:   48(fvec4) DPdxFine 890
-                              Store 889(r018) 891
-             893:   48(fvec4) Load 53(inF0)
-             894:   48(fvec4) DPdy 893
-                              Store 892(r019) 894
-             896:   48(fvec4) Load 53(inF0)
-             897:   48(fvec4) DPdyCoarse 896
-                              Store 895(r020) 897
-             899:   48(fvec4) Load 53(inF0)
-             900:   48(fvec4) DPdyFine 899
-                              Store 898(r021) 900
-             902:   48(fvec4) Load 53(inF0)
-             903:   48(fvec4) ExtInst 1(GLSL.std.450) 12(Degrees) 902
-                              Store 901(r022) 903
-             905:   48(fvec4) Load 53(inF0)
-             906:   48(fvec4) Load 54(inF1)
-             907:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 905 906
-                              Store 904(r023) 907
+             888:   48(fvec4) Load 54(inF1)
+             889:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 887 888
+                              Store 886(r023) 889
+             891:   48(fvec4) Load 53(inF0)
+             892:   48(fvec4) Load 54(inF1)
+             893:    6(float) Dot 891 892
+                              Store 890(r024) 893
+             895:      7(ptr) AccessChain 53(inF0) 515
+             896:    6(float) Load 895
+             897:      7(ptr) AccessChain 54(inF1) 515
+             898:    6(float) Load 897
+             899:    6(float) FMul 896 898
+             900:      7(ptr) AccessChain 53(inF0) 516
+             901:    6(float) Load 900
+             902:      7(ptr) AccessChain 54(inF1) 623
+             903:    6(float) Load 902
+             904:   48(fvec4) CompositeConstruct 281 899 901 903
+                              Store 894(r025) 904
+             906:   48(fvec4) Load 53(inF0)
+             907:   48(fvec4) ExtInst 1(GLSL.std.450) 27(Exp) 906
+                              Store 905(r029) 907
              909:   48(fvec4) Load 53(inF0)
-             910:   48(fvec4) Load 54(inF1)
-             911:    6(float) Dot 909 910
-                              Store 908(r024) 911
-             913:      7(ptr) AccessChain 53(inF0) 527
-             914:    6(float) Load 913
-             915:      7(ptr) AccessChain 54(inF1) 527
-             916:    6(float) Load 915
-             917:    6(float) FMul 914 916
-             918:      7(ptr) AccessChain 53(inF0) 528
-             919:    6(float) Load 918
-             920:      7(ptr) AccessChain 54(inF1) 635
-             921:    6(float) Load 920
-             922:   48(fvec4) CompositeConstruct 287 917 919 921
-                              Store 912(r025) 922
+             910:   48(fvec4) ExtInst 1(GLSL.std.450) 29(Exp2) 909
+                              Store 908(r030) 910
+             912:   48(fvec4) Load 53(inF0)
+             913:   48(fvec4) Load 54(inF1)
+             914:   48(fvec4) Load 55(inF2)
+             915:   48(fvec4) ExtInst 1(GLSL.std.450) 70(FaceForward) 912 913 914
+                              Store 911(r031) 915
+             920:   50(ivec4) ExtInst 1(GLSL.std.450) 75(FindUMsb) 919
+                              Store 916(r032) 920
+             922:   50(ivec4) ExtInst 1(GLSL.std.450) 73(FindILsb) 919
+                              Store 921(r033) 922
              924:   48(fvec4) Load 53(inF0)
-             925:   48(fvec4) ExtInst 1(GLSL.std.450) 27(Exp) 924
-                              Store 923(r029) 925
+             925:   48(fvec4) ExtInst 1(GLSL.std.450) 8(Floor) 924
+                              Store 923(r034) 925
              927:   48(fvec4) Load 53(inF0)
-             928:   48(fvec4) ExtInst 1(GLSL.std.450) 29(Exp2) 927
-                              Store 926(r030) 928
-             930:   48(fvec4) Load 53(inF0)
-             931:   48(fvec4) Load 54(inF1)
-             932:   48(fvec4) Load 55(inF2)
-             933:   48(fvec4) ExtInst 1(GLSL.std.450) 70(FaceForward) 930 931 932
-                              Store 929(r031) 933
-             938:   50(ivec4) ExtInst 1(GLSL.std.450) 75(FindUMsb) 937
-                              Store 934(r032) 938
-             940:   50(ivec4) ExtInst 1(GLSL.std.450) 73(FindILsb) 937
-                              Store 939(r033) 940
-             942:   48(fvec4) Load 53(inF0)
-             943:   48(fvec4) ExtInst 1(GLSL.std.450) 8(Floor) 942
-                              Store 941(r034) 943
-             945:   48(fvec4) Load 53(inF0)
-             946:   48(fvec4) Load 54(inF1)
-             947:   48(fvec4) FMod 945 946
-                              Store 944(r036) 947
-             949:   48(fvec4) Load 53(inF0)
-             950:   48(fvec4) ExtInst 1(GLSL.std.450) 10(Fract) 949
-                              Store 948(r037) 950
-             952:   48(fvec4) Load 53(inF0)
-             954:953(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 952
-             955:  840(ivec4) CompositeExtract 954 1
-                              Store 54(inF1) 955
-             956:   48(fvec4) CompositeExtract 954 0
-                              Store 951(r038) 956
-             958:   48(fvec4) Load 53(inF0)
-             959:   48(fvec4) Fwidth 958
-                              Store 957(r039) 959
-             962:   48(fvec4) Load 53(inF0)
-             963:  868(bvec4) IsInf 962
-                              Store 961(r040) 963
-             965:   48(fvec4) Load 53(inF0)
-             966:  868(bvec4) IsNan 965
-                              Store 964(r041) 966
-             968:   48(fvec4) Load 53(inF0)
-             969:   48(fvec4) Load 54(inF1)
-             970:   48(fvec4) ExtInst 1(GLSL.std.450) 53(Ldexp) 968 969
-                              Store 967(r042) 970
-             972:   48(fvec4) Load 53(inF0)
-             973:   48(fvec4) Load 54(inF1)
-             974:   48(fvec4) Load 55(inF2)
-             975:   48(fvec4) ExtInst 1(GLSL.std.450) 46(FMix) 972 973 974
-                              Store 971(r039a) 975
+             928:   48(fvec4) Load 54(inF1)
+             929:   48(fvec4) FMod 927 928
+                              Store 926(r036) 929
+             931:   48(fvec4) Load 53(inF0)
+             932:   48(fvec4) ExtInst 1(GLSL.std.450) 10(Fract) 931
+                              Store 930(r037) 932
+             934:   48(fvec4) Load 53(inF0)
+             935:   48(fvec4) Fwidth 934
+                              Store 933(r039) 935
+             938:   48(fvec4) Load 53(inF0)
+             939:  850(bvec4) IsInf 938
+                              Store 937(r040) 939
+             941:   48(fvec4) Load 53(inF0)
+             942:  850(bvec4) IsNan 941
+                              Store 940(r041) 942
+             944:   48(fvec4) Load 53(inF0)
+             945:   48(fvec4) Load 54(inF1)
+             946:   48(fvec4) ExtInst 1(GLSL.std.450) 53(Ldexp) 944 945
+                              Store 943(r042) 946
+             948:   48(fvec4) Load 53(inF0)
+             949:   48(fvec4) Load 54(inF1)
+             950:   48(fvec4) Load 55(inF2)
+             951:   48(fvec4) ExtInst 1(GLSL.std.450) 46(FMix) 948 949 950
+                              Store 947(r039a) 951
+             953:   48(fvec4) Load 53(inF0)
+             954:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 953
+                              Store 952(r043) 954
+             956:   48(fvec4) Load 53(inF0)
+             957:   48(fvec4) ExtInst 1(GLSL.std.450) 28(Log) 956
+                              Store 955(r044) 957
+             959:   48(fvec4) Load 53(inF0)
+             960:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 959
+             961:   48(fvec4) VectorTimesScalar 960 260
+                              Store 958(r045) 961
+             963:   48(fvec4) Load 53(inF0)
+             964:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 963
+                              Store 962(r046) 964
+             966:   48(fvec4) Load 53(inF0)
+             967:   48(fvec4) Load 54(inF1)
+             968:   48(fvec4) ExtInst 1(GLSL.std.450) 40(FMax) 966 967
+                              Store 965(r047) 968
+             970:   48(fvec4) Load 53(inF0)
+             971:   48(fvec4) Load 54(inF1)
+             972:   48(fvec4) ExtInst 1(GLSL.std.450) 37(FMin) 970 971
+                              Store 969(r048) 972
+             974:   48(fvec4) Load 53(inF0)
+             975:   48(fvec4) ExtInst 1(GLSL.std.450) 69(Normalize) 974
+                              Store 973(r049) 975
              977:   48(fvec4) Load 53(inF0)
-             978:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 977
-                              Store 976(r043) 978
-             980:   48(fvec4) Load 53(inF0)
-             981:   48(fvec4) ExtInst 1(GLSL.std.450) 28(Log) 980
-                              Store 979(r044) 981
-             983:   48(fvec4) Load 53(inF0)
-             984:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 983
-             985:   48(fvec4) VectorTimesScalar 984 266
-                              Store 982(r045) 985
-             987:   48(fvec4) Load 53(inF0)
-             988:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 987
-                              Store 986(r046) 988
-             990:   48(fvec4) Load 53(inF0)
-             991:   48(fvec4) Load 54(inF1)
-             992:   48(fvec4) ExtInst 1(GLSL.std.450) 40(FMax) 990 991
-                              Store 989(r047) 992
-             994:   48(fvec4) Load 53(inF0)
-             995:   48(fvec4) Load 54(inF1)
-             996:   48(fvec4) ExtInst 1(GLSL.std.450) 37(FMin) 994 995
-                              Store 993(r048) 996
-             998:   48(fvec4) Load 53(inF0)
-             999:   48(fvec4) ExtInst 1(GLSL.std.450) 69(Normalize) 998
-                              Store 997(r049) 999
-            1001:   48(fvec4) Load 53(inF0)
-            1002:   48(fvec4) Load 54(inF1)
-            1003:   48(fvec4) ExtInst 1(GLSL.std.450) 26(Pow) 1001 1002
-                              Store 1000(r050) 1003
+             978:   48(fvec4) Load 54(inF1)
+             979:   48(fvec4) ExtInst 1(GLSL.std.450) 26(Pow) 977 978
+                              Store 976(r050) 979
+             981:   48(fvec4) Load 53(inF0)
+             982:   48(fvec4) ExtInst 1(GLSL.std.450) 11(Radians) 981
+                              Store 980(r051) 982
+             984:   48(fvec4) Load 53(inF0)
+             985:   48(fvec4) CompositeConstruct 281 281 281 281
+             986:   48(fvec4) FDiv 985 984
+                              Store 983(r052) 986
+             988:   48(fvec4) Load 53(inF0)
+             989:   48(fvec4) Load 54(inF1)
+             990:   48(fvec4) ExtInst 1(GLSL.std.450) 71(Reflect) 988 989
+                              Store 987(r053) 990
+             992:   48(fvec4) Load 53(inF0)
+             993:   48(fvec4) Load 54(inF1)
+             994:   48(fvec4) ExtInst 1(GLSL.std.450) 72(Refract) 992 993 512
+                              Store 991(r054) 994
+             997:   50(ivec4) BitReverse 996
+                              Store 995(r055) 997
+             999:   48(fvec4) Load 53(inF0)
+            1000:   48(fvec4) ExtInst 1(GLSL.std.450) 2(RoundEven) 999
+                              Store 998(r056) 1000
+            1002:   48(fvec4) Load 53(inF0)
+            1003:   48(fvec4) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1002
+                              Store 1001(r057) 1003
             1005:   48(fvec4) Load 53(inF0)
-            1006:   48(fvec4) ExtInst 1(GLSL.std.450) 11(Radians) 1005
-                              Store 1004(r051) 1006
-            1008:   48(fvec4) Load 53(inF0)
-            1009:   48(fvec4) CompositeConstruct 287 287 287 287
-            1010:   48(fvec4) FDiv 1009 1008
-                              Store 1007(r052) 1010
-            1012:   48(fvec4) Load 53(inF0)
-            1013:   48(fvec4) Load 54(inF1)
-            1014:   48(fvec4) ExtInst 1(GLSL.std.450) 71(Reflect) 1012 1013
-                              Store 1011(r053) 1014
-            1016:   48(fvec4) Load 53(inF0)
-            1017:   48(fvec4) Load 54(inF1)
-            1018:   48(fvec4) ExtInst 1(GLSL.std.450) 72(Refract) 1016 1017 524
-                              Store 1015(r054) 1018
-            1021:   50(ivec4) BitReverse 1020
-                              Store 1019(r055) 1021
+            1006:   48(fvec4) CompositeConstruct 179 179 179 179
+            1007:   48(fvec4) CompositeConstruct 281 281 281 281
+            1008:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 1005 1006 1007
+                              Store 1004(r058) 1008
+            1010:   48(fvec4) Load 53(inF0)
+            1011:   48(fvec4) ExtInst 1(GLSL.std.450) 6(FSign) 1010
+                              Store 1009(r059) 1011
+            1013:   48(fvec4) Load 53(inF0)
+            1014:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 1013
+                              Store 1012(r060) 1014
+            1015:   48(fvec4) Load 53(inF0)
+            1016:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 1015
+                              Store 54(inF1) 1016
+            1017:   48(fvec4) Load 53(inF0)
+            1018:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 1017
+                              Store 55(inF2) 1018
+            1020:   48(fvec4) Load 53(inF0)
+            1021:   48(fvec4) ExtInst 1(GLSL.std.450) 19(Sinh) 1020
+                              Store 1019(r061) 1021
             1023:   48(fvec4) Load 53(inF0)
-            1024:   48(fvec4) ExtInst 1(GLSL.std.450) 2(RoundEven) 1023
-                              Store 1022(r056) 1024
-            1026:   48(fvec4) Load 53(inF0)
-            1027:   48(fvec4) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1026
-                              Store 1025(r057) 1027
-            1029:   48(fvec4) Load 53(inF0)
-            1030:   48(fvec4) CompositeConstruct 179 179 179 179
-            1031:   48(fvec4) CompositeConstruct 287 287 287 287
-            1032:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 1029 1030 1031
-                              Store 1028(r058) 1032
-            1034:   48(fvec4) Load 53(inF0)
-            1035:   48(fvec4) ExtInst 1(GLSL.std.450) 6(FSign) 1034
-                              Store 1033(r059) 1035
-            1037:   48(fvec4) Load 53(inF0)
-            1038:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 1037
-                              Store 1036(r060) 1038
-            1039:   48(fvec4) Load 53(inF0)
-            1040:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 1039
-                              Store 54(inF1) 1040
+            1024:   48(fvec4) Load 54(inF1)
+            1025:   48(fvec4) Load 55(inF2)
+            1026:   48(fvec4) ExtInst 1(GLSL.std.450) 49(SmoothStep) 1023 1024 1025
+                              Store 1022(r062) 1026
+            1028:   48(fvec4) Load 53(inF0)
+            1029:   48(fvec4) ExtInst 1(GLSL.std.450) 31(Sqrt) 1028
+                              Store 1027(r063) 1029
+            1031:   48(fvec4) Load 53(inF0)
+            1032:   48(fvec4) Load 54(inF1)
+            1033:   48(fvec4) ExtInst 1(GLSL.std.450) 48(Step) 1031 1032
+                              Store 1030(r064) 1033
+            1035:   48(fvec4) Load 53(inF0)
+            1036:   48(fvec4) ExtInst 1(GLSL.std.450) 15(Tan) 1035
+                              Store 1034(r065) 1036
+            1038:   48(fvec4) Load 53(inF0)
+            1039:   48(fvec4) ExtInst 1(GLSL.std.450) 21(Tanh) 1038
+                              Store 1037(r066) 1039
             1041:   48(fvec4) Load 53(inF0)
-            1042:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 1041
-                              Store 55(inF2) 1042
-            1044:   48(fvec4) Load 53(inF0)
-            1045:   48(fvec4) ExtInst 1(GLSL.std.450) 19(Sinh) 1044
-                              Store 1043(r061) 1045
-            1047:   48(fvec4) Load 53(inF0)
-            1048:   48(fvec4) Load 54(inF1)
-            1049:   48(fvec4) Load 55(inF2)
-            1050:   48(fvec4) ExtInst 1(GLSL.std.450) 49(SmoothStep) 1047 1048 1049
-                              Store 1046(r062) 1050
-            1052:   48(fvec4) Load 53(inF0)
-            1053:   48(fvec4) ExtInst 1(GLSL.std.450) 31(Sqrt) 1052
-                              Store 1051(r063) 1053
-            1055:   48(fvec4) Load 53(inF0)
-            1056:   48(fvec4) Load 54(inF1)
-            1057:   48(fvec4) ExtInst 1(GLSL.std.450) 48(Step) 1055 1056
-                              Store 1054(r064) 1057
-            1059:   48(fvec4) Load 53(inF0)
-            1060:   48(fvec4) ExtInst 1(GLSL.std.450) 15(Tan) 1059
-                              Store 1058(r065) 1060
-            1062:   48(fvec4) Load 53(inF0)
-            1063:   48(fvec4) ExtInst 1(GLSL.std.450) 21(Tanh) 1062
-                              Store 1061(r066) 1063
-            1065:   48(fvec4) Load 53(inF0)
-            1066:   48(fvec4) ExtInst 1(GLSL.std.450) 3(Trunc) 1065
-                              Store 1064(r067) 1066
-                              ReturnValue 1068
+            1042:   48(fvec4) ExtInst 1(GLSL.std.450) 3(Trunc) 1041
+                              Store 1040(r067) 1042
+                              ReturnValue 1044
                               FunctionEnd
 66(PixelShaderFunction2x2(mf22;mf22;mf22;):          60 Function None 62
         63(inF0):     61(ptr) FunctionParameter
         64(inF1):     61(ptr) FunctionParameter
         65(inF2):     61(ptr) FunctionParameter
               67:             Label
-      1071(r000):    136(ptr) Variable Function
-      1074(r001):     61(ptr) Variable Function
-      1079(r003):    136(ptr) Variable Function
-      1082(r004):     61(ptr) Variable Function
-      1085(r005):     61(ptr) Variable Function
-      1088(r006):     61(ptr) Variable Function
-      1092(r007):     61(ptr) Variable Function
-      1103(r008):     61(ptr) Variable Function
-      1108(r009):     61(ptr) Variable Function
-      1111(r010):     61(ptr) Variable Function
-      1114(r011):     61(ptr) Variable Function
-      1117(r012):     61(ptr) Variable Function
-      1120(r013):     61(ptr) Variable Function
-      1123(r014):     61(ptr) Variable Function
-      1126(r015):     61(ptr) Variable Function
-      1129(r016):     61(ptr) Variable Function
-      1132(r017):     61(ptr) Variable Function
-      1135(r018):      7(ptr) Variable Function
-      1138(r019):     61(ptr) Variable Function
-      1141(R020):     61(ptr) Variable Function
-      1144(r021):     61(ptr) Variable Function
-      1147(r022):     61(ptr) Variable Function
-      1157(r023):     61(ptr) Variable Function
-      1160(r024):     61(ptr) Variable Function
-      1166(r025):     61(ptr) Variable Function
-      1169(r026):     61(ptr) Variable Function
-     1173(r026a):     61(ptr) Variable Function
-      1178(r027):     61(ptr) Variable Function
-      1181(r028):     61(ptr) Variable Function
-      1185(r029):     61(ptr) Variable Function
-      1188(r030):     61(ptr) Variable Function
-      1192(r031):     61(ptr) Variable Function
-      1196(r032):     61(ptr) Variable Function
-      1200(r033):     61(ptr) Variable Function
-      1203(r034):     61(ptr) Variable Function
-      1206(r035):     61(ptr) Variable Function
-      1209(r036):     61(ptr) Variable Function
-      1214(r037):     61(ptr) Variable Function
-      1217(r038):     61(ptr) Variable Function
-      1224(r039):     61(ptr) Variable Function
-      1227(r049):     61(ptr) Variable Function
-      1232(r041):     61(ptr) Variable Function
-      1235(r042):     61(ptr) Variable Function
-      1239(r043):     61(ptr) Variable Function
-      1242(r044):     61(ptr) Variable Function
-      1247(r046):     61(ptr) Variable Function
-            1072:          60 Load 63(inF0)
-            1073:   135(bool) All 1072
-                              Store 1071(r000) 1073
-            1075:          60 Load 63(inF0)
-            1076:          60 ExtInst 1(GLSL.std.450) 4(FAbs) 1075
-                              Store 1074(r001) 1076
-            1077:          60 Load 63(inF0)
-            1078:          60 ExtInst 1(GLSL.std.450) 17(Acos) 1077
-            1080:          60 Load 63(inF0)
-            1081:   135(bool) Any 1080
-                              Store 1079(r003) 1081
-            1083:          60 Load 63(inF0)
-            1084:          60 ExtInst 1(GLSL.std.450) 16(Asin) 1083
-                              Store 1082(r004) 1084
-            1086:          60 Load 63(inF0)
-            1087:          60 ExtInst 1(GLSL.std.450) 18(Atan) 1086
-                              Store 1085(r005) 1087
-            1089:          60 Load 63(inF0)
-            1090:          60 Load 64(inF1)
-            1091:          60 ExtInst 1(GLSL.std.450) 25(Atan2) 1089 1090
-                              Store 1088(r006) 1091
-            1093:          60 Load 63(inF0)
-            1094:          60 ExtInst 1(GLSL.std.450) 9(Ceil) 1093
-                              Store 1092(r007) 1094
-            1095:          60 Load 63(inF0)
-            1098:        1097 FOrdLessThan 1095 1096
-            1099:   135(bool) Any 1098
-                              SelectionMerge 1101 None
-                              BranchConditional 1099 1100 1101
-            1100:               Label
+      1047(r000):    136(ptr) Variable Function
+      1050(r001):     61(ptr) Variable Function
+      1055(r003):    136(ptr) Variable Function
+      1058(r004):     61(ptr) Variable Function
+      1061(r005):     61(ptr) Variable Function
+      1064(r006):     61(ptr) Variable Function
+      1068(r007):     61(ptr) Variable Function
+      1079(r008):     61(ptr) Variable Function
+      1084(r009):     61(ptr) Variable Function
+      1087(r010):     61(ptr) Variable Function
+      1090(r011):     61(ptr) Variable Function
+      1093(r012):     61(ptr) Variable Function
+      1096(r013):     61(ptr) Variable Function
+      1099(r014):     61(ptr) Variable Function
+      1102(r015):     61(ptr) Variable Function
+      1105(r016):     61(ptr) Variable Function
+      1108(r017):     61(ptr) Variable Function
+      1111(r018):      7(ptr) Variable Function
+      1114(r019):     61(ptr) Variable Function
+      1117(R020):     61(ptr) Variable Function
+      1120(r021):     61(ptr) Variable Function
+      1123(r022):     61(ptr) Variable Function
+      1133(r023):     61(ptr) Variable Function
+      1136(r025):     61(ptr) Variable Function
+      1139(r026):     61(ptr) Variable Function
+     1143(r026a):     61(ptr) Variable Function
+      1148(r027):     61(ptr) Variable Function
+      1151(r028):     61(ptr) Variable Function
+      1155(r029):     61(ptr) Variable Function
+      1158(r030):     61(ptr) Variable Function
+      1162(r031):     61(ptr) Variable Function
+      1166(r032):     61(ptr) Variable Function
+      1170(r033):     61(ptr) Variable Function
+      1173(r034):     61(ptr) Variable Function
+      1176(r035):     61(ptr) Variable Function
+      1179(r036):     61(ptr) Variable Function
+      1184(r037):     61(ptr) Variable Function
+      1187(r038):     61(ptr) Variable Function
+      1194(r039):     61(ptr) Variable Function
+      1197(r049):     61(ptr) Variable Function
+      1202(r041):     61(ptr) Variable Function
+      1205(r042):     61(ptr) Variable Function
+      1209(r043):     61(ptr) Variable Function
+      1212(r044):     61(ptr) Variable Function
+      1217(r046):     61(ptr) Variable Function
+            1048:          60 Load 63(inF0)
+            1049:   135(bool) All 1048
+                              Store 1047(r000) 1049
+            1051:          60 Load 63(inF0)
+            1052:          60 ExtInst 1(GLSL.std.450) 4(FAbs) 1051
+                              Store 1050(r001) 1052
+            1053:          60 Load 63(inF0)
+            1054:          60 ExtInst 1(GLSL.std.450) 17(Acos) 1053
+            1056:          60 Load 63(inF0)
+            1057:   135(bool) Any 1056
+                              Store 1055(r003) 1057
+            1059:          60 Load 63(inF0)
+            1060:          60 ExtInst 1(GLSL.std.450) 16(Asin) 1059
+                              Store 1058(r004) 1060
+            1062:          60 Load 63(inF0)
+            1063:          60 ExtInst 1(GLSL.std.450) 18(Atan) 1062
+                              Store 1061(r005) 1063
+            1065:          60 Load 63(inF0)
+            1066:          60 Load 64(inF1)
+            1067:          60 ExtInst 1(GLSL.std.450) 25(Atan2) 1065 1066
+                              Store 1064(r006) 1067
+            1069:          60 Load 63(inF0)
+            1070:          60 ExtInst 1(GLSL.std.450) 9(Ceil) 1069
+                              Store 1068(r007) 1070
+            1071:          60 Load 63(inF0)
+            1074:        1073 FOrdLessThan 1071 1072
+            1075:   135(bool) Any 1074
+                              SelectionMerge 1077 None
+                              BranchConditional 1075 1076 1077
+            1076:               Label
                                 Kill
-            1101:             Label
-            1104:          60 Load 63(inF0)
-            1105:          60 Load 64(inF1)
-            1106:          60 Load 65(inF2)
-            1107:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 1104 1105 1106
-                              Store 1103(r008) 1107
+            1077:             Label
+            1080:          60 Load 63(inF0)
+            1081:          60 Load 64(inF1)
+            1082:          60 Load 65(inF2)
+            1083:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 1080 1081 1082
+                              Store 1079(r008) 1083
+            1085:          60 Load 63(inF0)
+            1086:          60 ExtInst 1(GLSL.std.450) 14(Cos) 1085
+                              Store 1084(r009) 1086
+            1088:          60 Load 63(inF0)
+            1089:          60 ExtInst 1(GLSL.std.450) 20(Cosh) 1088
+                              Store 1087(r010) 1089
+            1091:          60 Load 63(inF0)
+            1092:          60 DPdx 1091
+                              Store 1090(r011) 1092
+            1094:          60 Load 63(inF0)
+            1095:          60 DPdxCoarse 1094
+                              Store 1093(r012) 1095
+            1097:          60 Load 63(inF0)
+            1098:          60 DPdxFine 1097
+                              Store 1096(r013) 1098
+            1100:          60 Load 63(inF0)
+            1101:          60 DPdy 1100
+                              Store 1099(r014) 1101
+            1103:          60 Load 63(inF0)
+            1104:          60 DPdyCoarse 1103
+                              Store 1102(r015) 1104
+            1106:          60 Load 63(inF0)
+            1107:          60 DPdyFine 1106
+                              Store 1105(r016) 1107
             1109:          60 Load 63(inF0)
-            1110:          60 ExtInst 1(GLSL.std.450) 14(Cos) 1109
-                              Store 1108(r009) 1110
+            1110:          60 ExtInst 1(GLSL.std.450) 12(Degrees) 1109
+                              Store 1108(r017) 1110
             1112:          60 Load 63(inF0)
-            1113:          60 ExtInst 1(GLSL.std.450) 20(Cosh) 1112
-                              Store 1111(r010) 1113
+            1113:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 1112
+                              Store 1111(r018) 1113
             1115:          60 Load 63(inF0)
-            1116:          60 DPdx 1115
-                              Store 1114(r011) 1116
+            1116:          60 ExtInst 1(GLSL.std.450) 27(Exp) 1115
+                              Store 1114(r019) 1116
             1118:          60 Load 63(inF0)
-            1119:          60 DPdxCoarse 1118
-                              Store 1117(r012) 1119
+            1119:          60 ExtInst 1(GLSL.std.450) 29(Exp2) 1118
+                              Store 1117(R020) 1119
             1121:          60 Load 63(inF0)
-            1122:          60 DPdxFine 1121
-                              Store 1120(r013) 1122
+            1122:          60 ExtInst 1(GLSL.std.450) 8(Floor) 1121
+                              Store 1120(r021) 1122
             1124:          60 Load 63(inF0)
-            1125:          60 DPdy 1124
-                              Store 1123(r014) 1125
-            1127:          60 Load 63(inF0)
-            1128:          60 DPdyCoarse 1127
-                              Store 1126(r015) 1128
-            1130:          60 Load 63(inF0)
-            1131:          60 DPdyFine 1130
-                              Store 1129(r016) 1131
-            1133:          60 Load 63(inF0)
-            1134:          60 ExtInst 1(GLSL.std.450) 12(Degrees) 1133
-                              Store 1132(r017) 1134
-            1136:          60 Load 63(inF0)
-            1137:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 1136
-                              Store 1135(r018) 1137
-            1139:          60 Load 63(inF0)
-            1140:          60 ExtInst 1(GLSL.std.450) 27(Exp) 1139
-                              Store 1138(r019) 1140
-            1142:          60 Load 63(inF0)
-            1143:          60 ExtInst 1(GLSL.std.450) 29(Exp2) 1142
-                              Store 1141(R020) 1143
-            1145:          60 Load 63(inF0)
-            1146:          60 ExtInst 1(GLSL.std.450) 8(Floor) 1145
-                              Store 1144(r021) 1146
-            1148:          60 Load 63(inF0)
-            1149:          60 Load 64(inF1)
-            1150:   24(fvec2) CompositeExtract 1148 0
-            1151:   24(fvec2) CompositeExtract 1149 0
-            1152:   24(fvec2) FMod 1150 1151
-            1153:   24(fvec2) CompositeExtract 1148 1
-            1154:   24(fvec2) CompositeExtract 1149 1
-            1155:   24(fvec2) FMod 1153 1154
-            1156:          60 CompositeConstruct 1152 1155
-                              Store 1147(r022) 1156
-            1158:          60 Load 63(inF0)
-            1159:          60 ExtInst 1(GLSL.std.450) 10(Fract) 1158
-                              Store 1157(r023) 1159
-            1161:          60 Load 63(inF0)
-            1163:1162(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 1161
-            1164:  356(ivec2) CompositeExtract 1163 1
-                              Store 64(inF1) 1164
-            1165:          60 CompositeExtract 1163 0
-                              Store 1160(r024) 1165
+            1125:          60 Load 64(inF1)
+            1126:   24(fvec2) CompositeExtract 1124 0
+            1127:   24(fvec2) CompositeExtract 1125 0
+            1128:   24(fvec2) FMod 1126 1127
+            1129:   24(fvec2) CompositeExtract 1124 1
+            1130:   24(fvec2) CompositeExtract 1125 1
+            1131:   24(fvec2) FMod 1129 1130
+            1132:          60 CompositeConstruct 1128 1131
+                              Store 1123(r022) 1132
+            1134:          60 Load 63(inF0)
+            1135:          60 ExtInst 1(GLSL.std.450) 10(Fract) 1134
+                              Store 1133(r023) 1135
+            1137:          60 Load 63(inF0)
+            1138:          60 Fwidth 1137
+                              Store 1136(r025) 1138
+            1140:          60 Load 63(inF0)
+            1141:          60 Load 64(inF1)
+            1142:          60 ExtInst 1(GLSL.std.450) 53(Ldexp) 1140 1141
+                              Store 1139(r026) 1142
+            1144:          60 Load 63(inF0)
+            1145:          60 Load 64(inF1)
+            1146:          60 Load 65(inF2)
+            1147:          60 ExtInst 1(GLSL.std.450) 46(FMix) 1144 1145 1146
+                              Store 1143(r026a) 1147
+            1149:          60 Load 63(inF0)
+            1150:          60 ExtInst 1(GLSL.std.450) 28(Log) 1149
+                              Store 1148(r027) 1150
+            1152:          60 Load 63(inF0)
+            1153:          60 ExtInst 1(GLSL.std.450) 30(Log2) 1152
+            1154:          60 MatrixTimesScalar 1153 260
+                              Store 1151(r028) 1154
+            1156:          60 Load 63(inF0)
+            1157:          60 ExtInst 1(GLSL.std.450) 30(Log2) 1156
+                              Store 1155(r029) 1157
+            1159:          60 Load 63(inF0)
+            1160:          60 Load 64(inF1)
+            1161:          60 ExtInst 1(GLSL.std.450) 40(FMax) 1159 1160
+                              Store 1158(r030) 1161
+            1163:          60 Load 63(inF0)
+            1164:          60 Load 64(inF1)
+            1165:          60 ExtInst 1(GLSL.std.450) 37(FMin) 1163 1164
+                              Store 1162(r031) 1165
             1167:          60 Load 63(inF0)
-            1168:          60 Fwidth 1167
-                              Store 1166(r025) 1168
-            1170:          60 Load 63(inF0)
-            1171:          60 Load 64(inF1)
-            1172:          60 ExtInst 1(GLSL.std.450) 53(Ldexp) 1170 1171
-                              Store 1169(r026) 1172
+            1168:          60 Load 64(inF1)
+            1169:          60 ExtInst 1(GLSL.std.450) 26(Pow) 1167 1168
+                              Store 1166(r032) 1169
+            1171:          60 Load 63(inF0)
+            1172:          60 ExtInst 1(GLSL.std.450) 11(Radians) 1171
+                              Store 1170(r033) 1172
             1174:          60 Load 63(inF0)
-            1175:          60 Load 64(inF1)
-            1176:          60 Load 65(inF2)
-            1177:          60 ExtInst 1(GLSL.std.450) 46(FMix) 1174 1175 1176
-                              Store 1173(r026a) 1177
-            1179:          60 Load 63(inF0)
-            1180:          60 ExtInst 1(GLSL.std.450) 28(Log) 1179
-                              Store 1178(r027) 1180
-            1182:          60 Load 63(inF0)
-            1183:          60 ExtInst 1(GLSL.std.450) 30(Log2) 1182
-            1184:          60 MatrixTimesScalar 1183 266
-                              Store 1181(r028) 1184
-            1186:          60 Load 63(inF0)
-            1187:          60 ExtInst 1(GLSL.std.450) 30(Log2) 1186
-                              Store 1185(r029) 1187
-            1189:          60 Load 63(inF0)
-            1190:          60 Load 64(inF1)
-            1191:          60 ExtInst 1(GLSL.std.450) 40(FMax) 1189 1190
-                              Store 1188(r030) 1191
-            1193:          60 Load 63(inF0)
-            1194:          60 Load 64(inF1)
-            1195:          60 ExtInst 1(GLSL.std.450) 37(FMin) 1193 1194
-                              Store 1192(r031) 1195
-            1197:          60 Load 63(inF0)
-            1198:          60 Load 64(inF1)
-            1199:          60 ExtInst 1(GLSL.std.450) 26(Pow) 1197 1198
-                              Store 1196(r032) 1199
-            1201:          60 Load 63(inF0)
-            1202:          60 ExtInst 1(GLSL.std.450) 11(Radians) 1201
-                              Store 1200(r033) 1202
-            1204:          60 Load 63(inF0)
-            1205:          60 ExtInst 1(GLSL.std.450) 2(RoundEven) 1204
-                              Store 1203(r034) 1205
-            1207:          60 Load 63(inF0)
-            1208:          60 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1207
-                              Store 1206(r035) 1208
+            1175:          60 ExtInst 1(GLSL.std.450) 2(RoundEven) 1174
+                              Store 1173(r034) 1175
+            1177:          60 Load 63(inF0)
+            1178:          60 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1177
+                              Store 1176(r035) 1178
+            1180:          60 Load 63(inF0)
+            1181:   24(fvec2) CompositeConstruct 179 179
+            1182:   24(fvec2) CompositeConstruct 281 281
+            1183:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 1180 1181 1182
+                              Store 1179(r036) 1183
+            1185:          60 Load 63(inF0)
+            1186:          60 ExtInst 1(GLSL.std.450) 6(FSign) 1185
+                              Store 1184(r037) 1186
+            1188:          60 Load 63(inF0)
+            1189:          60 ExtInst 1(GLSL.std.450) 13(Sin) 1188
+                              Store 1187(r038) 1189
+            1190:          60 Load 63(inF0)
+            1191:          60 ExtInst 1(GLSL.std.450) 13(Sin) 1190
+                              Store 64(inF1) 1191
+            1192:          60 Load 63(inF0)
+            1193:          60 ExtInst 1(GLSL.std.450) 14(Cos) 1192
+                              Store 65(inF2) 1193
+            1195:          60 Load 63(inF0)
+            1196:          60 ExtInst 1(GLSL.std.450) 19(Sinh) 1195
+                              Store 1194(r039) 1196
+            1198:          60 Load 63(inF0)
+            1199:          60 Load 64(inF1)
+            1200:          60 Load 65(inF2)
+            1201:          60 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1198 1199 1200
+                              Store 1197(r049) 1201
+            1203:          60 Load 63(inF0)
+            1204:          60 ExtInst 1(GLSL.std.450) 31(Sqrt) 1203
+                              Store 1202(r041) 1204
+            1206:          60 Load 63(inF0)
+            1207:          60 Load 64(inF1)
+            1208:          60 ExtInst 1(GLSL.std.450) 48(Step) 1206 1207
+                              Store 1205(r042) 1208
             1210:          60 Load 63(inF0)
-            1211:   24(fvec2) CompositeConstruct 179 179
-            1212:   24(fvec2) CompositeConstruct 287 287
-            1213:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 1210 1211 1212
-                              Store 1209(r036) 1213
+            1211:          60 ExtInst 1(GLSL.std.450) 15(Tan) 1210
+                              Store 1209(r043) 1211
+            1213:          60 Load 63(inF0)
+            1214:          60 ExtInst 1(GLSL.std.450) 21(Tanh) 1213
+                              Store 1212(r044) 1214
             1215:          60 Load 63(inF0)
-            1216:          60 ExtInst 1(GLSL.std.450) 6(FSign) 1215
-                              Store 1214(r037) 1216
+            1216:          60 Transpose 1215
             1218:          60 Load 63(inF0)
-            1219:          60 ExtInst 1(GLSL.std.450) 13(Sin) 1218
-                              Store 1217(r038) 1219
-            1220:          60 Load 63(inF0)
-            1221:          60 ExtInst 1(GLSL.std.450) 13(Sin) 1220
-                              Store 64(inF1) 1221
-            1222:          60 Load 63(inF0)
-            1223:          60 ExtInst 1(GLSL.std.450) 14(Cos) 1222
-                              Store 65(inF2) 1223
-            1225:          60 Load 63(inF0)
-            1226:          60 ExtInst 1(GLSL.std.450) 19(Sinh) 1225
-                              Store 1224(r039) 1226
-            1228:          60 Load 63(inF0)
-            1229:          60 Load 64(inF1)
-            1230:          60 Load 65(inF2)
-            1231:          60 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1228 1229 1230
-                              Store 1227(r049) 1231
-            1233:          60 Load 63(inF0)
-            1234:          60 ExtInst 1(GLSL.std.450) 31(Sqrt) 1233
-                              Store 1232(r041) 1234
-            1236:          60 Load 63(inF0)
-            1237:          60 Load 64(inF1)
-            1238:          60 ExtInst 1(GLSL.std.450) 48(Step) 1236 1237
-                              Store 1235(r042) 1238
-            1240:          60 Load 63(inF0)
-            1241:          60 ExtInst 1(GLSL.std.450) 15(Tan) 1240
-                              Store 1239(r043) 1241
-            1243:          60 Load 63(inF0)
-            1244:          60 ExtInst 1(GLSL.std.450) 21(Tanh) 1243
-                              Store 1242(r044) 1244
-            1245:          60 Load 63(inF0)
-            1246:          60 Transpose 1245
-            1248:          60 Load 63(inF0)
-            1249:          60 ExtInst 1(GLSL.std.450) 3(Trunc) 1248
-                              Store 1247(r046) 1249
-                              ReturnValue 1251
+            1219:          60 ExtInst 1(GLSL.std.450) 3(Trunc) 1218
+                              Store 1217(r046) 1219
+                              ReturnValue 1221
                               FunctionEnd
 74(PixelShaderFunction3x3(mf33;mf33;mf33;):          68 Function None 70
         71(inF0):     69(ptr) FunctionParameter
         72(inF1):     69(ptr) FunctionParameter
         73(inF2):     69(ptr) FunctionParameter
               75:             Label
-      1254(r000):    136(ptr) Variable Function
-      1257(r001):     69(ptr) Variable Function
-      1262(r003):    136(ptr) Variable Function
-      1265(r004):     69(ptr) Variable Function
-      1268(r005):     69(ptr) Variable Function
-      1271(r006):     69(ptr) Variable Function
-      1275(r007):     69(ptr) Variable Function
-      1286(r008):     69(ptr) Variable Function
-      1291(r009):     69(ptr) Variable Function
-      1294(r010):     69(ptr) Variable Function
-      1297(r011):     69(ptr) Variable Function
-      1300(r012):     69(ptr) Variable Function
-      1303(r013):     69(ptr) Variable Function
-      1306(r014):     69(ptr) Variable Function
-      1309(r015):     69(ptr) Variable Function
-      1312(r016):     69(ptr) Variable Function
-      1315(r017):     69(ptr) Variable Function
-      1318(r018):      7(ptr) Variable Function
-      1321(r019):     69(ptr) Variable Function
-      1324(R020):     69(ptr) Variable Function
-      1327(r021):     69(ptr) Variable Function
-      1330(r022):     69(ptr) Variable Function
-      1343(r023):     69(ptr) Variable Function
-      1346(r024):     69(ptr) Variable Function
-      1352(r025):     69(ptr) Variable Function
-      1355(r026):     69(ptr) Variable Function
-     1359(r026a):     69(ptr) Variable Function
-      1364(r027):     69(ptr) Variable Function
-      1367(r028):     69(ptr) Variable Function
-      1371(r029):     69(ptr) Variable Function
-      1374(r030):     69(ptr) Variable Function
-      1378(r031):     69(ptr) Variable Function
-      1382(r032):     69(ptr) Variable Function
-      1386(r033):     69(ptr) Variable Function
-      1389(r034):     69(ptr) Variable Function
-      1392(r035):     69(ptr) Variable Function
-      1395(r036):     69(ptr) Variable Function
-      1400(r037):     69(ptr) Variable Function
-      1403(r038):     69(ptr) Variable Function
-      1410(r039):     69(ptr) Variable Function
-      1413(r049):     69(ptr) Variable Function
-      1418(r041):     69(ptr) Variable Function
-      1421(r042):     69(ptr) Variable Function
-      1425(r043):     69(ptr) Variable Function
-      1428(r044):     69(ptr) Variable Function
-      1433(r046):     69(ptr) Variable Function
-            1255:          68 Load 71(inF0)
-            1256:   135(bool) All 1255
-                              Store 1254(r000) 1256
-            1258:          68 Load 71(inF0)
-            1259:          68 ExtInst 1(GLSL.std.450) 4(FAbs) 1258
-                              Store 1257(r001) 1259
-            1260:          68 Load 71(inF0)
-            1261:          68 ExtInst 1(GLSL.std.450) 17(Acos) 1260
-            1263:          68 Load 71(inF0)
-            1264:   135(bool) Any 1263
-                              Store 1262(r003) 1264
-            1266:          68 Load 71(inF0)
-            1267:          68 ExtInst 1(GLSL.std.450) 16(Asin) 1266
-                              Store 1265(r004) 1267
-            1269:          68 Load 71(inF0)
-            1270:          68 ExtInst 1(GLSL.std.450) 18(Atan) 1269
-                              Store 1268(r005) 1270
-            1272:          68 Load 71(inF0)
-            1273:          68 Load 72(inF1)
-            1274:          68 ExtInst 1(GLSL.std.450) 25(Atan2) 1272 1273
-                              Store 1271(r006) 1274
-            1276:          68 Load 71(inF0)
-            1277:          68 ExtInst 1(GLSL.std.450) 9(Ceil) 1276
-                              Store 1275(r007) 1277
-            1278:          68 Load 71(inF0)
-            1281:        1280 FOrdLessThan 1278 1279
-            1282:   135(bool) Any 1281
-                              SelectionMerge 1284 None
-                              BranchConditional 1282 1283 1284
-            1283:               Label
+      1224(r000):    136(ptr) Variable Function
+      1227(r001):     69(ptr) Variable Function
+      1232(r003):    136(ptr) Variable Function
+      1235(r004):     69(ptr) Variable Function
+      1238(r005):     69(ptr) Variable Function
+      1241(r006):     69(ptr) Variable Function
+      1245(r007):     69(ptr) Variable Function
+      1256(r008):     69(ptr) Variable Function
+      1261(r009):     69(ptr) Variable Function
+      1264(r010):     69(ptr) Variable Function
+      1267(r011):     69(ptr) Variable Function
+      1270(r012):     69(ptr) Variable Function
+      1273(r013):     69(ptr) Variable Function
+      1276(r014):     69(ptr) Variable Function
+      1279(r015):     69(ptr) Variable Function
+      1282(r016):     69(ptr) Variable Function
+      1285(r017):     69(ptr) Variable Function
+      1288(r018):      7(ptr) Variable Function
+      1291(r019):     69(ptr) Variable Function
+      1294(R020):     69(ptr) Variable Function
+      1297(r021):     69(ptr) Variable Function
+      1300(r022):     69(ptr) Variable Function
+      1313(r023):     69(ptr) Variable Function
+      1316(r025):     69(ptr) Variable Function
+      1319(r026):     69(ptr) Variable Function
+     1323(r026a):     69(ptr) Variable Function
+      1328(r027):     69(ptr) Variable Function
+      1331(r028):     69(ptr) Variable Function
+      1335(r029):     69(ptr) Variable Function
+      1338(r030):     69(ptr) Variable Function
+      1342(r031):     69(ptr) Variable Function
+      1346(r032):     69(ptr) Variable Function
+      1350(r033):     69(ptr) Variable Function
+      1353(r034):     69(ptr) Variable Function
+      1356(r035):     69(ptr) Variable Function
+      1359(r036):     69(ptr) Variable Function
+      1364(r037):     69(ptr) Variable Function
+      1367(r038):     69(ptr) Variable Function
+      1374(r039):     69(ptr) Variable Function
+      1377(r049):     69(ptr) Variable Function
+      1382(r041):     69(ptr) Variable Function
+      1385(r042):     69(ptr) Variable Function
+      1389(r043):     69(ptr) Variable Function
+      1392(r044):     69(ptr) Variable Function
+      1397(r046):     69(ptr) Variable Function
+            1225:          68 Load 71(inF0)
+            1226:   135(bool) All 1225
+                              Store 1224(r000) 1226
+            1228:          68 Load 71(inF0)
+            1229:          68 ExtInst 1(GLSL.std.450) 4(FAbs) 1228
+                              Store 1227(r001) 1229
+            1230:          68 Load 71(inF0)
+            1231:          68 ExtInst 1(GLSL.std.450) 17(Acos) 1230
+            1233:          68 Load 71(inF0)
+            1234:   135(bool) Any 1233
+                              Store 1232(r003) 1234
+            1236:          68 Load 71(inF0)
+            1237:          68 ExtInst 1(GLSL.std.450) 16(Asin) 1236
+                              Store 1235(r004) 1237
+            1239:          68 Load 71(inF0)
+            1240:          68 ExtInst 1(GLSL.std.450) 18(Atan) 1239
+                              Store 1238(r005) 1240
+            1242:          68 Load 71(inF0)
+            1243:          68 Load 72(inF1)
+            1244:          68 ExtInst 1(GLSL.std.450) 25(Atan2) 1242 1243
+                              Store 1241(r006) 1244
+            1246:          68 Load 71(inF0)
+            1247:          68 ExtInst 1(GLSL.std.450) 9(Ceil) 1246
+                              Store 1245(r007) 1247
+            1248:          68 Load 71(inF0)
+            1251:        1250 FOrdLessThan 1248 1249
+            1252:   135(bool) Any 1251
+                              SelectionMerge 1254 None
+                              BranchConditional 1252 1253 1254
+            1253:               Label
                                 Kill
-            1284:             Label
-            1287:          68 Load 71(inF0)
-            1288:          68 Load 72(inF1)
-            1289:          68 Load 73(inF2)
-            1290:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 1287 1288 1289
-                              Store 1286(r008) 1290
+            1254:             Label
+            1257:          68 Load 71(inF0)
+            1258:          68 Load 72(inF1)
+            1259:          68 Load 73(inF2)
+            1260:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 1257 1258 1259
+                              Store 1256(r008) 1260
+            1262:          68 Load 71(inF0)
+            1263:          68 ExtInst 1(GLSL.std.450) 14(Cos) 1262
+                              Store 1261(r009) 1263
+            1265:          68 Load 71(inF0)
+            1266:          68 ExtInst 1(GLSL.std.450) 20(Cosh) 1265
+                              Store 1264(r010) 1266
+            1268:          68 Load 71(inF0)
+            1269:          68 DPdx 1268
+                              Store 1267(r011) 1269
+            1271:          68 Load 71(inF0)
+            1272:          68 DPdxCoarse 1271
+                              Store 1270(r012) 1272
+            1274:          68 Load 71(inF0)
+            1275:          68 DPdxFine 1274
+                              Store 1273(r013) 1275
+            1277:          68 Load 71(inF0)
+            1278:          68 DPdy 1277
+                              Store 1276(r014) 1278
+            1280:          68 Load 71(inF0)
+            1281:          68 DPdyCoarse 1280
+                              Store 1279(r015) 1281
+            1283:          68 Load 71(inF0)
+            1284:          68 DPdyFine 1283
+                              Store 1282(r016) 1284
+            1286:          68 Load 71(inF0)
+            1287:          68 ExtInst 1(GLSL.std.450) 12(Degrees) 1286
+                              Store 1285(r017) 1287
+            1289:          68 Load 71(inF0)
+            1290:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 1289
+                              Store 1288(r018) 1290
             1292:          68 Load 71(inF0)
-            1293:          68 ExtInst 1(GLSL.std.450) 14(Cos) 1292
-                              Store 1291(r009) 1293
+            1293:          68 ExtInst 1(GLSL.std.450) 27(Exp) 1292
+                              Store 1291(r019) 1293
             1295:          68 Load 71(inF0)
-            1296:          68 ExtInst 1(GLSL.std.450) 20(Cosh) 1295
-                              Store 1294(r010) 1296
+            1296:          68 ExtInst 1(GLSL.std.450) 29(Exp2) 1295
+                              Store 1294(R020) 1296
             1298:          68 Load 71(inF0)
-            1299:          68 DPdx 1298
-                              Store 1297(r011) 1299
+            1299:          68 ExtInst 1(GLSL.std.450) 8(Floor) 1298
+                              Store 1297(r021) 1299
             1301:          68 Load 71(inF0)
-            1302:          68 DPdxCoarse 1301
-                              Store 1300(r012) 1302
-            1304:          68 Load 71(inF0)
-            1305:          68 DPdxFine 1304
-                              Store 1303(r013) 1305
-            1307:          68 Load 71(inF0)
-            1308:          68 DPdy 1307
-                              Store 1306(r014) 1308
-            1310:          68 Load 71(inF0)
-            1311:          68 DPdyCoarse 1310
-                              Store 1309(r015) 1311
-            1313:          68 Load 71(inF0)
-            1314:          68 DPdyFine 1313
-                              Store 1312(r016) 1314
-            1316:          68 Load 71(inF0)
-            1317:          68 ExtInst 1(GLSL.std.450) 12(Degrees) 1316
-                              Store 1315(r017) 1317
-            1319:          68 Load 71(inF0)
-            1320:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 1319
-                              Store 1318(r018) 1320
-            1322:          68 Load 71(inF0)
-            1323:          68 ExtInst 1(GLSL.std.450) 27(Exp) 1322
-                              Store 1321(r019) 1323
-            1325:          68 Load 71(inF0)
-            1326:          68 ExtInst 1(GLSL.std.450) 29(Exp2) 1325
-                              Store 1324(R020) 1326
-            1328:          68 Load 71(inF0)
-            1329:          68 ExtInst 1(GLSL.std.450) 8(Floor) 1328
-                              Store 1327(r021) 1329
-            1331:          68 Load 71(inF0)
-            1332:          68 Load 72(inF1)
-            1333:   36(fvec3) CompositeExtract 1331 0
-            1334:   36(fvec3) CompositeExtract 1332 0
-            1335:   36(fvec3) FMod 1333 1334
-            1336:   36(fvec3) CompositeExtract 1331 1
-            1337:   36(fvec3) CompositeExtract 1332 1
-            1338:   36(fvec3) FMod 1336 1337
-            1339:   36(fvec3) CompositeExtract 1331 2
-            1340:   36(fvec3) CompositeExtract 1332 2
-            1341:   36(fvec3) FMod 1339 1340
-            1342:          68 CompositeConstruct 1335 1338 1341
-                              Store 1330(r022) 1342
-            1344:          68 Load 71(inF0)
-            1345:          68 ExtInst 1(GLSL.std.450) 10(Fract) 1344
-                              Store 1343(r023) 1345
+            1302:          68 Load 72(inF1)
+            1303:   36(fvec3) CompositeExtract 1301 0
+            1304:   36(fvec3) CompositeExtract 1302 0
+            1305:   36(fvec3) FMod 1303 1304
+            1306:   36(fvec3) CompositeExtract 1301 1
+            1307:   36(fvec3) CompositeExtract 1302 1
+            1308:   36(fvec3) FMod 1306 1307
+            1309:   36(fvec3) CompositeExtract 1301 2
+            1310:   36(fvec3) CompositeExtract 1302 2
+            1311:   36(fvec3) FMod 1309 1310
+            1312:          68 CompositeConstruct 1305 1308 1311
+                              Store 1300(r022) 1312
+            1314:          68 Load 71(inF0)
+            1315:          68 ExtInst 1(GLSL.std.450) 10(Fract) 1314
+                              Store 1313(r023) 1315
+            1317:          68 Load 71(inF0)
+            1318:          68 Fwidth 1317
+                              Store 1316(r025) 1318
+            1320:          68 Load 71(inF0)
+            1321:          68 Load 72(inF1)
+            1322:          68 ExtInst 1(GLSL.std.450) 53(Ldexp) 1320 1321
+                              Store 1319(r026) 1322
+            1324:          68 Load 71(inF0)
+            1325:          68 Load 72(inF1)
+            1326:          68 Load 73(inF2)
+            1327:          68 ExtInst 1(GLSL.std.450) 46(FMix) 1324 1325 1326
+                              Store 1323(r026a) 1327
+            1329:          68 Load 71(inF0)
+            1330:          68 ExtInst 1(GLSL.std.450) 28(Log) 1329
+                              Store 1328(r027) 1330
+            1332:          68 Load 71(inF0)
+            1333:          68 ExtInst 1(GLSL.std.450) 30(Log2) 1332
+            1334:          68 MatrixTimesScalar 1333 260
+                              Store 1331(r028) 1334
+            1336:          68 Load 71(inF0)
+            1337:          68 ExtInst 1(GLSL.std.450) 30(Log2) 1336
+                              Store 1335(r029) 1337
+            1339:          68 Load 71(inF0)
+            1340:          68 Load 72(inF1)
+            1341:          68 ExtInst 1(GLSL.std.450) 40(FMax) 1339 1340
+                              Store 1338(r030) 1341
+            1343:          68 Load 71(inF0)
+            1344:          68 Load 72(inF1)
+            1345:          68 ExtInst 1(GLSL.std.450) 37(FMin) 1343 1344
+                              Store 1342(r031) 1345
             1347:          68 Load 71(inF0)
-            1349:1348(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 1347
-            1350:  594(ivec3) CompositeExtract 1349 1
-                              Store 72(inF1) 1350
-            1351:          68 CompositeExtract 1349 0
-                              Store 1346(r024) 1351
-            1353:          68 Load 71(inF0)
-            1354:          68 Fwidth 1353
-                              Store 1352(r025) 1354
-            1356:          68 Load 71(inF0)
-            1357:          68 Load 72(inF1)
-            1358:          68 ExtInst 1(GLSL.std.450) 53(Ldexp) 1356 1357
-                              Store 1355(r026) 1358
+            1348:          68 Load 72(inF1)
+            1349:          68 ExtInst 1(GLSL.std.450) 26(Pow) 1347 1348
+                              Store 1346(r032) 1349
+            1351:          68 Load 71(inF0)
+            1352:          68 ExtInst 1(GLSL.std.450) 11(Radians) 1351
+                              Store 1350(r033) 1352
+            1354:          68 Load 71(inF0)
+            1355:          68 ExtInst 1(GLSL.std.450) 2(RoundEven) 1354
+                              Store 1353(r034) 1355
+            1357:          68 Load 71(inF0)
+            1358:          68 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1357
+                              Store 1356(r035) 1358
             1360:          68 Load 71(inF0)
-            1361:          68 Load 72(inF1)
-            1362:          68 Load 73(inF2)
-            1363:          68 ExtInst 1(GLSL.std.450) 46(FMix) 1360 1361 1362
-                              Store 1359(r026a) 1363
+            1361:   36(fvec3) CompositeConstruct 179 179 179
+            1362:   36(fvec3) CompositeConstruct 281 281 281
+            1363:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 1360 1361 1362
+                              Store 1359(r036) 1363
             1365:          68 Load 71(inF0)
-            1366:          68 ExtInst 1(GLSL.std.450) 28(Log) 1365
-                              Store 1364(r027) 1366
+            1366:          68 ExtInst 1(GLSL.std.450) 6(FSign) 1365
+                              Store 1364(r037) 1366
             1368:          68 Load 71(inF0)
-            1369:          68 ExtInst 1(GLSL.std.450) 30(Log2) 1368
-            1370:          68 MatrixTimesScalar 1369 266
-                              Store 1367(r028) 1370
+            1369:          68 ExtInst 1(GLSL.std.450) 13(Sin) 1368
+                              Store 1367(r038) 1369
+            1370:          68 Load 71(inF0)
+            1371:          68 ExtInst 1(GLSL.std.450) 13(Sin) 1370
+                              Store 72(inF1) 1371
             1372:          68 Load 71(inF0)
-            1373:          68 ExtInst 1(GLSL.std.450) 30(Log2) 1372
-                              Store 1371(r029) 1373
+            1373:          68 ExtInst 1(GLSL.std.450) 14(Cos) 1372
+                              Store 73(inF2) 1373
             1375:          68 Load 71(inF0)
-            1376:          68 Load 72(inF1)
-            1377:          68 ExtInst 1(GLSL.std.450) 40(FMax) 1375 1376
-                              Store 1374(r030) 1377
-            1379:          68 Load 71(inF0)
-            1380:          68 Load 72(inF1)
-            1381:          68 ExtInst 1(GLSL.std.450) 37(FMin) 1379 1380
-                              Store 1378(r031) 1381
+            1376:          68 ExtInst 1(GLSL.std.450) 19(Sinh) 1375
+                              Store 1374(r039) 1376
+            1378:          68 Load 71(inF0)
+            1379:          68 Load 72(inF1)
+            1380:          68 Load 73(inF2)
+            1381:          68 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1378 1379 1380
+                              Store 1377(r049) 1381
             1383:          68 Load 71(inF0)
-            1384:          68 Load 72(inF1)
-            1385:          68 ExtInst 1(GLSL.std.450) 26(Pow) 1383 1384
-                              Store 1382(r032) 1385
-            1387:          68 Load 71(inF0)
-            1388:          68 ExtInst 1(GLSL.std.450) 11(Radians) 1387
-                              Store 1386(r033) 1388
+            1384:          68 ExtInst 1(GLSL.std.450) 31(Sqrt) 1383
+                              Store 1382(r041) 1384
+            1386:          68 Load 71(inF0)
+            1387:          68 Load 72(inF1)
+            1388:          68 ExtInst 1(GLSL.std.450) 48(Step) 1386 1387
+                              Store 1385(r042) 1388
             1390:          68 Load 71(inF0)
-            1391:          68 ExtInst 1(GLSL.std.450) 2(RoundEven) 1390
-                              Store 1389(r034) 1391
+            1391:          68 ExtInst 1(GLSL.std.450) 15(Tan) 1390
+                              Store 1389(r043) 1391
             1393:          68 Load 71(inF0)
-            1394:          68 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1393
-                              Store 1392(r035) 1394
-            1396:          68 Load 71(inF0)
-            1397:   36(fvec3) CompositeConstruct 179 179 179
-            1398:   36(fvec3) CompositeConstruct 287 287 287
-            1399:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 1396 1397 1398
-                              Store 1395(r036) 1399
-            1401:          68 Load 71(inF0)
-            1402:          68 ExtInst 1(GLSL.std.450) 6(FSign) 1401
-                              Store 1400(r037) 1402
-            1404:          68 Load 71(inF0)
-            1405:          68 ExtInst 1(GLSL.std.450) 13(Sin) 1404
-                              Store 1403(r038) 1405
-            1406:          68 Load 71(inF0)
-            1407:          68 ExtInst 1(GLSL.std.450) 13(Sin) 1406
-                              Store 72(inF1) 1407
-            1408:          68 Load 71(inF0)
-            1409:          68 ExtInst 1(GLSL.std.450) 14(Cos) 1408
-                              Store 73(inF2) 1409
-            1411:          68 Load 71(inF0)
-            1412:          68 ExtInst 1(GLSL.std.450) 19(Sinh) 1411
-                              Store 1410(r039) 1412
-            1414:          68 Load 71(inF0)
-            1415:          68 Load 72(inF1)
-            1416:          68 Load 73(inF2)
-            1417:          68 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1414 1415 1416
-                              Store 1413(r049) 1417
-            1419:          68 Load 71(inF0)
-            1420:          68 ExtInst 1(GLSL.std.450) 31(Sqrt) 1419
-                              Store 1418(r041) 1420
-            1422:          68 Load 71(inF0)
-            1423:          68 Load 72(inF1)
-            1424:          68 ExtInst 1(GLSL.std.450) 48(Step) 1422 1423
-                              Store 1421(r042) 1424
-            1426:          68 Load 71(inF0)
-            1427:          68 ExtInst 1(GLSL.std.450) 15(Tan) 1426
-                              Store 1425(r043) 1427
-            1429:          68 Load 71(inF0)
-            1430:          68 ExtInst 1(GLSL.std.450) 21(Tanh) 1429
-                              Store 1428(r044) 1430
-            1431:          68 Load 71(inF0)
-            1432:          68 Transpose 1431
-            1434:          68 Load 71(inF0)
-            1435:          68 ExtInst 1(GLSL.std.450) 3(Trunc) 1434
-                              Store 1433(r046) 1435
-                              ReturnValue 1437
+            1394:          68 ExtInst 1(GLSL.std.450) 21(Tanh) 1393
+                              Store 1392(r044) 1394
+            1395:          68 Load 71(inF0)
+            1396:          68 Transpose 1395
+            1398:          68 Load 71(inF0)
+            1399:          68 ExtInst 1(GLSL.std.450) 3(Trunc) 1398
+                              Store 1397(r046) 1399
+                              ReturnValue 1401
                               FunctionEnd
 82(PixelShaderFunction4x4(mf44;mf44;mf44;):          76 Function None 78
         79(inF0):     77(ptr) FunctionParameter
         80(inF1):     77(ptr) FunctionParameter
         81(inF2):     77(ptr) FunctionParameter
               83:             Label
-      1440(r000):    136(ptr) Variable Function
-      1443(r001):     77(ptr) Variable Function
-      1448(r003):    136(ptr) Variable Function
-      1451(r004):     77(ptr) Variable Function
-      1454(r005):     77(ptr) Variable Function
-      1457(r006):     77(ptr) Variable Function
-      1461(r007):     77(ptr) Variable Function
-      1472(r008):     77(ptr) Variable Function
-      1477(r009):     77(ptr) Variable Function
-      1480(r010):     77(ptr) Variable Function
-      1483(r011):     77(ptr) Variable Function
-      1486(r012):     77(ptr) Variable Function
-      1489(r013):     77(ptr) Variable Function
-      1492(r014):     77(ptr) Variable Function
-      1495(r015):     77(ptr) Variable Function
-      1498(r016):     77(ptr) Variable Function
-      1501(r017):     77(ptr) Variable Function
-      1504(r018):      7(ptr) Variable Function
-      1507(r019):     77(ptr) Variable Function
-      1510(R020):     77(ptr) Variable Function
-      1513(r021):     77(ptr) Variable Function
-      1516(r022):     77(ptr) Variable Function
-      1532(r023):     77(ptr) Variable Function
-      1535(r024):     77(ptr) Variable Function
-      1541(r025):     77(ptr) Variable Function
-      1544(r026):     77(ptr) Variable Function
-     1548(r026a):     77(ptr) Variable Function
-      1553(r027):     77(ptr) Variable Function
-      1556(r028):     77(ptr) Variable Function
-      1560(r029):     77(ptr) Variable Function
-      1563(r030):     77(ptr) Variable Function
-      1567(r031):     77(ptr) Variable Function
-      1571(r032):     77(ptr) Variable Function
-      1575(r033):     77(ptr) Variable Function
-      1578(r034):     77(ptr) Variable Function
-      1581(r035):     77(ptr) Variable Function
-      1584(r036):     77(ptr) Variable Function
-      1589(r037):     77(ptr) Variable Function
-      1592(r038):     77(ptr) Variable Function
-      1599(r039):     77(ptr) Variable Function
-      1602(r049):     77(ptr) Variable Function
-      1607(r041):     77(ptr) Variable Function
-      1610(r042):     77(ptr) Variable Function
-      1614(r043):     77(ptr) Variable Function
-      1617(r044):     77(ptr) Variable Function
-      1622(r046):     77(ptr) Variable Function
-            1441:          76 Load 79(inF0)
-            1442:   135(bool) All 1441
-                              Store 1440(r000) 1442
-            1444:          76 Load 79(inF0)
-            1445:          76 ExtInst 1(GLSL.std.450) 4(FAbs) 1444
-                              Store 1443(r001) 1445
-            1446:          76 Load 79(inF0)
-            1447:          76 ExtInst 1(GLSL.std.450) 17(Acos) 1446
-            1449:          76 Load 79(inF0)
-            1450:   135(bool) Any 1449
-                              Store 1448(r003) 1450
-            1452:          76 Load 79(inF0)
-            1453:          76 ExtInst 1(GLSL.std.450) 16(Asin) 1452
-                              Store 1451(r004) 1453
-            1455:          76 Load 79(inF0)
-            1456:          76 ExtInst 1(GLSL.std.450) 18(Atan) 1455
-                              Store 1454(r005) 1456
-            1458:          76 Load 79(inF0)
-            1459:          76 Load 80(inF1)
-            1460:          76 ExtInst 1(GLSL.std.450) 25(Atan2) 1458 1459
-                              Store 1457(r006) 1460
-            1462:          76 Load 79(inF0)
-            1463:          76 ExtInst 1(GLSL.std.450) 9(Ceil) 1462
-                              Store 1461(r007) 1463
-            1464:          76 Load 79(inF0)
-            1467:        1466 FOrdLessThan 1464 1465
-            1468:   135(bool) Any 1467
-                              SelectionMerge 1470 None
-                              BranchConditional 1468 1469 1470
-            1469:               Label
+      1404(r000):    136(ptr) Variable Function
+      1407(r001):     77(ptr) Variable Function
+      1412(r003):    136(ptr) Variable Function
+      1415(r004):     77(ptr) Variable Function
+      1418(r005):     77(ptr) Variable Function
+      1421(r006):     77(ptr) Variable Function
+      1425(r007):     77(ptr) Variable Function
+      1436(r008):     77(ptr) Variable Function
+      1441(r009):     77(ptr) Variable Function
+      1444(r010):     77(ptr) Variable Function
+      1447(r011):     77(ptr) Variable Function
+      1450(r012):     77(ptr) Variable Function
+      1453(r013):     77(ptr) Variable Function
+      1456(r014):     77(ptr) Variable Function
+      1459(r015):     77(ptr) Variable Function
+      1462(r016):     77(ptr) Variable Function
+      1465(r017):     77(ptr) Variable Function
+      1468(r018):      7(ptr) Variable Function
+      1471(r019):     77(ptr) Variable Function
+      1474(R020):     77(ptr) Variable Function
+      1477(r021):     77(ptr) Variable Function
+      1480(r022):     77(ptr) Variable Function
+      1496(r023):     77(ptr) Variable Function
+      1499(r025):     77(ptr) Variable Function
+      1502(r026):     77(ptr) Variable Function
+     1506(r026a):     77(ptr) Variable Function
+      1511(r027):     77(ptr) Variable Function
+      1514(r028):     77(ptr) Variable Function
+      1518(r029):     77(ptr) Variable Function
+      1521(r030):     77(ptr) Variable Function
+      1525(r031):     77(ptr) Variable Function
+      1529(r032):     77(ptr) Variable Function
+      1533(r033):     77(ptr) Variable Function
+      1536(r034):     77(ptr) Variable Function
+      1539(r035):     77(ptr) Variable Function
+      1542(r036):     77(ptr) Variable Function
+      1547(r037):     77(ptr) Variable Function
+      1550(r038):     77(ptr) Variable Function
+      1557(r039):     77(ptr) Variable Function
+      1560(r049):     77(ptr) Variable Function
+      1565(r041):     77(ptr) Variable Function
+      1568(r042):     77(ptr) Variable Function
+      1572(r043):     77(ptr) Variable Function
+      1575(r044):     77(ptr) Variable Function
+      1580(r046):     77(ptr) Variable Function
+            1405:          76 Load 79(inF0)
+            1406:   135(bool) All 1405
+                              Store 1404(r000) 1406
+            1408:          76 Load 79(inF0)
+            1409:          76 ExtInst 1(GLSL.std.450) 4(FAbs) 1408
+                              Store 1407(r001) 1409
+            1410:          76 Load 79(inF0)
+            1411:          76 ExtInst 1(GLSL.std.450) 17(Acos) 1410
+            1413:          76 Load 79(inF0)
+            1414:   135(bool) Any 1413
+                              Store 1412(r003) 1414
+            1416:          76 Load 79(inF0)
+            1417:          76 ExtInst 1(GLSL.std.450) 16(Asin) 1416
+                              Store 1415(r004) 1417
+            1419:          76 Load 79(inF0)
+            1420:          76 ExtInst 1(GLSL.std.450) 18(Atan) 1419
+                              Store 1418(r005) 1420
+            1422:          76 Load 79(inF0)
+            1423:          76 Load 80(inF1)
+            1424:          76 ExtInst 1(GLSL.std.450) 25(Atan2) 1422 1423
+                              Store 1421(r006) 1424
+            1426:          76 Load 79(inF0)
+            1427:          76 ExtInst 1(GLSL.std.450) 9(Ceil) 1426
+                              Store 1425(r007) 1427
+            1428:          76 Load 79(inF0)
+            1431:        1430 FOrdLessThan 1428 1429
+            1432:   135(bool) Any 1431
+                              SelectionMerge 1434 None
+                              BranchConditional 1432 1433 1434
+            1433:               Label
                                 Kill
-            1470:             Label
-            1473:          76 Load 79(inF0)
-            1474:          76 Load 80(inF1)
-            1475:          76 Load 81(inF2)
-            1476:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 1473 1474 1475
-                              Store 1472(r008) 1476
+            1434:             Label
+            1437:          76 Load 79(inF0)
+            1438:          76 Load 80(inF1)
+            1439:          76 Load 81(inF2)
+            1440:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 1437 1438 1439
+                              Store 1436(r008) 1440
+            1442:          76 Load 79(inF0)
+            1443:          76 ExtInst 1(GLSL.std.450) 14(Cos) 1442
+                              Store 1441(r009) 1443
+            1445:          76 Load 79(inF0)
+            1446:          76 ExtInst 1(GLSL.std.450) 20(Cosh) 1445
+                              Store 1444(r010) 1446
+            1448:          76 Load 79(inF0)
+            1449:          76 DPdx 1448
+                              Store 1447(r011) 1449
+            1451:          76 Load 79(inF0)
+            1452:          76 DPdxCoarse 1451
+                              Store 1450(r012) 1452
+            1454:          76 Load 79(inF0)
+            1455:          76 DPdxFine 1454
+                              Store 1453(r013) 1455
+            1457:          76 Load 79(inF0)
+            1458:          76 DPdy 1457
+                              Store 1456(r014) 1458
+            1460:          76 Load 79(inF0)
+            1461:          76 DPdyCoarse 1460
+                              Store 1459(r015) 1461
+            1463:          76 Load 79(inF0)
+            1464:          76 DPdyFine 1463
+                              Store 1462(r016) 1464
+            1466:          76 Load 79(inF0)
+            1467:          76 ExtInst 1(GLSL.std.450) 12(Degrees) 1466
+                              Store 1465(r017) 1467
+            1469:          76 Load 79(inF0)
+            1470:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 1469
+                              Store 1468(r018) 1470
+            1472:          76 Load 79(inF0)
+            1473:          76 ExtInst 1(GLSL.std.450) 27(Exp) 1472
+                              Store 1471(r019) 1473
+            1475:          76 Load 79(inF0)
+            1476:          76 ExtInst 1(GLSL.std.450) 29(Exp2) 1475
+                              Store 1474(R020) 1476
             1478:          76 Load 79(inF0)
-            1479:          76 ExtInst 1(GLSL.std.450) 14(Cos) 1478
-                              Store 1477(r009) 1479
+            1479:          76 ExtInst 1(GLSL.std.450) 8(Floor) 1478
+                              Store 1477(r021) 1479
             1481:          76 Load 79(inF0)
-            1482:          76 ExtInst 1(GLSL.std.450) 20(Cosh) 1481
-                              Store 1480(r010) 1482
-            1484:          76 Load 79(inF0)
-            1485:          76 DPdx 1484
-                              Store 1483(r011) 1485
-            1487:          76 Load 79(inF0)
-            1488:          76 DPdxCoarse 1487
-                              Store 1486(r012) 1488
-            1490:          76 Load 79(inF0)
-            1491:          76 DPdxFine 1490
-                              Store 1489(r013) 1491
-            1493:          76 Load 79(inF0)
-            1494:          76 DPdy 1493
-                              Store 1492(r014) 1494
-            1496:          76 Load 79(inF0)
-            1497:          76 DPdyCoarse 1496
-                              Store 1495(r015) 1497
-            1499:          76 Load 79(inF0)
-            1500:          76 DPdyFine 1499
-                              Store 1498(r016) 1500
-            1502:          76 Load 79(inF0)
-            1503:          76 ExtInst 1(GLSL.std.450) 12(Degrees) 1502
-                              Store 1501(r017) 1503
-            1505:          76 Load 79(inF0)
-            1506:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 1505
-                              Store 1504(r018) 1506
-            1508:          76 Load 79(inF0)
-            1509:          76 ExtInst 1(GLSL.std.450) 27(Exp) 1508
-                              Store 1507(r019) 1509
-            1511:          76 Load 79(inF0)
-            1512:          76 ExtInst 1(GLSL.std.450) 29(Exp2) 1511
-                              Store 1510(R020) 1512
-            1514:          76 Load 79(inF0)
-            1515:          76 ExtInst 1(GLSL.std.450) 8(Floor) 1514
-                              Store 1513(r021) 1515
-            1517:          76 Load 79(inF0)
-            1518:          76 Load 80(inF1)
-            1519:   48(fvec4) CompositeExtract 1517 0
-            1520:   48(fvec4) CompositeExtract 1518 0
-            1521:   48(fvec4) FMod 1519 1520
-            1522:   48(fvec4) CompositeExtract 1517 1
-            1523:   48(fvec4) CompositeExtract 1518 1
-            1524:   48(fvec4) FMod 1522 1523
-            1525:   48(fvec4) CompositeExtract 1517 2
-            1526:   48(fvec4) CompositeExtract 1518 2
-            1527:   48(fvec4) FMod 1525 1526
-            1528:   48(fvec4) CompositeExtract 1517 3
-            1529:   48(fvec4) CompositeExtract 1518 3
-            1530:   48(fvec4) FMod 1528 1529
-            1531:          76 CompositeConstruct 1521 1524 1527 1530
-                              Store 1516(r022) 1531
-            1533:          76 Load 79(inF0)
-            1534:          76 ExtInst 1(GLSL.std.450) 10(Fract) 1533
-                              Store 1532(r023) 1534
-            1536:          76 Load 79(inF0)
-            1538:1537(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 1536
-            1539:  840(ivec4) CompositeExtract 1538 1
-                              Store 80(inF1) 1539
-            1540:          76 CompositeExtract 1538 0
-                              Store 1535(r024) 1540
-            1542:          76 Load 79(inF0)
-            1543:          76 Fwidth 1542
-                              Store 1541(r025) 1543
-            1545:          76 Load 79(inF0)
-            1546:          76 Load 80(inF1)
-            1547:          76 ExtInst 1(GLSL.std.450) 53(Ldexp) 1545 1546
-                              Store 1544(r026) 1547
-            1549:          76 Load 79(inF0)
-            1550:          76 Load 80(inF1)
-            1551:          76 Load 81(inF2)
-            1552:          76 ExtInst 1(GLSL.std.450) 46(FMix) 1549 1550 1551
-                              Store 1548(r026a) 1552
-            1554:          76 Load 79(inF0)
-            1555:          76 ExtInst 1(GLSL.std.450) 28(Log) 1554
-                              Store 1553(r027) 1555
-            1557:          76 Load 79(inF0)
-            1558:          76 ExtInst 1(GLSL.std.450) 30(Log2) 1557
-            1559:          76 MatrixTimesScalar 1558 266
-                              Store 1556(r028) 1559
+            1482:          76 Load 80(inF1)
+            1483:   48(fvec4) CompositeExtract 1481 0
+            1484:   48(fvec4) CompositeExtract 1482 0
+            1485:   48(fvec4) FMod 1483 1484
+            1486:   48(fvec4) CompositeExtract 1481 1
+            1487:   48(fvec4) CompositeExtract 1482 1
+            1488:   48(fvec4) FMod 1486 1487
+            1489:   48(fvec4) CompositeExtract 1481 2
+            1490:   48(fvec4) CompositeExtract 1482 2
+            1491:   48(fvec4) FMod 1489 1490
+            1492:   48(fvec4) CompositeExtract 1481 3
+            1493:   48(fvec4) CompositeExtract 1482 3
+            1494:   48(fvec4) FMod 1492 1493
+            1495:          76 CompositeConstruct 1485 1488 1491 1494
+                              Store 1480(r022) 1495
+            1497:          76 Load 79(inF0)
+            1498:          76 ExtInst 1(GLSL.std.450) 10(Fract) 1497
+                              Store 1496(r023) 1498
+            1500:          76 Load 79(inF0)
+            1501:          76 Fwidth 1500
+                              Store 1499(r025) 1501
+            1503:          76 Load 79(inF0)
+            1504:          76 Load 80(inF1)
+            1505:          76 ExtInst 1(GLSL.std.450) 53(Ldexp) 1503 1504
+                              Store 1502(r026) 1505
+            1507:          76 Load 79(inF0)
+            1508:          76 Load 80(inF1)
+            1509:          76 Load 81(inF2)
+            1510:          76 ExtInst 1(GLSL.std.450) 46(FMix) 1507 1508 1509
+                              Store 1506(r026a) 1510
+            1512:          76 Load 79(inF0)
+            1513:          76 ExtInst 1(GLSL.std.450) 28(Log) 1512
+                              Store 1511(r027) 1513
+            1515:          76 Load 79(inF0)
+            1516:          76 ExtInst 1(GLSL.std.450) 30(Log2) 1515
+            1517:          76 MatrixTimesScalar 1516 260
+                              Store 1514(r028) 1517
+            1519:          76 Load 79(inF0)
+            1520:          76 ExtInst 1(GLSL.std.450) 30(Log2) 1519
+                              Store 1518(r029) 1520
+            1522:          76 Load 79(inF0)
+            1523:          76 Load 80(inF1)
+            1524:          76 ExtInst 1(GLSL.std.450) 40(FMax) 1522 1523
+                              Store 1521(r030) 1524
+            1526:          76 Load 79(inF0)
+            1527:          76 Load 80(inF1)
+            1528:          76 ExtInst 1(GLSL.std.450) 37(FMin) 1526 1527
+                              Store 1525(r031) 1528
+            1530:          76 Load 79(inF0)
+            1531:          76 Load 80(inF1)
+            1532:          76 ExtInst 1(GLSL.std.450) 26(Pow) 1530 1531
+                              Store 1529(r032) 1532
+            1534:          76 Load 79(inF0)
+            1535:          76 ExtInst 1(GLSL.std.450) 11(Radians) 1534
+                              Store 1533(r033) 1535
+            1537:          76 Load 79(inF0)
+            1538:          76 ExtInst 1(GLSL.std.450) 2(RoundEven) 1537
+                              Store 1536(r034) 1538
+            1540:          76 Load 79(inF0)
+            1541:          76 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1540
+                              Store 1539(r035) 1541
+            1543:          76 Load 79(inF0)
+            1544:   48(fvec4) CompositeConstruct 179 179 179 179
+            1545:   48(fvec4) CompositeConstruct 281 281 281 281
+            1546:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 1543 1544 1545
+                              Store 1542(r036) 1546
+            1548:          76 Load 79(inF0)
+            1549:          76 ExtInst 1(GLSL.std.450) 6(FSign) 1548
+                              Store 1547(r037) 1549
+            1551:          76 Load 79(inF0)
+            1552:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1551
+                              Store 1550(r038) 1552
+            1553:          76 Load 79(inF0)
+            1554:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1553
+                              Store 80(inF1) 1554
+            1555:          76 Load 79(inF0)
+            1556:          76 ExtInst 1(GLSL.std.450) 14(Cos) 1555
+                              Store 81(inF2) 1556
+            1558:          76 Load 79(inF0)
+            1559:          76 ExtInst 1(GLSL.std.450) 19(Sinh) 1558
+                              Store 1557(r039) 1559
             1561:          76 Load 79(inF0)
-            1562:          76 ExtInst 1(GLSL.std.450) 30(Log2) 1561
-                              Store 1560(r029) 1562
-            1564:          76 Load 79(inF0)
-            1565:          76 Load 80(inF1)
-            1566:          76 ExtInst 1(GLSL.std.450) 40(FMax) 1564 1565
-                              Store 1563(r030) 1566
-            1568:          76 Load 79(inF0)
-            1569:          76 Load 80(inF1)
-            1570:          76 ExtInst 1(GLSL.std.450) 37(FMin) 1568 1569
-                              Store 1567(r031) 1570
-            1572:          76 Load 79(inF0)
-            1573:          76 Load 80(inF1)
-            1574:          76 ExtInst 1(GLSL.std.450) 26(Pow) 1572 1573
-                              Store 1571(r032) 1574
+            1562:          76 Load 80(inF1)
+            1563:          76 Load 81(inF2)
+            1564:          76 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1561 1562 1563
+                              Store 1560(r049) 1564
+            1566:          76 Load 79(inF0)
+            1567:          76 ExtInst 1(GLSL.std.450) 31(Sqrt) 1566
+                              Store 1565(r041) 1567
+            1569:          76 Load 79(inF0)
+            1570:          76 Load 80(inF1)
+            1571:          76 ExtInst 1(GLSL.std.450) 48(Step) 1569 1570
+                              Store 1568(r042) 1571
+            1573:          76 Load 79(inF0)
+            1574:          76 ExtInst 1(GLSL.std.450) 15(Tan) 1573
+                              Store 1572(r043) 1574
             1576:          76 Load 79(inF0)
-            1577:          76 ExtInst 1(GLSL.std.450) 11(Radians) 1576
-                              Store 1575(r033) 1577
-            1579:          76 Load 79(inF0)
-            1580:          76 ExtInst 1(GLSL.std.450) 2(RoundEven) 1579
-                              Store 1578(r034) 1580
-            1582:          76 Load 79(inF0)
-            1583:          76 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1582
-                              Store 1581(r035) 1583
-            1585:          76 Load 79(inF0)
-            1586:   48(fvec4) CompositeConstruct 179 179 179 179
-            1587:   48(fvec4) CompositeConstruct 287 287 287 287
-            1588:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 1585 1586 1587
-                              Store 1584(r036) 1588
-            1590:          76 Load 79(inF0)
-            1591:          76 ExtInst 1(GLSL.std.450) 6(FSign) 1590
-                              Store 1589(r037) 1591
-            1593:          76 Load 79(inF0)
-            1594:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1593
-                              Store 1592(r038) 1594
-            1595:          76 Load 79(inF0)
-            1596:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1595
-                              Store 80(inF1) 1596
-            1597:          76 Load 79(inF0)
-            1598:          76 ExtInst 1(GLSL.std.450) 14(Cos) 1597
-                              Store 81(inF2) 1598
-            1600:          76 Load 79(inF0)
-            1601:          76 ExtInst 1(GLSL.std.450) 19(Sinh) 1600
-                              Store 1599(r039) 1601
-            1603:          76 Load 79(inF0)
-            1604:          76 Load 80(inF1)
-            1605:          76 Load 81(inF2)
-            1606:          76 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1603 1604 1605
-                              Store 1602(r049) 1606
-            1608:          76 Load 79(inF0)
-            1609:          76 ExtInst 1(GLSL.std.450) 31(Sqrt) 1608
-                              Store 1607(r041) 1609
-            1611:          76 Load 79(inF0)
-            1612:          76 Load 80(inF1)
-            1613:          76 ExtInst 1(GLSL.std.450) 48(Step) 1611 1612
-                              Store 1610(r042) 1613
-            1615:          76 Load 79(inF0)
-            1616:          76 ExtInst 1(GLSL.std.450) 15(Tan) 1615
-                              Store 1614(r043) 1616
-            1618:          76 Load 79(inF0)
-            1619:          76 ExtInst 1(GLSL.std.450) 21(Tanh) 1618
-                              Store 1617(r044) 1619
-            1620:          76 Load 79(inF0)
-            1621:          76 Transpose 1620
-            1623:          76 Load 79(inF0)
-            1624:          76 ExtInst 1(GLSL.std.450) 3(Trunc) 1623
-                              Store 1622(r046) 1624
-                              ReturnValue 1626
+            1577:          76 ExtInst 1(GLSL.std.450) 21(Tanh) 1576
+                              Store 1575(r044) 1577
+            1578:          76 Load 79(inF0)
+            1579:          76 Transpose 1578
+            1581:          76 Load 79(inF0)
+            1582:          76 ExtInst 1(GLSL.std.450) 3(Trunc) 1581
+                              Store 1580(r046) 1582
+                              ReturnValue 1584
                               FunctionEnd
 91(TestGenMul2(f1;f1;vf2;vf2;mf22;mf22;):           2 Function None 84
         85(inF0):      7(ptr) FunctionParameter
@@ -8187,51 +8034,51 @@ gl_FragCoord origin is upper left
        89(inFM0):     61(ptr) FunctionParameter
        90(inFM1):     61(ptr) FunctionParameter
               92:             Label
-        1629(r0):      7(ptr) Variable Function
-        1633(r1):     25(ptr) Variable Function
-        1637(r2):     25(ptr) Variable Function
-        1641(r3):      7(ptr) Variable Function
-        1645(r4):     25(ptr) Variable Function
-        1649(r5):     25(ptr) Variable Function
-        1653(r6):     61(ptr) Variable Function
-        1657(r7):     61(ptr) Variable Function
-        1661(r8):     61(ptr) Variable Function
-            1630:    6(float) Load 86(inF1)
-            1631:    6(float) Load 85(inF0)
-            1632:    6(float) FMul 1630 1631
-                              Store 1629(r0) 1632
-            1634:    6(float) Load 85(inF0)
-            1635:   24(fvec2) Load 87(inFV0)
-            1636:   24(fvec2) VectorTimesScalar 1635 1634
-                              Store 1633(r1) 1636
-            1638:   24(fvec2) Load 87(inFV0)
-            1639:    6(float) Load 85(inF0)
-            1640:   24(fvec2) VectorTimesScalar 1638 1639
-                              Store 1637(r2) 1640
-            1642:   24(fvec2) Load 87(inFV0)
-            1643:   24(fvec2) Load 88(inFV1)
-            1644:    6(float) Dot 1642 1643
-                              Store 1641(r3) 1644
-            1646:   24(fvec2) Load 87(inFV0)
-            1647:          60 Load 89(inFM0)
-            1648:   24(fvec2) VectorTimesMatrix 1646 1647
-                              Store 1645(r4) 1648
-            1650:          60 Load 89(inFM0)
-            1651:   24(fvec2) Load 87(inFV0)
-            1652:   24(fvec2) MatrixTimesVector 1650 1651
-                              Store 1649(r5) 1652
-            1654:    6(float) Load 85(inF0)
-            1655:          60 Load 89(inFM0)
-            1656:          60 MatrixTimesScalar 1655 1654
-                              Store 1653(r6) 1656
-            1658:          60 Load 89(inFM0)
-            1659:    6(float) Load 85(inF0)
-            1660:          60 MatrixTimesScalar 1658 1659
-                              Store 1657(r7) 1660
-            1662:          60 Load 90(inFM1)
-            1663:          60 Load 89(inFM0)
-            1664:          60 MatrixTimesMatrix 1662 1663
-                              Store 1661(r8) 1664
+        1587(r0):      7(ptr) Variable Function
+        1591(r1):     25(ptr) Variable Function
+        1595(r2):     25(ptr) Variable Function
+        1599(r3):      7(ptr) Variable Function
+        1603(r4):     25(ptr) Variable Function
+        1607(r5):     25(ptr) Variable Function
+        1611(r6):     61(ptr) Variable Function
+        1615(r7):     61(ptr) Variable Function
+        1619(r8):     61(ptr) Variable Function
+            1588:    6(float) Load 86(inF1)
+            1589:    6(float) Load 85(inF0)
+            1590:    6(float) FMul 1588 1589
+                              Store 1587(r0) 1590
+            1592:    6(float) Load 85(inF0)
+            1593:   24(fvec2) Load 87(inFV0)
+            1594:   24(fvec2) VectorTimesScalar 1593 1592
+                              Store 1591(r1) 1594
+            1596:   24(fvec2) Load 87(inFV0)
+            1597:    6(float) Load 85(inF0)
+            1598:   24(fvec2) VectorTimesScalar 1596 1597
+                              Store 1595(r2) 1598
+            1600:   24(fvec2) Load 87(inFV0)
+            1601:   24(fvec2) Load 88(inFV1)
+            1602:    6(float) Dot 1600 1601
+                              Store 1599(r3) 1602
+            1604:   24(fvec2) Load 87(inFV0)
+            1605:          60 Load 89(inFM0)
+            1606:   24(fvec2) VectorTimesMatrix 1604 1605
+                              Store 1603(r4) 1606
+            1608:          60 Load 89(inFM0)
+            1609:   24(fvec2) Load 87(inFV0)
+            1610:   24(fvec2) MatrixTimesVector 1608 1609
+                              Store 1607(r5) 1610
+            1612:    6(float) Load 85(inF0)
+            1613:          60 Load 89(inFM0)
+            1614:          60 MatrixTimesScalar 1613 1612
+                              Store 1611(r6) 1614
+            1616:          60 Load 89(inFM0)
+            1617:    6(float) Load 85(inF0)
+            1618:          60 MatrixTimesScalar 1616 1617
+                              Store 1615(r7) 1618
+            1620:          60 Load 90(inFM1)
+            1621:          60 Load 89(inFM0)
+            1622:          60 MatrixTimesMatrix 1620 1621
+                              Store 1619(r8) 1622
                               Return
                               FunctionEnd
 100(TestGenMul3(f1;f1;vf3;vf3;mf33;mf33;):           2 Function None 93
@@ -8242,51 +8089,51 @@ gl_FragCoord origin is upper left
        98(inFM0):     69(ptr) FunctionParameter
        99(inFM1):     69(ptr) FunctionParameter
              101:             Label
-        1665(r0):      7(ptr) Variable Function
-        1669(r1):     37(ptr) Variable Function
-        1673(r2):     37(ptr) Variable Function
-        1677(r3):      7(ptr) Variable Function
-        1681(r4):     37(ptr) Variable Function
-        1685(r5):     37(ptr) Variable Function
-        1689(r6):     69(ptr) Variable Function
-        1693(r7):     69(ptr) Variable Function
-        1697(r8):     69(ptr) Variable Function
-            1666:    6(float) Load 95(inF1)
-            1667:    6(float) Load 94(inF0)
-            1668:    6(float) FMul 1666 1667
-                              Store 1665(r0) 1668
-            1670:    6(float) Load 94(inF0)
-            1671:   36(fvec3) Load 96(inFV0)
-            1672:   36(fvec3) VectorTimesScalar 1671 1670
-                              Store 1669(r1) 1672
-            1674:   36(fvec3) Load 96(inFV0)
-            1675:    6(float) Load 94(inF0)
-            1676:   36(fvec3) VectorTimesScalar 1674 1675
-                              Store 1673(r2) 1676
-            1678:   36(fvec3) Load 96(inFV0)
-            1679:   36(fvec3) Load 97(inFV1)
-            1680:    6(float) Dot 1678 1679
-                              Store 1677(r3) 1680
-            1682:   36(fvec3) Load 96(inFV0)
-            1683:          68 Load 98(inFM0)
-            1684:   36(fvec3) VectorTimesMatrix 1682 1683
-                              Store 1681(r4) 1684
-            1686:          68 Load 98(inFM0)
-            1687:   36(fvec3) Load 96(inFV0)
-            1688:   36(fvec3) MatrixTimesVector 1686 1687
-                              Store 1685(r5) 1688
-            1690:    6(float) Load 94(inF0)
-            1691:          68 Load 98(inFM0)
-            1692:          68 MatrixTimesScalar 1691 1690
-                              Store 1689(r6) 1692
-            1694:          68 Load 98(inFM0)
-            1695:    6(float) Load 94(inF0)
-            1696:          68 MatrixTimesScalar 1694 1695
-                              Store 1693(r7) 1696
-            1698:          68 Load 99(inFM1)
-            1699:          68 Load 98(inFM0)
-            1700:          68 MatrixTimesMatrix 1698 1699
-                              Store 1697(r8) 1700
+        1623(r0):      7(ptr) Variable Function
+        1627(r1):     37(ptr) Variable Function
+        1631(r2):     37(ptr) Variable Function
+        1635(r3):      7(ptr) Variable Function
+        1639(r4):     37(ptr) Variable Function
+        1643(r5):     37(ptr) Variable Function
+        1647(r6):     69(ptr) Variable Function
+        1651(r7):     69(ptr) Variable Function
+        1655(r8):     69(ptr) Variable Function
+            1624:    6(float) Load 95(inF1)
+            1625:    6(float) Load 94(inF0)
+            1626:    6(float) FMul 1624 1625
+                              Store 1623(r0) 1626
+            1628:    6(float) Load 94(inF0)
+            1629:   36(fvec3) Load 96(inFV0)
+            1630:   36(fvec3) VectorTimesScalar 1629 1628
+                              Store 1627(r1) 1630
+            1632:   36(fvec3) Load 96(inFV0)
+            1633:    6(float) Load 94(inF0)
+            1634:   36(fvec3) VectorTimesScalar 1632 1633
+                              Store 1631(r2) 1634
+            1636:   36(fvec3) Load 96(inFV0)
+            1637:   36(fvec3) Load 97(inFV1)
+            1638:    6(float) Dot 1636 1637
+                              Store 1635(r3) 1638
+            1640:   36(fvec3) Load 96(inFV0)
+            1641:          68 Load 98(inFM0)
+            1642:   36(fvec3) VectorTimesMatrix 1640 1641
+                              Store 1639(r4) 1642
+            1644:          68 Load 98(inFM0)
+            1645:   36(fvec3) Load 96(inFV0)
+            1646:   36(fvec3) MatrixTimesVector 1644 1645
+                              Store 1643(r5) 1646
+            1648:    6(float) Load 94(inF0)
+            1649:          68 Load 98(inFM0)
+            1650:          68 MatrixTimesScalar 1649 1648
+                              Store 1647(r6) 1650
+            1652:          68 Load 98(inFM0)
+            1653:    6(float) Load 94(inF0)
+            1654:          68 MatrixTimesScalar 1652 1653
+                              Store 1651(r7) 1654
+            1656:          68 Load 99(inFM1)
+            1657:          68 Load 98(inFM0)
+            1658:          68 MatrixTimesMatrix 1656 1657
+                              Store 1655(r8) 1658
                               Return
                               FunctionEnd
 109(TestGenMul4(f1;f1;vf4;vf4;mf44;mf44;):           2 Function None 102
@@ -8297,51 +8144,51 @@ gl_FragCoord origin is upper left
       107(inFM0):     77(ptr) FunctionParameter
       108(inFM1):     77(ptr) FunctionParameter
              110:             Label
-        1701(r0):      7(ptr) Variable Function
-        1705(r1):     49(ptr) Variable Function
-        1709(r2):     49(ptr) Variable Function
-        1713(r3):      7(ptr) Variable Function
-        1717(r4):     49(ptr) Variable Function
-        1721(r5):     49(ptr) Variable Function
-        1725(r6):     77(ptr) Variable Function
-        1729(r7):     77(ptr) Variable Function
-        1733(r8):     77(ptr) Variable Function
-            1702:    6(float) Load 104(inF1)
-            1703:    6(float) Load 103(inF0)
-            1704:    6(float) FMul 1702 1703
-                              Store 1701(r0) 1704
-            1706:    6(float) Load 103(inF0)
-            1707:   48(fvec4) Load 105(inFV0)
-            1708:   48(fvec4) VectorTimesScalar 1707 1706
-                              Store 1705(r1) 1708
-            1710:   48(fvec4) Load 105(inFV0)
-            1711:    6(float) Load 103(inF0)
-            1712:   48(fvec4) VectorTimesScalar 1710 1711
-                              Store 1709(r2) 1712
-            1714:   48(fvec4) Load 105(inFV0)
-            1715:   48(fvec4) Load 106(inFV1)
-            1716:    6(float) Dot 1714 1715
-                              Store 1713(r3) 1716
-            1718:   48(fvec4) Load 105(inFV0)
-            1719:          76 Load 107(inFM0)
-            1720:   48(fvec4) VectorTimesMatrix 1718 1719
-                              Store 1717(r4) 1720
-            1722:          76 Load 107(inFM0)
-            1723:   48(fvec4) Load 105(inFV0)
-            1724:   48(fvec4) MatrixTimesVector 1722 1723
-                              Store 1721(r5) 1724
-            1726:    6(float) Load 103(inF0)
-            1727:          76 Load 107(inFM0)
-            1728:          76 MatrixTimesScalar 1727 1726
-                              Store 1725(r6) 1728
-            1730:          76 Load 107(inFM0)
-            1731:    6(float) Load 103(inF0)
-            1732:          76 MatrixTimesScalar 1730 1731
-                              Store 1729(r7) 1732
-            1734:          76 Load 108(inFM1)
-            1735:          76 Load 107(inFM0)
-            1736:          76 MatrixTimesMatrix 1734 1735
-                              Store 1733(r8) 1736
+        1659(r0):      7(ptr) Variable Function
+        1663(r1):     49(ptr) Variable Function
+        1667(r2):     49(ptr) Variable Function
+        1671(r3):      7(ptr) Variable Function
+        1675(r4):     49(ptr) Variable Function
+        1679(r5):     49(ptr) Variable Function
+        1683(r6):     77(ptr) Variable Function
+        1687(r7):     77(ptr) Variable Function
+        1691(r8):     77(ptr) Variable Function
+            1660:    6(float) Load 104(inF1)
+            1661:    6(float) Load 103(inF0)
+            1662:    6(float) FMul 1660 1661
+                              Store 1659(r0) 1662
+            1664:    6(float) Load 103(inF0)
+            1665:   48(fvec4) Load 105(inFV0)
+            1666:   48(fvec4) VectorTimesScalar 1665 1664
+                              Store 1663(r1) 1666
+            1668:   48(fvec4) Load 105(inFV0)
+            1669:    6(float) Load 103(inF0)
+            1670:   48(fvec4) VectorTimesScalar 1668 1669
+                              Store 1667(r2) 1670
+            1672:   48(fvec4) Load 105(inFV0)
+            1673:   48(fvec4) Load 106(inFV1)
+            1674:    6(float) Dot 1672 1673
+                              Store 1671(r3) 1674
+            1676:   48(fvec4) Load 105(inFV0)
+            1677:          76 Load 107(inFM0)
+            1678:   48(fvec4) VectorTimesMatrix 1676 1677
+                              Store 1675(r4) 1678
+            1680:          76 Load 107(inFM0)
+            1681:   48(fvec4) Load 105(inFV0)
+            1682:   48(fvec4) MatrixTimesVector 1680 1681
+                              Store 1679(r5) 1682
+            1684:    6(float) Load 103(inF0)
+            1685:          76 Load 107(inFM0)
+            1686:          76 MatrixTimesScalar 1685 1684
+                              Store 1683(r6) 1686
+            1688:          76 Load 107(inFM0)
+            1689:    6(float) Load 103(inF0)
+            1690:          76 MatrixTimesScalar 1688 1689
+                              Store 1687(r7) 1690
+            1692:          76 Load 108(inFM1)
+            1693:          76 Load 107(inFM0)
+            1694:          76 MatrixTimesMatrix 1692 1693
+                              Store 1691(r8) 1694
                               Return
                               FunctionEnd
 129(TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24;):           2 Function None 119
@@ -8355,98 +8202,98 @@ gl_FragCoord origin is upper left
     127(inFM3x4):    116(ptr) FunctionParameter
     128(inFM2x4):    118(ptr) FunctionParameter
              130:             Label
-       1737(r00):      7(ptr) Variable Function
-       1741(r01):     25(ptr) Variable Function
-       1745(r02):     37(ptr) Variable Function
-       1749(r03):     25(ptr) Variable Function
-       1753(r04):     37(ptr) Variable Function
-       1757(r05):      7(ptr) Variable Function
-       1761(r06):      7(ptr) Variable Function
-       1765(r07):     37(ptr) Variable Function
-       1769(r08):     25(ptr) Variable Function
-       1773(r09):     25(ptr) Variable Function
-       1777(r10):     37(ptr) Variable Function
-       1781(r11):    112(ptr) Variable Function
-       1785(r12):    114(ptr) Variable Function
-       1789(r13):     61(ptr) Variable Function
-       1793(r14):    112(ptr) Variable Function
-       1797(r15):    118(ptr) Variable Function
-       1801(r16):    116(ptr) Variable Function
-            1738:    6(float) Load 121(inF1)
-            1739:    6(float) Load 120(inF0)
-            1740:    6(float) FMul 1738 1739
-                              Store 1737(r00) 1740
-            1742:    6(float) Load 120(inF0)
-            1743:   24(fvec2) Load 122(inFV2)
-            1744:   24(fvec2) VectorTimesScalar 1743 1742
-                              Store 1741(r01) 1744
-            1746:    6(float) Load 120(inF0)
-            1747:   36(fvec3) Load 123(inFV3)
-            1748:   36(fvec3) VectorTimesScalar 1747 1746
-                              Store 1745(r02) 1748
-            1750:   24(fvec2) Load 122(inFV2)
-            1751:    6(float) Load 120(inF0)
-            1752:   24(fvec2) VectorTimesScalar 1750 1751
-                              Store 1749(r03) 1752
-            1754:   36(fvec3) Load 123(inFV3)
-            1755:    6(float) Load 120(inF0)
-            1756:   36(fvec3) VectorTimesScalar 1754 1755
-                              Store 1753(r04) 1756
-            1758:   24(fvec2) Load 122(inFV2)
-            1759:   24(fvec2) Load 122(inFV2)
-            1760:    6(float) Dot 1758 1759
-                              Store 1757(r05) 1760
-            1762:   36(fvec3) Load 123(inFV3)
-            1763:   36(fvec3) Load 123(inFV3)
-            1764:    6(float) Dot 1762 1763
-                              Store 1761(r06) 1764
-            1766:         111 Load 124(inFM2x3)
-            1767:   24(fvec2) Load 122(inFV2)
-            1768:   36(fvec3) MatrixTimesVector 1766 1767
-                              Store 1765(r07) 1768
-            1770:         113 Load 125(inFM3x2)
-            1771:   36(fvec3) Load 123(inFV3)
-            1772:   24(fvec2) MatrixTimesVector 1770 1771
-                              Store 1769(r08) 1772
-            1774:   36(fvec3) Load 123(inFV3)
-            1775:         111 Load 124(inFM2x3)
-            1776:   24(fvec2) VectorTimesMatrix 1774 1775
-                              Store 1773(r09) 1776
-            1778:   24(fvec2) Load 122(inFV2)
-            1779:         113 Load 125(inFM3x2)
-            1780:   36(fvec3) VectorTimesMatrix 1778 1779
-                              Store 1777(r10) 1780
-            1782:    6(float) Load 120(inF0)
-            1783:         111 Load 124(inFM2x3)
-            1784:         111 MatrixTimesScalar 1783 1782
-                              Store 1781(r11) 1784
-            1786:    6(float) Load 120(inF0)
-            1787:         113 Load 125(inFM3x2)
-            1788:         113 MatrixTimesScalar 1787 1786
-                              Store 1785(r12) 1788
-            1790:         113 Load 125(inFM3x2)
-            1791:         111 Load 124(inFM2x3)
-            1792:          60 MatrixTimesMatrix 1790 1791
-                              Store 1789(r13) 1792
-            1794:          68 Load 126(inFM3x3)
-            1795:         111 Load 124(inFM2x3)
-            1796:         111 MatrixTimesMatrix 1794 1795
-                              Store 1793(r14) 1796
-            1798:         115 Load 127(inFM3x4)
-            1799:         111 Load 124(inFM2x3)
-            1800:         117 MatrixTimesMatrix 1798 1799
-                              Store 1797(r15) 1800
-            1802:         117 Load 128(inFM2x4)
-            1803:         113 Load 125(inFM3x2)
-            1804:         115 MatrixTimesMatrix 1802 1803
-                              Store 1801(r16) 1804
+       1695(r00):      7(ptr) Variable Function
+       1699(r01):     25(ptr) Variable Function
+       1703(r02):     37(ptr) Variable Function
+       1707(r03):     25(ptr) Variable Function
+       1711(r04):     37(ptr) Variable Function
+       1715(r05):      7(ptr) Variable Function
+       1719(r06):      7(ptr) Variable Function
+       1723(r07):     37(ptr) Variable Function
+       1727(r08):     25(ptr) Variable Function
+       1731(r09):     25(ptr) Variable Function
+       1735(r10):     37(ptr) Variable Function
+       1739(r11):    112(ptr) Variable Function
+       1743(r12):    114(ptr) Variable Function
+       1747(r13):     61(ptr) Variable Function
+       1751(r14):    112(ptr) Variable Function
+       1755(r15):    118(ptr) Variable Function
+       1759(r16):    116(ptr) Variable Function
+            1696:    6(float) Load 121(inF1)
+            1697:    6(float) Load 120(inF0)
+            1698:    6(float) FMul 1696 1697
+                              Store 1695(r00) 1698
+            1700:    6(float) Load 120(inF0)
+            1701:   24(fvec2) Load 122(inFV2)
+            1702:   24(fvec2) VectorTimesScalar 1701 1700
+                              Store 1699(r01) 1702
+            1704:    6(float) Load 120(inF0)
+            1705:   36(fvec3) Load 123(inFV3)
+            1706:   36(fvec3) VectorTimesScalar 1705 1704
+                              Store 1703(r02) 1706
+            1708:   24(fvec2) Load 122(inFV2)
+            1709:    6(float) Load 120(inF0)
+            1710:   24(fvec2) VectorTimesScalar 1708 1709
+                              Store 1707(r03) 1710
+            1712:   36(fvec3) Load 123(inFV3)
+            1713:    6(float) Load 120(inF0)
+            1714:   36(fvec3) VectorTimesScalar 1712 1713
+                              Store 1711(r04) 1714
+            1716:   24(fvec2) Load 122(inFV2)
+            1717:   24(fvec2) Load 122(inFV2)
+            1718:    6(float) Dot 1716 1717
+                              Store 1715(r05) 1718
+            1720:   36(fvec3) Load 123(inFV3)
+            1721:   36(fvec3) Load 123(inFV3)
+            1722:    6(float) Dot 1720 1721
+                              Store 1719(r06) 1722
+            1724:         111 Load 124(inFM2x3)
+            1725:   24(fvec2) Load 122(inFV2)
+            1726:   36(fvec3) MatrixTimesVector 1724 1725
+                              Store 1723(r07) 1726
+            1728:         113 Load 125(inFM3x2)
+            1729:   36(fvec3) Load 123(inFV3)
+            1730:   24(fvec2) MatrixTimesVector 1728 1729
+                              Store 1727(r08) 1730
+            1732:   36(fvec3) Load 123(inFV3)
+            1733:         111 Load 124(inFM2x3)
+            1734:   24(fvec2) VectorTimesMatrix 1732 1733
+                              Store 1731(r09) 1734
+            1736:   24(fvec2) Load 122(inFV2)
+            1737:         113 Load 125(inFM3x2)
+            1738:   36(fvec3) VectorTimesMatrix 1736 1737
+                              Store 1735(r10) 1738
+            1740:    6(float) Load 120(inF0)
+            1741:         111 Load 124(inFM2x3)
+            1742:         111 MatrixTimesScalar 1741 1740
+                              Store 1739(r11) 1742
+            1744:    6(float) Load 120(inF0)
+            1745:         113 Load 125(inFM3x2)
+            1746:         113 MatrixTimesScalar 1745 1744
+                              Store 1743(r12) 1746
+            1748:         113 Load 125(inFM3x2)
+            1749:         111 Load 124(inFM2x3)
+            1750:          60 MatrixTimesMatrix 1748 1749
+                              Store 1747(r13) 1750
+            1752:          68 Load 126(inFM3x3)
+            1753:         111 Load 124(inFM2x3)
+            1754:         111 MatrixTimesMatrix 1752 1753
+                              Store 1751(r14) 1754
+            1756:         115 Load 127(inFM3x4)
+            1757:         111 Load 124(inFM2x3)
+            1758:         117 MatrixTimesMatrix 1756 1757
+                              Store 1755(r15) 1758
+            1760:         117 Load 128(inFM2x4)
+            1761:         113 Load 125(inFM3x2)
+            1762:         115 MatrixTimesMatrix 1760 1761
+                              Store 1759(r16) 1762
                               Return
                               FunctionEnd
      133(@main():131(PS_OUTPUT) Function None 132
              134:             Label
- 1806(ps_output):   1805(ptr) Variable Function
-            1809:     49(ptr) AccessChain 1806(ps_output) 1807
-                              Store 1809 1808
-            1810:131(PS_OUTPUT) Load 1806(ps_output)
-                              ReturnValue 1810
+ 1764(ps_output):   1763(ptr) Variable Function
+            1767:     49(ptr) AccessChain 1764(ps_output) 1765
+                              Store 1767 1766
+            1768:131(PS_OUTPUT) Load 1764(ps_output)
+                              ReturnValue 1768
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.lit.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.lit.frag.out
index affed16..61024f4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.lit.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.lit.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.lit.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(f1;f1;f1; ( temp void)
@@ -60,7 +60,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(f1;f1;f1; ( temp void)
@@ -126,6 +126,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 37 40 43
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 12  "@PixelShaderFunction(f1;f1;f1;"
                               Name 9  "n_dot_l"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.comp.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.comp.out
index 2c8b915..176cf23 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.comp.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.comp.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.negative.comp
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:2  Function Definition: ComputeShaderFunctionS(f1;f1;f1;i1; ( temp float)
@@ -91,7 +91,7 @@ local_size = (1, 1, 1)
 Linked compute stage:
 
 
-Shader version: 450
+Shader version: 500
 local_size = (1, 1, 1)
 0:? Sequence
 0:2  Function Definition: ComputeShaderFunctionS(f1;f1;f1;i1; ( temp float)
@@ -188,6 +188,7 @@ local_size = (1, 1, 1)
                               MemoryModel Logical GLSL450
                               EntryPoint GLCompute 4  "ComputeShaderFunction" 76 79 82 86 89
                               ExecutionMode 4 LocalSize 1 1 1
+                              Source HLSL 500
                               Name 4  "ComputeShaderFunction"
                               Name 15  "ComputeShaderFunctionS(f1;f1;f1;i1;"
                               Name 11  "inF0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.frag.out
index 8f770c8..591ad13 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.frag.out
@@ -1,6 +1,5 @@
 hlsl.intrinsics.negative.frag
 ERROR: 0:10: 'determinant' : no matching overloaded function found 
-ERROR: 0:23: 'length' : ambiguous best function under implicit type conversion 
 ERROR: 0:25: 'normalize' : ambiguous best function under implicit type conversion 
 ERROR: 0:26: 'reflect' : ambiguous best function under implicit type conversion 
 ERROR: 0:27: 'refract' : ambiguous best function under implicit type conversion 
@@ -59,10 +58,10 @@ ERROR: 0:133: 'normalize' : no matching overloaded function found
 ERROR: 0:133: 'reflect' : no matching overloaded function found 
 ERROR: 0:133: 'refract' : no matching overloaded function found 
 ERROR: 0:133: 'reversebits' : no matching overloaded function found 
-ERROR: 60 compilation errors.  No code generated.
+ERROR: 59 compilation errors.  No code generated.
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:2  Function Definition: PixelShaderFunctionS(f1;f1;f1;i1; ( temp float)
@@ -120,8 +119,7 @@ ERROR: node is still EOpNull!
 0:14        Convert float to uint ( temp uint)
 0:14          'inF0' ( in float)
 0:23      length ( temp float)
-0:23        Construct vec2 ( in 2-component vector of float)
-0:23          'inF0' ( in float)
+0:23        'inF0' ( in float)
 0:24      Function Call: msad4(u1;vu2;vu4; ( temp 4-component vector of uint)
 0:24        Convert float to uint ( temp uint)
 0:24          'inF0' ( in float)
@@ -524,7 +522,7 @@ ERROR: node is still EOpNull!
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:2  Function Definition: PixelShaderFunctionS(f1;f1;f1;i1; ( temp float)
@@ -582,8 +580,7 @@ ERROR: node is still EOpNull!
 0:14        Convert float to uint ( temp uint)
 0:14          'inF0' ( in float)
 0:23      length ( temp float)
-0:23        Construct vec2 ( in 2-component vector of float)
-0:23          'inF0' ( in float)
+0:23        'inF0' ( in float)
 0:24      Function Call: msad4(u1;vu2;vu4; ( temp 4-component vector of uint)
 0:24        Convert float to uint ( temp uint)
 0:24          'inF0' ( in float)
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.vert.out
index 9808b1c..aabcda8 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.negative.vert.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.negative.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:15  Function Definition: VertexShaderFunctionS(f1;f1;f1;i1; ( temp float)
 0:15    Function Parameters: 
@@ -155,7 +155,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:15  Function Definition: VertexShaderFunctionS(f1;f1;f1;i1; ( temp float)
 0:15    Function Parameters: 
@@ -315,6 +315,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "VertexShaderFunction" 100 103 106 110 113
+                              Source HLSL 500
                               Name 4  "VertexShaderFunction"
                               Name 15  "VertexShaderFunctionS(f1;f1;f1;i1;"
                               Name 11  "inF0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.down.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.down.frag.out
index 4f47d86..d67b2e8 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.down.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.down.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.promote.down.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:15  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -53,7 +53,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:15  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -112,6 +112,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 47
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.frag.out
index 7fec7a2..b62d30d 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.promote.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -341,8 +341,8 @@ gl_FragCoord origin is upper left
 0:51        move second child to first child ( temp float)
 0:51          'r50' ( temp float)
 0:51          Construct float ( temp float)
-0:?             imageLoad ( temp 4-component vector of float)
-0:51              'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?             textureFetch ( temp 4-component vector of float)
+0:51              'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:51              Convert uint to int ( temp int)
 0:51                upos: direct index for structure ( uniform uint)
 0:51                  'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
@@ -352,8 +352,8 @@ gl_FragCoord origin is upper left
 0:52        move second child to first child ( temp float)
 0:52          'r51' ( temp float)
 0:52          Construct float ( temp float)
-0:?             imageLoad ( temp 4-component vector of float)
-0:52              'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?             textureFetch ( temp 4-component vector of float)
+0:52              'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:52              Convert float to int ( temp int)
 0:52                fpos: direct index for structure ( uniform float)
 0:52                  'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
@@ -437,7 +437,7 @@ gl_FragCoord origin is upper left
 0:20              0 (const int)
 0:?   Linker Objects
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
-0:?     'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?     'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:?     'g_tTex1df4' ( uniform texture1D)
 0:?     'color' (layout( location=0) out 4-component vector of float)
 
@@ -445,7 +445,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -787,8 +787,8 @@ gl_FragCoord origin is upper left
 0:51        move second child to first child ( temp float)
 0:51          'r50' ( temp float)
 0:51          Construct float ( temp float)
-0:?             imageLoad ( temp 4-component vector of float)
-0:51              'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?             textureFetch ( temp 4-component vector of float)
+0:51              'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:51              Convert uint to int ( temp int)
 0:51                upos: direct index for structure ( uniform uint)
 0:51                  'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
@@ -798,8 +798,8 @@ gl_FragCoord origin is upper left
 0:52        move second child to first child ( temp float)
 0:52          'r51' ( temp float)
 0:52          Construct float ( temp float)
-0:?             imageLoad ( temp 4-component vector of float)
-0:52              'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?             textureFetch ( temp 4-component vector of float)
+0:52              'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:52              Convert float to int ( temp int)
 0:52                fpos: direct index for structure ( uniform float)
 0:52                  'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
@@ -883,7 +883,7 @@ gl_FragCoord origin is upper left
 0:20              0 (const int)
 0:?   Linker Objects
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
-0:?     'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?     'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:?     'g_tTex1df4' ( uniform texture1D)
 0:?     'color' (layout( location=0) out 4-component vector of float)
 
@@ -899,6 +899,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 319
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
@@ -964,7 +965,6 @@ gl_FragCoord origin is upper left
                               Decorate 19($Global) Block
                               Decorate 21 DescriptorSet 0
                               Decorate 258(g_tTexbfs) DescriptorSet 0
-                              Decorate 258(g_tTexbfs) NonWritable
                               Decorate 277(g_tTex1df4) DescriptorSet 0
                               Decorate 319(color) Location 0
                2:             TypeVoid
@@ -1013,7 +1013,7 @@ gl_FragCoord origin is upper left
              107:   16(ivec2) ConstantComposite 44 44
              109:     14(int) Constant 4
              110:             TypePointer Uniform 16(ivec2)
-             256:             TypeImage 6(float) Buffer nonsampled format:R32f
+             256:             TypeImage 6(float) Buffer sampled format:R32f
              257:             TypePointer UniformConstant 256
   258(g_tTexbfs):    257(ptr) Variable UniformConstant
              260:     14(int) Constant 8
@@ -1275,14 +1275,14 @@ gl_FragCoord origin is upper left
              261:     23(ptr) AccessChain 21 260
              262:     15(int) Load 261
              263:     14(int) Bitcast 262
-             264:    7(fvec4) ImageRead 259 263
+             264:    7(fvec4) ImageFetch 259 263
              265:    6(float) CompositeExtract 264 0
                               Store 255(r50) 265
              267:         256 Load 258(g_tTexbfs)
              269:     33(ptr) AccessChain 21 268
              270:    6(float) Load 269
              271:     14(int) ConvertFToS 270
-             272:    7(fvec4) ImageRead 267 271
+             272:    7(fvec4) ImageFetch 267 271
              273:    6(float) CompositeExtract 272 0
                               Store 266(r51) 273
              278:         275 Load 277(g_tTex1df4)
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.outputs.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.outputs.frag.out
index 7ccc596..87b425f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.outputs.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.promote.outputs.frag.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.promote.outputs.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -95,7 +95,7 @@ gl_FragCoord origin is upper left
 0:20              0 (const int)
 0:?   Linker Objects
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
-0:?     'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?     'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:?     'g_tTex1df4' ( uniform texture1D)
 0:?     'color' (layout( location=0) out 4-component vector of float)
 
@@ -103,7 +103,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -199,7 +199,7 @@ gl_FragCoord origin is upper left
 0:20              0 (const int)
 0:?   Linker Objects
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int i,  uniform uint u,  uniform float f,  uniform bool b,  uniform 2-component vector of int i2,  uniform 2-component vector of uint u2,  uniform 2-component vector of float f2,  uniform 2-component vector of bool b2,  uniform uint upos,  uniform float fpos})
-0:?     'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?     'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:?     'g_tTex1df4' ( uniform texture1D)
 0:?     'color' (layout( location=0) out 4-component vector of float)
 
@@ -215,6 +215,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 74
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
@@ -258,7 +259,6 @@ gl_FragCoord origin is upper left
                               Decorate 31(g_tTex1df4) DescriptorSet 0
                               Decorate 74(color) Location 0
                               Decorate 79(g_tTexbfs) DescriptorSet 0
-                              Decorate 79(g_tTexbfs) NonWritable
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -289,7 +289,7 @@ gl_FragCoord origin is upper left
               68:             TypePointer Function 7(fvec4)
               73:             TypePointer Output 7(fvec4)
        74(color):     73(ptr) Variable Output
-              77:             TypeImage 6(float) Buffer nonsampled format:R32f
+              77:             TypeImage 6(float) Buffer sampled format:R32f
               78:             TypePointer UniformConstant 77
    79(g_tTexbfs):     78(ptr) Variable UniformConstant
          4(main):           2 Function None 3
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.vert.out
index 107575b..82bb18e 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.intrinsics.vert.out
@@ -1,5 +1,5 @@
 hlsl.intrinsics.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:2  Function Definition: VertexShaderFunctionS(f1;f1;f1;u1;u1; ( temp float)
 0:2    Function Parameters: 
@@ -62,880 +62,862 @@ Shader version: 450
 0:29        'inF1' ( in float)
 0:30      Fraction ( temp float)
 0:30        'inF0' ( in float)
-0:31      frexp ( temp float)
+0:31      isinf ( temp bool)
 0:31        'inF0' ( in float)
-0:31        'inF1' ( in float)
-0:32      isinf ( temp bool)
+0:32      isnan ( temp bool)
 0:32        'inF0' ( in float)
-0:33      isnan ( temp bool)
+0:33      ldexp ( temp float)
 0:33        'inF0' ( in float)
-0:34      ldexp ( temp float)
+0:33        'inF1' ( in float)
+0:34      mix ( temp float)
 0:34        'inF0' ( in float)
 0:34        'inF1' ( in float)
-0:35      mix ( temp float)
+0:34        'inF2' ( in float)
+0:35      log ( temp float)
 0:35        'inF0' ( in float)
-0:35        'inF1' ( in float)
-0:35        'inF2' ( in float)
-0:36      log ( temp float)
-0:36        'inF0' ( in float)
-0:37      component-wise multiply ( temp float)
-0:37        log2 ( temp float)
-0:37          'inF0' ( in float)
-0:37        Constant:
-0:37          0.301030
-0:38      log2 ( temp float)
+0:36      component-wise multiply ( temp float)
+0:36        log2 ( temp float)
+0:36          'inF0' ( in float)
+0:36        Constant:
+0:36          0.301030
+0:37      log2 ( temp float)
+0:37        'inF0' ( in float)
+0:38      max ( temp float)
 0:38        'inF0' ( in float)
-0:39      max ( temp float)
+0:38        'inF1' ( in float)
+0:39      min ( temp float)
 0:39        'inF0' ( in float)
 0:39        'inF1' ( in float)
-0:40      min ( temp float)
-0:40        'inF0' ( in float)
-0:40        'inF1' ( in float)
-0:42      pow ( temp float)
+0:41      pow ( temp float)
+0:41        'inF0' ( in float)
+0:41        'inF1' ( in float)
+0:42      radians ( temp float)
 0:42        'inF0' ( in float)
-0:42        'inF1' ( in float)
-0:43      radians ( temp float)
-0:43        'inF0' ( in float)
-0:44      bitFieldReverse ( temp int)
-0:44        Constant:
-0:44          2 (const int)
-0:45      roundEven ( temp float)
+0:43      bitFieldReverse ( temp int)
+0:43        Constant:
+0:43          2 (const int)
+0:44      roundEven ( temp float)
+0:44        'inF0' ( in float)
+0:45      inverse sqrt ( temp float)
 0:45        'inF0' ( in float)
-0:46      inverse sqrt ( temp float)
+0:46      clamp ( temp float)
 0:46        'inF0' ( in float)
-0:47      clamp ( temp float)
+0:46        Constant:
+0:46          0.000000
+0:46        Constant:
+0:46          1.000000
+0:47      Sign ( temp float)
 0:47        'inF0' ( in float)
-0:47        Constant:
-0:47          0.000000
-0:47        Constant:
-0:47          1.000000
-0:48      Sign ( temp float)
+0:48      sine ( temp float)
 0:48        'inF0' ( in float)
-0:49      sine ( temp float)
-0:49        'inF0' ( in float)
-0:50      Sequence
-0:50        move second child to first child ( temp float)
-0:50          'inF1' ( in float)
-0:50          sine ( temp float)
-0:50            'inF0' ( in float)
-0:50        move second child to first child ( temp float)
-0:50          'inF2' ( in float)
-0:50          cosine ( temp float)
-0:50            'inF0' ( in float)
-0:51      hyp. sine ( temp float)
+0:49      Sequence
+0:49        move second child to first child ( temp float)
+0:49          'inF1' ( in float)
+0:49          sine ( temp float)
+0:49            'inF0' ( in float)
+0:49        move second child to first child ( temp float)
+0:49          'inF2' ( in float)
+0:49          cosine ( temp float)
+0:49            'inF0' ( in float)
+0:50      hyp. sine ( temp float)
+0:50        'inF0' ( in float)
+0:51      smoothstep ( temp float)
 0:51        'inF0' ( in float)
-0:52      smoothstep ( temp float)
+0:51        'inF1' ( in float)
+0:51        'inF2' ( in float)
+0:52      sqrt ( temp float)
 0:52        'inF0' ( in float)
-0:52        'inF1' ( in float)
-0:52        'inF2' ( in float)
-0:53      sqrt ( temp float)
+0:53      step ( temp float)
 0:53        'inF0' ( in float)
-0:54      step ( temp float)
+0:53        'inF1' ( in float)
+0:54      tangent ( temp float)
 0:54        'inF0' ( in float)
-0:54        'inF1' ( in float)
-0:55      tangent ( temp float)
+0:55      hyp. tangent ( temp float)
 0:55        'inF0' ( in float)
-0:56      hyp. tangent ( temp float)
-0:56        'inF0' ( in float)
-0:58      trunc ( temp float)
-0:58        'inF0' ( in float)
-0:60      Branch: Return with expression
-0:60        Constant:
-0:60          0.000000
-0:64  Function Definition: VertexShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
-0:64    Function Parameters: 
-0:64      'inF0' ( in 1-component vector of float)
-0:64      'inF1' ( in 1-component vector of float)
-0:64      'inF2' ( in 1-component vector of float)
+0:57      trunc ( temp float)
+0:57        'inF0' ( in float)
+0:59      Branch: Return with expression
+0:59        Constant:
+0:59          0.000000
+0:63  Function Definition: VertexShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
+0:63    Function Parameters: 
+0:63      'inF0' ( in 1-component vector of float)
+0:63      'inF1' ( in 1-component vector of float)
+0:63      'inF2' ( in 1-component vector of float)
 0:?     Sequence
-0:66      Branch: Return with expression
-0:66        Constant:
-0:66          0.000000
-0:70  Function Definition: VertexShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
-0:70    Function Parameters: 
-0:70      'inF0' ( in 2-component vector of float)
-0:70      'inF1' ( in 2-component vector of float)
-0:70      'inF2' ( in 2-component vector of float)
-0:70      'inU0' ( in 2-component vector of uint)
-0:70      'inU1' ( in 2-component vector of uint)
+0:65      Branch: Return with expression
+0:65        Constant:
+0:65          0.000000
+0:69  Function Definition: VertexShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
+0:69    Function Parameters: 
+0:69      'inF0' ( in 2-component vector of float)
+0:69      'inF1' ( in 2-component vector of float)
+0:69      'inF2' ( in 2-component vector of float)
+0:69      'inU0' ( in 2-component vector of uint)
+0:69      'inU1' ( in 2-component vector of uint)
 0:?     Sequence
-0:71      all ( temp bool)
+0:70      all ( temp bool)
+0:70        'inF0' ( in 2-component vector of float)
+0:71      Absolute value ( temp 2-component vector of float)
 0:71        'inF0' ( in 2-component vector of float)
-0:72      Absolute value ( temp 2-component vector of float)
+0:72      arc cosine ( temp 2-component vector of float)
 0:72        'inF0' ( in 2-component vector of float)
-0:73      arc cosine ( temp 2-component vector of float)
+0:73      any ( temp bool)
 0:73        'inF0' ( in 2-component vector of float)
-0:74      any ( temp bool)
+0:74      arc sine ( temp 2-component vector of float)
 0:74        'inF0' ( in 2-component vector of float)
-0:75      arc sine ( temp 2-component vector of float)
+0:75      floatBitsToInt ( temp 2-component vector of int)
 0:75        'inF0' ( in 2-component vector of float)
-0:76      floatBitsToInt ( temp 2-component vector of int)
+0:76      floatBitsToUint ( temp 2-component vector of uint)
 0:76        'inF0' ( in 2-component vector of float)
-0:77      floatBitsToUint ( temp 2-component vector of uint)
-0:77        'inF0' ( in 2-component vector of float)
-0:78      intBitsToFloat ( temp 2-component vector of float)
-0:78        'inU0' ( in 2-component vector of uint)
+0:77      intBitsToFloat ( temp 2-component vector of float)
+0:77        'inU0' ( in 2-component vector of uint)
+0:79      arc tangent ( temp 2-component vector of float)
+0:79        'inF0' ( in 2-component vector of float)
 0:80      arc tangent ( temp 2-component vector of float)
 0:80        'inF0' ( in 2-component vector of float)
-0:81      arc tangent ( temp 2-component vector of float)
+0:80        'inF1' ( in 2-component vector of float)
+0:81      Ceiling ( temp 2-component vector of float)
 0:81        'inF0' ( in 2-component vector of float)
-0:81        'inF1' ( in 2-component vector of float)
-0:82      Ceiling ( temp 2-component vector of float)
+0:82      clamp ( temp 2-component vector of float)
 0:82        'inF0' ( in 2-component vector of float)
-0:83      clamp ( temp 2-component vector of float)
+0:82        'inF1' ( in 2-component vector of float)
+0:82        'inF2' ( in 2-component vector of float)
+0:83      cosine ( temp 2-component vector of float)
 0:83        'inF0' ( in 2-component vector of float)
-0:83        'inF1' ( in 2-component vector of float)
-0:83        'inF2' ( in 2-component vector of float)
-0:84      cosine ( temp 2-component vector of float)
+0:84      hyp. cosine ( temp 2-component vector of float)
 0:84        'inF0' ( in 2-component vector of float)
-0:85      hyp. cosine ( temp 2-component vector of float)
-0:85        'inF0' ( in 2-component vector of float)
 0:?       bitCount ( temp 2-component vector of int)
 0:?         Constant:
 0:?           7 (const int)
 0:?           3 (const int)
-0:87      degrees ( temp 2-component vector of float)
+0:86      degrees ( temp 2-component vector of float)
+0:86        'inF0' ( in 2-component vector of float)
+0:87      distance ( temp float)
 0:87        'inF0' ( in 2-component vector of float)
-0:88      distance ( temp float)
+0:87        'inF1' ( in 2-component vector of float)
+0:88      dot-product ( temp float)
 0:88        'inF0' ( in 2-component vector of float)
 0:88        'inF1' ( in 2-component vector of float)
-0:89      dot-product ( temp float)
-0:89        'inF0' ( in 2-component vector of float)
-0:89        'inF1' ( in 2-component vector of float)
-0:93      exp ( temp 2-component vector of float)
+0:92      exp ( temp 2-component vector of float)
+0:92        'inF0' ( in 2-component vector of float)
+0:93      exp2 ( temp 2-component vector of float)
 0:93        'inF0' ( in 2-component vector of float)
-0:94      exp2 ( temp 2-component vector of float)
+0:94      face-forward ( temp 2-component vector of float)
 0:94        'inF0' ( in 2-component vector of float)
-0:95      face-forward ( temp 2-component vector of float)
-0:95        'inF0' ( in 2-component vector of float)
-0:95        'inF1' ( in 2-component vector of float)
-0:95        'inF2' ( in 2-component vector of float)
-0:96      findMSB ( temp int)
+0:94        'inF1' ( in 2-component vector of float)
+0:94        'inF2' ( in 2-component vector of float)
+0:95      findMSB ( temp int)
+0:95        Constant:
+0:95          7 (const int)
+0:96      findLSB ( temp int)
 0:96        Constant:
 0:96          7 (const int)
-0:97      findLSB ( temp int)
-0:97        Constant:
-0:97          7 (const int)
-0:98      Floor ( temp 2-component vector of float)
-0:98        'inF0' ( in 2-component vector of float)
-0:100      mod ( temp 2-component vector of float)
+0:97      Floor ( temp 2-component vector of float)
+0:97        'inF0' ( in 2-component vector of float)
+0:99      mod ( temp 2-component vector of float)
+0:99        'inF0' ( in 2-component vector of float)
+0:99        'inF1' ( in 2-component vector of float)
+0:100      Fraction ( temp 2-component vector of float)
 0:100        'inF0' ( in 2-component vector of float)
-0:100        'inF1' ( in 2-component vector of float)
-0:101      Fraction ( temp 2-component vector of float)
+0:101      isinf ( temp 2-component vector of bool)
 0:101        'inF0' ( in 2-component vector of float)
-0:102      frexp ( temp 2-component vector of float)
+0:102      isnan ( temp 2-component vector of bool)
 0:102        'inF0' ( in 2-component vector of float)
-0:102        'inF1' ( in 2-component vector of float)
-0:103      isinf ( temp 2-component vector of bool)
+0:103      ldexp ( temp 2-component vector of float)
 0:103        'inF0' ( in 2-component vector of float)
-0:104      isnan ( temp 2-component vector of bool)
+0:103        'inF1' ( in 2-component vector of float)
+0:104      mix ( temp 2-component vector of float)
 0:104        'inF0' ( in 2-component vector of float)
-0:105      ldexp ( temp 2-component vector of float)
+0:104        'inF1' ( in 2-component vector of float)
+0:104        'inF2' ( in 2-component vector of float)
+0:105      length ( temp float)
 0:105        'inF0' ( in 2-component vector of float)
-0:105        'inF1' ( in 2-component vector of float)
-0:106      mix ( temp 2-component vector of float)
+0:106      log ( temp 2-component vector of float)
 0:106        'inF0' ( in 2-component vector of float)
-0:106        'inF1' ( in 2-component vector of float)
-0:106        'inF2' ( in 2-component vector of float)
-0:107      length ( temp float)
-0:107        'inF0' ( in 2-component vector of float)
-0:108      log ( temp 2-component vector of float)
+0:107      vector-scale ( temp 2-component vector of float)
+0:107        log2 ( temp 2-component vector of float)
+0:107          'inF0' ( in 2-component vector of float)
+0:107        Constant:
+0:107          0.301030
+0:108      log2 ( temp 2-component vector of float)
 0:108        'inF0' ( in 2-component vector of float)
-0:109      vector-scale ( temp 2-component vector of float)
-0:109        log2 ( temp 2-component vector of float)
-0:109          'inF0' ( in 2-component vector of float)
-0:109        Constant:
-0:109          0.301030
-0:110      log2 ( temp 2-component vector of float)
+0:109      max ( temp 2-component vector of float)
+0:109        'inF0' ( in 2-component vector of float)
+0:109        'inF1' ( in 2-component vector of float)
+0:110      min ( temp 2-component vector of float)
 0:110        'inF0' ( in 2-component vector of float)
-0:111      max ( temp 2-component vector of float)
-0:111        'inF0' ( in 2-component vector of float)
-0:111        'inF1' ( in 2-component vector of float)
-0:112      min ( temp 2-component vector of float)
+0:110        'inF1' ( in 2-component vector of float)
+0:112      normalize ( temp 2-component vector of float)
 0:112        'inF0' ( in 2-component vector of float)
-0:112        'inF1' ( in 2-component vector of float)
-0:114      normalize ( temp 2-component vector of float)
+0:113      pow ( temp 2-component vector of float)
+0:113        'inF0' ( in 2-component vector of float)
+0:113        'inF1' ( in 2-component vector of float)
+0:114      radians ( temp 2-component vector of float)
 0:114        'inF0' ( in 2-component vector of float)
-0:115      pow ( temp 2-component vector of float)
+0:115      reflect ( temp 2-component vector of float)
 0:115        'inF0' ( in 2-component vector of float)
 0:115        'inF1' ( in 2-component vector of float)
-0:116      radians ( temp 2-component vector of float)
+0:116      refract ( temp 2-component vector of float)
 0:116        'inF0' ( in 2-component vector of float)
-0:117      reflect ( temp 2-component vector of float)
-0:117        'inF0' ( in 2-component vector of float)
-0:117        'inF1' ( in 2-component vector of float)
-0:118      refract ( temp 2-component vector of float)
-0:118        'inF0' ( in 2-component vector of float)
-0:118        'inF1' ( in 2-component vector of float)
-0:118        Constant:
-0:118          2.000000
+0:116        'inF1' ( in 2-component vector of float)
+0:116        Constant:
+0:116          2.000000
 0:?       bitFieldReverse ( temp 2-component vector of int)
 0:?         Constant:
 0:?           1 (const int)
 0:?           2 (const int)
-0:120      roundEven ( temp 2-component vector of float)
+0:118      roundEven ( temp 2-component vector of float)
+0:118        'inF0' ( in 2-component vector of float)
+0:119      inverse sqrt ( temp 2-component vector of float)
+0:119        'inF0' ( in 2-component vector of float)
+0:120      clamp ( temp 2-component vector of float)
 0:120        'inF0' ( in 2-component vector of float)
-0:121      inverse sqrt ( temp 2-component vector of float)
+0:120        Constant:
+0:120          0.000000
+0:120        Constant:
+0:120          1.000000
+0:121      Sign ( temp 2-component vector of float)
 0:121        'inF0' ( in 2-component vector of float)
-0:122      clamp ( temp 2-component vector of float)
+0:122      sine ( temp 2-component vector of float)
 0:122        'inF0' ( in 2-component vector of float)
-0:122        Constant:
-0:122          0.000000
-0:122        Constant:
-0:122          1.000000
-0:123      Sign ( temp 2-component vector of float)
-0:123        'inF0' ( in 2-component vector of float)
-0:124      sine ( temp 2-component vector of float)
+0:123      Sequence
+0:123        move second child to first child ( temp 2-component vector of float)
+0:123          'inF1' ( in 2-component vector of float)
+0:123          sine ( temp 2-component vector of float)
+0:123            'inF0' ( in 2-component vector of float)
+0:123        move second child to first child ( temp 2-component vector of float)
+0:123          'inF2' ( in 2-component vector of float)
+0:123          cosine ( temp 2-component vector of float)
+0:123            'inF0' ( in 2-component vector of float)
+0:124      hyp. sine ( temp 2-component vector of float)
 0:124        'inF0' ( in 2-component vector of float)
-0:125      Sequence
-0:125        move second child to first child ( temp 2-component vector of float)
-0:125          'inF1' ( in 2-component vector of float)
-0:125          sine ( temp 2-component vector of float)
-0:125            'inF0' ( in 2-component vector of float)
-0:125        move second child to first child ( temp 2-component vector of float)
-0:125          'inF2' ( in 2-component vector of float)
-0:125          cosine ( temp 2-component vector of float)
-0:125            'inF0' ( in 2-component vector of float)
-0:126      hyp. sine ( temp 2-component vector of float)
+0:125      smoothstep ( temp 2-component vector of float)
+0:125        'inF0' ( in 2-component vector of float)
+0:125        'inF1' ( in 2-component vector of float)
+0:125        'inF2' ( in 2-component vector of float)
+0:126      sqrt ( temp 2-component vector of float)
 0:126        'inF0' ( in 2-component vector of float)
-0:127      smoothstep ( temp 2-component vector of float)
+0:127      step ( temp 2-component vector of float)
 0:127        'inF0' ( in 2-component vector of float)
 0:127        'inF1' ( in 2-component vector of float)
-0:127        'inF2' ( in 2-component vector of float)
-0:128      sqrt ( temp 2-component vector of float)
+0:128      tangent ( temp 2-component vector of float)
 0:128        'inF0' ( in 2-component vector of float)
-0:129      step ( temp 2-component vector of float)
+0:129      hyp. tangent ( temp 2-component vector of float)
 0:129        'inF0' ( in 2-component vector of float)
-0:129        'inF1' ( in 2-component vector of float)
-0:130      tangent ( temp 2-component vector of float)
-0:130        'inF0' ( in 2-component vector of float)
-0:131      hyp. tangent ( temp 2-component vector of float)
+0:131      trunc ( temp 2-component vector of float)
 0:131        'inF0' ( in 2-component vector of float)
-0:133      trunc ( temp 2-component vector of float)
-0:133        'inF0' ( in 2-component vector of float)
-0:136      Branch: Return with expression
+0:134      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
-0:140  Function Definition: VertexShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
-0:140    Function Parameters: 
-0:140      'inF0' ( in 3-component vector of float)
-0:140      'inF1' ( in 3-component vector of float)
-0:140      'inF2' ( in 3-component vector of float)
-0:140      'inU0' ( in 3-component vector of uint)
-0:140      'inU1' ( in 3-component vector of uint)
+0:138  Function Definition: VertexShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
+0:138    Function Parameters: 
+0:138      'inF0' ( in 3-component vector of float)
+0:138      'inF1' ( in 3-component vector of float)
+0:138      'inF2' ( in 3-component vector of float)
+0:138      'inU0' ( in 3-component vector of uint)
+0:138      'inU1' ( in 3-component vector of uint)
 0:?     Sequence
-0:141      all ( temp bool)
+0:139      all ( temp bool)
+0:139        'inF0' ( in 3-component vector of float)
+0:140      Absolute value ( temp 3-component vector of float)
+0:140        'inF0' ( in 3-component vector of float)
+0:141      arc cosine ( temp 3-component vector of float)
 0:141        'inF0' ( in 3-component vector of float)
-0:142      Absolute value ( temp 3-component vector of float)
+0:142      any ( temp bool)
 0:142        'inF0' ( in 3-component vector of float)
-0:143      arc cosine ( temp 3-component vector of float)
+0:143      arc sine ( temp 3-component vector of float)
 0:143        'inF0' ( in 3-component vector of float)
-0:144      any ( temp bool)
+0:144      floatBitsToInt ( temp 3-component vector of int)
 0:144        'inF0' ( in 3-component vector of float)
-0:145      arc sine ( temp 3-component vector of float)
+0:145      floatBitsToUint ( temp 3-component vector of uint)
 0:145        'inF0' ( in 3-component vector of float)
-0:146      floatBitsToInt ( temp 3-component vector of int)
-0:146        'inF0' ( in 3-component vector of float)
-0:147      floatBitsToUint ( temp 3-component vector of uint)
-0:147        'inF0' ( in 3-component vector of float)
-0:148      intBitsToFloat ( temp 3-component vector of float)
-0:148        'inU0' ( in 3-component vector of uint)
-0:150      arc tangent ( temp 3-component vector of float)
+0:146      intBitsToFloat ( temp 3-component vector of float)
+0:146        'inU0' ( in 3-component vector of uint)
+0:148      arc tangent ( temp 3-component vector of float)
+0:148        'inF0' ( in 3-component vector of float)
+0:149      arc tangent ( temp 3-component vector of float)
+0:149        'inF0' ( in 3-component vector of float)
+0:149        'inF1' ( in 3-component vector of float)
+0:150      Ceiling ( temp 3-component vector of float)
 0:150        'inF0' ( in 3-component vector of float)
-0:151      arc tangent ( temp 3-component vector of float)
+0:151      clamp ( temp 3-component vector of float)
 0:151        'inF0' ( in 3-component vector of float)
 0:151        'inF1' ( in 3-component vector of float)
-0:152      Ceiling ( temp 3-component vector of float)
+0:151        'inF2' ( in 3-component vector of float)
+0:152      cosine ( temp 3-component vector of float)
 0:152        'inF0' ( in 3-component vector of float)
-0:153      clamp ( temp 3-component vector of float)
+0:153      hyp. cosine ( temp 3-component vector of float)
 0:153        'inF0' ( in 3-component vector of float)
-0:153        'inF1' ( in 3-component vector of float)
-0:153        'inF2' ( in 3-component vector of float)
-0:154      cosine ( temp 3-component vector of float)
-0:154        'inF0' ( in 3-component vector of float)
-0:155      hyp. cosine ( temp 3-component vector of float)
-0:155        'inF0' ( in 3-component vector of float)
 0:?       bitCount ( temp 3-component vector of int)
 0:?         Constant:
 0:?           7 (const int)
 0:?           3 (const int)
 0:?           5 (const int)
-0:157      cross-product ( temp 3-component vector of float)
+0:155      cross-product ( temp 3-component vector of float)
+0:155        'inF0' ( in 3-component vector of float)
+0:155        'inF1' ( in 3-component vector of float)
+0:156      degrees ( temp 3-component vector of float)
+0:156        'inF0' ( in 3-component vector of float)
+0:157      distance ( temp float)
 0:157        'inF0' ( in 3-component vector of float)
 0:157        'inF1' ( in 3-component vector of float)
-0:158      degrees ( temp 3-component vector of float)
+0:158      dot-product ( temp float)
 0:158        'inF0' ( in 3-component vector of float)
-0:159      distance ( temp float)
-0:159        'inF0' ( in 3-component vector of float)
-0:159        'inF1' ( in 3-component vector of float)
-0:160      dot-product ( temp float)
-0:160        'inF0' ( in 3-component vector of float)
-0:160        'inF1' ( in 3-component vector of float)
-0:164      exp ( temp 3-component vector of float)
+0:158        'inF1' ( in 3-component vector of float)
+0:162      exp ( temp 3-component vector of float)
+0:162        'inF0' ( in 3-component vector of float)
+0:163      exp2 ( temp 3-component vector of float)
+0:163        'inF0' ( in 3-component vector of float)
+0:164      face-forward ( temp 3-component vector of float)
 0:164        'inF0' ( in 3-component vector of float)
-0:165      exp2 ( temp 3-component vector of float)
-0:165        'inF0' ( in 3-component vector of float)
-0:166      face-forward ( temp 3-component vector of float)
-0:166        'inF0' ( in 3-component vector of float)
-0:166        'inF1' ( in 3-component vector of float)
-0:166        'inF2' ( in 3-component vector of float)
-0:167      findMSB ( temp int)
-0:167        Constant:
-0:167          7 (const int)
-0:168      findLSB ( temp int)
-0:168        Constant:
-0:168          7 (const int)
-0:169      Floor ( temp 3-component vector of float)
+0:164        'inF1' ( in 3-component vector of float)
+0:164        'inF2' ( in 3-component vector of float)
+0:165      findMSB ( temp int)
+0:165        Constant:
+0:165          7 (const int)
+0:166      findLSB ( temp int)
+0:166        Constant:
+0:166          7 (const int)
+0:167      Floor ( temp 3-component vector of float)
+0:167        'inF0' ( in 3-component vector of float)
+0:169      mod ( temp 3-component vector of float)
 0:169        'inF0' ( in 3-component vector of float)
-0:171      mod ( temp 3-component vector of float)
+0:169        'inF1' ( in 3-component vector of float)
+0:170      Fraction ( temp 3-component vector of float)
+0:170        'inF0' ( in 3-component vector of float)
+0:171      isinf ( temp 3-component vector of bool)
 0:171        'inF0' ( in 3-component vector of float)
-0:171        'inF1' ( in 3-component vector of float)
-0:172      Fraction ( temp 3-component vector of float)
+0:172      isnan ( temp 3-component vector of bool)
 0:172        'inF0' ( in 3-component vector of float)
-0:173      frexp ( temp 3-component vector of float)
+0:173      ldexp ( temp 3-component vector of float)
 0:173        'inF0' ( in 3-component vector of float)
 0:173        'inF1' ( in 3-component vector of float)
-0:174      isinf ( temp 3-component vector of bool)
+0:174      mix ( temp 3-component vector of float)
 0:174        'inF0' ( in 3-component vector of float)
-0:175      isnan ( temp 3-component vector of bool)
+0:174        'inF1' ( in 3-component vector of float)
+0:174        'inF2' ( in 3-component vector of float)
+0:175      length ( temp float)
 0:175        'inF0' ( in 3-component vector of float)
-0:176      ldexp ( temp 3-component vector of float)
+0:176      log ( temp 3-component vector of float)
 0:176        'inF0' ( in 3-component vector of float)
-0:176        'inF1' ( in 3-component vector of float)
-0:177      mix ( temp 3-component vector of float)
-0:177        'inF0' ( in 3-component vector of float)
-0:177        'inF1' ( in 3-component vector of float)
-0:177        'inF2' ( in 3-component vector of float)
-0:178      length ( temp float)
+0:177      vector-scale ( temp 3-component vector of float)
+0:177        log2 ( temp 3-component vector of float)
+0:177          'inF0' ( in 3-component vector of float)
+0:177        Constant:
+0:177          0.301030
+0:178      log2 ( temp 3-component vector of float)
 0:178        'inF0' ( in 3-component vector of float)
-0:179      log ( temp 3-component vector of float)
+0:179      max ( temp 3-component vector of float)
 0:179        'inF0' ( in 3-component vector of float)
-0:180      vector-scale ( temp 3-component vector of float)
-0:180        log2 ( temp 3-component vector of float)
-0:180          'inF0' ( in 3-component vector of float)
-0:180        Constant:
-0:180          0.301030
-0:181      log2 ( temp 3-component vector of float)
-0:181        'inF0' ( in 3-component vector of float)
-0:182      max ( temp 3-component vector of float)
+0:179        'inF1' ( in 3-component vector of float)
+0:180      min ( temp 3-component vector of float)
+0:180        'inF0' ( in 3-component vector of float)
+0:180        'inF1' ( in 3-component vector of float)
+0:182      normalize ( temp 3-component vector of float)
 0:182        'inF0' ( in 3-component vector of float)
-0:182        'inF1' ( in 3-component vector of float)
-0:183      min ( temp 3-component vector of float)
+0:183      pow ( temp 3-component vector of float)
 0:183        'inF0' ( in 3-component vector of float)
 0:183        'inF1' ( in 3-component vector of float)
-0:185      normalize ( temp 3-component vector of float)
+0:184      radians ( temp 3-component vector of float)
+0:184        'inF0' ( in 3-component vector of float)
+0:185      reflect ( temp 3-component vector of float)
 0:185        'inF0' ( in 3-component vector of float)
-0:186      pow ( temp 3-component vector of float)
+0:185        'inF1' ( in 3-component vector of float)
+0:186      refract ( temp 3-component vector of float)
 0:186        'inF0' ( in 3-component vector of float)
 0:186        'inF1' ( in 3-component vector of float)
-0:187      radians ( temp 3-component vector of float)
-0:187        'inF0' ( in 3-component vector of float)
-0:188      reflect ( temp 3-component vector of float)
-0:188        'inF0' ( in 3-component vector of float)
-0:188        'inF1' ( in 3-component vector of float)
-0:189      refract ( temp 3-component vector of float)
-0:189        'inF0' ( in 3-component vector of float)
-0:189        'inF1' ( in 3-component vector of float)
-0:189        Constant:
-0:189          2.000000
+0:186        Constant:
+0:186          2.000000
 0:?       bitFieldReverse ( temp 3-component vector of int)
 0:?         Constant:
 0:?           1 (const int)
 0:?           2 (const int)
 0:?           3 (const int)
-0:191      roundEven ( temp 3-component vector of float)
+0:188      roundEven ( temp 3-component vector of float)
+0:188        'inF0' ( in 3-component vector of float)
+0:189      inverse sqrt ( temp 3-component vector of float)
+0:189        'inF0' ( in 3-component vector of float)
+0:190      clamp ( temp 3-component vector of float)
+0:190        'inF0' ( in 3-component vector of float)
+0:190        Constant:
+0:190          0.000000
+0:190        Constant:
+0:190          1.000000
+0:191      Sign ( temp 3-component vector of float)
 0:191        'inF0' ( in 3-component vector of float)
-0:192      inverse sqrt ( temp 3-component vector of float)
+0:192      sine ( temp 3-component vector of float)
 0:192        'inF0' ( in 3-component vector of float)
-0:193      clamp ( temp 3-component vector of float)
-0:193        'inF0' ( in 3-component vector of float)
-0:193        Constant:
-0:193          0.000000
-0:193        Constant:
-0:193          1.000000
-0:194      Sign ( temp 3-component vector of float)
+0:193      Sequence
+0:193        move second child to first child ( temp 3-component vector of float)
+0:193          'inF1' ( in 3-component vector of float)
+0:193          sine ( temp 3-component vector of float)
+0:193            'inF0' ( in 3-component vector of float)
+0:193        move second child to first child ( temp 3-component vector of float)
+0:193          'inF2' ( in 3-component vector of float)
+0:193          cosine ( temp 3-component vector of float)
+0:193            'inF0' ( in 3-component vector of float)
+0:194      hyp. sine ( temp 3-component vector of float)
 0:194        'inF0' ( in 3-component vector of float)
-0:195      sine ( temp 3-component vector of float)
+0:195      smoothstep ( temp 3-component vector of float)
 0:195        'inF0' ( in 3-component vector of float)
-0:196      Sequence
-0:196        move second child to first child ( temp 3-component vector of float)
-0:196          'inF1' ( in 3-component vector of float)
-0:196          sine ( temp 3-component vector of float)
-0:196            'inF0' ( in 3-component vector of float)
-0:196        move second child to first child ( temp 3-component vector of float)
-0:196          'inF2' ( in 3-component vector of float)
-0:196          cosine ( temp 3-component vector of float)
-0:196            'inF0' ( in 3-component vector of float)
-0:197      hyp. sine ( temp 3-component vector of float)
+0:195        'inF1' ( in 3-component vector of float)
+0:195        'inF2' ( in 3-component vector of float)
+0:196      sqrt ( temp 3-component vector of float)
+0:196        'inF0' ( in 3-component vector of float)
+0:197      step ( temp 3-component vector of float)
 0:197        'inF0' ( in 3-component vector of float)
-0:198      smoothstep ( temp 3-component vector of float)
+0:197        'inF1' ( in 3-component vector of float)
+0:198      tangent ( temp 3-component vector of float)
 0:198        'inF0' ( in 3-component vector of float)
-0:198        'inF1' ( in 3-component vector of float)
-0:198        'inF2' ( in 3-component vector of float)
-0:199      sqrt ( temp 3-component vector of float)
+0:199      hyp. tangent ( temp 3-component vector of float)
 0:199        'inF0' ( in 3-component vector of float)
-0:200      step ( temp 3-component vector of float)
-0:200        'inF0' ( in 3-component vector of float)
-0:200        'inF1' ( in 3-component vector of float)
-0:201      tangent ( temp 3-component vector of float)
+0:201      trunc ( temp 3-component vector of float)
 0:201        'inF0' ( in 3-component vector of float)
-0:202      hyp. tangent ( temp 3-component vector of float)
-0:202        'inF0' ( in 3-component vector of float)
-0:204      trunc ( temp 3-component vector of float)
-0:204        'inF0' ( in 3-component vector of float)
-0:207      Branch: Return with expression
+0:204      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
-0:211  Function Definition: VertexShaderFunction4(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
-0:211    Function Parameters: 
-0:211      'inF0' ( in 4-component vector of float)
-0:211      'inF1' ( in 4-component vector of float)
-0:211      'inF2' ( in 4-component vector of float)
-0:211      'inU0' ( in 4-component vector of uint)
-0:211      'inU1' ( in 4-component vector of uint)
+0:208  Function Definition: VertexShaderFunction4(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
+0:208    Function Parameters: 
+0:208      'inF0' ( in 4-component vector of float)
+0:208      'inF1' ( in 4-component vector of float)
+0:208      'inF2' ( in 4-component vector of float)
+0:208      'inU0' ( in 4-component vector of uint)
+0:208      'inU1' ( in 4-component vector of uint)
 0:?     Sequence
-0:212      all ( temp bool)
+0:209      all ( temp bool)
+0:209        'inF0' ( in 4-component vector of float)
+0:210      Absolute value ( temp 4-component vector of float)
+0:210        'inF0' ( in 4-component vector of float)
+0:211      arc cosine ( temp 4-component vector of float)
+0:211        'inF0' ( in 4-component vector of float)
+0:212      any ( temp bool)
 0:212        'inF0' ( in 4-component vector of float)
-0:213      Absolute value ( temp 4-component vector of float)
+0:213      arc sine ( temp 4-component vector of float)
 0:213        'inF0' ( in 4-component vector of float)
-0:214      arc cosine ( temp 4-component vector of float)
+0:214      floatBitsToInt ( temp 4-component vector of int)
 0:214        'inF0' ( in 4-component vector of float)
-0:215      any ( temp bool)
+0:215      floatBitsToUint ( temp 4-component vector of uint)
 0:215        'inF0' ( in 4-component vector of float)
-0:216      arc sine ( temp 4-component vector of float)
-0:216        'inF0' ( in 4-component vector of float)
-0:217      floatBitsToInt ( temp 4-component vector of int)
-0:217        'inF0' ( in 4-component vector of float)
-0:218      floatBitsToUint ( temp 4-component vector of uint)
+0:216      intBitsToFloat ( temp 4-component vector of float)
+0:216        'inU0' ( in 4-component vector of uint)
+0:218      arc tangent ( temp 4-component vector of float)
 0:218        'inF0' ( in 4-component vector of float)
-0:219      intBitsToFloat ( temp 4-component vector of float)
-0:219        'inU0' ( in 4-component vector of uint)
-0:221      arc tangent ( temp 4-component vector of float)
+0:219      arc tangent ( temp 4-component vector of float)
+0:219        'inF0' ( in 4-component vector of float)
+0:219        'inF1' ( in 4-component vector of float)
+0:220      Ceiling ( temp 4-component vector of float)
+0:220        'inF0' ( in 4-component vector of float)
+0:221      clamp ( temp 4-component vector of float)
 0:221        'inF0' ( in 4-component vector of float)
-0:222      arc tangent ( temp 4-component vector of float)
+0:221        'inF1' ( in 4-component vector of float)
+0:221        'inF2' ( in 4-component vector of float)
+0:222      cosine ( temp 4-component vector of float)
 0:222        'inF0' ( in 4-component vector of float)
-0:222        'inF1' ( in 4-component vector of float)
-0:223      Ceiling ( temp 4-component vector of float)
+0:223      hyp. cosine ( temp 4-component vector of float)
 0:223        'inF0' ( in 4-component vector of float)
-0:224      clamp ( temp 4-component vector of float)
-0:224        'inF0' ( in 4-component vector of float)
-0:224        'inF1' ( in 4-component vector of float)
-0:224        'inF2' ( in 4-component vector of float)
-0:225      cosine ( temp 4-component vector of float)
-0:225        'inF0' ( in 4-component vector of float)
-0:226      hyp. cosine ( temp 4-component vector of float)
-0:226        'inF0' ( in 4-component vector of float)
 0:?       bitCount ( temp 4-component vector of int)
 0:?         Constant:
 0:?           7 (const int)
 0:?           3 (const int)
 0:?           5 (const int)
 0:?           2 (const int)
-0:228      degrees ( temp 4-component vector of float)
-0:228        'inF0' ( in 4-component vector of float)
-0:229      distance ( temp float)
-0:229        'inF0' ( in 4-component vector of float)
-0:229        'inF1' ( in 4-component vector of float)
-0:230      dot-product ( temp float)
-0:230        'inF0' ( in 4-component vector of float)
-0:230        'inF1' ( in 4-component vector of float)
-0:231      Construct vec4 ( temp 4-component vector of float)
-0:231        Constant:
-0:231          1.000000
-0:231        component-wise multiply ( temp float)
-0:231          direct index ( temp float)
-0:231            'inF0' ( in 4-component vector of float)
-0:231            Constant:
-0:231              1 (const int)
-0:231          direct index ( temp float)
-0:231            'inF1' ( in 4-component vector of float)
-0:231            Constant:
-0:231              1 (const int)
-0:231        direct index ( temp float)
-0:231          'inF0' ( in 4-component vector of float)
-0:231          Constant:
-0:231            2 (const int)
-0:231        direct index ( temp float)
-0:231          'inF1' ( in 4-component vector of float)
-0:231          Constant:
-0:231            3 (const int)
-0:235      exp ( temp 4-component vector of float)
-0:235        'inF0' ( in 4-component vector of float)
-0:236      exp2 ( temp 4-component vector of float)
-0:236        'inF0' ( in 4-component vector of float)
-0:237      face-forward ( temp 4-component vector of float)
+0:225      degrees ( temp 4-component vector of float)
+0:225        'inF0' ( in 4-component vector of float)
+0:226      distance ( temp float)
+0:226        'inF0' ( in 4-component vector of float)
+0:226        'inF1' ( in 4-component vector of float)
+0:227      dot-product ( temp float)
+0:227        'inF0' ( in 4-component vector of float)
+0:227        'inF1' ( in 4-component vector of float)
+0:228      Construct vec4 ( temp 4-component vector of float)
+0:228        Constant:
+0:228          1.000000
+0:228        component-wise multiply ( temp float)
+0:228          direct index ( temp float)
+0:228            'inF0' ( in 4-component vector of float)
+0:228            Constant:
+0:228              1 (const int)
+0:228          direct index ( temp float)
+0:228            'inF1' ( in 4-component vector of float)
+0:228            Constant:
+0:228              1 (const int)
+0:228        direct index ( temp float)
+0:228          'inF0' ( in 4-component vector of float)
+0:228          Constant:
+0:228            2 (const int)
+0:228        direct index ( temp float)
+0:228          'inF1' ( in 4-component vector of float)
+0:228          Constant:
+0:228            3 (const int)
+0:232      exp ( temp 4-component vector of float)
+0:232        'inF0' ( in 4-component vector of float)
+0:233      exp2 ( temp 4-component vector of float)
+0:233        'inF0' ( in 4-component vector of float)
+0:234      face-forward ( temp 4-component vector of float)
+0:234        'inF0' ( in 4-component vector of float)
+0:234        'inF1' ( in 4-component vector of float)
+0:234        'inF2' ( in 4-component vector of float)
+0:235      findMSB ( temp int)
+0:235        Constant:
+0:235          7 (const int)
+0:236      findLSB ( temp int)
+0:236        Constant:
+0:236          7 (const int)
+0:237      Floor ( temp 4-component vector of float)
 0:237        'inF0' ( in 4-component vector of float)
-0:237        'inF1' ( in 4-component vector of float)
-0:237        'inF2' ( in 4-component vector of float)
-0:238      findMSB ( temp int)
-0:238        Constant:
-0:238          7 (const int)
-0:239      findLSB ( temp int)
-0:239        Constant:
-0:239          7 (const int)
-0:240      Floor ( temp 4-component vector of float)
+0:239      mod ( temp 4-component vector of float)
+0:239        'inF0' ( in 4-component vector of float)
+0:239        'inF1' ( in 4-component vector of float)
+0:240      Fraction ( temp 4-component vector of float)
 0:240        'inF0' ( in 4-component vector of float)
-0:242      mod ( temp 4-component vector of float)
+0:241      isinf ( temp 4-component vector of bool)
+0:241        'inF0' ( in 4-component vector of float)
+0:242      isnan ( temp 4-component vector of bool)
 0:242        'inF0' ( in 4-component vector of float)
-0:242        'inF1' ( in 4-component vector of float)
-0:243      Fraction ( temp 4-component vector of float)
+0:243      ldexp ( temp 4-component vector of float)
 0:243        'inF0' ( in 4-component vector of float)
-0:244      frexp ( temp 4-component vector of float)
+0:243        'inF1' ( in 4-component vector of float)
+0:244      mix ( temp 4-component vector of float)
 0:244        'inF0' ( in 4-component vector of float)
 0:244        'inF1' ( in 4-component vector of float)
-0:245      isinf ( temp 4-component vector of bool)
+0:244        'inF2' ( in 4-component vector of float)
+0:245      length ( temp float)
 0:245        'inF0' ( in 4-component vector of float)
-0:246      isnan ( temp 4-component vector of bool)
+0:246      log ( temp 4-component vector of float)
 0:246        'inF0' ( in 4-component vector of float)
-0:247      ldexp ( temp 4-component vector of float)
-0:247        'inF0' ( in 4-component vector of float)
-0:247        'inF1' ( in 4-component vector of float)
-0:248      mix ( temp 4-component vector of float)
+0:247      vector-scale ( temp 4-component vector of float)
+0:247        log2 ( temp 4-component vector of float)
+0:247          'inF0' ( in 4-component vector of float)
+0:247        Constant:
+0:247          0.301030
+0:248      log2 ( temp 4-component vector of float)
 0:248        'inF0' ( in 4-component vector of float)
-0:248        'inF1' ( in 4-component vector of float)
-0:248        'inF2' ( in 4-component vector of float)
-0:249      length ( temp float)
+0:249      max ( temp 4-component vector of float)
 0:249        'inF0' ( in 4-component vector of float)
-0:250      log ( temp 4-component vector of float)
+0:249        'inF1' ( in 4-component vector of float)
+0:250      min ( temp 4-component vector of float)
 0:250        'inF0' ( in 4-component vector of float)
-0:251      vector-scale ( temp 4-component vector of float)
-0:251        log2 ( temp 4-component vector of float)
-0:251          'inF0' ( in 4-component vector of float)
-0:251        Constant:
-0:251          0.301030
-0:252      log2 ( temp 4-component vector of float)
+0:250        'inF1' ( in 4-component vector of float)
+0:252      normalize ( temp 4-component vector of float)
 0:252        'inF0' ( in 4-component vector of float)
-0:253      max ( temp 4-component vector of float)
+0:253      pow ( temp 4-component vector of float)
 0:253        'inF0' ( in 4-component vector of float)
 0:253        'inF1' ( in 4-component vector of float)
-0:254      min ( temp 4-component vector of float)
+0:254      radians ( temp 4-component vector of float)
 0:254        'inF0' ( in 4-component vector of float)
-0:254        'inF1' ( in 4-component vector of float)
-0:256      normalize ( temp 4-component vector of float)
+0:255      reflect ( temp 4-component vector of float)
+0:255        'inF0' ( in 4-component vector of float)
+0:255        'inF1' ( in 4-component vector of float)
+0:256      refract ( temp 4-component vector of float)
 0:256        'inF0' ( in 4-component vector of float)
-0:257      pow ( temp 4-component vector of float)
-0:257        'inF0' ( in 4-component vector of float)
-0:257        'inF1' ( in 4-component vector of float)
-0:258      radians ( temp 4-component vector of float)
-0:258        'inF0' ( in 4-component vector of float)
-0:259      reflect ( temp 4-component vector of float)
-0:259        'inF0' ( in 4-component vector of float)
-0:259        'inF1' ( in 4-component vector of float)
-0:260      refract ( temp 4-component vector of float)
-0:260        'inF0' ( in 4-component vector of float)
-0:260        'inF1' ( in 4-component vector of float)
-0:260        Constant:
-0:260          2.000000
+0:256        'inF1' ( in 4-component vector of float)
+0:256        Constant:
+0:256          2.000000
 0:?       bitFieldReverse ( temp 4-component vector of int)
 0:?         Constant:
 0:?           1 (const int)
 0:?           2 (const int)
 0:?           3 (const int)
 0:?           4 (const int)
-0:262      roundEven ( temp 4-component vector of float)
+0:258      roundEven ( temp 4-component vector of float)
+0:258        'inF0' ( in 4-component vector of float)
+0:259      inverse sqrt ( temp 4-component vector of float)
+0:259        'inF0' ( in 4-component vector of float)
+0:260      clamp ( temp 4-component vector of float)
+0:260        'inF0' ( in 4-component vector of float)
+0:260        Constant:
+0:260          0.000000
+0:260        Constant:
+0:260          1.000000
+0:261      Sign ( temp 4-component vector of float)
+0:261        'inF0' ( in 4-component vector of float)
+0:262      sine ( temp 4-component vector of float)
 0:262        'inF0' ( in 4-component vector of float)
-0:263      inverse sqrt ( temp 4-component vector of float)
-0:263        'inF0' ( in 4-component vector of float)
-0:264      clamp ( temp 4-component vector of float)
+0:263      Sequence
+0:263        move second child to first child ( temp 4-component vector of float)
+0:263          'inF1' ( in 4-component vector of float)
+0:263          sine ( temp 4-component vector of float)
+0:263            'inF0' ( in 4-component vector of float)
+0:263        move second child to first child ( temp 4-component vector of float)
+0:263          'inF2' ( in 4-component vector of float)
+0:263          cosine ( temp 4-component vector of float)
+0:263            'inF0' ( in 4-component vector of float)
+0:264      hyp. sine ( temp 4-component vector of float)
 0:264        'inF0' ( in 4-component vector of float)
-0:264        Constant:
-0:264          0.000000
-0:264        Constant:
-0:264          1.000000
-0:265      Sign ( temp 4-component vector of float)
+0:265      smoothstep ( temp 4-component vector of float)
 0:265        'inF0' ( in 4-component vector of float)
-0:266      sine ( temp 4-component vector of float)
+0:265        'inF1' ( in 4-component vector of float)
+0:265        'inF2' ( in 4-component vector of float)
+0:266      sqrt ( temp 4-component vector of float)
 0:266        'inF0' ( in 4-component vector of float)
-0:267      Sequence
-0:267        move second child to first child ( temp 4-component vector of float)
-0:267          'inF1' ( in 4-component vector of float)
-0:267          sine ( temp 4-component vector of float)
-0:267            'inF0' ( in 4-component vector of float)
-0:267        move second child to first child ( temp 4-component vector of float)
-0:267          'inF2' ( in 4-component vector of float)
-0:267          cosine ( temp 4-component vector of float)
-0:267            'inF0' ( in 4-component vector of float)
-0:268      hyp. sine ( temp 4-component vector of float)
+0:267      step ( temp 4-component vector of float)
+0:267        'inF0' ( in 4-component vector of float)
+0:267        'inF1' ( in 4-component vector of float)
+0:268      tangent ( temp 4-component vector of float)
 0:268        'inF0' ( in 4-component vector of float)
-0:269      smoothstep ( temp 4-component vector of float)
+0:269      hyp. tangent ( temp 4-component vector of float)
 0:269        'inF0' ( in 4-component vector of float)
-0:269        'inF1' ( in 4-component vector of float)
-0:269        'inF2' ( in 4-component vector of float)
-0:270      sqrt ( temp 4-component vector of float)
-0:270        'inF0' ( in 4-component vector of float)
-0:271      step ( temp 4-component vector of float)
+0:271      trunc ( temp 4-component vector of float)
 0:271        'inF0' ( in 4-component vector of float)
-0:271        'inF1' ( in 4-component vector of float)
-0:272      tangent ( temp 4-component vector of float)
-0:272        'inF0' ( in 4-component vector of float)
-0:273      hyp. tangent ( temp 4-component vector of float)
-0:273        'inF0' ( in 4-component vector of float)
-0:275      trunc ( temp 4-component vector of float)
-0:275        'inF0' ( in 4-component vector of float)
-0:278      Branch: Return with expression
+0:274      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
 0:?           4.000000
-0:336  Function Definition: VertexShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
-0:336    Function Parameters: 
-0:336      'inF0' ( in 2X2 matrix of float)
-0:336      'inF1' ( in 2X2 matrix of float)
-0:336      'inF2' ( in 2X2 matrix of float)
+0:331  Function Definition: VertexShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
+0:331    Function Parameters: 
+0:331      'inF0' ( in 2X2 matrix of float)
+0:331      'inF1' ( in 2X2 matrix of float)
+0:331      'inF2' ( in 2X2 matrix of float)
 0:?     Sequence
-0:338      all ( temp bool)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      Absolute value ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc cosine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      any ( temp bool)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc sine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      Ceiling ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      clamp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338        'inF2' ( in 2X2 matrix of float)
-0:338      cosine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      hyp. cosine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      degrees ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      determinant ( temp float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      exp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      exp2 ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      findMSB ( temp int)
-0:338        Constant:
-0:338          7 (const int)
-0:338      findLSB ( temp int)
-0:338        Constant:
-0:338          7 (const int)
-0:338      Floor ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      mod ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      Fraction ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      frexp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      ldexp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      mix ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338        'inF2' ( in 2X2 matrix of float)
-0:338      log ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      matrix-scale ( temp 2X2 matrix of float)
-0:338        log2 ( temp 2X2 matrix of float)
-0:338          'inF0' ( in 2X2 matrix of float)
-0:338        Constant:
-0:338          0.301030
-0:338      log2 ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      max ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      min ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      pow ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      radians ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      roundEven ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      inverse sqrt ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      clamp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        Constant:
-0:338          0.000000
-0:338        Constant:
-0:338          1.000000
-0:338      Sign ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      sine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      Sequence
-0:338        move second child to first child ( temp 2X2 matrix of float)
-0:338          'inF1' ( in 2X2 matrix of float)
-0:338          sine ( temp 2X2 matrix of float)
-0:338            'inF0' ( in 2X2 matrix of float)
-0:338        move second child to first child ( temp 2X2 matrix of float)
-0:338          'inF2' ( in 2X2 matrix of float)
-0:338          cosine ( temp 2X2 matrix of float)
-0:338            'inF0' ( in 2X2 matrix of float)
-0:338      hyp. sine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      smoothstep ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338        'inF2' ( in 2X2 matrix of float)
-0:338      sqrt ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      step ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      hyp. tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      transpose ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      trunc ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:341      Branch: Return with expression
+0:333      all ( temp bool)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      Absolute value ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc cosine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      any ( temp bool)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc sine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      Ceiling ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      clamp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333        'inF2' ( in 2X2 matrix of float)
+0:333      cosine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      hyp. cosine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      degrees ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      determinant ( temp float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      exp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      exp2 ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      findMSB ( temp int)
+0:333        Constant:
+0:333          7 (const int)
+0:333      findLSB ( temp int)
+0:333        Constant:
+0:333          7 (const int)
+0:333      Floor ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      mod ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      Fraction ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      ldexp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      mix ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333        'inF2' ( in 2X2 matrix of float)
+0:333      log ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      matrix-scale ( temp 2X2 matrix of float)
+0:333        log2 ( temp 2X2 matrix of float)
+0:333          'inF0' ( in 2X2 matrix of float)
+0:333        Constant:
+0:333          0.301030
+0:333      log2 ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      max ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      min ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      pow ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      radians ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      roundEven ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      inverse sqrt ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      clamp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        Constant:
+0:333          0.000000
+0:333        Constant:
+0:333          1.000000
+0:333      Sign ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      sine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      Sequence
+0:333        move second child to first child ( temp 2X2 matrix of float)
+0:333          'inF1' ( in 2X2 matrix of float)
+0:333          sine ( temp 2X2 matrix of float)
+0:333            'inF0' ( in 2X2 matrix of float)
+0:333        move second child to first child ( temp 2X2 matrix of float)
+0:333          'inF2' ( in 2X2 matrix of float)
+0:333          cosine ( temp 2X2 matrix of float)
+0:333            'inF0' ( in 2X2 matrix of float)
+0:333      hyp. sine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      smoothstep ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333        'inF2' ( in 2X2 matrix of float)
+0:333      sqrt ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      step ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      hyp. tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      transpose ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      trunc ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:336      Branch: Return with expression
 0:?         Constant:
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
-0:345  Function Definition: VertexShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
-0:345    Function Parameters: 
-0:345      'inF0' ( in 3X3 matrix of float)
-0:345      'inF1' ( in 3X3 matrix of float)
-0:345      'inF2' ( in 3X3 matrix of float)
+0:340  Function Definition: VertexShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
+0:340    Function Parameters: 
+0:340      'inF0' ( in 3X3 matrix of float)
+0:340      'inF1' ( in 3X3 matrix of float)
+0:340      'inF2' ( in 3X3 matrix of float)
 0:?     Sequence
-0:347      all ( temp bool)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      Absolute value ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc cosine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      any ( temp bool)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc sine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      Ceiling ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      clamp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347        'inF2' ( in 3X3 matrix of float)
-0:347      cosine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      hyp. cosine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      degrees ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      determinant ( temp float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      exp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      exp2 ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      findMSB ( temp int)
-0:347        Constant:
-0:347          7 (const int)
-0:347      findLSB ( temp int)
-0:347        Constant:
-0:347          7 (const int)
-0:347      Floor ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      mod ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      Fraction ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      frexp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      ldexp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      mix ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347        'inF2' ( in 3X3 matrix of float)
-0:347      log ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      matrix-scale ( temp 3X3 matrix of float)
-0:347        log2 ( temp 3X3 matrix of float)
-0:347          'inF0' ( in 3X3 matrix of float)
-0:347        Constant:
-0:347          0.301030
-0:347      log2 ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      max ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      min ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      pow ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      radians ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      roundEven ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      inverse sqrt ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      clamp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        Constant:
-0:347          0.000000
-0:347        Constant:
-0:347          1.000000
-0:347      Sign ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      sine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      Sequence
-0:347        move second child to first child ( temp 3X3 matrix of float)
-0:347          'inF1' ( in 3X3 matrix of float)
-0:347          sine ( temp 3X3 matrix of float)
-0:347            'inF0' ( in 3X3 matrix of float)
-0:347        move second child to first child ( temp 3X3 matrix of float)
-0:347          'inF2' ( in 3X3 matrix of float)
-0:347          cosine ( temp 3X3 matrix of float)
-0:347            'inF0' ( in 3X3 matrix of float)
-0:347      hyp. sine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      smoothstep ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347        'inF2' ( in 3X3 matrix of float)
-0:347      sqrt ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      step ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      hyp. tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      transpose ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      trunc ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:350      Branch: Return with expression
+0:342      all ( temp bool)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      Absolute value ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc cosine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      any ( temp bool)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc sine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      Ceiling ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      clamp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342        'inF2' ( in 3X3 matrix of float)
+0:342      cosine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      hyp. cosine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      degrees ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      determinant ( temp float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      exp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      exp2 ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      findMSB ( temp int)
+0:342        Constant:
+0:342          7 (const int)
+0:342      findLSB ( temp int)
+0:342        Constant:
+0:342          7 (const int)
+0:342      Floor ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      mod ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      Fraction ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      ldexp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      mix ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342        'inF2' ( in 3X3 matrix of float)
+0:342      log ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      matrix-scale ( temp 3X3 matrix of float)
+0:342        log2 ( temp 3X3 matrix of float)
+0:342          'inF0' ( in 3X3 matrix of float)
+0:342        Constant:
+0:342          0.301030
+0:342      log2 ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      max ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      min ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      pow ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      radians ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      roundEven ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      inverse sqrt ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      clamp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        Constant:
+0:342          0.000000
+0:342        Constant:
+0:342          1.000000
+0:342      Sign ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      sine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      Sequence
+0:342        move second child to first child ( temp 3X3 matrix of float)
+0:342          'inF1' ( in 3X3 matrix of float)
+0:342          sine ( temp 3X3 matrix of float)
+0:342            'inF0' ( in 3X3 matrix of float)
+0:342        move second child to first child ( temp 3X3 matrix of float)
+0:342          'inF2' ( in 3X3 matrix of float)
+0:342          cosine ( temp 3X3 matrix of float)
+0:342            'inF0' ( in 3X3 matrix of float)
+0:342      hyp. sine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      smoothstep ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342        'inF2' ( in 3X3 matrix of float)
+0:342      sqrt ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      step ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      hyp. tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      transpose ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      trunc ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:345      Branch: Return with expression
 0:?         Constant:
 0:?           3.000000
 0:?           3.000000
@@ -946,131 +928,128 @@ Shader version: 450
 0:?           3.000000
 0:?           3.000000
 0:?           3.000000
-0:354  Function Definition: VertexShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
-0:354    Function Parameters: 
-0:354      'inF0' ( in 4X4 matrix of float)
-0:354      'inF1' ( in 4X4 matrix of float)
-0:354      'inF2' ( in 4X4 matrix of float)
+0:349  Function Definition: VertexShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
+0:349    Function Parameters: 
+0:349      'inF0' ( in 4X4 matrix of float)
+0:349      'inF1' ( in 4X4 matrix of float)
+0:349      'inF2' ( in 4X4 matrix of float)
 0:?     Sequence
-0:356      all ( temp bool)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      Absolute value ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc cosine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      any ( temp bool)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc sine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      Ceiling ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      clamp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356        'inF2' ( in 4X4 matrix of float)
-0:356      cosine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      hyp. cosine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      degrees ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      determinant ( temp float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      exp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      exp2 ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      findMSB ( temp int)
-0:356        Constant:
-0:356          7 (const int)
-0:356      findLSB ( temp int)
-0:356        Constant:
-0:356          7 (const int)
-0:356      Floor ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      mod ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      Fraction ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      frexp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      ldexp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      mix ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356        'inF2' ( in 4X4 matrix of float)
-0:356      log ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      matrix-scale ( temp 4X4 matrix of float)
-0:356        log2 ( temp 4X4 matrix of float)
-0:356          'inF0' ( in 4X4 matrix of float)
-0:356        Constant:
-0:356          0.301030
-0:356      log2 ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      max ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      min ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      pow ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      radians ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      roundEven ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      inverse sqrt ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      clamp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        Constant:
-0:356          0.000000
-0:356        Constant:
-0:356          1.000000
-0:356      Sign ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      sine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      Sequence
-0:356        move second child to first child ( temp 4X4 matrix of float)
-0:356          'inF1' ( in 4X4 matrix of float)
-0:356          sine ( temp 4X4 matrix of float)
-0:356            'inF0' ( in 4X4 matrix of float)
-0:356        move second child to first child ( temp 4X4 matrix of float)
-0:356          'inF2' ( in 4X4 matrix of float)
-0:356          cosine ( temp 4X4 matrix of float)
-0:356            'inF0' ( in 4X4 matrix of float)
-0:356      hyp. sine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      smoothstep ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356        'inF2' ( in 4X4 matrix of float)
-0:356      sqrt ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      step ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      hyp. tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      transpose ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      trunc ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:359      Branch: Return with expression
+0:351      all ( temp bool)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      Absolute value ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc cosine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      any ( temp bool)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc sine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      Ceiling ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      clamp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351        'inF2' ( in 4X4 matrix of float)
+0:351      cosine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      hyp. cosine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      degrees ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      determinant ( temp float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      exp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      exp2 ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      findMSB ( temp int)
+0:351        Constant:
+0:351          7 (const int)
+0:351      findLSB ( temp int)
+0:351        Constant:
+0:351          7 (const int)
+0:351      Floor ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      mod ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      Fraction ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      ldexp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      mix ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351        'inF2' ( in 4X4 matrix of float)
+0:351      log ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      matrix-scale ( temp 4X4 matrix of float)
+0:351        log2 ( temp 4X4 matrix of float)
+0:351          'inF0' ( in 4X4 matrix of float)
+0:351        Constant:
+0:351          0.301030
+0:351      log2 ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      max ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      min ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      pow ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      radians ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      roundEven ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      inverse sqrt ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      clamp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        Constant:
+0:351          0.000000
+0:351        Constant:
+0:351          1.000000
+0:351      Sign ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      sine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      Sequence
+0:351        move second child to first child ( temp 4X4 matrix of float)
+0:351          'inF1' ( in 4X4 matrix of float)
+0:351          sine ( temp 4X4 matrix of float)
+0:351            'inF0' ( in 4X4 matrix of float)
+0:351        move second child to first child ( temp 4X4 matrix of float)
+0:351          'inF2' ( in 4X4 matrix of float)
+0:351          cosine ( temp 4X4 matrix of float)
+0:351            'inF0' ( in 4X4 matrix of float)
+0:351      hyp. sine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      smoothstep ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351        'inF2' ( in 4X4 matrix of float)
+0:351      sqrt ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      step ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      hyp. tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      transpose ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      trunc ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:354      Branch: Return with expression
 0:?         Constant:
 0:?           4.000000
 0:?           4.000000
@@ -1088,309 +1067,309 @@ Shader version: 450
 0:?           4.000000
 0:?           4.000000
 0:?           4.000000
-0:377  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
-0:377    Function Parameters: 
-0:377      'inF0' ( in float)
-0:377      'inF1' ( in float)
-0:377      'inFV0' ( in 2-component vector of float)
-0:377      'inFV1' ( in 2-component vector of float)
-0:377      'inFM0' ( in 2X2 matrix of float)
-0:377      'inFM1' ( in 2X2 matrix of float)
+0:372  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
+0:372    Function Parameters: 
+0:372      'inF0' ( in float)
+0:372      'inF1' ( in float)
+0:372      'inFV0' ( in 2-component vector of float)
+0:372      'inFV1' ( in 2-component vector of float)
+0:372      'inFM0' ( in 2X2 matrix of float)
+0:372      'inFM1' ( in 2X2 matrix of float)
 0:?     Sequence
-0:378      Sequence
-0:378        move second child to first child ( temp float)
-0:378          'r0' ( temp float)
-0:378          component-wise multiply ( temp float)
-0:378            'inF1' ( in float)
-0:378            'inF0' ( in float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r1' ( temp 2-component vector of float)
-0:378          vector-scale ( temp 2-component vector of float)
-0:378            'inF0' ( in float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r2' ( temp 2-component vector of float)
-0:378          vector-scale ( temp 2-component vector of float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378            'inF0' ( in float)
-0:378      Sequence
-0:378        move second child to first child ( temp float)
-0:378          'r3' ( temp float)
-0:378          dot-product ( temp float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378            'inFV1' ( in 2-component vector of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r4' ( temp 2-component vector of float)
-0:378          vector-times-matrix ( temp 2-component vector of float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r5' ( temp 2-component vector of float)
-0:378          matrix-times-vector ( temp 2-component vector of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2X2 matrix of float)
-0:378          'r6' ( temp 2X2 matrix of float)
-0:378          matrix-scale ( temp 2X2 matrix of float)
-0:378            'inF0' ( in float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2X2 matrix of float)
-0:378          'r7' ( temp 2X2 matrix of float)
-0:378          matrix-scale ( temp 2X2 matrix of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378            'inF0' ( in float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2X2 matrix of float)
-0:378          'r8' ( temp 2X2 matrix of float)
-0:378          matrix-multiply ( temp 2X2 matrix of float)
-0:378            'inFM1' ( in 2X2 matrix of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:384  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
-0:384    Function Parameters: 
-0:384      'inF0' ( in float)
-0:384      'inF1' ( in float)
-0:384      'inFV0' ( in 3-component vector of float)
-0:384      'inFV1' ( in 3-component vector of float)
-0:384      'inFM0' ( in 3X3 matrix of float)
-0:384      'inFM1' ( in 3X3 matrix of float)
+0:373      Sequence
+0:373        move second child to first child ( temp float)
+0:373          'r0' ( temp float)
+0:373          component-wise multiply ( temp float)
+0:373            'inF1' ( in float)
+0:373            'inF0' ( in float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r1' ( temp 2-component vector of float)
+0:373          vector-scale ( temp 2-component vector of float)
+0:373            'inF0' ( in float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r2' ( temp 2-component vector of float)
+0:373          vector-scale ( temp 2-component vector of float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373            'inF0' ( in float)
+0:373      Sequence
+0:373        move second child to first child ( temp float)
+0:373          'r3' ( temp float)
+0:373          dot-product ( temp float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373            'inFV1' ( in 2-component vector of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r4' ( temp 2-component vector of float)
+0:373          vector-times-matrix ( temp 2-component vector of float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r5' ( temp 2-component vector of float)
+0:373          matrix-times-vector ( temp 2-component vector of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2X2 matrix of float)
+0:373          'r6' ( temp 2X2 matrix of float)
+0:373          matrix-scale ( temp 2X2 matrix of float)
+0:373            'inF0' ( in float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2X2 matrix of float)
+0:373          'r7' ( temp 2X2 matrix of float)
+0:373          matrix-scale ( temp 2X2 matrix of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373            'inF0' ( in float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2X2 matrix of float)
+0:373          'r8' ( temp 2X2 matrix of float)
+0:373          matrix-multiply ( temp 2X2 matrix of float)
+0:373            'inFM1' ( in 2X2 matrix of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:379  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
+0:379    Function Parameters: 
+0:379      'inF0' ( in float)
+0:379      'inF1' ( in float)
+0:379      'inFV0' ( in 3-component vector of float)
+0:379      'inFV1' ( in 3-component vector of float)
+0:379      'inFM0' ( in 3X3 matrix of float)
+0:379      'inFM1' ( in 3X3 matrix of float)
 0:?     Sequence
-0:385      Sequence
-0:385        move second child to first child ( temp float)
-0:385          'r0' ( temp float)
-0:385          component-wise multiply ( temp float)
-0:385            'inF1' ( in float)
-0:385            'inF0' ( in float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r1' ( temp 3-component vector of float)
-0:385          vector-scale ( temp 3-component vector of float)
-0:385            'inF0' ( in float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r2' ( temp 3-component vector of float)
-0:385          vector-scale ( temp 3-component vector of float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385            'inF0' ( in float)
-0:385      Sequence
-0:385        move second child to first child ( temp float)
-0:385          'r3' ( temp float)
-0:385          dot-product ( temp float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385            'inFV1' ( in 3-component vector of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r4' ( temp 3-component vector of float)
-0:385          vector-times-matrix ( temp 3-component vector of float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r5' ( temp 3-component vector of float)
-0:385          matrix-times-vector ( temp 3-component vector of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3X3 matrix of float)
-0:385          'r6' ( temp 3X3 matrix of float)
-0:385          matrix-scale ( temp 3X3 matrix of float)
-0:385            'inF0' ( in float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3X3 matrix of float)
-0:385          'r7' ( temp 3X3 matrix of float)
-0:385          matrix-scale ( temp 3X3 matrix of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385            'inF0' ( in float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3X3 matrix of float)
-0:385          'r8' ( temp 3X3 matrix of float)
-0:385          matrix-multiply ( temp 3X3 matrix of float)
-0:385            'inFM1' ( in 3X3 matrix of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:391  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
-0:391    Function Parameters: 
-0:391      'inF0' ( in float)
-0:391      'inF1' ( in float)
-0:391      'inFV0' ( in 4-component vector of float)
-0:391      'inFV1' ( in 4-component vector of float)
-0:391      'inFM0' ( in 4X4 matrix of float)
-0:391      'inFM1' ( in 4X4 matrix of float)
+0:380      Sequence
+0:380        move second child to first child ( temp float)
+0:380          'r0' ( temp float)
+0:380          component-wise multiply ( temp float)
+0:380            'inF1' ( in float)
+0:380            'inF0' ( in float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r1' ( temp 3-component vector of float)
+0:380          vector-scale ( temp 3-component vector of float)
+0:380            'inF0' ( in float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r2' ( temp 3-component vector of float)
+0:380          vector-scale ( temp 3-component vector of float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380            'inF0' ( in float)
+0:380      Sequence
+0:380        move second child to first child ( temp float)
+0:380          'r3' ( temp float)
+0:380          dot-product ( temp float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380            'inFV1' ( in 3-component vector of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r4' ( temp 3-component vector of float)
+0:380          vector-times-matrix ( temp 3-component vector of float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r5' ( temp 3-component vector of float)
+0:380          matrix-times-vector ( temp 3-component vector of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3X3 matrix of float)
+0:380          'r6' ( temp 3X3 matrix of float)
+0:380          matrix-scale ( temp 3X3 matrix of float)
+0:380            'inF0' ( in float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3X3 matrix of float)
+0:380          'r7' ( temp 3X3 matrix of float)
+0:380          matrix-scale ( temp 3X3 matrix of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380            'inF0' ( in float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3X3 matrix of float)
+0:380          'r8' ( temp 3X3 matrix of float)
+0:380          matrix-multiply ( temp 3X3 matrix of float)
+0:380            'inFM1' ( in 3X3 matrix of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:386  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
+0:386    Function Parameters: 
+0:386      'inF0' ( in float)
+0:386      'inF1' ( in float)
+0:386      'inFV0' ( in 4-component vector of float)
+0:386      'inFV1' ( in 4-component vector of float)
+0:386      'inFM0' ( in 4X4 matrix of float)
+0:386      'inFM1' ( in 4X4 matrix of float)
 0:?     Sequence
-0:392      Sequence
-0:392        move second child to first child ( temp float)
-0:392          'r0' ( temp float)
-0:392          component-wise multiply ( temp float)
-0:392            'inF1' ( in float)
-0:392            'inF0' ( in float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r1' ( temp 4-component vector of float)
-0:392          vector-scale ( temp 4-component vector of float)
-0:392            'inF0' ( in float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r2' ( temp 4-component vector of float)
-0:392          vector-scale ( temp 4-component vector of float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392            'inF0' ( in float)
-0:392      Sequence
-0:392        move second child to first child ( temp float)
-0:392          'r3' ( temp float)
-0:392          dot-product ( temp float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392            'inFV1' ( in 4-component vector of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r4' ( temp 4-component vector of float)
-0:392          vector-times-matrix ( temp 4-component vector of float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r5' ( temp 4-component vector of float)
-0:392          matrix-times-vector ( temp 4-component vector of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4X4 matrix of float)
-0:392          'r6' ( temp 4X4 matrix of float)
-0:392          matrix-scale ( temp 4X4 matrix of float)
-0:392            'inF0' ( in float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4X4 matrix of float)
-0:392          'r7' ( temp 4X4 matrix of float)
-0:392          matrix-scale ( temp 4X4 matrix of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392            'inF0' ( in float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4X4 matrix of float)
-0:392          'r8' ( temp 4X4 matrix of float)
-0:392          matrix-multiply ( temp 4X4 matrix of float)
-0:392            'inFM1' ( in 4X4 matrix of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:401  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
-0:401    Function Parameters: 
-0:401      'inF0' ( in float)
-0:401      'inF1' ( in float)
-0:401      'inFV2' ( in 2-component vector of float)
-0:401      'inFV3' ( in 3-component vector of float)
-0:401      'inFM2x3' ( in 2X3 matrix of float)
-0:401      'inFM3x2' ( in 3X2 matrix of float)
-0:401      'inFM3x3' ( in 3X3 matrix of float)
-0:401      'inFM3x4' ( in 3X4 matrix of float)
-0:401      'inFM2x4' ( in 2X4 matrix of float)
+0:387      Sequence
+0:387        move second child to first child ( temp float)
+0:387          'r0' ( temp float)
+0:387          component-wise multiply ( temp float)
+0:387            'inF1' ( in float)
+0:387            'inF0' ( in float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r1' ( temp 4-component vector of float)
+0:387          vector-scale ( temp 4-component vector of float)
+0:387            'inF0' ( in float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r2' ( temp 4-component vector of float)
+0:387          vector-scale ( temp 4-component vector of float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387            'inF0' ( in float)
+0:387      Sequence
+0:387        move second child to first child ( temp float)
+0:387          'r3' ( temp float)
+0:387          dot-product ( temp float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387            'inFV1' ( in 4-component vector of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r4' ( temp 4-component vector of float)
+0:387          vector-times-matrix ( temp 4-component vector of float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r5' ( temp 4-component vector of float)
+0:387          matrix-times-vector ( temp 4-component vector of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4X4 matrix of float)
+0:387          'r6' ( temp 4X4 matrix of float)
+0:387          matrix-scale ( temp 4X4 matrix of float)
+0:387            'inF0' ( in float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4X4 matrix of float)
+0:387          'r7' ( temp 4X4 matrix of float)
+0:387          matrix-scale ( temp 4X4 matrix of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387            'inF0' ( in float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4X4 matrix of float)
+0:387          'r8' ( temp 4X4 matrix of float)
+0:387          matrix-multiply ( temp 4X4 matrix of float)
+0:387            'inFM1' ( in 4X4 matrix of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:396  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
+0:396    Function Parameters: 
+0:396      'inF0' ( in float)
+0:396      'inF1' ( in float)
+0:396      'inFV2' ( in 2-component vector of float)
+0:396      'inFV3' ( in 3-component vector of float)
+0:396      'inFM2x3' ( in 2X3 matrix of float)
+0:396      'inFM3x2' ( in 3X2 matrix of float)
+0:396      'inFM3x3' ( in 3X3 matrix of float)
+0:396      'inFM3x4' ( in 3X4 matrix of float)
+0:396      'inFM2x4' ( in 2X4 matrix of float)
 0:?     Sequence
+0:397      Sequence
+0:397        move second child to first child ( temp float)
+0:397          'r00' ( temp float)
+0:397          component-wise multiply ( temp float)
+0:397            'inF1' ( in float)
+0:397            'inF0' ( in float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2-component vector of float)
+0:398          'r01' ( temp 2-component vector of float)
+0:398          vector-scale ( temp 2-component vector of float)
+0:398            'inF0' ( in float)
+0:398            'inFV2' ( in 2-component vector of float)
+0:399      Sequence
+0:399        move second child to first child ( temp 3-component vector of float)
+0:399          'r02' ( temp 3-component vector of float)
+0:399          vector-scale ( temp 3-component vector of float)
+0:399            'inF0' ( in float)
+0:399            'inFV3' ( in 3-component vector of float)
+0:400      Sequence
+0:400        move second child to first child ( temp 2-component vector of float)
+0:400          'r03' ( temp 2-component vector of float)
+0:400          vector-scale ( temp 2-component vector of float)
+0:400            'inFV2' ( in 2-component vector of float)
+0:400            'inF0' ( in float)
+0:401      Sequence
+0:401        move second child to first child ( temp 3-component vector of float)
+0:401          'r04' ( temp 3-component vector of float)
+0:401          vector-scale ( temp 3-component vector of float)
+0:401            'inFV3' ( in 3-component vector of float)
+0:401            'inF0' ( in float)
 0:402      Sequence
 0:402        move second child to first child ( temp float)
-0:402          'r00' ( temp float)
-0:402          component-wise multiply ( temp float)
-0:402            'inF1' ( in float)
-0:402            'inF0' ( in float)
+0:402          'r05' ( temp float)
+0:402          dot-product ( temp float)
+0:402            'inFV2' ( in 2-component vector of float)
+0:402            'inFV2' ( in 2-component vector of float)
 0:403      Sequence
-0:403        move second child to first child ( temp 2-component vector of float)
-0:403          'r01' ( temp 2-component vector of float)
-0:403          vector-scale ( temp 2-component vector of float)
-0:403            'inF0' ( in float)
-0:403            'inFV2' ( in 2-component vector of float)
+0:403        move second child to first child ( temp float)
+0:403          'r06' ( temp float)
+0:403          dot-product ( temp float)
+0:403            'inFV3' ( in 3-component vector of float)
+0:403            'inFV3' ( in 3-component vector of float)
 0:404      Sequence
 0:404        move second child to first child ( temp 3-component vector of float)
-0:404          'r02' ( temp 3-component vector of float)
-0:404          vector-scale ( temp 3-component vector of float)
-0:404            'inF0' ( in float)
-0:404            'inFV3' ( in 3-component vector of float)
+0:404          'r07' ( temp 3-component vector of float)
+0:404          matrix-times-vector ( temp 3-component vector of float)
+0:404            'inFM2x3' ( in 2X3 matrix of float)
+0:404            'inFV2' ( in 2-component vector of float)
 0:405      Sequence
 0:405        move second child to first child ( temp 2-component vector of float)
-0:405          'r03' ( temp 2-component vector of float)
-0:405          vector-scale ( temp 2-component vector of float)
-0:405            'inFV2' ( in 2-component vector of float)
-0:405            'inF0' ( in float)
+0:405          'r08' ( temp 2-component vector of float)
+0:405          matrix-times-vector ( temp 2-component vector of float)
+0:405            'inFM3x2' ( in 3X2 matrix of float)
+0:405            'inFV3' ( in 3-component vector of float)
 0:406      Sequence
-0:406        move second child to first child ( temp 3-component vector of float)
-0:406          'r04' ( temp 3-component vector of float)
-0:406          vector-scale ( temp 3-component vector of float)
+0:406        move second child to first child ( temp 2-component vector of float)
+0:406          'r09' ( temp 2-component vector of float)
+0:406          vector-times-matrix ( temp 2-component vector of float)
 0:406            'inFV3' ( in 3-component vector of float)
-0:406            'inF0' ( in float)
+0:406            'inFM2x3' ( in 2X3 matrix of float)
 0:407      Sequence
-0:407        move second child to first child ( temp float)
-0:407          'r05' ( temp float)
-0:407          dot-product ( temp float)
-0:407            'inFV2' ( in 2-component vector of float)
+0:407        move second child to first child ( temp 3-component vector of float)
+0:407          'r10' ( temp 3-component vector of float)
+0:407          vector-times-matrix ( temp 3-component vector of float)
 0:407            'inFV2' ( in 2-component vector of float)
+0:407            'inFM3x2' ( in 3X2 matrix of float)
 0:408      Sequence
-0:408        move second child to first child ( temp float)
-0:408          'r06' ( temp float)
-0:408          dot-product ( temp float)
-0:408            'inFV3' ( in 3-component vector of float)
-0:408            'inFV3' ( in 3-component vector of float)
+0:408        move second child to first child ( temp 2X3 matrix of float)
+0:408          'r11' ( temp 2X3 matrix of float)
+0:408          matrix-scale ( temp 2X3 matrix of float)
+0:408            'inF0' ( in float)
+0:408            'inFM2x3' ( in 2X3 matrix of float)
 0:409      Sequence
-0:409        move second child to first child ( temp 3-component vector of float)
-0:409          'r07' ( temp 3-component vector of float)
-0:409          matrix-times-vector ( temp 3-component vector of float)
-0:409            'inFM2x3' ( in 2X3 matrix of float)
-0:409            'inFV2' ( in 2-component vector of float)
+0:409        move second child to first child ( temp 3X2 matrix of float)
+0:409          'r12' ( temp 3X2 matrix of float)
+0:409          matrix-scale ( temp 3X2 matrix of float)
+0:409            'inF0' ( in float)
+0:409            'inFM3x2' ( in 3X2 matrix of float)
 0:410      Sequence
-0:410        move second child to first child ( temp 2-component vector of float)
-0:410          'r08' ( temp 2-component vector of float)
-0:410          matrix-times-vector ( temp 2-component vector of float)
+0:410        move second child to first child ( temp 2X2 matrix of float)
+0:410          'r13' ( temp 2X2 matrix of float)
+0:410          matrix-multiply ( temp 2X2 matrix of float)
 0:410            'inFM3x2' ( in 3X2 matrix of float)
-0:410            'inFV3' ( in 3-component vector of float)
+0:410            'inFM2x3' ( in 2X3 matrix of float)
 0:411      Sequence
-0:411        move second child to first child ( temp 2-component vector of float)
-0:411          'r09' ( temp 2-component vector of float)
-0:411          vector-times-matrix ( temp 2-component vector of float)
-0:411            'inFV3' ( in 3-component vector of float)
+0:411        move second child to first child ( temp 2X3 matrix of float)
+0:411          'r14' ( temp 2X3 matrix of float)
+0:411          matrix-multiply ( temp 2X3 matrix of float)
+0:411            'inFM3x3' ( in 3X3 matrix of float)
 0:411            'inFM2x3' ( in 2X3 matrix of float)
 0:412      Sequence
-0:412        move second child to first child ( temp 3-component vector of float)
-0:412          'r10' ( temp 3-component vector of float)
-0:412          vector-times-matrix ( temp 3-component vector of float)
-0:412            'inFV2' ( in 2-component vector of float)
-0:412            'inFM3x2' ( in 3X2 matrix of float)
+0:412        move second child to first child ( temp 2X4 matrix of float)
+0:412          'r15' ( temp 2X4 matrix of float)
+0:412          matrix-multiply ( temp 2X4 matrix of float)
+0:412            'inFM3x4' ( in 3X4 matrix of float)
+0:412            'inFM2x3' ( in 2X3 matrix of float)
 0:413      Sequence
-0:413        move second child to first child ( temp 2X3 matrix of float)
-0:413          'r11' ( temp 2X3 matrix of float)
-0:413          matrix-scale ( temp 2X3 matrix of float)
-0:413            'inF0' ( in float)
-0:413            'inFM2x3' ( in 2X3 matrix of float)
-0:414      Sequence
-0:414        move second child to first child ( temp 3X2 matrix of float)
-0:414          'r12' ( temp 3X2 matrix of float)
-0:414          matrix-scale ( temp 3X2 matrix of float)
-0:414            'inF0' ( in float)
-0:414            'inFM3x2' ( in 3X2 matrix of float)
-0:415      Sequence
-0:415        move second child to first child ( temp 2X2 matrix of float)
-0:415          'r13' ( temp 2X2 matrix of float)
-0:415          matrix-multiply ( temp 2X2 matrix of float)
-0:415            'inFM3x2' ( in 3X2 matrix of float)
-0:415            'inFM2x3' ( in 2X3 matrix of float)
-0:416      Sequence
-0:416        move second child to first child ( temp 2X3 matrix of float)
-0:416          'r14' ( temp 2X3 matrix of float)
-0:416          matrix-multiply ( temp 2X3 matrix of float)
-0:416            'inFM3x3' ( in 3X3 matrix of float)
-0:416            'inFM2x3' ( in 2X3 matrix of float)
-0:417      Sequence
-0:417        move second child to first child ( temp 2X4 matrix of float)
-0:417          'r15' ( temp 2X4 matrix of float)
-0:417          matrix-multiply ( temp 2X4 matrix of float)
-0:417            'inFM3x4' ( in 3X4 matrix of float)
-0:417            'inFM2x3' ( in 2X3 matrix of float)
-0:418      Sequence
-0:418        move second child to first child ( temp 3X4 matrix of float)
-0:418          'r16' ( temp 3X4 matrix of float)
-0:418          matrix-multiply ( temp 3X4 matrix of float)
-0:418            'inFM2x4' ( in 2X4 matrix of float)
-0:418            'inFM3x2' ( in 3X2 matrix of float)
+0:413        move second child to first child ( temp 3X4 matrix of float)
+0:413          'r16' ( temp 3X4 matrix of float)
+0:413          matrix-multiply ( temp 3X4 matrix of float)
+0:413            'inFM2x4' ( in 2X4 matrix of float)
+0:413            'inFM3x2' ( in 3X2 matrix of float)
 0:?   Linker Objects
 
 
@@ -1398,7 +1377,7 @@ Linked vertex stage:
 
 WARNING: Linking vertex stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:2  Function Definition: VertexShaderFunctionS(f1;f1;f1;u1;u1; ( temp float)
 0:2    Function Parameters: 
@@ -1461,880 +1440,862 @@ Shader version: 450
 0:29        'inF1' ( in float)
 0:30      Fraction ( temp float)
 0:30        'inF0' ( in float)
-0:31      frexp ( temp float)
+0:31      isinf ( temp bool)
 0:31        'inF0' ( in float)
-0:31        'inF1' ( in float)
-0:32      isinf ( temp bool)
+0:32      isnan ( temp bool)
 0:32        'inF0' ( in float)
-0:33      isnan ( temp bool)
+0:33      ldexp ( temp float)
 0:33        'inF0' ( in float)
-0:34      ldexp ( temp float)
+0:33        'inF1' ( in float)
+0:34      mix ( temp float)
 0:34        'inF0' ( in float)
 0:34        'inF1' ( in float)
-0:35      mix ( temp float)
+0:34        'inF2' ( in float)
+0:35      log ( temp float)
 0:35        'inF0' ( in float)
-0:35        'inF1' ( in float)
-0:35        'inF2' ( in float)
-0:36      log ( temp float)
-0:36        'inF0' ( in float)
-0:37      component-wise multiply ( temp float)
-0:37        log2 ( temp float)
-0:37          'inF0' ( in float)
-0:37        Constant:
-0:37          0.301030
-0:38      log2 ( temp float)
+0:36      component-wise multiply ( temp float)
+0:36        log2 ( temp float)
+0:36          'inF0' ( in float)
+0:36        Constant:
+0:36          0.301030
+0:37      log2 ( temp float)
+0:37        'inF0' ( in float)
+0:38      max ( temp float)
 0:38        'inF0' ( in float)
-0:39      max ( temp float)
+0:38        'inF1' ( in float)
+0:39      min ( temp float)
 0:39        'inF0' ( in float)
 0:39        'inF1' ( in float)
-0:40      min ( temp float)
-0:40        'inF0' ( in float)
-0:40        'inF1' ( in float)
-0:42      pow ( temp float)
+0:41      pow ( temp float)
+0:41        'inF0' ( in float)
+0:41        'inF1' ( in float)
+0:42      radians ( temp float)
 0:42        'inF0' ( in float)
-0:42        'inF1' ( in float)
-0:43      radians ( temp float)
-0:43        'inF0' ( in float)
-0:44      bitFieldReverse ( temp int)
-0:44        Constant:
-0:44          2 (const int)
-0:45      roundEven ( temp float)
+0:43      bitFieldReverse ( temp int)
+0:43        Constant:
+0:43          2 (const int)
+0:44      roundEven ( temp float)
+0:44        'inF0' ( in float)
+0:45      inverse sqrt ( temp float)
 0:45        'inF0' ( in float)
-0:46      inverse sqrt ( temp float)
+0:46      clamp ( temp float)
 0:46        'inF0' ( in float)
-0:47      clamp ( temp float)
+0:46        Constant:
+0:46          0.000000
+0:46        Constant:
+0:46          1.000000
+0:47      Sign ( temp float)
 0:47        'inF0' ( in float)
-0:47        Constant:
-0:47          0.000000
-0:47        Constant:
-0:47          1.000000
-0:48      Sign ( temp float)
+0:48      sine ( temp float)
 0:48        'inF0' ( in float)
-0:49      sine ( temp float)
-0:49        'inF0' ( in float)
-0:50      Sequence
-0:50        move second child to first child ( temp float)
-0:50          'inF1' ( in float)
-0:50          sine ( temp float)
-0:50            'inF0' ( in float)
-0:50        move second child to first child ( temp float)
-0:50          'inF2' ( in float)
-0:50          cosine ( temp float)
-0:50            'inF0' ( in float)
-0:51      hyp. sine ( temp float)
+0:49      Sequence
+0:49        move second child to first child ( temp float)
+0:49          'inF1' ( in float)
+0:49          sine ( temp float)
+0:49            'inF0' ( in float)
+0:49        move second child to first child ( temp float)
+0:49          'inF2' ( in float)
+0:49          cosine ( temp float)
+0:49            'inF0' ( in float)
+0:50      hyp. sine ( temp float)
+0:50        'inF0' ( in float)
+0:51      smoothstep ( temp float)
 0:51        'inF0' ( in float)
-0:52      smoothstep ( temp float)
+0:51        'inF1' ( in float)
+0:51        'inF2' ( in float)
+0:52      sqrt ( temp float)
 0:52        'inF0' ( in float)
-0:52        'inF1' ( in float)
-0:52        'inF2' ( in float)
-0:53      sqrt ( temp float)
+0:53      step ( temp float)
 0:53        'inF0' ( in float)
-0:54      step ( temp float)
+0:53        'inF1' ( in float)
+0:54      tangent ( temp float)
 0:54        'inF0' ( in float)
-0:54        'inF1' ( in float)
-0:55      tangent ( temp float)
+0:55      hyp. tangent ( temp float)
 0:55        'inF0' ( in float)
-0:56      hyp. tangent ( temp float)
-0:56        'inF0' ( in float)
-0:58      trunc ( temp float)
-0:58        'inF0' ( in float)
-0:60      Branch: Return with expression
-0:60        Constant:
-0:60          0.000000
-0:64  Function Definition: VertexShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
-0:64    Function Parameters: 
-0:64      'inF0' ( in 1-component vector of float)
-0:64      'inF1' ( in 1-component vector of float)
-0:64      'inF2' ( in 1-component vector of float)
+0:57      trunc ( temp float)
+0:57        'inF0' ( in float)
+0:59      Branch: Return with expression
+0:59        Constant:
+0:59          0.000000
+0:63  Function Definition: VertexShaderFunction1(vf1;vf1;vf1; ( temp 1-component vector of float)
+0:63    Function Parameters: 
+0:63      'inF0' ( in 1-component vector of float)
+0:63      'inF1' ( in 1-component vector of float)
+0:63      'inF2' ( in 1-component vector of float)
 0:?     Sequence
-0:66      Branch: Return with expression
-0:66        Constant:
-0:66          0.000000
-0:70  Function Definition: VertexShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
-0:70    Function Parameters: 
-0:70      'inF0' ( in 2-component vector of float)
-0:70      'inF1' ( in 2-component vector of float)
-0:70      'inF2' ( in 2-component vector of float)
-0:70      'inU0' ( in 2-component vector of uint)
-0:70      'inU1' ( in 2-component vector of uint)
+0:65      Branch: Return with expression
+0:65        Constant:
+0:65          0.000000
+0:69  Function Definition: VertexShaderFunction2(vf2;vf2;vf2;vu2;vu2; ( temp 2-component vector of float)
+0:69    Function Parameters: 
+0:69      'inF0' ( in 2-component vector of float)
+0:69      'inF1' ( in 2-component vector of float)
+0:69      'inF2' ( in 2-component vector of float)
+0:69      'inU0' ( in 2-component vector of uint)
+0:69      'inU1' ( in 2-component vector of uint)
 0:?     Sequence
-0:71      all ( temp bool)
+0:70      all ( temp bool)
+0:70        'inF0' ( in 2-component vector of float)
+0:71      Absolute value ( temp 2-component vector of float)
 0:71        'inF0' ( in 2-component vector of float)
-0:72      Absolute value ( temp 2-component vector of float)
+0:72      arc cosine ( temp 2-component vector of float)
 0:72        'inF0' ( in 2-component vector of float)
-0:73      arc cosine ( temp 2-component vector of float)
+0:73      any ( temp bool)
 0:73        'inF0' ( in 2-component vector of float)
-0:74      any ( temp bool)
+0:74      arc sine ( temp 2-component vector of float)
 0:74        'inF0' ( in 2-component vector of float)
-0:75      arc sine ( temp 2-component vector of float)
+0:75      floatBitsToInt ( temp 2-component vector of int)
 0:75        'inF0' ( in 2-component vector of float)
-0:76      floatBitsToInt ( temp 2-component vector of int)
+0:76      floatBitsToUint ( temp 2-component vector of uint)
 0:76        'inF0' ( in 2-component vector of float)
-0:77      floatBitsToUint ( temp 2-component vector of uint)
-0:77        'inF0' ( in 2-component vector of float)
-0:78      intBitsToFloat ( temp 2-component vector of float)
-0:78        'inU0' ( in 2-component vector of uint)
+0:77      intBitsToFloat ( temp 2-component vector of float)
+0:77        'inU0' ( in 2-component vector of uint)
+0:79      arc tangent ( temp 2-component vector of float)
+0:79        'inF0' ( in 2-component vector of float)
 0:80      arc tangent ( temp 2-component vector of float)
 0:80        'inF0' ( in 2-component vector of float)
-0:81      arc tangent ( temp 2-component vector of float)
+0:80        'inF1' ( in 2-component vector of float)
+0:81      Ceiling ( temp 2-component vector of float)
 0:81        'inF0' ( in 2-component vector of float)
-0:81        'inF1' ( in 2-component vector of float)
-0:82      Ceiling ( temp 2-component vector of float)
+0:82      clamp ( temp 2-component vector of float)
 0:82        'inF0' ( in 2-component vector of float)
-0:83      clamp ( temp 2-component vector of float)
+0:82        'inF1' ( in 2-component vector of float)
+0:82        'inF2' ( in 2-component vector of float)
+0:83      cosine ( temp 2-component vector of float)
 0:83        'inF0' ( in 2-component vector of float)
-0:83        'inF1' ( in 2-component vector of float)
-0:83        'inF2' ( in 2-component vector of float)
-0:84      cosine ( temp 2-component vector of float)
+0:84      hyp. cosine ( temp 2-component vector of float)
 0:84        'inF0' ( in 2-component vector of float)
-0:85      hyp. cosine ( temp 2-component vector of float)
-0:85        'inF0' ( in 2-component vector of float)
 0:?       bitCount ( temp 2-component vector of int)
 0:?         Constant:
 0:?           7 (const int)
 0:?           3 (const int)
-0:87      degrees ( temp 2-component vector of float)
+0:86      degrees ( temp 2-component vector of float)
+0:86        'inF0' ( in 2-component vector of float)
+0:87      distance ( temp float)
 0:87        'inF0' ( in 2-component vector of float)
-0:88      distance ( temp float)
+0:87        'inF1' ( in 2-component vector of float)
+0:88      dot-product ( temp float)
 0:88        'inF0' ( in 2-component vector of float)
 0:88        'inF1' ( in 2-component vector of float)
-0:89      dot-product ( temp float)
-0:89        'inF0' ( in 2-component vector of float)
-0:89        'inF1' ( in 2-component vector of float)
-0:93      exp ( temp 2-component vector of float)
+0:92      exp ( temp 2-component vector of float)
+0:92        'inF0' ( in 2-component vector of float)
+0:93      exp2 ( temp 2-component vector of float)
 0:93        'inF0' ( in 2-component vector of float)
-0:94      exp2 ( temp 2-component vector of float)
+0:94      face-forward ( temp 2-component vector of float)
 0:94        'inF0' ( in 2-component vector of float)
-0:95      face-forward ( temp 2-component vector of float)
-0:95        'inF0' ( in 2-component vector of float)
-0:95        'inF1' ( in 2-component vector of float)
-0:95        'inF2' ( in 2-component vector of float)
-0:96      findMSB ( temp int)
+0:94        'inF1' ( in 2-component vector of float)
+0:94        'inF2' ( in 2-component vector of float)
+0:95      findMSB ( temp int)
+0:95        Constant:
+0:95          7 (const int)
+0:96      findLSB ( temp int)
 0:96        Constant:
 0:96          7 (const int)
-0:97      findLSB ( temp int)
-0:97        Constant:
-0:97          7 (const int)
-0:98      Floor ( temp 2-component vector of float)
-0:98        'inF0' ( in 2-component vector of float)
-0:100      mod ( temp 2-component vector of float)
+0:97      Floor ( temp 2-component vector of float)
+0:97        'inF0' ( in 2-component vector of float)
+0:99      mod ( temp 2-component vector of float)
+0:99        'inF0' ( in 2-component vector of float)
+0:99        'inF1' ( in 2-component vector of float)
+0:100      Fraction ( temp 2-component vector of float)
 0:100        'inF0' ( in 2-component vector of float)
-0:100        'inF1' ( in 2-component vector of float)
-0:101      Fraction ( temp 2-component vector of float)
+0:101      isinf ( temp 2-component vector of bool)
 0:101        'inF0' ( in 2-component vector of float)
-0:102      frexp ( temp 2-component vector of float)
+0:102      isnan ( temp 2-component vector of bool)
 0:102        'inF0' ( in 2-component vector of float)
-0:102        'inF1' ( in 2-component vector of float)
-0:103      isinf ( temp 2-component vector of bool)
+0:103      ldexp ( temp 2-component vector of float)
 0:103        'inF0' ( in 2-component vector of float)
-0:104      isnan ( temp 2-component vector of bool)
+0:103        'inF1' ( in 2-component vector of float)
+0:104      mix ( temp 2-component vector of float)
 0:104        'inF0' ( in 2-component vector of float)
-0:105      ldexp ( temp 2-component vector of float)
+0:104        'inF1' ( in 2-component vector of float)
+0:104        'inF2' ( in 2-component vector of float)
+0:105      length ( temp float)
 0:105        'inF0' ( in 2-component vector of float)
-0:105        'inF1' ( in 2-component vector of float)
-0:106      mix ( temp 2-component vector of float)
+0:106      log ( temp 2-component vector of float)
 0:106        'inF0' ( in 2-component vector of float)
-0:106        'inF1' ( in 2-component vector of float)
-0:106        'inF2' ( in 2-component vector of float)
-0:107      length ( temp float)
-0:107        'inF0' ( in 2-component vector of float)
-0:108      log ( temp 2-component vector of float)
+0:107      vector-scale ( temp 2-component vector of float)
+0:107        log2 ( temp 2-component vector of float)
+0:107          'inF0' ( in 2-component vector of float)
+0:107        Constant:
+0:107          0.301030
+0:108      log2 ( temp 2-component vector of float)
 0:108        'inF0' ( in 2-component vector of float)
-0:109      vector-scale ( temp 2-component vector of float)
-0:109        log2 ( temp 2-component vector of float)
-0:109          'inF0' ( in 2-component vector of float)
-0:109        Constant:
-0:109          0.301030
-0:110      log2 ( temp 2-component vector of float)
+0:109      max ( temp 2-component vector of float)
+0:109        'inF0' ( in 2-component vector of float)
+0:109        'inF1' ( in 2-component vector of float)
+0:110      min ( temp 2-component vector of float)
 0:110        'inF0' ( in 2-component vector of float)
-0:111      max ( temp 2-component vector of float)
-0:111        'inF0' ( in 2-component vector of float)
-0:111        'inF1' ( in 2-component vector of float)
-0:112      min ( temp 2-component vector of float)
+0:110        'inF1' ( in 2-component vector of float)
+0:112      normalize ( temp 2-component vector of float)
 0:112        'inF0' ( in 2-component vector of float)
-0:112        'inF1' ( in 2-component vector of float)
-0:114      normalize ( temp 2-component vector of float)
+0:113      pow ( temp 2-component vector of float)
+0:113        'inF0' ( in 2-component vector of float)
+0:113        'inF1' ( in 2-component vector of float)
+0:114      radians ( temp 2-component vector of float)
 0:114        'inF0' ( in 2-component vector of float)
-0:115      pow ( temp 2-component vector of float)
+0:115      reflect ( temp 2-component vector of float)
 0:115        'inF0' ( in 2-component vector of float)
 0:115        'inF1' ( in 2-component vector of float)
-0:116      radians ( temp 2-component vector of float)
+0:116      refract ( temp 2-component vector of float)
 0:116        'inF0' ( in 2-component vector of float)
-0:117      reflect ( temp 2-component vector of float)
-0:117        'inF0' ( in 2-component vector of float)
-0:117        'inF1' ( in 2-component vector of float)
-0:118      refract ( temp 2-component vector of float)
-0:118        'inF0' ( in 2-component vector of float)
-0:118        'inF1' ( in 2-component vector of float)
-0:118        Constant:
-0:118          2.000000
+0:116        'inF1' ( in 2-component vector of float)
+0:116        Constant:
+0:116          2.000000
 0:?       bitFieldReverse ( temp 2-component vector of int)
 0:?         Constant:
 0:?           1 (const int)
 0:?           2 (const int)
-0:120      roundEven ( temp 2-component vector of float)
+0:118      roundEven ( temp 2-component vector of float)
+0:118        'inF0' ( in 2-component vector of float)
+0:119      inverse sqrt ( temp 2-component vector of float)
+0:119        'inF0' ( in 2-component vector of float)
+0:120      clamp ( temp 2-component vector of float)
 0:120        'inF0' ( in 2-component vector of float)
-0:121      inverse sqrt ( temp 2-component vector of float)
+0:120        Constant:
+0:120          0.000000
+0:120        Constant:
+0:120          1.000000
+0:121      Sign ( temp 2-component vector of float)
 0:121        'inF0' ( in 2-component vector of float)
-0:122      clamp ( temp 2-component vector of float)
+0:122      sine ( temp 2-component vector of float)
 0:122        'inF0' ( in 2-component vector of float)
-0:122        Constant:
-0:122          0.000000
-0:122        Constant:
-0:122          1.000000
-0:123      Sign ( temp 2-component vector of float)
-0:123        'inF0' ( in 2-component vector of float)
-0:124      sine ( temp 2-component vector of float)
+0:123      Sequence
+0:123        move second child to first child ( temp 2-component vector of float)
+0:123          'inF1' ( in 2-component vector of float)
+0:123          sine ( temp 2-component vector of float)
+0:123            'inF0' ( in 2-component vector of float)
+0:123        move second child to first child ( temp 2-component vector of float)
+0:123          'inF2' ( in 2-component vector of float)
+0:123          cosine ( temp 2-component vector of float)
+0:123            'inF0' ( in 2-component vector of float)
+0:124      hyp. sine ( temp 2-component vector of float)
 0:124        'inF0' ( in 2-component vector of float)
-0:125      Sequence
-0:125        move second child to first child ( temp 2-component vector of float)
-0:125          'inF1' ( in 2-component vector of float)
-0:125          sine ( temp 2-component vector of float)
-0:125            'inF0' ( in 2-component vector of float)
-0:125        move second child to first child ( temp 2-component vector of float)
-0:125          'inF2' ( in 2-component vector of float)
-0:125          cosine ( temp 2-component vector of float)
-0:125            'inF0' ( in 2-component vector of float)
-0:126      hyp. sine ( temp 2-component vector of float)
+0:125      smoothstep ( temp 2-component vector of float)
+0:125        'inF0' ( in 2-component vector of float)
+0:125        'inF1' ( in 2-component vector of float)
+0:125        'inF2' ( in 2-component vector of float)
+0:126      sqrt ( temp 2-component vector of float)
 0:126        'inF0' ( in 2-component vector of float)
-0:127      smoothstep ( temp 2-component vector of float)
+0:127      step ( temp 2-component vector of float)
 0:127        'inF0' ( in 2-component vector of float)
 0:127        'inF1' ( in 2-component vector of float)
-0:127        'inF2' ( in 2-component vector of float)
-0:128      sqrt ( temp 2-component vector of float)
+0:128      tangent ( temp 2-component vector of float)
 0:128        'inF0' ( in 2-component vector of float)
-0:129      step ( temp 2-component vector of float)
+0:129      hyp. tangent ( temp 2-component vector of float)
 0:129        'inF0' ( in 2-component vector of float)
-0:129        'inF1' ( in 2-component vector of float)
-0:130      tangent ( temp 2-component vector of float)
-0:130        'inF0' ( in 2-component vector of float)
-0:131      hyp. tangent ( temp 2-component vector of float)
+0:131      trunc ( temp 2-component vector of float)
 0:131        'inF0' ( in 2-component vector of float)
-0:133      trunc ( temp 2-component vector of float)
-0:133        'inF0' ( in 2-component vector of float)
-0:136      Branch: Return with expression
+0:134      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
-0:140  Function Definition: VertexShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
-0:140    Function Parameters: 
-0:140      'inF0' ( in 3-component vector of float)
-0:140      'inF1' ( in 3-component vector of float)
-0:140      'inF2' ( in 3-component vector of float)
-0:140      'inU0' ( in 3-component vector of uint)
-0:140      'inU1' ( in 3-component vector of uint)
+0:138  Function Definition: VertexShaderFunction3(vf3;vf3;vf3;vu3;vu3; ( temp 3-component vector of float)
+0:138    Function Parameters: 
+0:138      'inF0' ( in 3-component vector of float)
+0:138      'inF1' ( in 3-component vector of float)
+0:138      'inF2' ( in 3-component vector of float)
+0:138      'inU0' ( in 3-component vector of uint)
+0:138      'inU1' ( in 3-component vector of uint)
 0:?     Sequence
-0:141      all ( temp bool)
+0:139      all ( temp bool)
+0:139        'inF0' ( in 3-component vector of float)
+0:140      Absolute value ( temp 3-component vector of float)
+0:140        'inF0' ( in 3-component vector of float)
+0:141      arc cosine ( temp 3-component vector of float)
 0:141        'inF0' ( in 3-component vector of float)
-0:142      Absolute value ( temp 3-component vector of float)
+0:142      any ( temp bool)
 0:142        'inF0' ( in 3-component vector of float)
-0:143      arc cosine ( temp 3-component vector of float)
+0:143      arc sine ( temp 3-component vector of float)
 0:143        'inF0' ( in 3-component vector of float)
-0:144      any ( temp bool)
+0:144      floatBitsToInt ( temp 3-component vector of int)
 0:144        'inF0' ( in 3-component vector of float)
-0:145      arc sine ( temp 3-component vector of float)
+0:145      floatBitsToUint ( temp 3-component vector of uint)
 0:145        'inF0' ( in 3-component vector of float)
-0:146      floatBitsToInt ( temp 3-component vector of int)
-0:146        'inF0' ( in 3-component vector of float)
-0:147      floatBitsToUint ( temp 3-component vector of uint)
-0:147        'inF0' ( in 3-component vector of float)
-0:148      intBitsToFloat ( temp 3-component vector of float)
-0:148        'inU0' ( in 3-component vector of uint)
-0:150      arc tangent ( temp 3-component vector of float)
+0:146      intBitsToFloat ( temp 3-component vector of float)
+0:146        'inU0' ( in 3-component vector of uint)
+0:148      arc tangent ( temp 3-component vector of float)
+0:148        'inF0' ( in 3-component vector of float)
+0:149      arc tangent ( temp 3-component vector of float)
+0:149        'inF0' ( in 3-component vector of float)
+0:149        'inF1' ( in 3-component vector of float)
+0:150      Ceiling ( temp 3-component vector of float)
 0:150        'inF0' ( in 3-component vector of float)
-0:151      arc tangent ( temp 3-component vector of float)
+0:151      clamp ( temp 3-component vector of float)
 0:151        'inF0' ( in 3-component vector of float)
 0:151        'inF1' ( in 3-component vector of float)
-0:152      Ceiling ( temp 3-component vector of float)
+0:151        'inF2' ( in 3-component vector of float)
+0:152      cosine ( temp 3-component vector of float)
 0:152        'inF0' ( in 3-component vector of float)
-0:153      clamp ( temp 3-component vector of float)
+0:153      hyp. cosine ( temp 3-component vector of float)
 0:153        'inF0' ( in 3-component vector of float)
-0:153        'inF1' ( in 3-component vector of float)
-0:153        'inF2' ( in 3-component vector of float)
-0:154      cosine ( temp 3-component vector of float)
-0:154        'inF0' ( in 3-component vector of float)
-0:155      hyp. cosine ( temp 3-component vector of float)
-0:155        'inF0' ( in 3-component vector of float)
 0:?       bitCount ( temp 3-component vector of int)
 0:?         Constant:
 0:?           7 (const int)
 0:?           3 (const int)
 0:?           5 (const int)
-0:157      cross-product ( temp 3-component vector of float)
+0:155      cross-product ( temp 3-component vector of float)
+0:155        'inF0' ( in 3-component vector of float)
+0:155        'inF1' ( in 3-component vector of float)
+0:156      degrees ( temp 3-component vector of float)
+0:156        'inF0' ( in 3-component vector of float)
+0:157      distance ( temp float)
 0:157        'inF0' ( in 3-component vector of float)
 0:157        'inF1' ( in 3-component vector of float)
-0:158      degrees ( temp 3-component vector of float)
+0:158      dot-product ( temp float)
 0:158        'inF0' ( in 3-component vector of float)
-0:159      distance ( temp float)
-0:159        'inF0' ( in 3-component vector of float)
-0:159        'inF1' ( in 3-component vector of float)
-0:160      dot-product ( temp float)
-0:160        'inF0' ( in 3-component vector of float)
-0:160        'inF1' ( in 3-component vector of float)
-0:164      exp ( temp 3-component vector of float)
+0:158        'inF1' ( in 3-component vector of float)
+0:162      exp ( temp 3-component vector of float)
+0:162        'inF0' ( in 3-component vector of float)
+0:163      exp2 ( temp 3-component vector of float)
+0:163        'inF0' ( in 3-component vector of float)
+0:164      face-forward ( temp 3-component vector of float)
 0:164        'inF0' ( in 3-component vector of float)
-0:165      exp2 ( temp 3-component vector of float)
-0:165        'inF0' ( in 3-component vector of float)
-0:166      face-forward ( temp 3-component vector of float)
-0:166        'inF0' ( in 3-component vector of float)
-0:166        'inF1' ( in 3-component vector of float)
-0:166        'inF2' ( in 3-component vector of float)
-0:167      findMSB ( temp int)
-0:167        Constant:
-0:167          7 (const int)
-0:168      findLSB ( temp int)
-0:168        Constant:
-0:168          7 (const int)
-0:169      Floor ( temp 3-component vector of float)
+0:164        'inF1' ( in 3-component vector of float)
+0:164        'inF2' ( in 3-component vector of float)
+0:165      findMSB ( temp int)
+0:165        Constant:
+0:165          7 (const int)
+0:166      findLSB ( temp int)
+0:166        Constant:
+0:166          7 (const int)
+0:167      Floor ( temp 3-component vector of float)
+0:167        'inF0' ( in 3-component vector of float)
+0:169      mod ( temp 3-component vector of float)
 0:169        'inF0' ( in 3-component vector of float)
-0:171      mod ( temp 3-component vector of float)
+0:169        'inF1' ( in 3-component vector of float)
+0:170      Fraction ( temp 3-component vector of float)
+0:170        'inF0' ( in 3-component vector of float)
+0:171      isinf ( temp 3-component vector of bool)
 0:171        'inF0' ( in 3-component vector of float)
-0:171        'inF1' ( in 3-component vector of float)
-0:172      Fraction ( temp 3-component vector of float)
+0:172      isnan ( temp 3-component vector of bool)
 0:172        'inF0' ( in 3-component vector of float)
-0:173      frexp ( temp 3-component vector of float)
+0:173      ldexp ( temp 3-component vector of float)
 0:173        'inF0' ( in 3-component vector of float)
 0:173        'inF1' ( in 3-component vector of float)
-0:174      isinf ( temp 3-component vector of bool)
+0:174      mix ( temp 3-component vector of float)
 0:174        'inF0' ( in 3-component vector of float)
-0:175      isnan ( temp 3-component vector of bool)
+0:174        'inF1' ( in 3-component vector of float)
+0:174        'inF2' ( in 3-component vector of float)
+0:175      length ( temp float)
 0:175        'inF0' ( in 3-component vector of float)
-0:176      ldexp ( temp 3-component vector of float)
+0:176      log ( temp 3-component vector of float)
 0:176        'inF0' ( in 3-component vector of float)
-0:176        'inF1' ( in 3-component vector of float)
-0:177      mix ( temp 3-component vector of float)
-0:177        'inF0' ( in 3-component vector of float)
-0:177        'inF1' ( in 3-component vector of float)
-0:177        'inF2' ( in 3-component vector of float)
-0:178      length ( temp float)
+0:177      vector-scale ( temp 3-component vector of float)
+0:177        log2 ( temp 3-component vector of float)
+0:177          'inF0' ( in 3-component vector of float)
+0:177        Constant:
+0:177          0.301030
+0:178      log2 ( temp 3-component vector of float)
 0:178        'inF0' ( in 3-component vector of float)
-0:179      log ( temp 3-component vector of float)
+0:179      max ( temp 3-component vector of float)
 0:179        'inF0' ( in 3-component vector of float)
-0:180      vector-scale ( temp 3-component vector of float)
-0:180        log2 ( temp 3-component vector of float)
-0:180          'inF0' ( in 3-component vector of float)
-0:180        Constant:
-0:180          0.301030
-0:181      log2 ( temp 3-component vector of float)
-0:181        'inF0' ( in 3-component vector of float)
-0:182      max ( temp 3-component vector of float)
+0:179        'inF1' ( in 3-component vector of float)
+0:180      min ( temp 3-component vector of float)
+0:180        'inF0' ( in 3-component vector of float)
+0:180        'inF1' ( in 3-component vector of float)
+0:182      normalize ( temp 3-component vector of float)
 0:182        'inF0' ( in 3-component vector of float)
-0:182        'inF1' ( in 3-component vector of float)
-0:183      min ( temp 3-component vector of float)
+0:183      pow ( temp 3-component vector of float)
 0:183        'inF0' ( in 3-component vector of float)
 0:183        'inF1' ( in 3-component vector of float)
-0:185      normalize ( temp 3-component vector of float)
+0:184      radians ( temp 3-component vector of float)
+0:184        'inF0' ( in 3-component vector of float)
+0:185      reflect ( temp 3-component vector of float)
 0:185        'inF0' ( in 3-component vector of float)
-0:186      pow ( temp 3-component vector of float)
+0:185        'inF1' ( in 3-component vector of float)
+0:186      refract ( temp 3-component vector of float)
 0:186        'inF0' ( in 3-component vector of float)
 0:186        'inF1' ( in 3-component vector of float)
-0:187      radians ( temp 3-component vector of float)
-0:187        'inF0' ( in 3-component vector of float)
-0:188      reflect ( temp 3-component vector of float)
-0:188        'inF0' ( in 3-component vector of float)
-0:188        'inF1' ( in 3-component vector of float)
-0:189      refract ( temp 3-component vector of float)
-0:189        'inF0' ( in 3-component vector of float)
-0:189        'inF1' ( in 3-component vector of float)
-0:189        Constant:
-0:189          2.000000
+0:186        Constant:
+0:186          2.000000
 0:?       bitFieldReverse ( temp 3-component vector of int)
 0:?         Constant:
 0:?           1 (const int)
 0:?           2 (const int)
 0:?           3 (const int)
-0:191      roundEven ( temp 3-component vector of float)
+0:188      roundEven ( temp 3-component vector of float)
+0:188        'inF0' ( in 3-component vector of float)
+0:189      inverse sqrt ( temp 3-component vector of float)
+0:189        'inF0' ( in 3-component vector of float)
+0:190      clamp ( temp 3-component vector of float)
+0:190        'inF0' ( in 3-component vector of float)
+0:190        Constant:
+0:190          0.000000
+0:190        Constant:
+0:190          1.000000
+0:191      Sign ( temp 3-component vector of float)
 0:191        'inF0' ( in 3-component vector of float)
-0:192      inverse sqrt ( temp 3-component vector of float)
+0:192      sine ( temp 3-component vector of float)
 0:192        'inF0' ( in 3-component vector of float)
-0:193      clamp ( temp 3-component vector of float)
-0:193        'inF0' ( in 3-component vector of float)
-0:193        Constant:
-0:193          0.000000
-0:193        Constant:
-0:193          1.000000
-0:194      Sign ( temp 3-component vector of float)
+0:193      Sequence
+0:193        move second child to first child ( temp 3-component vector of float)
+0:193          'inF1' ( in 3-component vector of float)
+0:193          sine ( temp 3-component vector of float)
+0:193            'inF0' ( in 3-component vector of float)
+0:193        move second child to first child ( temp 3-component vector of float)
+0:193          'inF2' ( in 3-component vector of float)
+0:193          cosine ( temp 3-component vector of float)
+0:193            'inF0' ( in 3-component vector of float)
+0:194      hyp. sine ( temp 3-component vector of float)
 0:194        'inF0' ( in 3-component vector of float)
-0:195      sine ( temp 3-component vector of float)
+0:195      smoothstep ( temp 3-component vector of float)
 0:195        'inF0' ( in 3-component vector of float)
-0:196      Sequence
-0:196        move second child to first child ( temp 3-component vector of float)
-0:196          'inF1' ( in 3-component vector of float)
-0:196          sine ( temp 3-component vector of float)
-0:196            'inF0' ( in 3-component vector of float)
-0:196        move second child to first child ( temp 3-component vector of float)
-0:196          'inF2' ( in 3-component vector of float)
-0:196          cosine ( temp 3-component vector of float)
-0:196            'inF0' ( in 3-component vector of float)
-0:197      hyp. sine ( temp 3-component vector of float)
+0:195        'inF1' ( in 3-component vector of float)
+0:195        'inF2' ( in 3-component vector of float)
+0:196      sqrt ( temp 3-component vector of float)
+0:196        'inF0' ( in 3-component vector of float)
+0:197      step ( temp 3-component vector of float)
 0:197        'inF0' ( in 3-component vector of float)
-0:198      smoothstep ( temp 3-component vector of float)
+0:197        'inF1' ( in 3-component vector of float)
+0:198      tangent ( temp 3-component vector of float)
 0:198        'inF0' ( in 3-component vector of float)
-0:198        'inF1' ( in 3-component vector of float)
-0:198        'inF2' ( in 3-component vector of float)
-0:199      sqrt ( temp 3-component vector of float)
+0:199      hyp. tangent ( temp 3-component vector of float)
 0:199        'inF0' ( in 3-component vector of float)
-0:200      step ( temp 3-component vector of float)
-0:200        'inF0' ( in 3-component vector of float)
-0:200        'inF1' ( in 3-component vector of float)
-0:201      tangent ( temp 3-component vector of float)
+0:201      trunc ( temp 3-component vector of float)
 0:201        'inF0' ( in 3-component vector of float)
-0:202      hyp. tangent ( temp 3-component vector of float)
-0:202        'inF0' ( in 3-component vector of float)
-0:204      trunc ( temp 3-component vector of float)
-0:204        'inF0' ( in 3-component vector of float)
-0:207      Branch: Return with expression
+0:204      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
-0:211  Function Definition: VertexShaderFunction4(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
-0:211    Function Parameters: 
-0:211      'inF0' ( in 4-component vector of float)
-0:211      'inF1' ( in 4-component vector of float)
-0:211      'inF2' ( in 4-component vector of float)
-0:211      'inU0' ( in 4-component vector of uint)
-0:211      'inU1' ( in 4-component vector of uint)
+0:208  Function Definition: VertexShaderFunction4(vf4;vf4;vf4;vu4;vu4; ( temp 4-component vector of float)
+0:208    Function Parameters: 
+0:208      'inF0' ( in 4-component vector of float)
+0:208      'inF1' ( in 4-component vector of float)
+0:208      'inF2' ( in 4-component vector of float)
+0:208      'inU0' ( in 4-component vector of uint)
+0:208      'inU1' ( in 4-component vector of uint)
 0:?     Sequence
-0:212      all ( temp bool)
+0:209      all ( temp bool)
+0:209        'inF0' ( in 4-component vector of float)
+0:210      Absolute value ( temp 4-component vector of float)
+0:210        'inF0' ( in 4-component vector of float)
+0:211      arc cosine ( temp 4-component vector of float)
+0:211        'inF0' ( in 4-component vector of float)
+0:212      any ( temp bool)
 0:212        'inF0' ( in 4-component vector of float)
-0:213      Absolute value ( temp 4-component vector of float)
+0:213      arc sine ( temp 4-component vector of float)
 0:213        'inF0' ( in 4-component vector of float)
-0:214      arc cosine ( temp 4-component vector of float)
+0:214      floatBitsToInt ( temp 4-component vector of int)
 0:214        'inF0' ( in 4-component vector of float)
-0:215      any ( temp bool)
+0:215      floatBitsToUint ( temp 4-component vector of uint)
 0:215        'inF0' ( in 4-component vector of float)
-0:216      arc sine ( temp 4-component vector of float)
-0:216        'inF0' ( in 4-component vector of float)
-0:217      floatBitsToInt ( temp 4-component vector of int)
-0:217        'inF0' ( in 4-component vector of float)
-0:218      floatBitsToUint ( temp 4-component vector of uint)
+0:216      intBitsToFloat ( temp 4-component vector of float)
+0:216        'inU0' ( in 4-component vector of uint)
+0:218      arc tangent ( temp 4-component vector of float)
 0:218        'inF0' ( in 4-component vector of float)
-0:219      intBitsToFloat ( temp 4-component vector of float)
-0:219        'inU0' ( in 4-component vector of uint)
-0:221      arc tangent ( temp 4-component vector of float)
+0:219      arc tangent ( temp 4-component vector of float)
+0:219        'inF0' ( in 4-component vector of float)
+0:219        'inF1' ( in 4-component vector of float)
+0:220      Ceiling ( temp 4-component vector of float)
+0:220        'inF0' ( in 4-component vector of float)
+0:221      clamp ( temp 4-component vector of float)
 0:221        'inF0' ( in 4-component vector of float)
-0:222      arc tangent ( temp 4-component vector of float)
+0:221        'inF1' ( in 4-component vector of float)
+0:221        'inF2' ( in 4-component vector of float)
+0:222      cosine ( temp 4-component vector of float)
 0:222        'inF0' ( in 4-component vector of float)
-0:222        'inF1' ( in 4-component vector of float)
-0:223      Ceiling ( temp 4-component vector of float)
+0:223      hyp. cosine ( temp 4-component vector of float)
 0:223        'inF0' ( in 4-component vector of float)
-0:224      clamp ( temp 4-component vector of float)
-0:224        'inF0' ( in 4-component vector of float)
-0:224        'inF1' ( in 4-component vector of float)
-0:224        'inF2' ( in 4-component vector of float)
-0:225      cosine ( temp 4-component vector of float)
-0:225        'inF0' ( in 4-component vector of float)
-0:226      hyp. cosine ( temp 4-component vector of float)
-0:226        'inF0' ( in 4-component vector of float)
 0:?       bitCount ( temp 4-component vector of int)
 0:?         Constant:
 0:?           7 (const int)
 0:?           3 (const int)
 0:?           5 (const int)
 0:?           2 (const int)
-0:228      degrees ( temp 4-component vector of float)
-0:228        'inF0' ( in 4-component vector of float)
-0:229      distance ( temp float)
-0:229        'inF0' ( in 4-component vector of float)
-0:229        'inF1' ( in 4-component vector of float)
-0:230      dot-product ( temp float)
-0:230        'inF0' ( in 4-component vector of float)
-0:230        'inF1' ( in 4-component vector of float)
-0:231      Construct vec4 ( temp 4-component vector of float)
-0:231        Constant:
-0:231          1.000000
-0:231        component-wise multiply ( temp float)
-0:231          direct index ( temp float)
-0:231            'inF0' ( in 4-component vector of float)
-0:231            Constant:
-0:231              1 (const int)
-0:231          direct index ( temp float)
-0:231            'inF1' ( in 4-component vector of float)
-0:231            Constant:
-0:231              1 (const int)
-0:231        direct index ( temp float)
-0:231          'inF0' ( in 4-component vector of float)
-0:231          Constant:
-0:231            2 (const int)
-0:231        direct index ( temp float)
-0:231          'inF1' ( in 4-component vector of float)
-0:231          Constant:
-0:231            3 (const int)
-0:235      exp ( temp 4-component vector of float)
-0:235        'inF0' ( in 4-component vector of float)
-0:236      exp2 ( temp 4-component vector of float)
-0:236        'inF0' ( in 4-component vector of float)
-0:237      face-forward ( temp 4-component vector of float)
+0:225      degrees ( temp 4-component vector of float)
+0:225        'inF0' ( in 4-component vector of float)
+0:226      distance ( temp float)
+0:226        'inF0' ( in 4-component vector of float)
+0:226        'inF1' ( in 4-component vector of float)
+0:227      dot-product ( temp float)
+0:227        'inF0' ( in 4-component vector of float)
+0:227        'inF1' ( in 4-component vector of float)
+0:228      Construct vec4 ( temp 4-component vector of float)
+0:228        Constant:
+0:228          1.000000
+0:228        component-wise multiply ( temp float)
+0:228          direct index ( temp float)
+0:228            'inF0' ( in 4-component vector of float)
+0:228            Constant:
+0:228              1 (const int)
+0:228          direct index ( temp float)
+0:228            'inF1' ( in 4-component vector of float)
+0:228            Constant:
+0:228              1 (const int)
+0:228        direct index ( temp float)
+0:228          'inF0' ( in 4-component vector of float)
+0:228          Constant:
+0:228            2 (const int)
+0:228        direct index ( temp float)
+0:228          'inF1' ( in 4-component vector of float)
+0:228          Constant:
+0:228            3 (const int)
+0:232      exp ( temp 4-component vector of float)
+0:232        'inF0' ( in 4-component vector of float)
+0:233      exp2 ( temp 4-component vector of float)
+0:233        'inF0' ( in 4-component vector of float)
+0:234      face-forward ( temp 4-component vector of float)
+0:234        'inF0' ( in 4-component vector of float)
+0:234        'inF1' ( in 4-component vector of float)
+0:234        'inF2' ( in 4-component vector of float)
+0:235      findMSB ( temp int)
+0:235        Constant:
+0:235          7 (const int)
+0:236      findLSB ( temp int)
+0:236        Constant:
+0:236          7 (const int)
+0:237      Floor ( temp 4-component vector of float)
 0:237        'inF0' ( in 4-component vector of float)
-0:237        'inF1' ( in 4-component vector of float)
-0:237        'inF2' ( in 4-component vector of float)
-0:238      findMSB ( temp int)
-0:238        Constant:
-0:238          7 (const int)
-0:239      findLSB ( temp int)
-0:239        Constant:
-0:239          7 (const int)
-0:240      Floor ( temp 4-component vector of float)
+0:239      mod ( temp 4-component vector of float)
+0:239        'inF0' ( in 4-component vector of float)
+0:239        'inF1' ( in 4-component vector of float)
+0:240      Fraction ( temp 4-component vector of float)
 0:240        'inF0' ( in 4-component vector of float)
-0:242      mod ( temp 4-component vector of float)
+0:241      isinf ( temp 4-component vector of bool)
+0:241        'inF0' ( in 4-component vector of float)
+0:242      isnan ( temp 4-component vector of bool)
 0:242        'inF0' ( in 4-component vector of float)
-0:242        'inF1' ( in 4-component vector of float)
-0:243      Fraction ( temp 4-component vector of float)
+0:243      ldexp ( temp 4-component vector of float)
 0:243        'inF0' ( in 4-component vector of float)
-0:244      frexp ( temp 4-component vector of float)
+0:243        'inF1' ( in 4-component vector of float)
+0:244      mix ( temp 4-component vector of float)
 0:244        'inF0' ( in 4-component vector of float)
 0:244        'inF1' ( in 4-component vector of float)
-0:245      isinf ( temp 4-component vector of bool)
+0:244        'inF2' ( in 4-component vector of float)
+0:245      length ( temp float)
 0:245        'inF0' ( in 4-component vector of float)
-0:246      isnan ( temp 4-component vector of bool)
+0:246      log ( temp 4-component vector of float)
 0:246        'inF0' ( in 4-component vector of float)
-0:247      ldexp ( temp 4-component vector of float)
-0:247        'inF0' ( in 4-component vector of float)
-0:247        'inF1' ( in 4-component vector of float)
-0:248      mix ( temp 4-component vector of float)
+0:247      vector-scale ( temp 4-component vector of float)
+0:247        log2 ( temp 4-component vector of float)
+0:247          'inF0' ( in 4-component vector of float)
+0:247        Constant:
+0:247          0.301030
+0:248      log2 ( temp 4-component vector of float)
 0:248        'inF0' ( in 4-component vector of float)
-0:248        'inF1' ( in 4-component vector of float)
-0:248        'inF2' ( in 4-component vector of float)
-0:249      length ( temp float)
+0:249      max ( temp 4-component vector of float)
 0:249        'inF0' ( in 4-component vector of float)
-0:250      log ( temp 4-component vector of float)
+0:249        'inF1' ( in 4-component vector of float)
+0:250      min ( temp 4-component vector of float)
 0:250        'inF0' ( in 4-component vector of float)
-0:251      vector-scale ( temp 4-component vector of float)
-0:251        log2 ( temp 4-component vector of float)
-0:251          'inF0' ( in 4-component vector of float)
-0:251        Constant:
-0:251          0.301030
-0:252      log2 ( temp 4-component vector of float)
+0:250        'inF1' ( in 4-component vector of float)
+0:252      normalize ( temp 4-component vector of float)
 0:252        'inF0' ( in 4-component vector of float)
-0:253      max ( temp 4-component vector of float)
+0:253      pow ( temp 4-component vector of float)
 0:253        'inF0' ( in 4-component vector of float)
 0:253        'inF1' ( in 4-component vector of float)
-0:254      min ( temp 4-component vector of float)
+0:254      radians ( temp 4-component vector of float)
 0:254        'inF0' ( in 4-component vector of float)
-0:254        'inF1' ( in 4-component vector of float)
-0:256      normalize ( temp 4-component vector of float)
+0:255      reflect ( temp 4-component vector of float)
+0:255        'inF0' ( in 4-component vector of float)
+0:255        'inF1' ( in 4-component vector of float)
+0:256      refract ( temp 4-component vector of float)
 0:256        'inF0' ( in 4-component vector of float)
-0:257      pow ( temp 4-component vector of float)
-0:257        'inF0' ( in 4-component vector of float)
-0:257        'inF1' ( in 4-component vector of float)
-0:258      radians ( temp 4-component vector of float)
-0:258        'inF0' ( in 4-component vector of float)
-0:259      reflect ( temp 4-component vector of float)
-0:259        'inF0' ( in 4-component vector of float)
-0:259        'inF1' ( in 4-component vector of float)
-0:260      refract ( temp 4-component vector of float)
-0:260        'inF0' ( in 4-component vector of float)
-0:260        'inF1' ( in 4-component vector of float)
-0:260        Constant:
-0:260          2.000000
+0:256        'inF1' ( in 4-component vector of float)
+0:256        Constant:
+0:256          2.000000
 0:?       bitFieldReverse ( temp 4-component vector of int)
 0:?         Constant:
 0:?           1 (const int)
 0:?           2 (const int)
 0:?           3 (const int)
 0:?           4 (const int)
-0:262      roundEven ( temp 4-component vector of float)
+0:258      roundEven ( temp 4-component vector of float)
+0:258        'inF0' ( in 4-component vector of float)
+0:259      inverse sqrt ( temp 4-component vector of float)
+0:259        'inF0' ( in 4-component vector of float)
+0:260      clamp ( temp 4-component vector of float)
+0:260        'inF0' ( in 4-component vector of float)
+0:260        Constant:
+0:260          0.000000
+0:260        Constant:
+0:260          1.000000
+0:261      Sign ( temp 4-component vector of float)
+0:261        'inF0' ( in 4-component vector of float)
+0:262      sine ( temp 4-component vector of float)
 0:262        'inF0' ( in 4-component vector of float)
-0:263      inverse sqrt ( temp 4-component vector of float)
-0:263        'inF0' ( in 4-component vector of float)
-0:264      clamp ( temp 4-component vector of float)
+0:263      Sequence
+0:263        move second child to first child ( temp 4-component vector of float)
+0:263          'inF1' ( in 4-component vector of float)
+0:263          sine ( temp 4-component vector of float)
+0:263            'inF0' ( in 4-component vector of float)
+0:263        move second child to first child ( temp 4-component vector of float)
+0:263          'inF2' ( in 4-component vector of float)
+0:263          cosine ( temp 4-component vector of float)
+0:263            'inF0' ( in 4-component vector of float)
+0:264      hyp. sine ( temp 4-component vector of float)
 0:264        'inF0' ( in 4-component vector of float)
-0:264        Constant:
-0:264          0.000000
-0:264        Constant:
-0:264          1.000000
-0:265      Sign ( temp 4-component vector of float)
+0:265      smoothstep ( temp 4-component vector of float)
 0:265        'inF0' ( in 4-component vector of float)
-0:266      sine ( temp 4-component vector of float)
+0:265        'inF1' ( in 4-component vector of float)
+0:265        'inF2' ( in 4-component vector of float)
+0:266      sqrt ( temp 4-component vector of float)
 0:266        'inF0' ( in 4-component vector of float)
-0:267      Sequence
-0:267        move second child to first child ( temp 4-component vector of float)
-0:267          'inF1' ( in 4-component vector of float)
-0:267          sine ( temp 4-component vector of float)
-0:267            'inF0' ( in 4-component vector of float)
-0:267        move second child to first child ( temp 4-component vector of float)
-0:267          'inF2' ( in 4-component vector of float)
-0:267          cosine ( temp 4-component vector of float)
-0:267            'inF0' ( in 4-component vector of float)
-0:268      hyp. sine ( temp 4-component vector of float)
+0:267      step ( temp 4-component vector of float)
+0:267        'inF0' ( in 4-component vector of float)
+0:267        'inF1' ( in 4-component vector of float)
+0:268      tangent ( temp 4-component vector of float)
 0:268        'inF0' ( in 4-component vector of float)
-0:269      smoothstep ( temp 4-component vector of float)
+0:269      hyp. tangent ( temp 4-component vector of float)
 0:269        'inF0' ( in 4-component vector of float)
-0:269        'inF1' ( in 4-component vector of float)
-0:269        'inF2' ( in 4-component vector of float)
-0:270      sqrt ( temp 4-component vector of float)
-0:270        'inF0' ( in 4-component vector of float)
-0:271      step ( temp 4-component vector of float)
+0:271      trunc ( temp 4-component vector of float)
 0:271        'inF0' ( in 4-component vector of float)
-0:271        'inF1' ( in 4-component vector of float)
-0:272      tangent ( temp 4-component vector of float)
-0:272        'inF0' ( in 4-component vector of float)
-0:273      hyp. tangent ( temp 4-component vector of float)
-0:273        'inF0' ( in 4-component vector of float)
-0:275      trunc ( temp 4-component vector of float)
-0:275        'inF0' ( in 4-component vector of float)
-0:278      Branch: Return with expression
+0:274      Branch: Return with expression
 0:?         Constant:
 0:?           1.000000
 0:?           2.000000
 0:?           3.000000
 0:?           4.000000
-0:336  Function Definition: VertexShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
-0:336    Function Parameters: 
-0:336      'inF0' ( in 2X2 matrix of float)
-0:336      'inF1' ( in 2X2 matrix of float)
-0:336      'inF2' ( in 2X2 matrix of float)
+0:331  Function Definition: VertexShaderFunction2x2(mf22;mf22;mf22; ( temp 2X2 matrix of float)
+0:331    Function Parameters: 
+0:331      'inF0' ( in 2X2 matrix of float)
+0:331      'inF1' ( in 2X2 matrix of float)
+0:331      'inF2' ( in 2X2 matrix of float)
 0:?     Sequence
-0:338      all ( temp bool)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      Absolute value ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc cosine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      any ( temp bool)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc sine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      arc tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      Ceiling ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      clamp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338        'inF2' ( in 2X2 matrix of float)
-0:338      cosine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      hyp. cosine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      degrees ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      determinant ( temp float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      exp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      exp2 ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      findMSB ( temp int)
-0:338        Constant:
-0:338          7 (const int)
-0:338      findLSB ( temp int)
-0:338        Constant:
-0:338          7 (const int)
-0:338      Floor ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      mod ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      Fraction ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      frexp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      ldexp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      mix ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338        'inF2' ( in 2X2 matrix of float)
-0:338      log ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      matrix-scale ( temp 2X2 matrix of float)
-0:338        log2 ( temp 2X2 matrix of float)
-0:338          'inF0' ( in 2X2 matrix of float)
-0:338        Constant:
-0:338          0.301030
-0:338      log2 ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      max ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      min ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      pow ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      radians ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      roundEven ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      inverse sqrt ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      clamp ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        Constant:
-0:338          0.000000
-0:338        Constant:
-0:338          1.000000
-0:338      Sign ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      sine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      Sequence
-0:338        move second child to first child ( temp 2X2 matrix of float)
-0:338          'inF1' ( in 2X2 matrix of float)
-0:338          sine ( temp 2X2 matrix of float)
-0:338            'inF0' ( in 2X2 matrix of float)
-0:338        move second child to first child ( temp 2X2 matrix of float)
-0:338          'inF2' ( in 2X2 matrix of float)
-0:338          cosine ( temp 2X2 matrix of float)
-0:338            'inF0' ( in 2X2 matrix of float)
-0:338      hyp. sine ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      smoothstep ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338        'inF2' ( in 2X2 matrix of float)
-0:338      sqrt ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      step ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338        'inF1' ( in 2X2 matrix of float)
-0:338      tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      hyp. tangent ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      transpose ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:338      trunc ( temp 2X2 matrix of float)
-0:338        'inF0' ( in 2X2 matrix of float)
-0:341      Branch: Return with expression
+0:333      all ( temp bool)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      Absolute value ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc cosine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      any ( temp bool)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc sine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      arc tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      Ceiling ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      clamp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333        'inF2' ( in 2X2 matrix of float)
+0:333      cosine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      hyp. cosine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      degrees ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      determinant ( temp float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      exp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      exp2 ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      findMSB ( temp int)
+0:333        Constant:
+0:333          7 (const int)
+0:333      findLSB ( temp int)
+0:333        Constant:
+0:333          7 (const int)
+0:333      Floor ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      mod ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      Fraction ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      ldexp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      mix ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333        'inF2' ( in 2X2 matrix of float)
+0:333      log ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      matrix-scale ( temp 2X2 matrix of float)
+0:333        log2 ( temp 2X2 matrix of float)
+0:333          'inF0' ( in 2X2 matrix of float)
+0:333        Constant:
+0:333          0.301030
+0:333      log2 ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      max ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      min ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      pow ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      radians ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      roundEven ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      inverse sqrt ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      clamp ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        Constant:
+0:333          0.000000
+0:333        Constant:
+0:333          1.000000
+0:333      Sign ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      sine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      Sequence
+0:333        move second child to first child ( temp 2X2 matrix of float)
+0:333          'inF1' ( in 2X2 matrix of float)
+0:333          sine ( temp 2X2 matrix of float)
+0:333            'inF0' ( in 2X2 matrix of float)
+0:333        move second child to first child ( temp 2X2 matrix of float)
+0:333          'inF2' ( in 2X2 matrix of float)
+0:333          cosine ( temp 2X2 matrix of float)
+0:333            'inF0' ( in 2X2 matrix of float)
+0:333      hyp. sine ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      smoothstep ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333        'inF2' ( in 2X2 matrix of float)
+0:333      sqrt ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      step ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333        'inF1' ( in 2X2 matrix of float)
+0:333      tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      hyp. tangent ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      transpose ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:333      trunc ( temp 2X2 matrix of float)
+0:333        'inF0' ( in 2X2 matrix of float)
+0:336      Branch: Return with expression
 0:?         Constant:
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
 0:?           2.000000
-0:345  Function Definition: VertexShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
-0:345    Function Parameters: 
-0:345      'inF0' ( in 3X3 matrix of float)
-0:345      'inF1' ( in 3X3 matrix of float)
-0:345      'inF2' ( in 3X3 matrix of float)
+0:340  Function Definition: VertexShaderFunction3x3(mf33;mf33;mf33; ( temp 3X3 matrix of float)
+0:340    Function Parameters: 
+0:340      'inF0' ( in 3X3 matrix of float)
+0:340      'inF1' ( in 3X3 matrix of float)
+0:340      'inF2' ( in 3X3 matrix of float)
 0:?     Sequence
-0:347      all ( temp bool)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      Absolute value ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc cosine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      any ( temp bool)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc sine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      arc tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      Ceiling ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      clamp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347        'inF2' ( in 3X3 matrix of float)
-0:347      cosine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      hyp. cosine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      degrees ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      determinant ( temp float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      exp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      exp2 ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      findMSB ( temp int)
-0:347        Constant:
-0:347          7 (const int)
-0:347      findLSB ( temp int)
-0:347        Constant:
-0:347          7 (const int)
-0:347      Floor ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      mod ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      Fraction ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      frexp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      ldexp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      mix ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347        'inF2' ( in 3X3 matrix of float)
-0:347      log ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      matrix-scale ( temp 3X3 matrix of float)
-0:347        log2 ( temp 3X3 matrix of float)
-0:347          'inF0' ( in 3X3 matrix of float)
-0:347        Constant:
-0:347          0.301030
-0:347      log2 ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      max ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      min ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      pow ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      radians ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      roundEven ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      inverse sqrt ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      clamp ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        Constant:
-0:347          0.000000
-0:347        Constant:
-0:347          1.000000
-0:347      Sign ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      sine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      Sequence
-0:347        move second child to first child ( temp 3X3 matrix of float)
-0:347          'inF1' ( in 3X3 matrix of float)
-0:347          sine ( temp 3X3 matrix of float)
-0:347            'inF0' ( in 3X3 matrix of float)
-0:347        move second child to first child ( temp 3X3 matrix of float)
-0:347          'inF2' ( in 3X3 matrix of float)
-0:347          cosine ( temp 3X3 matrix of float)
-0:347            'inF0' ( in 3X3 matrix of float)
-0:347      hyp. sine ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      smoothstep ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347        'inF2' ( in 3X3 matrix of float)
-0:347      sqrt ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      step ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347        'inF1' ( in 3X3 matrix of float)
-0:347      tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      hyp. tangent ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      transpose ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:347      trunc ( temp 3X3 matrix of float)
-0:347        'inF0' ( in 3X3 matrix of float)
-0:350      Branch: Return with expression
+0:342      all ( temp bool)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      Absolute value ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc cosine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      any ( temp bool)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc sine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      arc tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      Ceiling ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      clamp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342        'inF2' ( in 3X3 matrix of float)
+0:342      cosine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      hyp. cosine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      degrees ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      determinant ( temp float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      exp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      exp2 ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      findMSB ( temp int)
+0:342        Constant:
+0:342          7 (const int)
+0:342      findLSB ( temp int)
+0:342        Constant:
+0:342          7 (const int)
+0:342      Floor ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      mod ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      Fraction ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      ldexp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      mix ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342        'inF2' ( in 3X3 matrix of float)
+0:342      log ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      matrix-scale ( temp 3X3 matrix of float)
+0:342        log2 ( temp 3X3 matrix of float)
+0:342          'inF0' ( in 3X3 matrix of float)
+0:342        Constant:
+0:342          0.301030
+0:342      log2 ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      max ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      min ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      pow ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      radians ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      roundEven ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      inverse sqrt ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      clamp ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        Constant:
+0:342          0.000000
+0:342        Constant:
+0:342          1.000000
+0:342      Sign ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      sine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      Sequence
+0:342        move second child to first child ( temp 3X3 matrix of float)
+0:342          'inF1' ( in 3X3 matrix of float)
+0:342          sine ( temp 3X3 matrix of float)
+0:342            'inF0' ( in 3X3 matrix of float)
+0:342        move second child to first child ( temp 3X3 matrix of float)
+0:342          'inF2' ( in 3X3 matrix of float)
+0:342          cosine ( temp 3X3 matrix of float)
+0:342            'inF0' ( in 3X3 matrix of float)
+0:342      hyp. sine ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      smoothstep ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342        'inF2' ( in 3X3 matrix of float)
+0:342      sqrt ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      step ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342        'inF1' ( in 3X3 matrix of float)
+0:342      tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      hyp. tangent ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      transpose ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:342      trunc ( temp 3X3 matrix of float)
+0:342        'inF0' ( in 3X3 matrix of float)
+0:345      Branch: Return with expression
 0:?         Constant:
 0:?           3.000000
 0:?           3.000000
@@ -2345,131 +2306,128 @@ Shader version: 450
 0:?           3.000000
 0:?           3.000000
 0:?           3.000000
-0:354  Function Definition: VertexShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
-0:354    Function Parameters: 
-0:354      'inF0' ( in 4X4 matrix of float)
-0:354      'inF1' ( in 4X4 matrix of float)
-0:354      'inF2' ( in 4X4 matrix of float)
+0:349  Function Definition: VertexShaderFunction4x4(mf44;mf44;mf44; ( temp 4X4 matrix of float)
+0:349    Function Parameters: 
+0:349      'inF0' ( in 4X4 matrix of float)
+0:349      'inF1' ( in 4X4 matrix of float)
+0:349      'inF2' ( in 4X4 matrix of float)
 0:?     Sequence
-0:356      all ( temp bool)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      Absolute value ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc cosine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      any ( temp bool)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc sine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      arc tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      Ceiling ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      clamp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356        'inF2' ( in 4X4 matrix of float)
-0:356      cosine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      hyp. cosine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      degrees ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      determinant ( temp float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      exp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      exp2 ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      findMSB ( temp int)
-0:356        Constant:
-0:356          7 (const int)
-0:356      findLSB ( temp int)
-0:356        Constant:
-0:356          7 (const int)
-0:356      Floor ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      mod ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      Fraction ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      frexp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      ldexp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      mix ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356        'inF2' ( in 4X4 matrix of float)
-0:356      log ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      matrix-scale ( temp 4X4 matrix of float)
-0:356        log2 ( temp 4X4 matrix of float)
-0:356          'inF0' ( in 4X4 matrix of float)
-0:356        Constant:
-0:356          0.301030
-0:356      log2 ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      max ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      min ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      pow ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      radians ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      roundEven ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      inverse sqrt ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      clamp ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        Constant:
-0:356          0.000000
-0:356        Constant:
-0:356          1.000000
-0:356      Sign ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      sine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      Sequence
-0:356        move second child to first child ( temp 4X4 matrix of float)
-0:356          'inF1' ( in 4X4 matrix of float)
-0:356          sine ( temp 4X4 matrix of float)
-0:356            'inF0' ( in 4X4 matrix of float)
-0:356        move second child to first child ( temp 4X4 matrix of float)
-0:356          'inF2' ( in 4X4 matrix of float)
-0:356          cosine ( temp 4X4 matrix of float)
-0:356            'inF0' ( in 4X4 matrix of float)
-0:356      hyp. sine ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      smoothstep ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356        'inF2' ( in 4X4 matrix of float)
-0:356      sqrt ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      step ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356        'inF1' ( in 4X4 matrix of float)
-0:356      tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      hyp. tangent ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      transpose ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:356      trunc ( temp 4X4 matrix of float)
-0:356        'inF0' ( in 4X4 matrix of float)
-0:359      Branch: Return with expression
+0:351      all ( temp bool)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      Absolute value ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc cosine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      any ( temp bool)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc sine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      arc tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      Ceiling ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      clamp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351        'inF2' ( in 4X4 matrix of float)
+0:351      cosine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      hyp. cosine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      degrees ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      determinant ( temp float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      exp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      exp2 ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      findMSB ( temp int)
+0:351        Constant:
+0:351          7 (const int)
+0:351      findLSB ( temp int)
+0:351        Constant:
+0:351          7 (const int)
+0:351      Floor ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      mod ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      Fraction ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      ldexp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      mix ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351        'inF2' ( in 4X4 matrix of float)
+0:351      log ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      matrix-scale ( temp 4X4 matrix of float)
+0:351        log2 ( temp 4X4 matrix of float)
+0:351          'inF0' ( in 4X4 matrix of float)
+0:351        Constant:
+0:351          0.301030
+0:351      log2 ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      max ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      min ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      pow ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      radians ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      roundEven ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      inverse sqrt ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      clamp ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        Constant:
+0:351          0.000000
+0:351        Constant:
+0:351          1.000000
+0:351      Sign ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      sine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      Sequence
+0:351        move second child to first child ( temp 4X4 matrix of float)
+0:351          'inF1' ( in 4X4 matrix of float)
+0:351          sine ( temp 4X4 matrix of float)
+0:351            'inF0' ( in 4X4 matrix of float)
+0:351        move second child to first child ( temp 4X4 matrix of float)
+0:351          'inF2' ( in 4X4 matrix of float)
+0:351          cosine ( temp 4X4 matrix of float)
+0:351            'inF0' ( in 4X4 matrix of float)
+0:351      hyp. sine ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      smoothstep ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351        'inF2' ( in 4X4 matrix of float)
+0:351      sqrt ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      step ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351        'inF1' ( in 4X4 matrix of float)
+0:351      tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      hyp. tangent ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      transpose ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:351      trunc ( temp 4X4 matrix of float)
+0:351        'inF0' ( in 4X4 matrix of float)
+0:354      Branch: Return with expression
 0:?         Constant:
 0:?           4.000000
 0:?           4.000000
@@ -2487,319 +2445,320 @@ Shader version: 450
 0:?           4.000000
 0:?           4.000000
 0:?           4.000000
-0:377  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
-0:377    Function Parameters: 
-0:377      'inF0' ( in float)
-0:377      'inF1' ( in float)
-0:377      'inFV0' ( in 2-component vector of float)
-0:377      'inFV1' ( in 2-component vector of float)
-0:377      'inFM0' ( in 2X2 matrix of float)
-0:377      'inFM1' ( in 2X2 matrix of float)
+0:372  Function Definition: TestGenMul2(f1;f1;vf2;vf2;mf22;mf22; ( temp void)
+0:372    Function Parameters: 
+0:372      'inF0' ( in float)
+0:372      'inF1' ( in float)
+0:372      'inFV0' ( in 2-component vector of float)
+0:372      'inFV1' ( in 2-component vector of float)
+0:372      'inFM0' ( in 2X2 matrix of float)
+0:372      'inFM1' ( in 2X2 matrix of float)
 0:?     Sequence
-0:378      Sequence
-0:378        move second child to first child ( temp float)
-0:378          'r0' ( temp float)
-0:378          component-wise multiply ( temp float)
-0:378            'inF1' ( in float)
-0:378            'inF0' ( in float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r1' ( temp 2-component vector of float)
-0:378          vector-scale ( temp 2-component vector of float)
-0:378            'inF0' ( in float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r2' ( temp 2-component vector of float)
-0:378          vector-scale ( temp 2-component vector of float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378            'inF0' ( in float)
-0:378      Sequence
-0:378        move second child to first child ( temp float)
-0:378          'r3' ( temp float)
-0:378          dot-product ( temp float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378            'inFV1' ( in 2-component vector of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r4' ( temp 2-component vector of float)
-0:378          vector-times-matrix ( temp 2-component vector of float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2-component vector of float)
-0:378          'r5' ( temp 2-component vector of float)
-0:378          matrix-times-vector ( temp 2-component vector of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378            'inFV0' ( in 2-component vector of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2X2 matrix of float)
-0:378          'r6' ( temp 2X2 matrix of float)
-0:378          matrix-scale ( temp 2X2 matrix of float)
-0:378            'inF0' ( in float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2X2 matrix of float)
-0:378          'r7' ( temp 2X2 matrix of float)
-0:378          matrix-scale ( temp 2X2 matrix of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:378            'inF0' ( in float)
-0:378      Sequence
-0:378        move second child to first child ( temp 2X2 matrix of float)
-0:378          'r8' ( temp 2X2 matrix of float)
-0:378          matrix-multiply ( temp 2X2 matrix of float)
-0:378            'inFM1' ( in 2X2 matrix of float)
-0:378            'inFM0' ( in 2X2 matrix of float)
-0:384  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
-0:384    Function Parameters: 
-0:384      'inF0' ( in float)
-0:384      'inF1' ( in float)
-0:384      'inFV0' ( in 3-component vector of float)
-0:384      'inFV1' ( in 3-component vector of float)
-0:384      'inFM0' ( in 3X3 matrix of float)
-0:384      'inFM1' ( in 3X3 matrix of float)
+0:373      Sequence
+0:373        move second child to first child ( temp float)
+0:373          'r0' ( temp float)
+0:373          component-wise multiply ( temp float)
+0:373            'inF1' ( in float)
+0:373            'inF0' ( in float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r1' ( temp 2-component vector of float)
+0:373          vector-scale ( temp 2-component vector of float)
+0:373            'inF0' ( in float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r2' ( temp 2-component vector of float)
+0:373          vector-scale ( temp 2-component vector of float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373            'inF0' ( in float)
+0:373      Sequence
+0:373        move second child to first child ( temp float)
+0:373          'r3' ( temp float)
+0:373          dot-product ( temp float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373            'inFV1' ( in 2-component vector of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r4' ( temp 2-component vector of float)
+0:373          vector-times-matrix ( temp 2-component vector of float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2-component vector of float)
+0:373          'r5' ( temp 2-component vector of float)
+0:373          matrix-times-vector ( temp 2-component vector of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373            'inFV0' ( in 2-component vector of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2X2 matrix of float)
+0:373          'r6' ( temp 2X2 matrix of float)
+0:373          matrix-scale ( temp 2X2 matrix of float)
+0:373            'inF0' ( in float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2X2 matrix of float)
+0:373          'r7' ( temp 2X2 matrix of float)
+0:373          matrix-scale ( temp 2X2 matrix of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:373            'inF0' ( in float)
+0:373      Sequence
+0:373        move second child to first child ( temp 2X2 matrix of float)
+0:373          'r8' ( temp 2X2 matrix of float)
+0:373          matrix-multiply ( temp 2X2 matrix of float)
+0:373            'inFM1' ( in 2X2 matrix of float)
+0:373            'inFM0' ( in 2X2 matrix of float)
+0:379  Function Definition: TestGenMul3(f1;f1;vf3;vf3;mf33;mf33; ( temp void)
+0:379    Function Parameters: 
+0:379      'inF0' ( in float)
+0:379      'inF1' ( in float)
+0:379      'inFV0' ( in 3-component vector of float)
+0:379      'inFV1' ( in 3-component vector of float)
+0:379      'inFM0' ( in 3X3 matrix of float)
+0:379      'inFM1' ( in 3X3 matrix of float)
 0:?     Sequence
-0:385      Sequence
-0:385        move second child to first child ( temp float)
-0:385          'r0' ( temp float)
-0:385          component-wise multiply ( temp float)
-0:385            'inF1' ( in float)
-0:385            'inF0' ( in float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r1' ( temp 3-component vector of float)
-0:385          vector-scale ( temp 3-component vector of float)
-0:385            'inF0' ( in float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r2' ( temp 3-component vector of float)
-0:385          vector-scale ( temp 3-component vector of float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385            'inF0' ( in float)
-0:385      Sequence
-0:385        move second child to first child ( temp float)
-0:385          'r3' ( temp float)
-0:385          dot-product ( temp float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385            'inFV1' ( in 3-component vector of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r4' ( temp 3-component vector of float)
-0:385          vector-times-matrix ( temp 3-component vector of float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3-component vector of float)
-0:385          'r5' ( temp 3-component vector of float)
-0:385          matrix-times-vector ( temp 3-component vector of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385            'inFV0' ( in 3-component vector of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3X3 matrix of float)
-0:385          'r6' ( temp 3X3 matrix of float)
-0:385          matrix-scale ( temp 3X3 matrix of float)
-0:385            'inF0' ( in float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3X3 matrix of float)
-0:385          'r7' ( temp 3X3 matrix of float)
-0:385          matrix-scale ( temp 3X3 matrix of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:385            'inF0' ( in float)
-0:385      Sequence
-0:385        move second child to first child ( temp 3X3 matrix of float)
-0:385          'r8' ( temp 3X3 matrix of float)
-0:385          matrix-multiply ( temp 3X3 matrix of float)
-0:385            'inFM1' ( in 3X3 matrix of float)
-0:385            'inFM0' ( in 3X3 matrix of float)
-0:391  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
-0:391    Function Parameters: 
-0:391      'inF0' ( in float)
-0:391      'inF1' ( in float)
-0:391      'inFV0' ( in 4-component vector of float)
-0:391      'inFV1' ( in 4-component vector of float)
-0:391      'inFM0' ( in 4X4 matrix of float)
-0:391      'inFM1' ( in 4X4 matrix of float)
+0:380      Sequence
+0:380        move second child to first child ( temp float)
+0:380          'r0' ( temp float)
+0:380          component-wise multiply ( temp float)
+0:380            'inF1' ( in float)
+0:380            'inF0' ( in float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r1' ( temp 3-component vector of float)
+0:380          vector-scale ( temp 3-component vector of float)
+0:380            'inF0' ( in float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r2' ( temp 3-component vector of float)
+0:380          vector-scale ( temp 3-component vector of float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380            'inF0' ( in float)
+0:380      Sequence
+0:380        move second child to first child ( temp float)
+0:380          'r3' ( temp float)
+0:380          dot-product ( temp float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380            'inFV1' ( in 3-component vector of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r4' ( temp 3-component vector of float)
+0:380          vector-times-matrix ( temp 3-component vector of float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3-component vector of float)
+0:380          'r5' ( temp 3-component vector of float)
+0:380          matrix-times-vector ( temp 3-component vector of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380            'inFV0' ( in 3-component vector of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3X3 matrix of float)
+0:380          'r6' ( temp 3X3 matrix of float)
+0:380          matrix-scale ( temp 3X3 matrix of float)
+0:380            'inF0' ( in float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3X3 matrix of float)
+0:380          'r7' ( temp 3X3 matrix of float)
+0:380          matrix-scale ( temp 3X3 matrix of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:380            'inF0' ( in float)
+0:380      Sequence
+0:380        move second child to first child ( temp 3X3 matrix of float)
+0:380          'r8' ( temp 3X3 matrix of float)
+0:380          matrix-multiply ( temp 3X3 matrix of float)
+0:380            'inFM1' ( in 3X3 matrix of float)
+0:380            'inFM0' ( in 3X3 matrix of float)
+0:386  Function Definition: TestGenMul4(f1;f1;vf4;vf4;mf44;mf44; ( temp void)
+0:386    Function Parameters: 
+0:386      'inF0' ( in float)
+0:386      'inF1' ( in float)
+0:386      'inFV0' ( in 4-component vector of float)
+0:386      'inFV1' ( in 4-component vector of float)
+0:386      'inFM0' ( in 4X4 matrix of float)
+0:386      'inFM1' ( in 4X4 matrix of float)
 0:?     Sequence
-0:392      Sequence
-0:392        move second child to first child ( temp float)
-0:392          'r0' ( temp float)
-0:392          component-wise multiply ( temp float)
-0:392            'inF1' ( in float)
-0:392            'inF0' ( in float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r1' ( temp 4-component vector of float)
-0:392          vector-scale ( temp 4-component vector of float)
-0:392            'inF0' ( in float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r2' ( temp 4-component vector of float)
-0:392          vector-scale ( temp 4-component vector of float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392            'inF0' ( in float)
-0:392      Sequence
-0:392        move second child to first child ( temp float)
-0:392          'r3' ( temp float)
-0:392          dot-product ( temp float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392            'inFV1' ( in 4-component vector of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r4' ( temp 4-component vector of float)
-0:392          vector-times-matrix ( temp 4-component vector of float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4-component vector of float)
-0:392          'r5' ( temp 4-component vector of float)
-0:392          matrix-times-vector ( temp 4-component vector of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392            'inFV0' ( in 4-component vector of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4X4 matrix of float)
-0:392          'r6' ( temp 4X4 matrix of float)
-0:392          matrix-scale ( temp 4X4 matrix of float)
-0:392            'inF0' ( in float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4X4 matrix of float)
-0:392          'r7' ( temp 4X4 matrix of float)
-0:392          matrix-scale ( temp 4X4 matrix of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:392            'inF0' ( in float)
-0:392      Sequence
-0:392        move second child to first child ( temp 4X4 matrix of float)
-0:392          'r8' ( temp 4X4 matrix of float)
-0:392          matrix-multiply ( temp 4X4 matrix of float)
-0:392            'inFM1' ( in 4X4 matrix of float)
-0:392            'inFM0' ( in 4X4 matrix of float)
-0:401  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
-0:401    Function Parameters: 
-0:401      'inF0' ( in float)
-0:401      'inF1' ( in float)
-0:401      'inFV2' ( in 2-component vector of float)
-0:401      'inFV3' ( in 3-component vector of float)
-0:401      'inFM2x3' ( in 2X3 matrix of float)
-0:401      'inFM3x2' ( in 3X2 matrix of float)
-0:401      'inFM3x3' ( in 3X3 matrix of float)
-0:401      'inFM3x4' ( in 3X4 matrix of float)
-0:401      'inFM2x4' ( in 2X4 matrix of float)
+0:387      Sequence
+0:387        move second child to first child ( temp float)
+0:387          'r0' ( temp float)
+0:387          component-wise multiply ( temp float)
+0:387            'inF1' ( in float)
+0:387            'inF0' ( in float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r1' ( temp 4-component vector of float)
+0:387          vector-scale ( temp 4-component vector of float)
+0:387            'inF0' ( in float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r2' ( temp 4-component vector of float)
+0:387          vector-scale ( temp 4-component vector of float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387            'inF0' ( in float)
+0:387      Sequence
+0:387        move second child to first child ( temp float)
+0:387          'r3' ( temp float)
+0:387          dot-product ( temp float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387            'inFV1' ( in 4-component vector of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r4' ( temp 4-component vector of float)
+0:387          vector-times-matrix ( temp 4-component vector of float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4-component vector of float)
+0:387          'r5' ( temp 4-component vector of float)
+0:387          matrix-times-vector ( temp 4-component vector of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387            'inFV0' ( in 4-component vector of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4X4 matrix of float)
+0:387          'r6' ( temp 4X4 matrix of float)
+0:387          matrix-scale ( temp 4X4 matrix of float)
+0:387            'inF0' ( in float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4X4 matrix of float)
+0:387          'r7' ( temp 4X4 matrix of float)
+0:387          matrix-scale ( temp 4X4 matrix of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:387            'inF0' ( in float)
+0:387      Sequence
+0:387        move second child to first child ( temp 4X4 matrix of float)
+0:387          'r8' ( temp 4X4 matrix of float)
+0:387          matrix-multiply ( temp 4X4 matrix of float)
+0:387            'inFM1' ( in 4X4 matrix of float)
+0:387            'inFM0' ( in 4X4 matrix of float)
+0:396  Function Definition: TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24; ( temp void)
+0:396    Function Parameters: 
+0:396      'inF0' ( in float)
+0:396      'inF1' ( in float)
+0:396      'inFV2' ( in 2-component vector of float)
+0:396      'inFV3' ( in 3-component vector of float)
+0:396      'inFM2x3' ( in 2X3 matrix of float)
+0:396      'inFM3x2' ( in 3X2 matrix of float)
+0:396      'inFM3x3' ( in 3X3 matrix of float)
+0:396      'inFM3x4' ( in 3X4 matrix of float)
+0:396      'inFM2x4' ( in 2X4 matrix of float)
 0:?     Sequence
+0:397      Sequence
+0:397        move second child to first child ( temp float)
+0:397          'r00' ( temp float)
+0:397          component-wise multiply ( temp float)
+0:397            'inF1' ( in float)
+0:397            'inF0' ( in float)
+0:398      Sequence
+0:398        move second child to first child ( temp 2-component vector of float)
+0:398          'r01' ( temp 2-component vector of float)
+0:398          vector-scale ( temp 2-component vector of float)
+0:398            'inF0' ( in float)
+0:398            'inFV2' ( in 2-component vector of float)
+0:399      Sequence
+0:399        move second child to first child ( temp 3-component vector of float)
+0:399          'r02' ( temp 3-component vector of float)
+0:399          vector-scale ( temp 3-component vector of float)
+0:399            'inF0' ( in float)
+0:399            'inFV3' ( in 3-component vector of float)
+0:400      Sequence
+0:400        move second child to first child ( temp 2-component vector of float)
+0:400          'r03' ( temp 2-component vector of float)
+0:400          vector-scale ( temp 2-component vector of float)
+0:400            'inFV2' ( in 2-component vector of float)
+0:400            'inF0' ( in float)
+0:401      Sequence
+0:401        move second child to first child ( temp 3-component vector of float)
+0:401          'r04' ( temp 3-component vector of float)
+0:401          vector-scale ( temp 3-component vector of float)
+0:401            'inFV3' ( in 3-component vector of float)
+0:401            'inF0' ( in float)
 0:402      Sequence
 0:402        move second child to first child ( temp float)
-0:402          'r00' ( temp float)
-0:402          component-wise multiply ( temp float)
-0:402            'inF1' ( in float)
-0:402            'inF0' ( in float)
+0:402          'r05' ( temp float)
+0:402          dot-product ( temp float)
+0:402            'inFV2' ( in 2-component vector of float)
+0:402            'inFV2' ( in 2-component vector of float)
 0:403      Sequence
-0:403        move second child to first child ( temp 2-component vector of float)
-0:403          'r01' ( temp 2-component vector of float)
-0:403          vector-scale ( temp 2-component vector of float)
-0:403            'inF0' ( in float)
-0:403            'inFV2' ( in 2-component vector of float)
+0:403        move second child to first child ( temp float)
+0:403          'r06' ( temp float)
+0:403          dot-product ( temp float)
+0:403            'inFV3' ( in 3-component vector of float)
+0:403            'inFV3' ( in 3-component vector of float)
 0:404      Sequence
 0:404        move second child to first child ( temp 3-component vector of float)
-0:404          'r02' ( temp 3-component vector of float)
-0:404          vector-scale ( temp 3-component vector of float)
-0:404            'inF0' ( in float)
-0:404            'inFV3' ( in 3-component vector of float)
+0:404          'r07' ( temp 3-component vector of float)
+0:404          matrix-times-vector ( temp 3-component vector of float)
+0:404            'inFM2x3' ( in 2X3 matrix of float)
+0:404            'inFV2' ( in 2-component vector of float)
 0:405      Sequence
 0:405        move second child to first child ( temp 2-component vector of float)
-0:405          'r03' ( temp 2-component vector of float)
-0:405          vector-scale ( temp 2-component vector of float)
-0:405            'inFV2' ( in 2-component vector of float)
-0:405            'inF0' ( in float)
+0:405          'r08' ( temp 2-component vector of float)
+0:405          matrix-times-vector ( temp 2-component vector of float)
+0:405            'inFM3x2' ( in 3X2 matrix of float)
+0:405            'inFV3' ( in 3-component vector of float)
 0:406      Sequence
-0:406        move second child to first child ( temp 3-component vector of float)
-0:406          'r04' ( temp 3-component vector of float)
-0:406          vector-scale ( temp 3-component vector of float)
+0:406        move second child to first child ( temp 2-component vector of float)
+0:406          'r09' ( temp 2-component vector of float)
+0:406          vector-times-matrix ( temp 2-component vector of float)
 0:406            'inFV3' ( in 3-component vector of float)
-0:406            'inF0' ( in float)
+0:406            'inFM2x3' ( in 2X3 matrix of float)
 0:407      Sequence
-0:407        move second child to first child ( temp float)
-0:407          'r05' ( temp float)
-0:407          dot-product ( temp float)
-0:407            'inFV2' ( in 2-component vector of float)
+0:407        move second child to first child ( temp 3-component vector of float)
+0:407          'r10' ( temp 3-component vector of float)
+0:407          vector-times-matrix ( temp 3-component vector of float)
 0:407            'inFV2' ( in 2-component vector of float)
+0:407            'inFM3x2' ( in 3X2 matrix of float)
 0:408      Sequence
-0:408        move second child to first child ( temp float)
-0:408          'r06' ( temp float)
-0:408          dot-product ( temp float)
-0:408            'inFV3' ( in 3-component vector of float)
-0:408            'inFV3' ( in 3-component vector of float)
+0:408        move second child to first child ( temp 2X3 matrix of float)
+0:408          'r11' ( temp 2X3 matrix of float)
+0:408          matrix-scale ( temp 2X3 matrix of float)
+0:408            'inF0' ( in float)
+0:408            'inFM2x3' ( in 2X3 matrix of float)
 0:409      Sequence
-0:409        move second child to first child ( temp 3-component vector of float)
-0:409          'r07' ( temp 3-component vector of float)
-0:409          matrix-times-vector ( temp 3-component vector of float)
-0:409            'inFM2x3' ( in 2X3 matrix of float)
-0:409            'inFV2' ( in 2-component vector of float)
+0:409        move second child to first child ( temp 3X2 matrix of float)
+0:409          'r12' ( temp 3X2 matrix of float)
+0:409          matrix-scale ( temp 3X2 matrix of float)
+0:409            'inF0' ( in float)
+0:409            'inFM3x2' ( in 3X2 matrix of float)
 0:410      Sequence
-0:410        move second child to first child ( temp 2-component vector of float)
-0:410          'r08' ( temp 2-component vector of float)
-0:410          matrix-times-vector ( temp 2-component vector of float)
+0:410        move second child to first child ( temp 2X2 matrix of float)
+0:410          'r13' ( temp 2X2 matrix of float)
+0:410          matrix-multiply ( temp 2X2 matrix of float)
 0:410            'inFM3x2' ( in 3X2 matrix of float)
-0:410            'inFV3' ( in 3-component vector of float)
+0:410            'inFM2x3' ( in 2X3 matrix of float)
 0:411      Sequence
-0:411        move second child to first child ( temp 2-component vector of float)
-0:411          'r09' ( temp 2-component vector of float)
-0:411          vector-times-matrix ( temp 2-component vector of float)
-0:411            'inFV3' ( in 3-component vector of float)
+0:411        move second child to first child ( temp 2X3 matrix of float)
+0:411          'r14' ( temp 2X3 matrix of float)
+0:411          matrix-multiply ( temp 2X3 matrix of float)
+0:411            'inFM3x3' ( in 3X3 matrix of float)
 0:411            'inFM2x3' ( in 2X3 matrix of float)
 0:412      Sequence
-0:412        move second child to first child ( temp 3-component vector of float)
-0:412          'r10' ( temp 3-component vector of float)
-0:412          vector-times-matrix ( temp 3-component vector of float)
-0:412            'inFV2' ( in 2-component vector of float)
-0:412            'inFM3x2' ( in 3X2 matrix of float)
+0:412        move second child to first child ( temp 2X4 matrix of float)
+0:412          'r15' ( temp 2X4 matrix of float)
+0:412          matrix-multiply ( temp 2X4 matrix of float)
+0:412            'inFM3x4' ( in 3X4 matrix of float)
+0:412            'inFM2x3' ( in 2X3 matrix of float)
 0:413      Sequence
-0:413        move second child to first child ( temp 2X3 matrix of float)
-0:413          'r11' ( temp 2X3 matrix of float)
-0:413          matrix-scale ( temp 2X3 matrix of float)
-0:413            'inF0' ( in float)
-0:413            'inFM2x3' ( in 2X3 matrix of float)
-0:414      Sequence
-0:414        move second child to first child ( temp 3X2 matrix of float)
-0:414          'r12' ( temp 3X2 matrix of float)
-0:414          matrix-scale ( temp 3X2 matrix of float)
-0:414            'inF0' ( in float)
-0:414            'inFM3x2' ( in 3X2 matrix of float)
-0:415      Sequence
-0:415        move second child to first child ( temp 2X2 matrix of float)
-0:415          'r13' ( temp 2X2 matrix of float)
-0:415          matrix-multiply ( temp 2X2 matrix of float)
-0:415            'inFM3x2' ( in 3X2 matrix of float)
-0:415            'inFM2x3' ( in 2X3 matrix of float)
-0:416      Sequence
-0:416        move second child to first child ( temp 2X3 matrix of float)
-0:416          'r14' ( temp 2X3 matrix of float)
-0:416          matrix-multiply ( temp 2X3 matrix of float)
-0:416            'inFM3x3' ( in 3X3 matrix of float)
-0:416            'inFM2x3' ( in 2X3 matrix of float)
-0:417      Sequence
-0:417        move second child to first child ( temp 2X4 matrix of float)
-0:417          'r15' ( temp 2X4 matrix of float)
-0:417          matrix-multiply ( temp 2X4 matrix of float)
-0:417            'inFM3x4' ( in 3X4 matrix of float)
-0:417            'inFM2x3' ( in 2X3 matrix of float)
-0:418      Sequence
-0:418        move second child to first child ( temp 3X4 matrix of float)
-0:418          'r16' ( temp 3X4 matrix of float)
-0:418          matrix-multiply ( temp 3X4 matrix of float)
-0:418            'inFM2x4' ( in 2X4 matrix of float)
-0:418            'inFM3x2' ( in 3X2 matrix of float)
+0:413        move second child to first child ( temp 3X4 matrix of float)
+0:413          'r16' ( temp 3X4 matrix of float)
+0:413          matrix-multiply ( temp 3X4 matrix of float)
+0:413            'inFM2x4' ( in 2X4 matrix of float)
+0:413            'inFM3x2' ( in 3X2 matrix of float)
 0:?   Linker Objects
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 1240
+// Id's are bound by 1205
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "VertexShaderFunction"
+                              Source HLSL 500
                               Name 4  "VertexShaderFunction"
                               Name 16  "VertexShaderFunctionS(f1;f1;f1;u1;u1;"
                               Name 11  "inF0"
@@ -2872,57 +2831,50 @@ Shader version: 450
                               Name 126  "inFM3x3"
                               Name 127  "inFM3x4"
                               Name 128  "inFM2x4"
-                              Name 182  "ResType"
-                              Name 316  "ResType"
-                              Name 464  "ResType"
-                              Name 620  "ResType"
-                              Name 753  "ResType"
-                              Name 873  "ResType"
-                              Name 996  "ResType"
-                              Name 1064  "r0"
-                              Name 1068  "r1"
-                              Name 1072  "r2"
-                              Name 1076  "r3"
-                              Name 1080  "r4"
-                              Name 1084  "r5"
-                              Name 1088  "r6"
-                              Name 1092  "r7"
-                              Name 1096  "r8"
-                              Name 1100  "r0"
-                              Name 1104  "r1"
-                              Name 1108  "r2"
-                              Name 1112  "r3"
-                              Name 1116  "r4"
-                              Name 1120  "r5"
-                              Name 1124  "r6"
-                              Name 1128  "r7"
-                              Name 1132  "r8"
-                              Name 1136  "r0"
-                              Name 1140  "r1"
-                              Name 1144  "r2"
-                              Name 1148  "r3"
-                              Name 1152  "r4"
-                              Name 1156  "r5"
-                              Name 1160  "r6"
-                              Name 1164  "r7"
-                              Name 1168  "r8"
-                              Name 1172  "r00"
-                              Name 1176  "r01"
-                              Name 1180  "r02"
-                              Name 1184  "r03"
-                              Name 1188  "r04"
-                              Name 1192  "r05"
-                              Name 1196  "r06"
-                              Name 1200  "r07"
-                              Name 1204  "r08"
-                              Name 1208  "r09"
-                              Name 1212  "r10"
-                              Name 1216  "r11"
-                              Name 1220  "r12"
-                              Name 1224  "r13"
-                              Name 1228  "r14"
-                              Name 1232  "r15"
-                              Name 1236  "r16"
+                              Name 1029  "r0"
+                              Name 1033  "r1"
+                              Name 1037  "r2"
+                              Name 1041  "r3"
+                              Name 1045  "r4"
+                              Name 1049  "r5"
+                              Name 1053  "r6"
+                              Name 1057  "r7"
+                              Name 1061  "r8"
+                              Name 1065  "r0"
+                              Name 1069  "r1"
+                              Name 1073  "r2"
+                              Name 1077  "r3"
+                              Name 1081  "r4"
+                              Name 1085  "r5"
+                              Name 1089  "r6"
+                              Name 1093  "r7"
+                              Name 1097  "r8"
+                              Name 1101  "r0"
+                              Name 1105  "r1"
+                              Name 1109  "r2"
+                              Name 1113  "r3"
+                              Name 1117  "r4"
+                              Name 1121  "r5"
+                              Name 1125  "r6"
+                              Name 1129  "r7"
+                              Name 1133  "r8"
+                              Name 1137  "r00"
+                              Name 1141  "r01"
+                              Name 1145  "r02"
+                              Name 1149  "r03"
+                              Name 1153  "r04"
+                              Name 1157  "r05"
+                              Name 1161  "r06"
+                              Name 1165  "r07"
+                              Name 1169  "r08"
+                              Name 1173  "r09"
+                              Name 1177  "r10"
+                              Name 1181  "r11"
+                              Name 1185  "r12"
+                              Name 1189  "r13"
+                              Name 1193  "r14"
+                              Name 1197  "r15"
+                              Name 1201  "r16"
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -2970,48 +2922,41 @@ Shader version: 450
              132:             TypeBool
              143:             TypeInt 32 1
              164:    143(int) Constant 7
-    182(ResType):             TypeStruct 6(float) 143(int)
-             201:    6(float) Constant 1050288283
-             216:    143(int) Constant 2
-             223:    6(float) Constant 0
-             224:    6(float) Constant 1065353216
-             266:             TypeVector 143(int) 2
-             287:    143(int) Constant 3
-             288:  266(ivec2) ConstantComposite 164 287
-    316(ResType):             TypeStruct 24(fvec2) 266(ivec2)
-             321:             TypeVector 132(bool) 2
-             359:    6(float) Constant 1073741824
-             361:    143(int) Constant 1
-             362:  266(ivec2) ConstantComposite 361 216
-             397:   24(fvec2) ConstantComposite 224 359
-             411:             TypeVector 143(int) 3
-             432:    143(int) Constant 5
-             433:  411(ivec3) ConstantComposite 164 287 432
-    464(ResType):             TypeStruct 36(fvec3) 411(ivec3)
-             469:             TypeVector 132(bool) 3
-             508:  411(ivec3) ConstantComposite 361 216 287
-             543:    6(float) Constant 1077936128
-             544:   36(fvec3) ConstantComposite 224 359 543
-             558:             TypeVector 143(int) 4
-             579:  558(ivec4) ConstantComposite 164 287 432 216
-             589:      8(int) Constant 1
-             595:      8(int) Constant 2
-             598:      8(int) Constant 3
-    620(ResType):             TypeStruct 48(fvec4) 558(ivec4)
-             625:             TypeVector 132(bool) 4
-             664:    143(int) Constant 4
-             665:  558(ivec4) ConstantComposite 361 216 287 664
-             700:    6(float) Constant 1082130432
-             701:   48(fvec4) ConstantComposite 224 359 543 700
-    753(ResType):             TypeStruct 60 266(ivec2)
-             817:   24(fvec2) ConstantComposite 359 359
-             818:          60 ConstantComposite 817 817
-    873(ResType):             TypeStruct 68 411(ivec3)
-             937:   36(fvec3) ConstantComposite 543 543 543
-             938:          68 ConstantComposite 937 937 937
-    996(ResType):             TypeStruct 76 558(ivec4)
-            1060:   48(fvec4) ConstantComposite 700 700 700 700
-            1061:          76 ConstantComposite 1060 1060 1060 1060
+             196:    6(float) Constant 1050288283
+             211:    143(int) Constant 2
+             218:    6(float) Constant 0
+             219:    6(float) Constant 1065353216
+             261:             TypeVector 143(int) 2
+             282:    143(int) Constant 3
+             283:  261(ivec2) ConstantComposite 164 282
+             311:             TypeVector 132(bool) 2
+             349:    6(float) Constant 1073741824
+             351:    143(int) Constant 1
+             352:  261(ivec2) ConstantComposite 351 211
+             387:   24(fvec2) ConstantComposite 219 349
+             401:             TypeVector 143(int) 3
+             422:    143(int) Constant 5
+             423:  401(ivec3) ConstantComposite 164 282 422
+             454:             TypeVector 132(bool) 3
+             493:  401(ivec3) ConstantComposite 351 211 282
+             528:    6(float) Constant 1077936128
+             529:   36(fvec3) ConstantComposite 219 349 528
+             543:             TypeVector 143(int) 4
+             564:  543(ivec4) ConstantComposite 164 282 422 211
+             574:      8(int) Constant 1
+             580:      8(int) Constant 2
+             583:      8(int) Constant 3
+             605:             TypeVector 132(bool) 4
+             644:    143(int) Constant 4
+             645:  543(ivec4) ConstantComposite 351 211 282 644
+             680:    6(float) Constant 1082130432
+             681:   48(fvec4) ConstantComposite 219 349 528 680
+             792:   24(fvec2) ConstantComposite 349 349
+             793:          60 ConstantComposite 792 792
+             907:   36(fvec3) ConstantComposite 528 528 528
+             908:          68 ConstantComposite 907 907 907
+            1025:   48(fvec4) ConstantComposite 680 680 680 680
+            1026:          76 ConstantComposite 1025 1025 1025 1025
 4(VertexShaderFunction):           2 Function None 3
                5:             Label
                               Return
@@ -3071,81 +3016,76 @@ Shader version: 450
              179:    6(float) Load 11(inF0)
              180:    6(float) ExtInst 1(GLSL.std.450) 10(Fract) 179
              181:    6(float) Load 11(inF0)
-             183:182(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 181
-             184:    143(int) CompositeExtract 183 1
-                              Store 12(inF1) 184
-             185:    6(float) CompositeExtract 183 0
-             186:    6(float) Load 11(inF0)
-             187:   132(bool) IsInf 186
+             182:   132(bool) IsInf 181
+             183:    6(float) Load 11(inF0)
+             184:   132(bool) IsNan 183
+             185:    6(float) Load 11(inF0)
+             186:    6(float) Load 12(inF1)
+             187:    6(float) ExtInst 1(GLSL.std.450) 53(Ldexp) 185 186
              188:    6(float) Load 11(inF0)
-             189:   132(bool) IsNan 188
-             190:    6(float) Load 11(inF0)
-             191:    6(float) Load 12(inF1)
-             192:    6(float) ExtInst 1(GLSL.std.450) 53(Ldexp) 190 191
-             193:    6(float) Load 11(inF0)
-             194:    6(float) Load 12(inF1)
-             195:    6(float) Load 13(inF2)
-             196:    6(float) ExtInst 1(GLSL.std.450) 46(FMix) 193 194 195
-             197:    6(float) Load 11(inF0)
-             198:    6(float) ExtInst 1(GLSL.std.450) 28(Log) 197
-             199:    6(float) Load 11(inF0)
-             200:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 199
-             202:    6(float) FMul 200 201
+             189:    6(float) Load 12(inF1)
+             190:    6(float) Load 13(inF2)
+             191:    6(float) ExtInst 1(GLSL.std.450) 46(FMix) 188 189 190
+             192:    6(float) Load 11(inF0)
+             193:    6(float) ExtInst 1(GLSL.std.450) 28(Log) 192
+             194:    6(float) Load 11(inF0)
+             195:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 194
+             197:    6(float) FMul 195 196
+             198:    6(float) Load 11(inF0)
+             199:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 198
+             200:    6(float) Load 11(inF0)
+             201:    6(float) Load 12(inF1)
+             202:    6(float) ExtInst 1(GLSL.std.450) 40(FMax) 200 201
              203:    6(float) Load 11(inF0)
-             204:    6(float) ExtInst 1(GLSL.std.450) 30(Log2) 203
-             205:    6(float) Load 11(inF0)
-             206:    6(float) Load 12(inF1)
-             207:    6(float) ExtInst 1(GLSL.std.450) 40(FMax) 205 206
-             208:    6(float) Load 11(inF0)
-             209:    6(float) Load 12(inF1)
-             210:    6(float) ExtInst 1(GLSL.std.450) 37(FMin) 208 209
-             211:    6(float) Load 11(inF0)
-             212:    6(float) Load 12(inF1)
-             213:    6(float) ExtInst 1(GLSL.std.450) 26(Pow) 211 212
-             214:    6(float) Load 11(inF0)
-             215:    6(float) ExtInst 1(GLSL.std.450) 11(Radians) 214
-             217:    143(int) BitReverse 216
-             218:    6(float) Load 11(inF0)
-             219:    6(float) ExtInst 1(GLSL.std.450) 2(RoundEven) 218
-             220:    6(float) Load 11(inF0)
-             221:    6(float) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 220
-             222:    6(float) Load 11(inF0)
-             225:    6(float) ExtInst 1(GLSL.std.450) 43(FClamp) 222 223 224
-             226:    6(float) Load 11(inF0)
-             227:    6(float) ExtInst 1(GLSL.std.450) 6(FSign) 226
-             228:    6(float) Load 11(inF0)
-             229:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 228
-             230:    6(float) Load 11(inF0)
-             231:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 230
-                              Store 12(inF1) 231
-             232:    6(float) Load 11(inF0)
-             233:    6(float) ExtInst 1(GLSL.std.450) 14(Cos) 232
-                              Store 13(inF2) 233
-             234:    6(float) Load 11(inF0)
-             235:    6(float) ExtInst 1(GLSL.std.450) 19(Sinh) 234
-             236:    6(float) Load 11(inF0)
-             237:    6(float) Load 12(inF1)
-             238:    6(float) Load 13(inF2)
-             239:    6(float) ExtInst 1(GLSL.std.450) 49(SmoothStep) 236 237 238
+             204:    6(float) Load 12(inF1)
+             205:    6(float) ExtInst 1(GLSL.std.450) 37(FMin) 203 204
+             206:    6(float) Load 11(inF0)
+             207:    6(float) Load 12(inF1)
+             208:    6(float) ExtInst 1(GLSL.std.450) 26(Pow) 206 207
+             209:    6(float) Load 11(inF0)
+             210:    6(float) ExtInst 1(GLSL.std.450) 11(Radians) 209
+             212:    143(int) BitReverse 211
+             213:    6(float) Load 11(inF0)
+             214:    6(float) ExtInst 1(GLSL.std.450) 2(RoundEven) 213
+             215:    6(float) Load 11(inF0)
+             216:    6(float) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 215
+             217:    6(float) Load 11(inF0)
+             220:    6(float) ExtInst 1(GLSL.std.450) 43(FClamp) 217 218 219
+             221:    6(float) Load 11(inF0)
+             222:    6(float) ExtInst 1(GLSL.std.450) 6(FSign) 221
+             223:    6(float) Load 11(inF0)
+             224:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 223
+             225:    6(float) Load 11(inF0)
+             226:    6(float) ExtInst 1(GLSL.std.450) 13(Sin) 225
+                              Store 12(inF1) 226
+             227:    6(float) Load 11(inF0)
+             228:    6(float) ExtInst 1(GLSL.std.450) 14(Cos) 227
+                              Store 13(inF2) 228
+             229:    6(float) Load 11(inF0)
+             230:    6(float) ExtInst 1(GLSL.std.450) 19(Sinh) 229
+             231:    6(float) Load 11(inF0)
+             232:    6(float) Load 12(inF1)
+             233:    6(float) Load 13(inF2)
+             234:    6(float) ExtInst 1(GLSL.std.450) 49(SmoothStep) 231 232 233
+             235:    6(float) Load 11(inF0)
+             236:    6(float) ExtInst 1(GLSL.std.450) 31(Sqrt) 235
+             237:    6(float) Load 11(inF0)
+             238:    6(float) Load 12(inF1)
+             239:    6(float) ExtInst 1(GLSL.std.450) 48(Step) 237 238
              240:    6(float) Load 11(inF0)
-             241:    6(float) ExtInst 1(GLSL.std.450) 31(Sqrt) 240
+             241:    6(float) ExtInst 1(GLSL.std.450) 15(Tan) 240
              242:    6(float) Load 11(inF0)
-             243:    6(float) Load 12(inF1)
-             244:    6(float) ExtInst 1(GLSL.std.450) 48(Step) 242 243
-             245:    6(float) Load 11(inF0)
-             246:    6(float) ExtInst 1(GLSL.std.450) 15(Tan) 245
-             247:    6(float) Load 11(inF0)
-             248:    6(float) ExtInst 1(GLSL.std.450) 21(Tanh) 247
-             249:    6(float) Load 11(inF0)
-             250:    6(float) ExtInst 1(GLSL.std.450) 3(Trunc) 249
-                              ReturnValue 223
+             243:    6(float) ExtInst 1(GLSL.std.450) 21(Tanh) 242
+             244:    6(float) Load 11(inF0)
+             245:    6(float) ExtInst 1(GLSL.std.450) 3(Trunc) 244
+                              ReturnValue 218
                               FunctionEnd
 22(VertexShaderFunction1(vf1;vf1;vf1;):    6(float) Function None 18
         19(inF0):      7(ptr) FunctionParameter
         20(inF1):      7(ptr) FunctionParameter
         21(inF2):      7(ptr) FunctionParameter
               23:             Label
-                              ReturnValue 223
+                              ReturnValue 218
                               FunctionEnd
 34(VertexShaderFunction2(vf2;vf2;vf2;vu2;vu2;):   24(fvec2) Function None 28
         29(inF0):     25(ptr) FunctionParameter
@@ -3154,144 +3094,139 @@ Shader version: 450
         32(inU0):     27(ptr) FunctionParameter
         33(inU1):     27(ptr) FunctionParameter
               35:             Label
-             255:   24(fvec2) Load 29(inF0)
-             256:   132(bool) All 255
-             257:   24(fvec2) Load 29(inF0)
-             258:   24(fvec2) ExtInst 1(GLSL.std.450) 4(FAbs) 257
-             259:   24(fvec2) Load 29(inF0)
-             260:   24(fvec2) ExtInst 1(GLSL.std.450) 17(Acos) 259
-             261:   24(fvec2) Load 29(inF0)
-             262:   132(bool) Any 261
+             250:   24(fvec2) Load 29(inF0)
+             251:   132(bool) All 250
+             252:   24(fvec2) Load 29(inF0)
+             253:   24(fvec2) ExtInst 1(GLSL.std.450) 4(FAbs) 252
+             254:   24(fvec2) Load 29(inF0)
+             255:   24(fvec2) ExtInst 1(GLSL.std.450) 17(Acos) 254
+             256:   24(fvec2) Load 29(inF0)
+             257:   132(bool) Any 256
+             258:   24(fvec2) Load 29(inF0)
+             259:   24(fvec2) ExtInst 1(GLSL.std.450) 16(Asin) 258
+             260:   24(fvec2) Load 29(inF0)
+             262:  261(ivec2) Bitcast 260
              263:   24(fvec2) Load 29(inF0)
-             264:   24(fvec2) ExtInst 1(GLSL.std.450) 16(Asin) 263
-             265:   24(fvec2) Load 29(inF0)
-             267:  266(ivec2) Bitcast 265
-             268:   24(fvec2) Load 29(inF0)
-             269:   26(ivec2) Bitcast 268
-             270:   26(ivec2) Load 32(inU0)
-             271:   24(fvec2) Bitcast 270
+             264:   26(ivec2) Bitcast 263
+             265:   26(ivec2) Load 32(inU0)
+             266:   24(fvec2) Bitcast 265
+             267:   24(fvec2) Load 29(inF0)
+             268:   24(fvec2) ExtInst 1(GLSL.std.450) 18(Atan) 267
+             269:   24(fvec2) Load 29(inF0)
+             270:   24(fvec2) Load 30(inF1)
+             271:   24(fvec2) ExtInst 1(GLSL.std.450) 25(Atan2) 269 270
              272:   24(fvec2) Load 29(inF0)
-             273:   24(fvec2) ExtInst 1(GLSL.std.450) 18(Atan) 272
+             273:   24(fvec2) ExtInst 1(GLSL.std.450) 9(Ceil) 272
              274:   24(fvec2) Load 29(inF0)
              275:   24(fvec2) Load 30(inF1)
-             276:   24(fvec2) ExtInst 1(GLSL.std.450) 25(Atan2) 274 275
-             277:   24(fvec2) Load 29(inF0)
-             278:   24(fvec2) ExtInst 1(GLSL.std.450) 9(Ceil) 277
-             279:   24(fvec2) Load 29(inF0)
-             280:   24(fvec2) Load 30(inF1)
-             281:   24(fvec2) Load 31(inF2)
-             282:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 279 280 281
-             283:   24(fvec2) Load 29(inF0)
-             284:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 283
+             276:   24(fvec2) Load 31(inF2)
+             277:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 274 275 276
+             278:   24(fvec2) Load 29(inF0)
+             279:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 278
+             280:   24(fvec2) Load 29(inF0)
+             281:   24(fvec2) ExtInst 1(GLSL.std.450) 20(Cosh) 280
+             284:  261(ivec2) BitCount 283
              285:   24(fvec2) Load 29(inF0)
-             286:   24(fvec2) ExtInst 1(GLSL.std.450) 20(Cosh) 285
-             289:  266(ivec2) BitCount 288
+             286:   24(fvec2) ExtInst 1(GLSL.std.450) 12(Degrees) 285
+             287:   24(fvec2) Load 29(inF0)
+             288:   24(fvec2) Load 30(inF1)
+             289:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 287 288
              290:   24(fvec2) Load 29(inF0)
-             291:   24(fvec2) ExtInst 1(GLSL.std.450) 12(Degrees) 290
-             292:   24(fvec2) Load 29(inF0)
-             293:   24(fvec2) Load 30(inF1)
-             294:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 292 293
+             291:   24(fvec2) Load 30(inF1)
+             292:    6(float) Dot 290 291
+             293:   24(fvec2) Load 29(inF0)
+             294:   24(fvec2) ExtInst 1(GLSL.std.450) 27(Exp) 293
              295:   24(fvec2) Load 29(inF0)
-             296:   24(fvec2) Load 30(inF1)
-             297:    6(float) Dot 295 296
-             298:   24(fvec2) Load 29(inF0)
-             299:   24(fvec2) ExtInst 1(GLSL.std.450) 27(Exp) 298
-             300:   24(fvec2) Load 29(inF0)
-             301:   24(fvec2) ExtInst 1(GLSL.std.450) 29(Exp2) 300
-             302:   24(fvec2) Load 29(inF0)
-             303:   24(fvec2) Load 30(inF1)
-             304:   24(fvec2) Load 31(inF2)
-             305:   24(fvec2) ExtInst 1(GLSL.std.450) 70(FaceForward) 302 303 304
-             306:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
-             307:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             296:   24(fvec2) ExtInst 1(GLSL.std.450) 29(Exp2) 295
+             297:   24(fvec2) Load 29(inF0)
+             298:   24(fvec2) Load 30(inF1)
+             299:   24(fvec2) Load 31(inF2)
+             300:   24(fvec2) ExtInst 1(GLSL.std.450) 70(FaceForward) 297 298 299
+             301:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
+             302:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             303:   24(fvec2) Load 29(inF0)
+             304:   24(fvec2) ExtInst 1(GLSL.std.450) 8(Floor) 303
+             305:   24(fvec2) Load 29(inF0)
+             306:   24(fvec2) Load 30(inF1)
+             307:   24(fvec2) FMod 305 306
              308:   24(fvec2) Load 29(inF0)
-             309:   24(fvec2) ExtInst 1(GLSL.std.450) 8(Floor) 308
+             309:   24(fvec2) ExtInst 1(GLSL.std.450) 10(Fract) 308
              310:   24(fvec2) Load 29(inF0)
-             311:   24(fvec2) Load 30(inF1)
-             312:   24(fvec2) FMod 310 311
+             312:  311(bvec2) IsInf 310
              313:   24(fvec2) Load 29(inF0)
-             314:   24(fvec2) ExtInst 1(GLSL.std.450) 10(Fract) 313
+             314:  311(bvec2) IsNan 313
              315:   24(fvec2) Load 29(inF0)
-             317:316(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 315
-             318:  266(ivec2) CompositeExtract 317 1
-                              Store 30(inF1) 318
-             319:   24(fvec2) CompositeExtract 317 0
-             320:   24(fvec2) Load 29(inF0)
-             322:  321(bvec2) IsInf 320
-             323:   24(fvec2) Load 29(inF0)
-             324:  321(bvec2) IsNan 323
-             325:   24(fvec2) Load 29(inF0)
-             326:   24(fvec2) Load 30(inF1)
-             327:   24(fvec2) ExtInst 1(GLSL.std.450) 53(Ldexp) 325 326
-             328:   24(fvec2) Load 29(inF0)
-             329:   24(fvec2) Load 30(inF1)
-             330:   24(fvec2) Load 31(inF2)
-             331:   24(fvec2) ExtInst 1(GLSL.std.450) 46(FMix) 328 329 330
-             332:   24(fvec2) Load 29(inF0)
-             333:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 332
+             316:   24(fvec2) Load 30(inF1)
+             317:   24(fvec2) ExtInst 1(GLSL.std.450) 53(Ldexp) 315 316
+             318:   24(fvec2) Load 29(inF0)
+             319:   24(fvec2) Load 30(inF1)
+             320:   24(fvec2) Load 31(inF2)
+             321:   24(fvec2) ExtInst 1(GLSL.std.450) 46(FMix) 318 319 320
+             322:   24(fvec2) Load 29(inF0)
+             323:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 322
+             324:   24(fvec2) Load 29(inF0)
+             325:   24(fvec2) ExtInst 1(GLSL.std.450) 28(Log) 324
+             326:   24(fvec2) Load 29(inF0)
+             327:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 326
+             328:   24(fvec2) VectorTimesScalar 327 196
+             329:   24(fvec2) Load 29(inF0)
+             330:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 329
+             331:   24(fvec2) Load 29(inF0)
+             332:   24(fvec2) Load 30(inF1)
+             333:   24(fvec2) ExtInst 1(GLSL.std.450) 40(FMax) 331 332
              334:   24(fvec2) Load 29(inF0)
-             335:   24(fvec2) ExtInst 1(GLSL.std.450) 28(Log) 334
-             336:   24(fvec2) Load 29(inF0)
-             337:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 336
-             338:   24(fvec2) VectorTimesScalar 337 201
+             335:   24(fvec2) Load 30(inF1)
+             336:   24(fvec2) ExtInst 1(GLSL.std.450) 37(FMin) 334 335
+             337:   24(fvec2) Load 29(inF0)
+             338:   24(fvec2) ExtInst 1(GLSL.std.450) 69(Normalize) 337
              339:   24(fvec2) Load 29(inF0)
-             340:   24(fvec2) ExtInst 1(GLSL.std.450) 30(Log2) 339
-             341:   24(fvec2) Load 29(inF0)
-             342:   24(fvec2) Load 30(inF1)
-             343:   24(fvec2) ExtInst 1(GLSL.std.450) 40(FMax) 341 342
+             340:   24(fvec2) Load 30(inF1)
+             341:   24(fvec2) ExtInst 1(GLSL.std.450) 26(Pow) 339 340
+             342:   24(fvec2) Load 29(inF0)
+             343:   24(fvec2) ExtInst 1(GLSL.std.450) 11(Radians) 342
              344:   24(fvec2) Load 29(inF0)
              345:   24(fvec2) Load 30(inF1)
-             346:   24(fvec2) ExtInst 1(GLSL.std.450) 37(FMin) 344 345
+             346:   24(fvec2) ExtInst 1(GLSL.std.450) 71(Reflect) 344 345
              347:   24(fvec2) Load 29(inF0)
-             348:   24(fvec2) ExtInst 1(GLSL.std.450) 69(Normalize) 347
-             349:   24(fvec2) Load 29(inF0)
-             350:   24(fvec2) Load 30(inF1)
-             351:   24(fvec2) ExtInst 1(GLSL.std.450) 26(Pow) 349 350
-             352:   24(fvec2) Load 29(inF0)
-             353:   24(fvec2) ExtInst 1(GLSL.std.450) 11(Radians) 352
+             348:   24(fvec2) Load 30(inF1)
+             350:   24(fvec2) ExtInst 1(GLSL.std.450) 72(Refract) 347 348 349
+             353:  261(ivec2) BitReverse 352
              354:   24(fvec2) Load 29(inF0)
-             355:   24(fvec2) Load 30(inF1)
-             356:   24(fvec2) ExtInst 1(GLSL.std.450) 71(Reflect) 354 355
-             357:   24(fvec2) Load 29(inF0)
-             358:   24(fvec2) Load 30(inF1)
-             360:   24(fvec2) ExtInst 1(GLSL.std.450) 72(Refract) 357 358 359
-             363:  266(ivec2) BitReverse 362
+             355:   24(fvec2) ExtInst 1(GLSL.std.450) 2(RoundEven) 354
+             356:   24(fvec2) Load 29(inF0)
+             357:   24(fvec2) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 356
+             358:   24(fvec2) Load 29(inF0)
+             359:   24(fvec2) CompositeConstruct 218 218
+             360:   24(fvec2) CompositeConstruct 219 219
+             361:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 358 359 360
+             362:   24(fvec2) Load 29(inF0)
+             363:   24(fvec2) ExtInst 1(GLSL.std.450) 6(FSign) 362
              364:   24(fvec2) Load 29(inF0)
-             365:   24(fvec2) ExtInst 1(GLSL.std.450) 2(RoundEven) 364
+             365:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 364
              366:   24(fvec2) Load 29(inF0)
-             367:   24(fvec2) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 366
+             367:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 366
+                              Store 30(inF1) 367
              368:   24(fvec2) Load 29(inF0)
-             369:   24(fvec2) CompositeConstruct 223 223
-             370:   24(fvec2) CompositeConstruct 224 224
-             371:   24(fvec2) ExtInst 1(GLSL.std.450) 43(FClamp) 368 369 370
+             369:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 368
+                              Store 31(inF2) 369
+             370:   24(fvec2) Load 29(inF0)
+             371:   24(fvec2) ExtInst 1(GLSL.std.450) 19(Sinh) 370
              372:   24(fvec2) Load 29(inF0)
-             373:   24(fvec2) ExtInst 1(GLSL.std.450) 6(FSign) 372
-             374:   24(fvec2) Load 29(inF0)
-             375:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 374
+             373:   24(fvec2) Load 30(inF1)
+             374:   24(fvec2) Load 31(inF2)
+             375:   24(fvec2) ExtInst 1(GLSL.std.450) 49(SmoothStep) 372 373 374
              376:   24(fvec2) Load 29(inF0)
-             377:   24(fvec2) ExtInst 1(GLSL.std.450) 13(Sin) 376
-                              Store 30(inF1) 377
+             377:   24(fvec2) ExtInst 1(GLSL.std.450) 31(Sqrt) 376
              378:   24(fvec2) Load 29(inF0)
-             379:   24(fvec2) ExtInst 1(GLSL.std.450) 14(Cos) 378
-                              Store 31(inF2) 379
-             380:   24(fvec2) Load 29(inF0)
-             381:   24(fvec2) ExtInst 1(GLSL.std.450) 19(Sinh) 380
-             382:   24(fvec2) Load 29(inF0)
-             383:   24(fvec2) Load 30(inF1)
-             384:   24(fvec2) Load 31(inF2)
-             385:   24(fvec2) ExtInst 1(GLSL.std.450) 49(SmoothStep) 382 383 384
-             386:   24(fvec2) Load 29(inF0)
-             387:   24(fvec2) ExtInst 1(GLSL.std.450) 31(Sqrt) 386
-             388:   24(fvec2) Load 29(inF0)
-             389:   24(fvec2) Load 30(inF1)
-             390:   24(fvec2) ExtInst 1(GLSL.std.450) 48(Step) 388 389
-             391:   24(fvec2) Load 29(inF0)
-             392:   24(fvec2) ExtInst 1(GLSL.std.450) 15(Tan) 391
-             393:   24(fvec2) Load 29(inF0)
-             394:   24(fvec2) ExtInst 1(GLSL.std.450) 21(Tanh) 393
-             395:   24(fvec2) Load 29(inF0)
-             396:   24(fvec2) ExtInst 1(GLSL.std.450) 3(Trunc) 395
-                              ReturnValue 397
+             379:   24(fvec2) Load 30(inF1)
+             380:   24(fvec2) ExtInst 1(GLSL.std.450) 48(Step) 378 379
+             381:   24(fvec2) Load 29(inF0)
+             382:   24(fvec2) ExtInst 1(GLSL.std.450) 15(Tan) 381
+             383:   24(fvec2) Load 29(inF0)
+             384:   24(fvec2) ExtInst 1(GLSL.std.450) 21(Tanh) 383
+             385:   24(fvec2) Load 29(inF0)
+             386:   24(fvec2) ExtInst 1(GLSL.std.450) 3(Trunc) 385
+                              ReturnValue 387
                               FunctionEnd
 46(VertexShaderFunction3(vf3;vf3;vf3;vu3;vu3;):   36(fvec3) Function None 40
         41(inF0):     37(ptr) FunctionParameter
@@ -3300,147 +3235,142 @@ Shader version: 450
         44(inU0):     39(ptr) FunctionParameter
         45(inU1):     39(ptr) FunctionParameter
               47:             Label
+             390:   36(fvec3) Load 41(inF0)
+             391:   132(bool) All 390
+             392:   36(fvec3) Load 41(inF0)
+             393:   36(fvec3) ExtInst 1(GLSL.std.450) 4(FAbs) 392
+             394:   36(fvec3) Load 41(inF0)
+             395:   36(fvec3) ExtInst 1(GLSL.std.450) 17(Acos) 394
+             396:   36(fvec3) Load 41(inF0)
+             397:   132(bool) Any 396
+             398:   36(fvec3) Load 41(inF0)
+             399:   36(fvec3) ExtInst 1(GLSL.std.450) 16(Asin) 398
              400:   36(fvec3) Load 41(inF0)
-             401:   132(bool) All 400
-             402:   36(fvec3) Load 41(inF0)
-             403:   36(fvec3) ExtInst 1(GLSL.std.450) 4(FAbs) 402
-             404:   36(fvec3) Load 41(inF0)
-             405:   36(fvec3) ExtInst 1(GLSL.std.450) 17(Acos) 404
-             406:   36(fvec3) Load 41(inF0)
-             407:   132(bool) Any 406
-             408:   36(fvec3) Load 41(inF0)
-             409:   36(fvec3) ExtInst 1(GLSL.std.450) 16(Asin) 408
-             410:   36(fvec3) Load 41(inF0)
-             412:  411(ivec3) Bitcast 410
-             413:   36(fvec3) Load 41(inF0)
-             414:   38(ivec3) Bitcast 413
-             415:   38(ivec3) Load 44(inU0)
-             416:   36(fvec3) Bitcast 415
-             417:   36(fvec3) Load 41(inF0)
-             418:   36(fvec3) ExtInst 1(GLSL.std.450) 18(Atan) 417
-             419:   36(fvec3) Load 41(inF0)
-             420:   36(fvec3) Load 42(inF1)
-             421:   36(fvec3) ExtInst 1(GLSL.std.450) 25(Atan2) 419 420
-             422:   36(fvec3) Load 41(inF0)
-             423:   36(fvec3) ExtInst 1(GLSL.std.450) 9(Ceil) 422
-             424:   36(fvec3) Load 41(inF0)
-             425:   36(fvec3) Load 42(inF1)
-             426:   36(fvec3) Load 43(inF2)
-             427:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 424 425 426
+             402:  401(ivec3) Bitcast 400
+             403:   36(fvec3) Load 41(inF0)
+             404:   38(ivec3) Bitcast 403
+             405:   38(ivec3) Load 44(inU0)
+             406:   36(fvec3) Bitcast 405
+             407:   36(fvec3) Load 41(inF0)
+             408:   36(fvec3) ExtInst 1(GLSL.std.450) 18(Atan) 407
+             409:   36(fvec3) Load 41(inF0)
+             410:   36(fvec3) Load 42(inF1)
+             411:   36(fvec3) ExtInst 1(GLSL.std.450) 25(Atan2) 409 410
+             412:   36(fvec3) Load 41(inF0)
+             413:   36(fvec3) ExtInst 1(GLSL.std.450) 9(Ceil) 412
+             414:   36(fvec3) Load 41(inF0)
+             415:   36(fvec3) Load 42(inF1)
+             416:   36(fvec3) Load 43(inF2)
+             417:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 414 415 416
+             418:   36(fvec3) Load 41(inF0)
+             419:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 418
+             420:   36(fvec3) Load 41(inF0)
+             421:   36(fvec3) ExtInst 1(GLSL.std.450) 20(Cosh) 420
+             424:  401(ivec3) BitCount 423
+             425:   36(fvec3) Load 41(inF0)
+             426:   36(fvec3) Load 42(inF1)
+             427:   36(fvec3) ExtInst 1(GLSL.std.450) 68(Cross) 425 426
              428:   36(fvec3) Load 41(inF0)
-             429:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 428
+             429:   36(fvec3) ExtInst 1(GLSL.std.450) 12(Degrees) 428
              430:   36(fvec3) Load 41(inF0)
-             431:   36(fvec3) ExtInst 1(GLSL.std.450) 20(Cosh) 430
-             434:  411(ivec3) BitCount 433
-             435:   36(fvec3) Load 41(inF0)
-             436:   36(fvec3) Load 42(inF1)
-             437:   36(fvec3) ExtInst 1(GLSL.std.450) 68(Cross) 435 436
+             431:   36(fvec3) Load 42(inF1)
+             432:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 430 431
+             433:   36(fvec3) Load 41(inF0)
+             434:   36(fvec3) Load 42(inF1)
+             435:    6(float) Dot 433 434
+             436:   36(fvec3) Load 41(inF0)
+             437:   36(fvec3) ExtInst 1(GLSL.std.450) 27(Exp) 436
              438:   36(fvec3) Load 41(inF0)
-             439:   36(fvec3) ExtInst 1(GLSL.std.450) 12(Degrees) 438
+             439:   36(fvec3) ExtInst 1(GLSL.std.450) 29(Exp2) 438
              440:   36(fvec3) Load 41(inF0)
              441:   36(fvec3) Load 42(inF1)
-             442:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 440 441
-             443:   36(fvec3) Load 41(inF0)
-             444:   36(fvec3) Load 42(inF1)
-             445:    6(float) Dot 443 444
+             442:   36(fvec3) Load 43(inF2)
+             443:   36(fvec3) ExtInst 1(GLSL.std.450) 70(FaceForward) 440 441 442
+             444:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
+             445:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
              446:   36(fvec3) Load 41(inF0)
-             447:   36(fvec3) ExtInst 1(GLSL.std.450) 27(Exp) 446
+             447:   36(fvec3) ExtInst 1(GLSL.std.450) 8(Floor) 446
              448:   36(fvec3) Load 41(inF0)
-             449:   36(fvec3) ExtInst 1(GLSL.std.450) 29(Exp2) 448
-             450:   36(fvec3) Load 41(inF0)
-             451:   36(fvec3) Load 42(inF1)
-             452:   36(fvec3) Load 43(inF2)
-             453:   36(fvec3) ExtInst 1(GLSL.std.450) 70(FaceForward) 450 451 452
-             454:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
-             455:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             449:   36(fvec3) Load 42(inF1)
+             450:   36(fvec3) FMod 448 449
+             451:   36(fvec3) Load 41(inF0)
+             452:   36(fvec3) ExtInst 1(GLSL.std.450) 10(Fract) 451
+             453:   36(fvec3) Load 41(inF0)
+             455:  454(bvec3) IsInf 453
              456:   36(fvec3) Load 41(inF0)
-             457:   36(fvec3) ExtInst 1(GLSL.std.450) 8(Floor) 456
+             457:  454(bvec3) IsNan 456
              458:   36(fvec3) Load 41(inF0)
              459:   36(fvec3) Load 42(inF1)
-             460:   36(fvec3) FMod 458 459
+             460:   36(fvec3) ExtInst 1(GLSL.std.450) 53(Ldexp) 458 459
              461:   36(fvec3) Load 41(inF0)
-             462:   36(fvec3) ExtInst 1(GLSL.std.450) 10(Fract) 461
-             463:   36(fvec3) Load 41(inF0)
-             465:464(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 463
-             466:  411(ivec3) CompositeExtract 465 1
-                              Store 42(inF1) 466
-             467:   36(fvec3) CompositeExtract 465 0
-             468:   36(fvec3) Load 41(inF0)
-             470:  469(bvec3) IsInf 468
-             471:   36(fvec3) Load 41(inF0)
-             472:  469(bvec3) IsNan 471
-             473:   36(fvec3) Load 41(inF0)
-             474:   36(fvec3) Load 42(inF1)
-             475:   36(fvec3) ExtInst 1(GLSL.std.450) 53(Ldexp) 473 474
-             476:   36(fvec3) Load 41(inF0)
-             477:   36(fvec3) Load 42(inF1)
-             478:   36(fvec3) Load 43(inF2)
-             479:   36(fvec3) ExtInst 1(GLSL.std.450) 46(FMix) 476 477 478
+             462:   36(fvec3) Load 42(inF1)
+             463:   36(fvec3) Load 43(inF2)
+             464:   36(fvec3) ExtInst 1(GLSL.std.450) 46(FMix) 461 462 463
+             465:   36(fvec3) Load 41(inF0)
+             466:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 465
+             467:   36(fvec3) Load 41(inF0)
+             468:   36(fvec3) ExtInst 1(GLSL.std.450) 28(Log) 467
+             469:   36(fvec3) Load 41(inF0)
+             470:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 469
+             471:   36(fvec3) VectorTimesScalar 470 196
+             472:   36(fvec3) Load 41(inF0)
+             473:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 472
+             474:   36(fvec3) Load 41(inF0)
+             475:   36(fvec3) Load 42(inF1)
+             476:   36(fvec3) ExtInst 1(GLSL.std.450) 40(FMax) 474 475
+             477:   36(fvec3) Load 41(inF0)
+             478:   36(fvec3) Load 42(inF1)
+             479:   36(fvec3) ExtInst 1(GLSL.std.450) 37(FMin) 477 478
              480:   36(fvec3) Load 41(inF0)
-             481:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 480
+             481:   36(fvec3) ExtInst 1(GLSL.std.450) 69(Normalize) 480
              482:   36(fvec3) Load 41(inF0)
-             483:   36(fvec3) ExtInst 1(GLSL.std.450) 28(Log) 482
-             484:   36(fvec3) Load 41(inF0)
-             485:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 484
-             486:   36(fvec3) VectorTimesScalar 485 201
+             483:   36(fvec3) Load 42(inF1)
+             484:   36(fvec3) ExtInst 1(GLSL.std.450) 26(Pow) 482 483
+             485:   36(fvec3) Load 41(inF0)
+             486:   36(fvec3) ExtInst 1(GLSL.std.450) 11(Radians) 485
              487:   36(fvec3) Load 41(inF0)
-             488:   36(fvec3) ExtInst 1(GLSL.std.450) 30(Log2) 487
-             489:   36(fvec3) Load 41(inF0)
-             490:   36(fvec3) Load 42(inF1)
-             491:   36(fvec3) ExtInst 1(GLSL.std.450) 40(FMax) 489 490
-             492:   36(fvec3) Load 41(inF0)
-             493:   36(fvec3) Load 42(inF1)
-             494:   36(fvec3) ExtInst 1(GLSL.std.450) 37(FMin) 492 493
+             488:   36(fvec3) Load 42(inF1)
+             489:   36(fvec3) ExtInst 1(GLSL.std.450) 71(Reflect) 487 488
+             490:   36(fvec3) Load 41(inF0)
+             491:   36(fvec3) Load 42(inF1)
+             492:   36(fvec3) ExtInst 1(GLSL.std.450) 72(Refract) 490 491 349
+             494:  401(ivec3) BitReverse 493
              495:   36(fvec3) Load 41(inF0)
-             496:   36(fvec3) ExtInst 1(GLSL.std.450) 69(Normalize) 495
+             496:   36(fvec3) ExtInst 1(GLSL.std.450) 2(RoundEven) 495
              497:   36(fvec3) Load 41(inF0)
-             498:   36(fvec3) Load 42(inF1)
-             499:   36(fvec3) ExtInst 1(GLSL.std.450) 26(Pow) 497 498
-             500:   36(fvec3) Load 41(inF0)
-             501:   36(fvec3) ExtInst 1(GLSL.std.450) 11(Radians) 500
-             502:   36(fvec3) Load 41(inF0)
-             503:   36(fvec3) Load 42(inF1)
-             504:   36(fvec3) ExtInst 1(GLSL.std.450) 71(Reflect) 502 503
+             498:   36(fvec3) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 497
+             499:   36(fvec3) Load 41(inF0)
+             500:   36(fvec3) CompositeConstruct 218 218 218
+             501:   36(fvec3) CompositeConstruct 219 219 219
+             502:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 499 500 501
+             503:   36(fvec3) Load 41(inF0)
+             504:   36(fvec3) ExtInst 1(GLSL.std.450) 6(FSign) 503
              505:   36(fvec3) Load 41(inF0)
-             506:   36(fvec3) Load 42(inF1)
-             507:   36(fvec3) ExtInst 1(GLSL.std.450) 72(Refract) 505 506 359
-             509:  411(ivec3) BitReverse 508
-             510:   36(fvec3) Load 41(inF0)
-             511:   36(fvec3) ExtInst 1(GLSL.std.450) 2(RoundEven) 510
-             512:   36(fvec3) Load 41(inF0)
-             513:   36(fvec3) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 512
-             514:   36(fvec3) Load 41(inF0)
-             515:   36(fvec3) CompositeConstruct 223 223 223
-             516:   36(fvec3) CompositeConstruct 224 224 224
-             517:   36(fvec3) ExtInst 1(GLSL.std.450) 43(FClamp) 514 515 516
-             518:   36(fvec3) Load 41(inF0)
-             519:   36(fvec3) ExtInst 1(GLSL.std.450) 6(FSign) 518
-             520:   36(fvec3) Load 41(inF0)
-             521:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 520
+             506:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 505
+             507:   36(fvec3) Load 41(inF0)
+             508:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 507
+                              Store 42(inF1) 508
+             509:   36(fvec3) Load 41(inF0)
+             510:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 509
+                              Store 43(inF2) 510
+             511:   36(fvec3) Load 41(inF0)
+             512:   36(fvec3) ExtInst 1(GLSL.std.450) 19(Sinh) 511
+             513:   36(fvec3) Load 41(inF0)
+             514:   36(fvec3) Load 42(inF1)
+             515:   36(fvec3) Load 43(inF2)
+             516:   36(fvec3) ExtInst 1(GLSL.std.450) 49(SmoothStep) 513 514 515
+             517:   36(fvec3) Load 41(inF0)
+             518:   36(fvec3) ExtInst 1(GLSL.std.450) 31(Sqrt) 517
+             519:   36(fvec3) Load 41(inF0)
+             520:   36(fvec3) Load 42(inF1)
+             521:   36(fvec3) ExtInst 1(GLSL.std.450) 48(Step) 519 520
              522:   36(fvec3) Load 41(inF0)
-             523:   36(fvec3) ExtInst 1(GLSL.std.450) 13(Sin) 522
-                              Store 42(inF1) 523
+             523:   36(fvec3) ExtInst 1(GLSL.std.450) 15(Tan) 522
              524:   36(fvec3) Load 41(inF0)
-             525:   36(fvec3) ExtInst 1(GLSL.std.450) 14(Cos) 524
-                              Store 43(inF2) 525
+             525:   36(fvec3) ExtInst 1(GLSL.std.450) 21(Tanh) 524
              526:   36(fvec3) Load 41(inF0)
-             527:   36(fvec3) ExtInst 1(GLSL.std.450) 19(Sinh) 526
-             528:   36(fvec3) Load 41(inF0)
-             529:   36(fvec3) Load 42(inF1)
-             530:   36(fvec3) Load 43(inF2)
-             531:   36(fvec3) ExtInst 1(GLSL.std.450) 49(SmoothStep) 528 529 530
-             532:   36(fvec3) Load 41(inF0)
-             533:   36(fvec3) ExtInst 1(GLSL.std.450) 31(Sqrt) 532
-             534:   36(fvec3) Load 41(inF0)
-             535:   36(fvec3) Load 42(inF1)
-             536:   36(fvec3) ExtInst 1(GLSL.std.450) 48(Step) 534 535
-             537:   36(fvec3) Load 41(inF0)
-             538:   36(fvec3) ExtInst 1(GLSL.std.450) 15(Tan) 537
-             539:   36(fvec3) Load 41(inF0)
-             540:   36(fvec3) ExtInst 1(GLSL.std.450) 21(Tanh) 539
-             541:   36(fvec3) Load 41(inF0)
-             542:   36(fvec3) ExtInst 1(GLSL.std.450) 3(Trunc) 541
-                              ReturnValue 544
+             527:   36(fvec3) ExtInst 1(GLSL.std.450) 3(Trunc) 526
+                              ReturnValue 529
                               FunctionEnd
 58(VertexShaderFunction4(vf4;vf4;vf4;vu4;vu4;):   48(fvec4) Function None 52
         53(inF0):     49(ptr) FunctionParameter
@@ -3449,529 +3379,509 @@ Shader version: 450
         56(inU0):     51(ptr) FunctionParameter
         57(inU1):     51(ptr) FunctionParameter
               59:             Label
-             547:   48(fvec4) Load 53(inF0)
-             548:   132(bool) All 547
+             532:   48(fvec4) Load 53(inF0)
+             533:   132(bool) All 532
+             534:   48(fvec4) Load 53(inF0)
+             535:   48(fvec4) ExtInst 1(GLSL.std.450) 4(FAbs) 534
+             536:   48(fvec4) Load 53(inF0)
+             537:   48(fvec4) ExtInst 1(GLSL.std.450) 17(Acos) 536
+             538:   48(fvec4) Load 53(inF0)
+             539:   132(bool) Any 538
+             540:   48(fvec4) Load 53(inF0)
+             541:   48(fvec4) ExtInst 1(GLSL.std.450) 16(Asin) 540
+             542:   48(fvec4) Load 53(inF0)
+             544:  543(ivec4) Bitcast 542
+             545:   48(fvec4) Load 53(inF0)
+             546:   50(ivec4) Bitcast 545
+             547:   50(ivec4) Load 56(inU0)
+             548:   48(fvec4) Bitcast 547
              549:   48(fvec4) Load 53(inF0)
-             550:   48(fvec4) ExtInst 1(GLSL.std.450) 4(FAbs) 549
+             550:   48(fvec4) ExtInst 1(GLSL.std.450) 18(Atan) 549
              551:   48(fvec4) Load 53(inF0)
-             552:   48(fvec4) ExtInst 1(GLSL.std.450) 17(Acos) 551
-             553:   48(fvec4) Load 53(inF0)
-             554:   132(bool) Any 553
-             555:   48(fvec4) Load 53(inF0)
-             556:   48(fvec4) ExtInst 1(GLSL.std.450) 16(Asin) 555
-             557:   48(fvec4) Load 53(inF0)
-             559:  558(ivec4) Bitcast 557
+             552:   48(fvec4) Load 54(inF1)
+             553:   48(fvec4) ExtInst 1(GLSL.std.450) 25(Atan2) 551 552
+             554:   48(fvec4) Load 53(inF0)
+             555:   48(fvec4) ExtInst 1(GLSL.std.450) 9(Ceil) 554
+             556:   48(fvec4) Load 53(inF0)
+             557:   48(fvec4) Load 54(inF1)
+             558:   48(fvec4) Load 55(inF2)
+             559:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 556 557 558
              560:   48(fvec4) Load 53(inF0)
-             561:   50(ivec4) Bitcast 560
-             562:   50(ivec4) Load 56(inU0)
-             563:   48(fvec4) Bitcast 562
-             564:   48(fvec4) Load 53(inF0)
-             565:   48(fvec4) ExtInst 1(GLSL.std.450) 18(Atan) 564
+             561:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 560
+             562:   48(fvec4) Load 53(inF0)
+             563:   48(fvec4) ExtInst 1(GLSL.std.450) 20(Cosh) 562
+             565:  543(ivec4) BitCount 564
              566:   48(fvec4) Load 53(inF0)
-             567:   48(fvec4) Load 54(inF1)
-             568:   48(fvec4) ExtInst 1(GLSL.std.450) 25(Atan2) 566 567
-             569:   48(fvec4) Load 53(inF0)
-             570:   48(fvec4) ExtInst 1(GLSL.std.450) 9(Ceil) 569
+             567:   48(fvec4) ExtInst 1(GLSL.std.450) 12(Degrees) 566
+             568:   48(fvec4) Load 53(inF0)
+             569:   48(fvec4) Load 54(inF1)
+             570:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 568 569
              571:   48(fvec4) Load 53(inF0)
              572:   48(fvec4) Load 54(inF1)
-             573:   48(fvec4) Load 55(inF2)
-             574:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 571 572 573
-             575:   48(fvec4) Load 53(inF0)
-             576:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 575
-             577:   48(fvec4) Load 53(inF0)
-             578:   48(fvec4) ExtInst 1(GLSL.std.450) 20(Cosh) 577
-             580:  558(ivec4) BitCount 579
-             581:   48(fvec4) Load 53(inF0)
-             582:   48(fvec4) ExtInst 1(GLSL.std.450) 12(Degrees) 581
-             583:   48(fvec4) Load 53(inF0)
-             584:   48(fvec4) Load 54(inF1)
-             585:    6(float) ExtInst 1(GLSL.std.450) 67(Distance) 583 584
-             586:   48(fvec4) Load 53(inF0)
-             587:   48(fvec4) Load 54(inF1)
-             588:    6(float) Dot 586 587
-             590:      7(ptr) AccessChain 53(inF0) 589
-             591:    6(float) Load 590
-             592:      7(ptr) AccessChain 54(inF1) 589
-             593:    6(float) Load 592
-             594:    6(float) FMul 591 593
-             596:      7(ptr) AccessChain 53(inF0) 595
-             597:    6(float) Load 596
-             599:      7(ptr) AccessChain 54(inF1) 598
-             600:    6(float) Load 599
-             601:   48(fvec4) CompositeConstruct 224 594 597 600
+             573:    6(float) Dot 571 572
+             575:      7(ptr) AccessChain 53(inF0) 574
+             576:    6(float) Load 575
+             577:      7(ptr) AccessChain 54(inF1) 574
+             578:    6(float) Load 577
+             579:    6(float) FMul 576 578
+             581:      7(ptr) AccessChain 53(inF0) 580
+             582:    6(float) Load 581
+             584:      7(ptr) AccessChain 54(inF1) 583
+             585:    6(float) Load 584
+             586:   48(fvec4) CompositeConstruct 219 579 582 585
+             587:   48(fvec4) Load 53(inF0)
+             588:   48(fvec4) ExtInst 1(GLSL.std.450) 27(Exp) 587
+             589:   48(fvec4) Load 53(inF0)
+             590:   48(fvec4) ExtInst 1(GLSL.std.450) 29(Exp2) 589
+             591:   48(fvec4) Load 53(inF0)
+             592:   48(fvec4) Load 54(inF1)
+             593:   48(fvec4) Load 55(inF2)
+             594:   48(fvec4) ExtInst 1(GLSL.std.450) 70(FaceForward) 591 592 593
+             595:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
+             596:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             597:   48(fvec4) Load 53(inF0)
+             598:   48(fvec4) ExtInst 1(GLSL.std.450) 8(Floor) 597
+             599:   48(fvec4) Load 53(inF0)
+             600:   48(fvec4) Load 54(inF1)
+             601:   48(fvec4) FMod 599 600
              602:   48(fvec4) Load 53(inF0)
-             603:   48(fvec4) ExtInst 1(GLSL.std.450) 27(Exp) 602
+             603:   48(fvec4) ExtInst 1(GLSL.std.450) 10(Fract) 602
              604:   48(fvec4) Load 53(inF0)
-             605:   48(fvec4) ExtInst 1(GLSL.std.450) 29(Exp2) 604
-             606:   48(fvec4) Load 53(inF0)
-             607:   48(fvec4) Load 54(inF1)
-             608:   48(fvec4) Load 55(inF2)
-             609:   48(fvec4) ExtInst 1(GLSL.std.450) 70(FaceForward) 606 607 608
-             610:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
-             611:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             606:  605(bvec4) IsInf 604
+             607:   48(fvec4) Load 53(inF0)
+             608:  605(bvec4) IsNan 607
+             609:   48(fvec4) Load 53(inF0)
+             610:   48(fvec4) Load 54(inF1)
+             611:   48(fvec4) ExtInst 1(GLSL.std.450) 53(Ldexp) 609 610
              612:   48(fvec4) Load 53(inF0)
-             613:   48(fvec4) ExtInst 1(GLSL.std.450) 8(Floor) 612
-             614:   48(fvec4) Load 53(inF0)
-             615:   48(fvec4) Load 54(inF1)
-             616:   48(fvec4) FMod 614 615
-             617:   48(fvec4) Load 53(inF0)
-             618:   48(fvec4) ExtInst 1(GLSL.std.450) 10(Fract) 617
-             619:   48(fvec4) Load 53(inF0)
-             621:620(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 619
-             622:  558(ivec4) CompositeExtract 621 1
-                              Store 54(inF1) 622
-             623:   48(fvec4) CompositeExtract 621 0
-             624:   48(fvec4) Load 53(inF0)
-             626:  625(bvec4) IsInf 624
-             627:   48(fvec4) Load 53(inF0)
-             628:  625(bvec4) IsNan 627
-             629:   48(fvec4) Load 53(inF0)
-             630:   48(fvec4) Load 54(inF1)
-             631:   48(fvec4) ExtInst 1(GLSL.std.450) 53(Ldexp) 629 630
-             632:   48(fvec4) Load 53(inF0)
-             633:   48(fvec4) Load 54(inF1)
-             634:   48(fvec4) Load 55(inF2)
-             635:   48(fvec4) ExtInst 1(GLSL.std.450) 46(FMix) 632 633 634
+             613:   48(fvec4) Load 54(inF1)
+             614:   48(fvec4) Load 55(inF2)
+             615:   48(fvec4) ExtInst 1(GLSL.std.450) 46(FMix) 612 613 614
+             616:   48(fvec4) Load 53(inF0)
+             617:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 616
+             618:   48(fvec4) Load 53(inF0)
+             619:   48(fvec4) ExtInst 1(GLSL.std.450) 28(Log) 618
+             620:   48(fvec4) Load 53(inF0)
+             621:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 620
+             622:   48(fvec4) VectorTimesScalar 621 196
+             623:   48(fvec4) Load 53(inF0)
+             624:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 623
+             625:   48(fvec4) Load 53(inF0)
+             626:   48(fvec4) Load 54(inF1)
+             627:   48(fvec4) ExtInst 1(GLSL.std.450) 40(FMax) 625 626
+             628:   48(fvec4) Load 53(inF0)
+             629:   48(fvec4) Load 54(inF1)
+             630:   48(fvec4) ExtInst 1(GLSL.std.450) 37(FMin) 628 629
+             631:   48(fvec4) Load 53(inF0)
+             632:   48(fvec4) ExtInst 1(GLSL.std.450) 69(Normalize) 631
+             633:   48(fvec4) Load 53(inF0)
+             634:   48(fvec4) Load 54(inF1)
+             635:   48(fvec4) ExtInst 1(GLSL.std.450) 26(Pow) 633 634
              636:   48(fvec4) Load 53(inF0)
-             637:    6(float) ExtInst 1(GLSL.std.450) 66(Length) 636
+             637:   48(fvec4) ExtInst 1(GLSL.std.450) 11(Radians) 636
              638:   48(fvec4) Load 53(inF0)
-             639:   48(fvec4) ExtInst 1(GLSL.std.450) 28(Log) 638
-             640:   48(fvec4) Load 53(inF0)
-             641:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 640
-             642:   48(fvec4) VectorTimesScalar 641 201
-             643:   48(fvec4) Load 53(inF0)
-             644:   48(fvec4) ExtInst 1(GLSL.std.450) 30(Log2) 643
-             645:   48(fvec4) Load 53(inF0)
-             646:   48(fvec4) Load 54(inF1)
-             647:   48(fvec4) ExtInst 1(GLSL.std.450) 40(FMax) 645 646
-             648:   48(fvec4) Load 53(inF0)
-             649:   48(fvec4) Load 54(inF1)
-             650:   48(fvec4) ExtInst 1(GLSL.std.450) 37(FMin) 648 649
+             639:   48(fvec4) Load 54(inF1)
+             640:   48(fvec4) ExtInst 1(GLSL.std.450) 71(Reflect) 638 639
+             641:   48(fvec4) Load 53(inF0)
+             642:   48(fvec4) Load 54(inF1)
+             643:   48(fvec4) ExtInst 1(GLSL.std.450) 72(Refract) 641 642 349
+             646:  543(ivec4) BitReverse 645
+             647:   48(fvec4) Load 53(inF0)
+             648:   48(fvec4) ExtInst 1(GLSL.std.450) 2(RoundEven) 647
+             649:   48(fvec4) Load 53(inF0)
+             650:   48(fvec4) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 649
              651:   48(fvec4) Load 53(inF0)
-             652:   48(fvec4) ExtInst 1(GLSL.std.450) 69(Normalize) 651
-             653:   48(fvec4) Load 53(inF0)
-             654:   48(fvec4) Load 54(inF1)
-             655:   48(fvec4) ExtInst 1(GLSL.std.450) 26(Pow) 653 654
-             656:   48(fvec4) Load 53(inF0)
-             657:   48(fvec4) ExtInst 1(GLSL.std.450) 11(Radians) 656
-             658:   48(fvec4) Load 53(inF0)
-             659:   48(fvec4) Load 54(inF1)
-             660:   48(fvec4) ExtInst 1(GLSL.std.450) 71(Reflect) 658 659
+             652:   48(fvec4) CompositeConstruct 218 218 218 218
+             653:   48(fvec4) CompositeConstruct 219 219 219 219
+             654:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 651 652 653
+             655:   48(fvec4) Load 53(inF0)
+             656:   48(fvec4) ExtInst 1(GLSL.std.450) 6(FSign) 655
+             657:   48(fvec4) Load 53(inF0)
+             658:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 657
+             659:   48(fvec4) Load 53(inF0)
+             660:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 659
+                              Store 54(inF1) 660
              661:   48(fvec4) Load 53(inF0)
-             662:   48(fvec4) Load 54(inF1)
-             663:   48(fvec4) ExtInst 1(GLSL.std.450) 72(Refract) 661 662 359
-             666:  558(ivec4) BitReverse 665
-             667:   48(fvec4) Load 53(inF0)
-             668:   48(fvec4) ExtInst 1(GLSL.std.450) 2(RoundEven) 667
+             662:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 661
+                              Store 55(inF2) 662
+             663:   48(fvec4) Load 53(inF0)
+             664:   48(fvec4) ExtInst 1(GLSL.std.450) 19(Sinh) 663
+             665:   48(fvec4) Load 53(inF0)
+             666:   48(fvec4) Load 54(inF1)
+             667:   48(fvec4) Load 55(inF2)
+             668:   48(fvec4) ExtInst 1(GLSL.std.450) 49(SmoothStep) 665 666 667
              669:   48(fvec4) Load 53(inF0)
-             670:   48(fvec4) ExtInst 1(GLSL.std.450) 32(InverseSqrt) 669
+             670:   48(fvec4) ExtInst 1(GLSL.std.450) 31(Sqrt) 669
              671:   48(fvec4) Load 53(inF0)
-             672:   48(fvec4) CompositeConstruct 223 223 223 223
-             673:   48(fvec4) CompositeConstruct 224 224 224 224
-             674:   48(fvec4) ExtInst 1(GLSL.std.450) 43(FClamp) 671 672 673
-             675:   48(fvec4) Load 53(inF0)
-             676:   48(fvec4) ExtInst 1(GLSL.std.450) 6(FSign) 675
-             677:   48(fvec4) Load 53(inF0)
-             678:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 677
-             679:   48(fvec4) Load 53(inF0)
-             680:   48(fvec4) ExtInst 1(GLSL.std.450) 13(Sin) 679
-                              Store 54(inF1) 680
-             681:   48(fvec4) Load 53(inF0)
-             682:   48(fvec4) ExtInst 1(GLSL.std.450) 14(Cos) 681
-                              Store 55(inF2) 682
-             683:   48(fvec4) Load 53(inF0)
-             684:   48(fvec4) ExtInst 1(GLSL.std.450) 19(Sinh) 683
-             685:   48(fvec4) Load 53(inF0)
-             686:   48(fvec4) Load 54(inF1)
-             687:   48(fvec4) Load 55(inF2)
-             688:   48(fvec4) ExtInst 1(GLSL.std.450) 49(SmoothStep) 685 686 687
-             689:   48(fvec4) Load 53(inF0)
-             690:   48(fvec4) ExtInst 1(GLSL.std.450) 31(Sqrt) 689
-             691:   48(fvec4) Load 53(inF0)
-             692:   48(fvec4) Load 54(inF1)
-             693:   48(fvec4) ExtInst 1(GLSL.std.450) 48(Step) 691 692
-             694:   48(fvec4) Load 53(inF0)
-             695:   48(fvec4) ExtInst 1(GLSL.std.450) 15(Tan) 694
-             696:   48(fvec4) Load 53(inF0)
-             697:   48(fvec4) ExtInst 1(GLSL.std.450) 21(Tanh) 696
-             698:   48(fvec4) Load 53(inF0)
-             699:   48(fvec4) ExtInst 1(GLSL.std.450) 3(Trunc) 698
-                              ReturnValue 701
+             672:   48(fvec4) Load 54(inF1)
+             673:   48(fvec4) ExtInst 1(GLSL.std.450) 48(Step) 671 672
+             674:   48(fvec4) Load 53(inF0)
+             675:   48(fvec4) ExtInst 1(GLSL.std.450) 15(Tan) 674
+             676:   48(fvec4) Load 53(inF0)
+             677:   48(fvec4) ExtInst 1(GLSL.std.450) 21(Tanh) 676
+             678:   48(fvec4) Load 53(inF0)
+             679:   48(fvec4) ExtInst 1(GLSL.std.450) 3(Trunc) 678
+                              ReturnValue 681
                               FunctionEnd
 66(VertexShaderFunction2x2(mf22;mf22;mf22;):          60 Function None 62
         63(inF0):     61(ptr) FunctionParameter
         64(inF1):     61(ptr) FunctionParameter
         65(inF2):     61(ptr) FunctionParameter
               67:             Label
-             704:          60 Load 63(inF0)
-             705:   132(bool) All 704
-             706:          60 Load 63(inF0)
-             707:          60 ExtInst 1(GLSL.std.450) 4(FAbs) 706
-             708:          60 Load 63(inF0)
-             709:          60 ExtInst 1(GLSL.std.450) 17(Acos) 708
-             710:          60 Load 63(inF0)
-             711:   132(bool) Any 710
-             712:          60 Load 63(inF0)
-             713:          60 ExtInst 1(GLSL.std.450) 16(Asin) 712
-             714:          60 Load 63(inF0)
-             715:          60 ExtInst 1(GLSL.std.450) 18(Atan) 714
-             716:          60 Load 63(inF0)
-             717:          60 Load 64(inF1)
-             718:          60 ExtInst 1(GLSL.std.450) 25(Atan2) 716 717
+             684:          60 Load 63(inF0)
+             685:   132(bool) All 684
+             686:          60 Load 63(inF0)
+             687:          60 ExtInst 1(GLSL.std.450) 4(FAbs) 686
+             688:          60 Load 63(inF0)
+             689:          60 ExtInst 1(GLSL.std.450) 17(Acos) 688
+             690:          60 Load 63(inF0)
+             691:   132(bool) Any 690
+             692:          60 Load 63(inF0)
+             693:          60 ExtInst 1(GLSL.std.450) 16(Asin) 692
+             694:          60 Load 63(inF0)
+             695:          60 ExtInst 1(GLSL.std.450) 18(Atan) 694
+             696:          60 Load 63(inF0)
+             697:          60 Load 64(inF1)
+             698:          60 ExtInst 1(GLSL.std.450) 25(Atan2) 696 697
+             699:          60 Load 63(inF0)
+             700:          60 ExtInst 1(GLSL.std.450) 9(Ceil) 699
+             701:          60 Load 63(inF0)
+             702:          60 Load 64(inF1)
+             703:          60 Load 65(inF2)
+             704:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 701 702 703
+             705:          60 Load 63(inF0)
+             706:          60 ExtInst 1(GLSL.std.450) 14(Cos) 705
+             707:          60 Load 63(inF0)
+             708:          60 ExtInst 1(GLSL.std.450) 20(Cosh) 707
+             709:          60 Load 63(inF0)
+             710:          60 ExtInst 1(GLSL.std.450) 12(Degrees) 709
+             711:          60 Load 63(inF0)
+             712:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 711
+             713:          60 Load 63(inF0)
+             714:          60 ExtInst 1(GLSL.std.450) 27(Exp) 713
+             715:          60 Load 63(inF0)
+             716:          60 ExtInst 1(GLSL.std.450) 29(Exp2) 715
+             717:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
+             718:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
              719:          60 Load 63(inF0)
-             720:          60 ExtInst 1(GLSL.std.450) 9(Ceil) 719
+             720:          60 ExtInst 1(GLSL.std.450) 8(Floor) 719
              721:          60 Load 63(inF0)
              722:          60 Load 64(inF1)
-             723:          60 Load 65(inF2)
-             724:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 721 722 723
-             725:          60 Load 63(inF0)
-             726:          60 ExtInst 1(GLSL.std.450) 14(Cos) 725
-             727:          60 Load 63(inF0)
-             728:          60 ExtInst 1(GLSL.std.450) 20(Cosh) 727
-             729:          60 Load 63(inF0)
-             730:          60 ExtInst 1(GLSL.std.450) 12(Degrees) 729
-             731:          60 Load 63(inF0)
-             732:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 731
-             733:          60 Load 63(inF0)
-             734:          60 ExtInst 1(GLSL.std.450) 27(Exp) 733
+             723:   24(fvec2) CompositeExtract 721 0
+             724:   24(fvec2) CompositeExtract 722 0
+             725:   24(fvec2) FMod 723 724
+             726:   24(fvec2) CompositeExtract 721 1
+             727:   24(fvec2) CompositeExtract 722 1
+             728:   24(fvec2) FMod 726 727
+             729:          60 CompositeConstruct 725 728
+             730:          60 Load 63(inF0)
+             731:          60 ExtInst 1(GLSL.std.450) 10(Fract) 730
+             732:          60 Load 63(inF0)
+             733:          60 Load 64(inF1)
+             734:          60 ExtInst 1(GLSL.std.450) 53(Ldexp) 732 733
              735:          60 Load 63(inF0)
-             736:          60 ExtInst 1(GLSL.std.450) 29(Exp2) 735
-             737:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
-             738:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             736:          60 Load 64(inF1)
+             737:          60 Load 65(inF2)
+             738:          60 ExtInst 1(GLSL.std.450) 46(FMix) 735 736 737
              739:          60 Load 63(inF0)
-             740:          60 ExtInst 1(GLSL.std.450) 8(Floor) 739
+             740:          60 ExtInst 1(GLSL.std.450) 28(Log) 739
              741:          60 Load 63(inF0)
-             742:          60 Load 64(inF1)
-             743:   24(fvec2) CompositeExtract 741 0
-             744:   24(fvec2) CompositeExtract 742 0
-             745:   24(fvec2) FMod 743 744
-             746:   24(fvec2) CompositeExtract 741 1
-             747:   24(fvec2) CompositeExtract 742 1
-             748:   24(fvec2) FMod 746 747
-             749:          60 CompositeConstruct 745 748
-             750:          60 Load 63(inF0)
-             751:          60 ExtInst 1(GLSL.std.450) 10(Fract) 750
+             742:          60 ExtInst 1(GLSL.std.450) 30(Log2) 741
+             743:          60 MatrixTimesScalar 742 196
+             744:          60 Load 63(inF0)
+             745:          60 ExtInst 1(GLSL.std.450) 30(Log2) 744
+             746:          60 Load 63(inF0)
+             747:          60 Load 64(inF1)
+             748:          60 ExtInst 1(GLSL.std.450) 40(FMax) 746 747
+             749:          60 Load 63(inF0)
+             750:          60 Load 64(inF1)
+             751:          60 ExtInst 1(GLSL.std.450) 37(FMin) 749 750
              752:          60 Load 63(inF0)
-             754:753(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 752
-             755:  266(ivec2) CompositeExtract 754 1
-                              Store 64(inF1) 755
-             756:          60 CompositeExtract 754 0
+             753:          60 Load 64(inF1)
+             754:          60 ExtInst 1(GLSL.std.450) 26(Pow) 752 753
+             755:          60 Load 63(inF0)
+             756:          60 ExtInst 1(GLSL.std.450) 11(Radians) 755
              757:          60 Load 63(inF0)
-             758:          60 Load 64(inF1)
-             759:          60 ExtInst 1(GLSL.std.450) 53(Ldexp) 757 758
-             760:          60 Load 63(inF0)
-             761:          60 Load 64(inF1)
-             762:          60 Load 65(inF2)
-             763:          60 ExtInst 1(GLSL.std.450) 46(FMix) 760 761 762
-             764:          60 Load 63(inF0)
-             765:          60 ExtInst 1(GLSL.std.450) 28(Log) 764
-             766:          60 Load 63(inF0)
-             767:          60 ExtInst 1(GLSL.std.450) 30(Log2) 766
-             768:          60 MatrixTimesScalar 767 201
+             758:          60 ExtInst 1(GLSL.std.450) 2(RoundEven) 757
+             759:          60 Load 63(inF0)
+             760:          60 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 759
+             761:          60 Load 63(inF0)
+             762:   24(fvec2) CompositeConstruct 218 218
+             763:   24(fvec2) CompositeConstruct 219 219
+             764:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 761 762 763
+             765:          60 Load 63(inF0)
+             766:          60 ExtInst 1(GLSL.std.450) 6(FSign) 765
+             767:          60 Load 63(inF0)
+             768:          60 ExtInst 1(GLSL.std.450) 13(Sin) 767
              769:          60 Load 63(inF0)
-             770:          60 ExtInst 1(GLSL.std.450) 30(Log2) 769
+             770:          60 ExtInst 1(GLSL.std.450) 13(Sin) 769
+                              Store 64(inF1) 770
              771:          60 Load 63(inF0)
-             772:          60 Load 64(inF1)
-             773:          60 ExtInst 1(GLSL.std.450) 40(FMax) 771 772
-             774:          60 Load 63(inF0)
-             775:          60 Load 64(inF1)
-             776:          60 ExtInst 1(GLSL.std.450) 37(FMin) 774 775
-             777:          60 Load 63(inF0)
-             778:          60 Load 64(inF1)
-             779:          60 ExtInst 1(GLSL.std.450) 26(Pow) 777 778
-             780:          60 Load 63(inF0)
-             781:          60 ExtInst 1(GLSL.std.450) 11(Radians) 780
-             782:          60 Load 63(inF0)
-             783:          60 ExtInst 1(GLSL.std.450) 2(RoundEven) 782
+             772:          60 ExtInst 1(GLSL.std.450) 14(Cos) 771
+                              Store 65(inF2) 772
+             773:          60 Load 63(inF0)
+             774:          60 ExtInst 1(GLSL.std.450) 19(Sinh) 773
+             775:          60 Load 63(inF0)
+             776:          60 Load 64(inF1)
+             777:          60 Load 65(inF2)
+             778:          60 ExtInst 1(GLSL.std.450) 49(SmoothStep) 775 776 777
+             779:          60 Load 63(inF0)
+             780:          60 ExtInst 1(GLSL.std.450) 31(Sqrt) 779
+             781:          60 Load 63(inF0)
+             782:          60 Load 64(inF1)
+             783:          60 ExtInst 1(GLSL.std.450) 48(Step) 781 782
              784:          60 Load 63(inF0)
-             785:          60 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 784
+             785:          60 ExtInst 1(GLSL.std.450) 15(Tan) 784
              786:          60 Load 63(inF0)
-             787:   24(fvec2) CompositeConstruct 223 223
-             788:   24(fvec2) CompositeConstruct 224 224
-             789:          60 ExtInst 1(GLSL.std.450) 43(FClamp) 786 787 788
+             787:          60 ExtInst 1(GLSL.std.450) 21(Tanh) 786
+             788:          60 Load 63(inF0)
+             789:          60 Transpose 788
              790:          60 Load 63(inF0)
-             791:          60 ExtInst 1(GLSL.std.450) 6(FSign) 790
-             792:          60 Load 63(inF0)
-             793:          60 ExtInst 1(GLSL.std.450) 13(Sin) 792
-             794:          60 Load 63(inF0)
-             795:          60 ExtInst 1(GLSL.std.450) 13(Sin) 794
-                              Store 64(inF1) 795
-             796:          60 Load 63(inF0)
-             797:          60 ExtInst 1(GLSL.std.450) 14(Cos) 796
-                              Store 65(inF2) 797
-             798:          60 Load 63(inF0)
-             799:          60 ExtInst 1(GLSL.std.450) 19(Sinh) 798
-             800:          60 Load 63(inF0)
-             801:          60 Load 64(inF1)
-             802:          60 Load 65(inF2)
-             803:          60 ExtInst 1(GLSL.std.450) 49(SmoothStep) 800 801 802
-             804:          60 Load 63(inF0)
-             805:          60 ExtInst 1(GLSL.std.450) 31(Sqrt) 804
-             806:          60 Load 63(inF0)
-             807:          60 Load 64(inF1)
-             808:          60 ExtInst 1(GLSL.std.450) 48(Step) 806 807
-             809:          60 Load 63(inF0)
-             810:          60 ExtInst 1(GLSL.std.450) 15(Tan) 809
-             811:          60 Load 63(inF0)
-             812:          60 ExtInst 1(GLSL.std.450) 21(Tanh) 811
-             813:          60 Load 63(inF0)
-             814:          60 Transpose 813
-             815:          60 Load 63(inF0)
-             816:          60 ExtInst 1(GLSL.std.450) 3(Trunc) 815
-                              ReturnValue 818
+             791:          60 ExtInst 1(GLSL.std.450) 3(Trunc) 790
+                              ReturnValue 793
                               FunctionEnd
 74(VertexShaderFunction3x3(mf33;mf33;mf33;):          68 Function None 70
         71(inF0):     69(ptr) FunctionParameter
         72(inF1):     69(ptr) FunctionParameter
         73(inF2):     69(ptr) FunctionParameter
               75:             Label
+             796:          68 Load 71(inF0)
+             797:   132(bool) All 796
+             798:          68 Load 71(inF0)
+             799:          68 ExtInst 1(GLSL.std.450) 4(FAbs) 798
+             800:          68 Load 71(inF0)
+             801:          68 ExtInst 1(GLSL.std.450) 17(Acos) 800
+             802:          68 Load 71(inF0)
+             803:   132(bool) Any 802
+             804:          68 Load 71(inF0)
+             805:          68 ExtInst 1(GLSL.std.450) 16(Asin) 804
+             806:          68 Load 71(inF0)
+             807:          68 ExtInst 1(GLSL.std.450) 18(Atan) 806
+             808:          68 Load 71(inF0)
+             809:          68 Load 72(inF1)
+             810:          68 ExtInst 1(GLSL.std.450) 25(Atan2) 808 809
+             811:          68 Load 71(inF0)
+             812:          68 ExtInst 1(GLSL.std.450) 9(Ceil) 811
+             813:          68 Load 71(inF0)
+             814:          68 Load 72(inF1)
+             815:          68 Load 73(inF2)
+             816:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 813 814 815
+             817:          68 Load 71(inF0)
+             818:          68 ExtInst 1(GLSL.std.450) 14(Cos) 817
+             819:          68 Load 71(inF0)
+             820:          68 ExtInst 1(GLSL.std.450) 20(Cosh) 819
              821:          68 Load 71(inF0)
-             822:   132(bool) All 821
+             822:          68 ExtInst 1(GLSL.std.450) 12(Degrees) 821
              823:          68 Load 71(inF0)
-             824:          68 ExtInst 1(GLSL.std.450) 4(FAbs) 823
+             824:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 823
              825:          68 Load 71(inF0)
-             826:          68 ExtInst 1(GLSL.std.450) 17(Acos) 825
+             826:          68 ExtInst 1(GLSL.std.450) 27(Exp) 825
              827:          68 Load 71(inF0)
-             828:   132(bool) Any 827
-             829:          68 Load 71(inF0)
-             830:          68 ExtInst 1(GLSL.std.450) 16(Asin) 829
+             828:          68 ExtInst 1(GLSL.std.450) 29(Exp2) 827
+             829:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
+             830:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
              831:          68 Load 71(inF0)
-             832:          68 ExtInst 1(GLSL.std.450) 18(Atan) 831
+             832:          68 ExtInst 1(GLSL.std.450) 8(Floor) 831
              833:          68 Load 71(inF0)
              834:          68 Load 72(inF1)
-             835:          68 ExtInst 1(GLSL.std.450) 25(Atan2) 833 834
-             836:          68 Load 71(inF0)
-             837:          68 ExtInst 1(GLSL.std.450) 9(Ceil) 836
-             838:          68 Load 71(inF0)
-             839:          68 Load 72(inF1)
-             840:          68 Load 73(inF2)
-             841:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 838 839 840
-             842:          68 Load 71(inF0)
-             843:          68 ExtInst 1(GLSL.std.450) 14(Cos) 842
-             844:          68 Load 71(inF0)
-             845:          68 ExtInst 1(GLSL.std.450) 20(Cosh) 844
-             846:          68 Load 71(inF0)
-             847:          68 ExtInst 1(GLSL.std.450) 12(Degrees) 846
-             848:          68 Load 71(inF0)
-             849:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 848
+             835:   36(fvec3) CompositeExtract 833 0
+             836:   36(fvec3) CompositeExtract 834 0
+             837:   36(fvec3) FMod 835 836
+             838:   36(fvec3) CompositeExtract 833 1
+             839:   36(fvec3) CompositeExtract 834 1
+             840:   36(fvec3) FMod 838 839
+             841:   36(fvec3) CompositeExtract 833 2
+             842:   36(fvec3) CompositeExtract 834 2
+             843:   36(fvec3) FMod 841 842
+             844:          68 CompositeConstruct 837 840 843
+             845:          68 Load 71(inF0)
+             846:          68 ExtInst 1(GLSL.std.450) 10(Fract) 845
+             847:          68 Load 71(inF0)
+             848:          68 Load 72(inF1)
+             849:          68 ExtInst 1(GLSL.std.450) 53(Ldexp) 847 848
              850:          68 Load 71(inF0)
-             851:          68 ExtInst 1(GLSL.std.450) 27(Exp) 850
-             852:          68 Load 71(inF0)
-             853:          68 ExtInst 1(GLSL.std.450) 29(Exp2) 852
-             854:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
-             855:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             851:          68 Load 72(inF1)
+             852:          68 Load 73(inF2)
+             853:          68 ExtInst 1(GLSL.std.450) 46(FMix) 850 851 852
+             854:          68 Load 71(inF0)
+             855:          68 ExtInst 1(GLSL.std.450) 28(Log) 854
              856:          68 Load 71(inF0)
-             857:          68 ExtInst 1(GLSL.std.450) 8(Floor) 856
-             858:          68 Load 71(inF0)
-             859:          68 Load 72(inF1)
-             860:   36(fvec3) CompositeExtract 858 0
-             861:   36(fvec3) CompositeExtract 859 0
-             862:   36(fvec3) FMod 860 861
-             863:   36(fvec3) CompositeExtract 858 1
-             864:   36(fvec3) CompositeExtract 859 1
-             865:   36(fvec3) FMod 863 864
-             866:   36(fvec3) CompositeExtract 858 2
-             867:   36(fvec3) CompositeExtract 859 2
-             868:   36(fvec3) FMod 866 867
-             869:          68 CompositeConstruct 862 865 868
+             857:          68 ExtInst 1(GLSL.std.450) 30(Log2) 856
+             858:          68 MatrixTimesScalar 857 196
+             859:          68 Load 71(inF0)
+             860:          68 ExtInst 1(GLSL.std.450) 30(Log2) 859
+             861:          68 Load 71(inF0)
+             862:          68 Load 72(inF1)
+             863:          68 ExtInst 1(GLSL.std.450) 40(FMax) 861 862
+             864:          68 Load 71(inF0)
+             865:          68 Load 72(inF1)
+             866:          68 ExtInst 1(GLSL.std.450) 37(FMin) 864 865
+             867:          68 Load 71(inF0)
+             868:          68 Load 72(inF1)
+             869:          68 ExtInst 1(GLSL.std.450) 26(Pow) 867 868
              870:          68 Load 71(inF0)
-             871:          68 ExtInst 1(GLSL.std.450) 10(Fract) 870
+             871:          68 ExtInst 1(GLSL.std.450) 11(Radians) 870
              872:          68 Load 71(inF0)
-             874:873(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 872
-             875:  411(ivec3) CompositeExtract 874 1
-                              Store 72(inF1) 875
-             876:          68 CompositeExtract 874 0
-             877:          68 Load 71(inF0)
-             878:          68 Load 72(inF1)
-             879:          68 ExtInst 1(GLSL.std.450) 53(Ldexp) 877 878
+             873:          68 ExtInst 1(GLSL.std.450) 2(RoundEven) 872
+             874:          68 Load 71(inF0)
+             875:          68 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 874
+             876:          68 Load 71(inF0)
+             877:   36(fvec3) CompositeConstruct 218 218 218
+             878:   36(fvec3) CompositeConstruct 219 219 219
+             879:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 876 877 878
              880:          68 Load 71(inF0)
-             881:          68 Load 72(inF1)
-             882:          68 Load 73(inF2)
-             883:          68 ExtInst 1(GLSL.std.450) 46(FMix) 880 881 882
+             881:          68 ExtInst 1(GLSL.std.450) 6(FSign) 880
+             882:          68 Load 71(inF0)
+             883:          68 ExtInst 1(GLSL.std.450) 13(Sin) 882
              884:          68 Load 71(inF0)
-             885:          68 ExtInst 1(GLSL.std.450) 28(Log) 884
+             885:          68 ExtInst 1(GLSL.std.450) 13(Sin) 884
+                              Store 72(inF1) 885
              886:          68 Load 71(inF0)
-             887:          68 ExtInst 1(GLSL.std.450) 30(Log2) 886
-             888:          68 MatrixTimesScalar 887 201
-             889:          68 Load 71(inF0)
-             890:          68 ExtInst 1(GLSL.std.450) 30(Log2) 889
-             891:          68 Load 71(inF0)
-             892:          68 Load 72(inF1)
-             893:          68 ExtInst 1(GLSL.std.450) 40(FMax) 891 892
+             887:          68 ExtInst 1(GLSL.std.450) 14(Cos) 886
+                              Store 73(inF2) 887
+             888:          68 Load 71(inF0)
+             889:          68 ExtInst 1(GLSL.std.450) 19(Sinh) 888
+             890:          68 Load 71(inF0)
+             891:          68 Load 72(inF1)
+             892:          68 Load 73(inF2)
+             893:          68 ExtInst 1(GLSL.std.450) 49(SmoothStep) 890 891 892
              894:          68 Load 71(inF0)
-             895:          68 Load 72(inF1)
-             896:          68 ExtInst 1(GLSL.std.450) 37(FMin) 894 895
-             897:          68 Load 71(inF0)
-             898:          68 Load 72(inF1)
-             899:          68 ExtInst 1(GLSL.std.450) 26(Pow) 897 898
-             900:          68 Load 71(inF0)
-             901:          68 ExtInst 1(GLSL.std.450) 11(Radians) 900
-             902:          68 Load 71(inF0)
-             903:          68 ExtInst 1(GLSL.std.450) 2(RoundEven) 902
-             904:          68 Load 71(inF0)
-             905:          68 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 904
-             906:          68 Load 71(inF0)
-             907:   36(fvec3) CompositeConstruct 223 223 223
-             908:   36(fvec3) CompositeConstruct 224 224 224
-             909:          68 ExtInst 1(GLSL.std.450) 43(FClamp) 906 907 908
-             910:          68 Load 71(inF0)
-             911:          68 ExtInst 1(GLSL.std.450) 6(FSign) 910
-             912:          68 Load 71(inF0)
-             913:          68 ExtInst 1(GLSL.std.450) 13(Sin) 912
-             914:          68 Load 71(inF0)
-             915:          68 ExtInst 1(GLSL.std.450) 13(Sin) 914
-                              Store 72(inF1) 915
-             916:          68 Load 71(inF0)
-             917:          68 ExtInst 1(GLSL.std.450) 14(Cos) 916
-                              Store 73(inF2) 917
-             918:          68 Load 71(inF0)
-             919:          68 ExtInst 1(GLSL.std.450) 19(Sinh) 918
-             920:          68 Load 71(inF0)
-             921:          68 Load 72(inF1)
-             922:          68 Load 73(inF2)
-             923:          68 ExtInst 1(GLSL.std.450) 49(SmoothStep) 920 921 922
-             924:          68 Load 71(inF0)
-             925:          68 ExtInst 1(GLSL.std.450) 31(Sqrt) 924
-             926:          68 Load 71(inF0)
-             927:          68 Load 72(inF1)
-             928:          68 ExtInst 1(GLSL.std.450) 48(Step) 926 927
-             929:          68 Load 71(inF0)
-             930:          68 ExtInst 1(GLSL.std.450) 15(Tan) 929
-             931:          68 Load 71(inF0)
-             932:          68 ExtInst 1(GLSL.std.450) 21(Tanh) 931
-             933:          68 Load 71(inF0)
-             934:          68 Transpose 933
-             935:          68 Load 71(inF0)
-             936:          68 ExtInst 1(GLSL.std.450) 3(Trunc) 935
-                              ReturnValue 938
+             895:          68 ExtInst 1(GLSL.std.450) 31(Sqrt) 894
+             896:          68 Load 71(inF0)
+             897:          68 Load 72(inF1)
+             898:          68 ExtInst 1(GLSL.std.450) 48(Step) 896 897
+             899:          68 Load 71(inF0)
+             900:          68 ExtInst 1(GLSL.std.450) 15(Tan) 899
+             901:          68 Load 71(inF0)
+             902:          68 ExtInst 1(GLSL.std.450) 21(Tanh) 901
+             903:          68 Load 71(inF0)
+             904:          68 Transpose 903
+             905:          68 Load 71(inF0)
+             906:          68 ExtInst 1(GLSL.std.450) 3(Trunc) 905
+                              ReturnValue 908
                               FunctionEnd
 82(VertexShaderFunction4x4(mf44;mf44;mf44;):          76 Function None 78
         79(inF0):     77(ptr) FunctionParameter
         80(inF1):     77(ptr) FunctionParameter
         81(inF2):     77(ptr) FunctionParameter
               83:             Label
-             941:          76 Load 79(inF0)
-             942:   132(bool) All 941
-             943:          76 Load 79(inF0)
-             944:          76 ExtInst 1(GLSL.std.450) 4(FAbs) 943
-             945:          76 Load 79(inF0)
-             946:          76 ExtInst 1(GLSL.std.450) 17(Acos) 945
-             947:          76 Load 79(inF0)
-             948:   132(bool) Any 947
-             949:          76 Load 79(inF0)
-             950:          76 ExtInst 1(GLSL.std.450) 16(Asin) 949
-             951:          76 Load 79(inF0)
-             952:          76 ExtInst 1(GLSL.std.450) 18(Atan) 951
-             953:          76 Load 79(inF0)
-             954:          76 Load 80(inF1)
-             955:          76 ExtInst 1(GLSL.std.450) 25(Atan2) 953 954
-             956:          76 Load 79(inF0)
-             957:          76 ExtInst 1(GLSL.std.450) 9(Ceil) 956
-             958:          76 Load 79(inF0)
-             959:          76 Load 80(inF1)
-             960:          76 Load 81(inF2)
-             961:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 958 959 960
-             962:          76 Load 79(inF0)
-             963:          76 ExtInst 1(GLSL.std.450) 14(Cos) 962
-             964:          76 Load 79(inF0)
-             965:          76 ExtInst 1(GLSL.std.450) 20(Cosh) 964
-             966:          76 Load 79(inF0)
-             967:          76 ExtInst 1(GLSL.std.450) 12(Degrees) 966
+             911:          76 Load 79(inF0)
+             912:   132(bool) All 911
+             913:          76 Load 79(inF0)
+             914:          76 ExtInst 1(GLSL.std.450) 4(FAbs) 913
+             915:          76 Load 79(inF0)
+             916:          76 ExtInst 1(GLSL.std.450) 17(Acos) 915
+             917:          76 Load 79(inF0)
+             918:   132(bool) Any 917
+             919:          76 Load 79(inF0)
+             920:          76 ExtInst 1(GLSL.std.450) 16(Asin) 919
+             921:          76 Load 79(inF0)
+             922:          76 ExtInst 1(GLSL.std.450) 18(Atan) 921
+             923:          76 Load 79(inF0)
+             924:          76 Load 80(inF1)
+             925:          76 ExtInst 1(GLSL.std.450) 25(Atan2) 923 924
+             926:          76 Load 79(inF0)
+             927:          76 ExtInst 1(GLSL.std.450) 9(Ceil) 926
+             928:          76 Load 79(inF0)
+             929:          76 Load 80(inF1)
+             930:          76 Load 81(inF2)
+             931:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 928 929 930
+             932:          76 Load 79(inF0)
+             933:          76 ExtInst 1(GLSL.std.450) 14(Cos) 932
+             934:          76 Load 79(inF0)
+             935:          76 ExtInst 1(GLSL.std.450) 20(Cosh) 934
+             936:          76 Load 79(inF0)
+             937:          76 ExtInst 1(GLSL.std.450) 12(Degrees) 936
+             938:          76 Load 79(inF0)
+             939:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 938
+             940:          76 Load 79(inF0)
+             941:          76 ExtInst 1(GLSL.std.450) 27(Exp) 940
+             942:          76 Load 79(inF0)
+             943:          76 ExtInst 1(GLSL.std.450) 29(Exp2) 942
+             944:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
+             945:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
+             946:          76 Load 79(inF0)
+             947:          76 ExtInst 1(GLSL.std.450) 8(Floor) 946
+             948:          76 Load 79(inF0)
+             949:          76 Load 80(inF1)
+             950:   48(fvec4) CompositeExtract 948 0
+             951:   48(fvec4) CompositeExtract 949 0
+             952:   48(fvec4) FMod 950 951
+             953:   48(fvec4) CompositeExtract 948 1
+             954:   48(fvec4) CompositeExtract 949 1
+             955:   48(fvec4) FMod 953 954
+             956:   48(fvec4) CompositeExtract 948 2
+             957:   48(fvec4) CompositeExtract 949 2
+             958:   48(fvec4) FMod 956 957
+             959:   48(fvec4) CompositeExtract 948 3
+             960:   48(fvec4) CompositeExtract 949 3
+             961:   48(fvec4) FMod 959 960
+             962:          76 CompositeConstruct 952 955 958 961
+             963:          76 Load 79(inF0)
+             964:          76 ExtInst 1(GLSL.std.450) 10(Fract) 963
+             965:          76 Load 79(inF0)
+             966:          76 Load 80(inF1)
+             967:          76 ExtInst 1(GLSL.std.450) 53(Ldexp) 965 966
              968:          76 Load 79(inF0)
-             969:    6(float) ExtInst 1(GLSL.std.450) 33(Determinant) 968
-             970:          76 Load 79(inF0)
-             971:          76 ExtInst 1(GLSL.std.450) 27(Exp) 970
+             969:          76 Load 80(inF1)
+             970:          76 Load 81(inF2)
+             971:          76 ExtInst 1(GLSL.std.450) 46(FMix) 968 969 970
              972:          76 Load 79(inF0)
-             973:          76 ExtInst 1(GLSL.std.450) 29(Exp2) 972
-             974:    143(int) ExtInst 1(GLSL.std.450) 74(FindSMsb) 164
-             975:    143(int) ExtInst 1(GLSL.std.450) 73(FindILsb) 164
-             976:          76 Load 79(inF0)
-             977:          76 ExtInst 1(GLSL.std.450) 8(Floor) 976
-             978:          76 Load 79(inF0)
-             979:          76 Load 80(inF1)
-             980:   48(fvec4) CompositeExtract 978 0
-             981:   48(fvec4) CompositeExtract 979 0
-             982:   48(fvec4) FMod 980 981
-             983:   48(fvec4) CompositeExtract 978 1
-             984:   48(fvec4) CompositeExtract 979 1
-             985:   48(fvec4) FMod 983 984
-             986:   48(fvec4) CompositeExtract 978 2
-             987:   48(fvec4) CompositeExtract 979 2
-             988:   48(fvec4) FMod 986 987
-             989:   48(fvec4) CompositeExtract 978 3
-             990:   48(fvec4) CompositeExtract 979 3
-             991:   48(fvec4) FMod 989 990
-             992:          76 CompositeConstruct 982 985 988 991
-             993:          76 Load 79(inF0)
-             994:          76 ExtInst 1(GLSL.std.450) 10(Fract) 993
-             995:          76 Load 79(inF0)
-             997:996(ResType) ExtInst 1(GLSL.std.450) 52(FrexpStruct) 995
-             998:  558(ivec4) CompositeExtract 997 1
-                              Store 80(inF1) 998
-             999:          76 CompositeExtract 997 0
+             973:          76 ExtInst 1(GLSL.std.450) 28(Log) 972
+             974:          76 Load 79(inF0)
+             975:          76 ExtInst 1(GLSL.std.450) 30(Log2) 974
+             976:          76 MatrixTimesScalar 975 196
+             977:          76 Load 79(inF0)
+             978:          76 ExtInst 1(GLSL.std.450) 30(Log2) 977
+             979:          76 Load 79(inF0)
+             980:          76 Load 80(inF1)
+             981:          76 ExtInst 1(GLSL.std.450) 40(FMax) 979 980
+             982:          76 Load 79(inF0)
+             983:          76 Load 80(inF1)
+             984:          76 ExtInst 1(GLSL.std.450) 37(FMin) 982 983
+             985:          76 Load 79(inF0)
+             986:          76 Load 80(inF1)
+             987:          76 ExtInst 1(GLSL.std.450) 26(Pow) 985 986
+             988:          76 Load 79(inF0)
+             989:          76 ExtInst 1(GLSL.std.450) 11(Radians) 988
+             990:          76 Load 79(inF0)
+             991:          76 ExtInst 1(GLSL.std.450) 2(RoundEven) 990
+             992:          76 Load 79(inF0)
+             993:          76 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 992
+             994:          76 Load 79(inF0)
+             995:   48(fvec4) CompositeConstruct 218 218 218 218
+             996:   48(fvec4) CompositeConstruct 219 219 219 219
+             997:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 994 995 996
+             998:          76 Load 79(inF0)
+             999:          76 ExtInst 1(GLSL.std.450) 6(FSign) 998
             1000:          76 Load 79(inF0)
-            1001:          76 Load 80(inF1)
-            1002:          76 ExtInst 1(GLSL.std.450) 53(Ldexp) 1000 1001
-            1003:          76 Load 79(inF0)
-            1004:          76 Load 80(inF1)
-            1005:          76 Load 81(inF2)
-            1006:          76 ExtInst 1(GLSL.std.450) 46(FMix) 1003 1004 1005
-            1007:          76 Load 79(inF0)
-            1008:          76 ExtInst 1(GLSL.std.450) 28(Log) 1007
-            1009:          76 Load 79(inF0)
-            1010:          76 ExtInst 1(GLSL.std.450) 30(Log2) 1009
-            1011:          76 MatrixTimesScalar 1010 201
+            1001:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1000
+            1002:          76 Load 79(inF0)
+            1003:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1002
+                              Store 80(inF1) 1003
+            1004:          76 Load 79(inF0)
+            1005:          76 ExtInst 1(GLSL.std.450) 14(Cos) 1004
+                              Store 81(inF2) 1005
+            1006:          76 Load 79(inF0)
+            1007:          76 ExtInst 1(GLSL.std.450) 19(Sinh) 1006
+            1008:          76 Load 79(inF0)
+            1009:          76 Load 80(inF1)
+            1010:          76 Load 81(inF2)
+            1011:          76 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1008 1009 1010
             1012:          76 Load 79(inF0)
-            1013:          76 ExtInst 1(GLSL.std.450) 30(Log2) 1012
+            1013:          76 ExtInst 1(GLSL.std.450) 31(Sqrt) 1012
             1014:          76 Load 79(inF0)
             1015:          76 Load 80(inF1)
-            1016:          76 ExtInst 1(GLSL.std.450) 40(FMax) 1014 1015
+            1016:          76 ExtInst 1(GLSL.std.450) 48(Step) 1014 1015
             1017:          76 Load 79(inF0)
-            1018:          76 Load 80(inF1)
-            1019:          76 ExtInst 1(GLSL.std.450) 37(FMin) 1017 1018
-            1020:          76 Load 79(inF0)
-            1021:          76 Load 80(inF1)
-            1022:          76 ExtInst 1(GLSL.std.450) 26(Pow) 1020 1021
+            1018:          76 ExtInst 1(GLSL.std.450) 15(Tan) 1017
+            1019:          76 Load 79(inF0)
+            1020:          76 ExtInst 1(GLSL.std.450) 21(Tanh) 1019
+            1021:          76 Load 79(inF0)
+            1022:          76 Transpose 1021
             1023:          76 Load 79(inF0)
-            1024:          76 ExtInst 1(GLSL.std.450) 11(Radians) 1023
-            1025:          76 Load 79(inF0)
-            1026:          76 ExtInst 1(GLSL.std.450) 2(RoundEven) 1025
-            1027:          76 Load 79(inF0)
-            1028:          76 ExtInst 1(GLSL.std.450) 32(InverseSqrt) 1027
-            1029:          76 Load 79(inF0)
-            1030:   48(fvec4) CompositeConstruct 223 223 223 223
-            1031:   48(fvec4) CompositeConstruct 224 224 224 224
-            1032:          76 ExtInst 1(GLSL.std.450) 43(FClamp) 1029 1030 1031
-            1033:          76 Load 79(inF0)
-            1034:          76 ExtInst 1(GLSL.std.450) 6(FSign) 1033
-            1035:          76 Load 79(inF0)
-            1036:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1035
-            1037:          76 Load 79(inF0)
-            1038:          76 ExtInst 1(GLSL.std.450) 13(Sin) 1037
-                              Store 80(inF1) 1038
-            1039:          76 Load 79(inF0)
-            1040:          76 ExtInst 1(GLSL.std.450) 14(Cos) 1039
-                              Store 81(inF2) 1040
-            1041:          76 Load 79(inF0)
-            1042:          76 ExtInst 1(GLSL.std.450) 19(Sinh) 1041
-            1043:          76 Load 79(inF0)
-            1044:          76 Load 80(inF1)
-            1045:          76 Load 81(inF2)
-            1046:          76 ExtInst 1(GLSL.std.450) 49(SmoothStep) 1043 1044 1045
-            1047:          76 Load 79(inF0)
-            1048:          76 ExtInst 1(GLSL.std.450) 31(Sqrt) 1047
-            1049:          76 Load 79(inF0)
-            1050:          76 Load 80(inF1)
-            1051:          76 ExtInst 1(GLSL.std.450) 48(Step) 1049 1050
-            1052:          76 Load 79(inF0)
-            1053:          76 ExtInst 1(GLSL.std.450) 15(Tan) 1052
-            1054:          76 Load 79(inF0)
-            1055:          76 ExtInst 1(GLSL.std.450) 21(Tanh) 1054
-            1056:          76 Load 79(inF0)
-            1057:          76 Transpose 1056
-            1058:          76 Load 79(inF0)
-            1059:          76 ExtInst 1(GLSL.std.450) 3(Trunc) 1058
-                              ReturnValue 1061
+            1024:          76 ExtInst 1(GLSL.std.450) 3(Trunc) 1023
+                              ReturnValue 1026
                               FunctionEnd
 91(TestGenMul2(f1;f1;vf2;vf2;mf22;mf22;):           2 Function None 84
         85(inF0):      7(ptr) FunctionParameter
@@ -3981,51 +3891,51 @@ Shader version: 450
        89(inFM0):     61(ptr) FunctionParameter
        90(inFM1):     61(ptr) FunctionParameter
               92:             Label
-        1064(r0):      7(ptr) Variable Function
-        1068(r1):     25(ptr) Variable Function
-        1072(r2):     25(ptr) Variable Function
-        1076(r3):      7(ptr) Variable Function
-        1080(r4):     25(ptr) Variable Function
-        1084(r5):     25(ptr) Variable Function
-        1088(r6):     61(ptr) Variable Function
-        1092(r7):     61(ptr) Variable Function
-        1096(r8):     61(ptr) Variable Function
-            1065:    6(float) Load 86(inF1)
-            1066:    6(float) Load 85(inF0)
-            1067:    6(float) FMul 1065 1066
-                              Store 1064(r0) 1067
-            1069:    6(float) Load 85(inF0)
-            1070:   24(fvec2) Load 87(inFV0)
-            1071:   24(fvec2) VectorTimesScalar 1070 1069
-                              Store 1068(r1) 1071
-            1073:   24(fvec2) Load 87(inFV0)
-            1074:    6(float) Load 85(inF0)
-            1075:   24(fvec2) VectorTimesScalar 1073 1074
-                              Store 1072(r2) 1075
-            1077:   24(fvec2) Load 87(inFV0)
-            1078:   24(fvec2) Load 88(inFV1)
-            1079:    6(float) Dot 1077 1078
-                              Store 1076(r3) 1079
-            1081:   24(fvec2) Load 87(inFV0)
-            1082:          60 Load 89(inFM0)
-            1083:   24(fvec2) VectorTimesMatrix 1081 1082
-                              Store 1080(r4) 1083
-            1085:          60 Load 89(inFM0)
-            1086:   24(fvec2) Load 87(inFV0)
-            1087:   24(fvec2) MatrixTimesVector 1085 1086
-                              Store 1084(r5) 1087
-            1089:    6(float) Load 85(inF0)
-            1090:          60 Load 89(inFM0)
-            1091:          60 MatrixTimesScalar 1090 1089
-                              Store 1088(r6) 1091
-            1093:          60 Load 89(inFM0)
-            1094:    6(float) Load 85(inF0)
-            1095:          60 MatrixTimesScalar 1093 1094
-                              Store 1092(r7) 1095
-            1097:          60 Load 90(inFM1)
-            1098:          60 Load 89(inFM0)
-            1099:          60 MatrixTimesMatrix 1097 1098
-                              Store 1096(r8) 1099
+        1029(r0):      7(ptr) Variable Function
+        1033(r1):     25(ptr) Variable Function
+        1037(r2):     25(ptr) Variable Function
+        1041(r3):      7(ptr) Variable Function
+        1045(r4):     25(ptr) Variable Function
+        1049(r5):     25(ptr) Variable Function
+        1053(r6):     61(ptr) Variable Function
+        1057(r7):     61(ptr) Variable Function
+        1061(r8):     61(ptr) Variable Function
+            1030:    6(float) Load 86(inF1)
+            1031:    6(float) Load 85(inF0)
+            1032:    6(float) FMul 1030 1031
+                              Store 1029(r0) 1032
+            1034:    6(float) Load 85(inF0)
+            1035:   24(fvec2) Load 87(inFV0)
+            1036:   24(fvec2) VectorTimesScalar 1035 1034
+                              Store 1033(r1) 1036
+            1038:   24(fvec2) Load 87(inFV0)
+            1039:    6(float) Load 85(inF0)
+            1040:   24(fvec2) VectorTimesScalar 1038 1039
+                              Store 1037(r2) 1040
+            1042:   24(fvec2) Load 87(inFV0)
+            1043:   24(fvec2) Load 88(inFV1)
+            1044:    6(float) Dot 1042 1043
+                              Store 1041(r3) 1044
+            1046:   24(fvec2) Load 87(inFV0)
+            1047:          60 Load 89(inFM0)
+            1048:   24(fvec2) VectorTimesMatrix 1046 1047
+                              Store 1045(r4) 1048
+            1050:          60 Load 89(inFM0)
+            1051:   24(fvec2) Load 87(inFV0)
+            1052:   24(fvec2) MatrixTimesVector 1050 1051
+                              Store 1049(r5) 1052
+            1054:    6(float) Load 85(inF0)
+            1055:          60 Load 89(inFM0)
+            1056:          60 MatrixTimesScalar 1055 1054
+                              Store 1053(r6) 1056
+            1058:          60 Load 89(inFM0)
+            1059:    6(float) Load 85(inF0)
+            1060:          60 MatrixTimesScalar 1058 1059
+                              Store 1057(r7) 1060
+            1062:          60 Load 90(inFM1)
+            1063:          60 Load 89(inFM0)
+            1064:          60 MatrixTimesMatrix 1062 1063
+                              Store 1061(r8) 1064
                               Return
                               FunctionEnd
 100(TestGenMul3(f1;f1;vf3;vf3;mf33;mf33;):           2 Function None 93
@@ -4036,51 +3946,51 @@ Shader version: 450
        98(inFM0):     69(ptr) FunctionParameter
        99(inFM1):     69(ptr) FunctionParameter
              101:             Label
-        1100(r0):      7(ptr) Variable Function
-        1104(r1):     37(ptr) Variable Function
-        1108(r2):     37(ptr) Variable Function
-        1112(r3):      7(ptr) Variable Function
-        1116(r4):     37(ptr) Variable Function
-        1120(r5):     37(ptr) Variable Function
-        1124(r6):     69(ptr) Variable Function
-        1128(r7):     69(ptr) Variable Function
-        1132(r8):     69(ptr) Variable Function
-            1101:    6(float) Load 95(inF1)
-            1102:    6(float) Load 94(inF0)
-            1103:    6(float) FMul 1101 1102
-                              Store 1100(r0) 1103
-            1105:    6(float) Load 94(inF0)
-            1106:   36(fvec3) Load 96(inFV0)
-            1107:   36(fvec3) VectorTimesScalar 1106 1105
-                              Store 1104(r1) 1107
-            1109:   36(fvec3) Load 96(inFV0)
-            1110:    6(float) Load 94(inF0)
-            1111:   36(fvec3) VectorTimesScalar 1109 1110
-                              Store 1108(r2) 1111
-            1113:   36(fvec3) Load 96(inFV0)
-            1114:   36(fvec3) Load 97(inFV1)
-            1115:    6(float) Dot 1113 1114
-                              Store 1112(r3) 1115
-            1117:   36(fvec3) Load 96(inFV0)
-            1118:          68 Load 98(inFM0)
-            1119:   36(fvec3) VectorTimesMatrix 1117 1118
-                              Store 1116(r4) 1119
-            1121:          68 Load 98(inFM0)
-            1122:   36(fvec3) Load 96(inFV0)
-            1123:   36(fvec3) MatrixTimesVector 1121 1122
-                              Store 1120(r5) 1123
-            1125:    6(float) Load 94(inF0)
-            1126:          68 Load 98(inFM0)
-            1127:          68 MatrixTimesScalar 1126 1125
-                              Store 1124(r6) 1127
-            1129:          68 Load 98(inFM0)
-            1130:    6(float) Load 94(inF0)
-            1131:          68 MatrixTimesScalar 1129 1130
-                              Store 1128(r7) 1131
-            1133:          68 Load 99(inFM1)
-            1134:          68 Load 98(inFM0)
-            1135:          68 MatrixTimesMatrix 1133 1134
-                              Store 1132(r8) 1135
+        1065(r0):      7(ptr) Variable Function
+        1069(r1):     37(ptr) Variable Function
+        1073(r2):     37(ptr) Variable Function
+        1077(r3):      7(ptr) Variable Function
+        1081(r4):     37(ptr) Variable Function
+        1085(r5):     37(ptr) Variable Function
+        1089(r6):     69(ptr) Variable Function
+        1093(r7):     69(ptr) Variable Function
+        1097(r8):     69(ptr) Variable Function
+            1066:    6(float) Load 95(inF1)
+            1067:    6(float) Load 94(inF0)
+            1068:    6(float) FMul 1066 1067
+                              Store 1065(r0) 1068
+            1070:    6(float) Load 94(inF0)
+            1071:   36(fvec3) Load 96(inFV0)
+            1072:   36(fvec3) VectorTimesScalar 1071 1070
+                              Store 1069(r1) 1072
+            1074:   36(fvec3) Load 96(inFV0)
+            1075:    6(float) Load 94(inF0)
+            1076:   36(fvec3) VectorTimesScalar 1074 1075
+                              Store 1073(r2) 1076
+            1078:   36(fvec3) Load 96(inFV0)
+            1079:   36(fvec3) Load 97(inFV1)
+            1080:    6(float) Dot 1078 1079
+                              Store 1077(r3) 1080
+            1082:   36(fvec3) Load 96(inFV0)
+            1083:          68 Load 98(inFM0)
+            1084:   36(fvec3) VectorTimesMatrix 1082 1083
+                              Store 1081(r4) 1084
+            1086:          68 Load 98(inFM0)
+            1087:   36(fvec3) Load 96(inFV0)
+            1088:   36(fvec3) MatrixTimesVector 1086 1087
+                              Store 1085(r5) 1088
+            1090:    6(float) Load 94(inF0)
+            1091:          68 Load 98(inFM0)
+            1092:          68 MatrixTimesScalar 1091 1090
+                              Store 1089(r6) 1092
+            1094:          68 Load 98(inFM0)
+            1095:    6(float) Load 94(inF0)
+            1096:          68 MatrixTimesScalar 1094 1095
+                              Store 1093(r7) 1096
+            1098:          68 Load 99(inFM1)
+            1099:          68 Load 98(inFM0)
+            1100:          68 MatrixTimesMatrix 1098 1099
+                              Store 1097(r8) 1100
                               Return
                               FunctionEnd
 109(TestGenMul4(f1;f1;vf4;vf4;mf44;mf44;):           2 Function None 102
@@ -4091,51 +4001,51 @@ Shader version: 450
       107(inFM0):     77(ptr) FunctionParameter
       108(inFM1):     77(ptr) FunctionParameter
              110:             Label
-        1136(r0):      7(ptr) Variable Function
-        1140(r1):     49(ptr) Variable Function
-        1144(r2):     49(ptr) Variable Function
-        1148(r3):      7(ptr) Variable Function
-        1152(r4):     49(ptr) Variable Function
-        1156(r5):     49(ptr) Variable Function
-        1160(r6):     77(ptr) Variable Function
-        1164(r7):     77(ptr) Variable Function
-        1168(r8):     77(ptr) Variable Function
-            1137:    6(float) Load 104(inF1)
-            1138:    6(float) Load 103(inF0)
-            1139:    6(float) FMul 1137 1138
-                              Store 1136(r0) 1139
-            1141:    6(float) Load 103(inF0)
-            1142:   48(fvec4) Load 105(inFV0)
-            1143:   48(fvec4) VectorTimesScalar 1142 1141
-                              Store 1140(r1) 1143
-            1145:   48(fvec4) Load 105(inFV0)
-            1146:    6(float) Load 103(inF0)
-            1147:   48(fvec4) VectorTimesScalar 1145 1146
-                              Store 1144(r2) 1147
-            1149:   48(fvec4) Load 105(inFV0)
-            1150:   48(fvec4) Load 106(inFV1)
-            1151:    6(float) Dot 1149 1150
-                              Store 1148(r3) 1151
-            1153:   48(fvec4) Load 105(inFV0)
-            1154:          76 Load 107(inFM0)
-            1155:   48(fvec4) VectorTimesMatrix 1153 1154
-                              Store 1152(r4) 1155
-            1157:          76 Load 107(inFM0)
-            1158:   48(fvec4) Load 105(inFV0)
-            1159:   48(fvec4) MatrixTimesVector 1157 1158
-                              Store 1156(r5) 1159
-            1161:    6(float) Load 103(inF0)
-            1162:          76 Load 107(inFM0)
-            1163:          76 MatrixTimesScalar 1162 1161
-                              Store 1160(r6) 1163
-            1165:          76 Load 107(inFM0)
-            1166:    6(float) Load 103(inF0)
-            1167:          76 MatrixTimesScalar 1165 1166
-                              Store 1164(r7) 1167
-            1169:          76 Load 108(inFM1)
-            1170:          76 Load 107(inFM0)
-            1171:          76 MatrixTimesMatrix 1169 1170
-                              Store 1168(r8) 1171
+        1101(r0):      7(ptr) Variable Function
+        1105(r1):     49(ptr) Variable Function
+        1109(r2):     49(ptr) Variable Function
+        1113(r3):      7(ptr) Variable Function
+        1117(r4):     49(ptr) Variable Function
+        1121(r5):     49(ptr) Variable Function
+        1125(r6):     77(ptr) Variable Function
+        1129(r7):     77(ptr) Variable Function
+        1133(r8):     77(ptr) Variable Function
+            1102:    6(float) Load 104(inF1)
+            1103:    6(float) Load 103(inF0)
+            1104:    6(float) FMul 1102 1103
+                              Store 1101(r0) 1104
+            1106:    6(float) Load 103(inF0)
+            1107:   48(fvec4) Load 105(inFV0)
+            1108:   48(fvec4) VectorTimesScalar 1107 1106
+                              Store 1105(r1) 1108
+            1110:   48(fvec4) Load 105(inFV0)
+            1111:    6(float) Load 103(inF0)
+            1112:   48(fvec4) VectorTimesScalar 1110 1111
+                              Store 1109(r2) 1112
+            1114:   48(fvec4) Load 105(inFV0)
+            1115:   48(fvec4) Load 106(inFV1)
+            1116:    6(float) Dot 1114 1115
+                              Store 1113(r3) 1116
+            1118:   48(fvec4) Load 105(inFV0)
+            1119:          76 Load 107(inFM0)
+            1120:   48(fvec4) VectorTimesMatrix 1118 1119
+                              Store 1117(r4) 1120
+            1122:          76 Load 107(inFM0)
+            1123:   48(fvec4) Load 105(inFV0)
+            1124:   48(fvec4) MatrixTimesVector 1122 1123
+                              Store 1121(r5) 1124
+            1126:    6(float) Load 103(inF0)
+            1127:          76 Load 107(inFM0)
+            1128:          76 MatrixTimesScalar 1127 1126
+                              Store 1125(r6) 1128
+            1130:          76 Load 107(inFM0)
+            1131:    6(float) Load 103(inF0)
+            1132:          76 MatrixTimesScalar 1130 1131
+                              Store 1129(r7) 1132
+            1134:          76 Load 108(inFM1)
+            1135:          76 Load 107(inFM0)
+            1136:          76 MatrixTimesMatrix 1134 1135
+                              Store 1133(r8) 1136
                               Return
                               FunctionEnd
 129(TestGenMulNxM(f1;f1;vf2;vf3;mf23;mf32;mf33;mf34;mf24;):           2 Function None 119
@@ -4149,90 +4059,90 @@ Shader version: 450
     127(inFM3x4):    116(ptr) FunctionParameter
     128(inFM2x4):    118(ptr) FunctionParameter
              130:             Label
-       1172(r00):      7(ptr) Variable Function
-       1176(r01):     25(ptr) Variable Function
-       1180(r02):     37(ptr) Variable Function
-       1184(r03):     25(ptr) Variable Function
-       1188(r04):     37(ptr) Variable Function
-       1192(r05):      7(ptr) Variable Function
-       1196(r06):      7(ptr) Variable Function
-       1200(r07):     37(ptr) Variable Function
-       1204(r08):     25(ptr) Variable Function
-       1208(r09):     25(ptr) Variable Function
-       1212(r10):     37(ptr) Variable Function
-       1216(r11):    112(ptr) Variable Function
-       1220(r12):    114(ptr) Variable Function
-       1224(r13):     61(ptr) Variable Function
-       1228(r14):    112(ptr) Variable Function
-       1232(r15):    118(ptr) Variable Function
-       1236(r16):    116(ptr) Variable Function
-            1173:    6(float) Load 121(inF1)
-            1174:    6(float) Load 120(inF0)
-            1175:    6(float) FMul 1173 1174
-                              Store 1172(r00) 1175
-            1177:    6(float) Load 120(inF0)
+       1137(r00):      7(ptr) Variable Function
+       1141(r01):     25(ptr) Variable Function
+       1145(r02):     37(ptr) Variable Function
+       1149(r03):     25(ptr) Variable Function
+       1153(r04):     37(ptr) Variable Function
+       1157(r05):      7(ptr) Variable Function
+       1161(r06):      7(ptr) Variable Function
+       1165(r07):     37(ptr) Variable Function
+       1169(r08):     25(ptr) Variable Function
+       1173(r09):     25(ptr) Variable Function
+       1177(r10):     37(ptr) Variable Function
+       1181(r11):    112(ptr) Variable Function
+       1185(r12):    114(ptr) Variable Function
+       1189(r13):     61(ptr) Variable Function
+       1193(r14):    112(ptr) Variable Function
+       1197(r15):    118(ptr) Variable Function
+       1201(r16):    116(ptr) Variable Function
+            1138:    6(float) Load 121(inF1)
+            1139:    6(float) Load 120(inF0)
+            1140:    6(float) FMul 1138 1139
+                              Store 1137(r00) 1140
+            1142:    6(float) Load 120(inF0)
+            1143:   24(fvec2) Load 122(inFV2)
+            1144:   24(fvec2) VectorTimesScalar 1143 1142
+                              Store 1141(r01) 1144
+            1146:    6(float) Load 120(inF0)
+            1147:   36(fvec3) Load 123(inFV3)
+            1148:   36(fvec3) VectorTimesScalar 1147 1146
+                              Store 1145(r02) 1148
+            1150:   24(fvec2) Load 122(inFV2)
+            1151:    6(float) Load 120(inF0)
+            1152:   24(fvec2) VectorTimesScalar 1150 1151
+                              Store 1149(r03) 1152
+            1154:   36(fvec3) Load 123(inFV3)
+            1155:    6(float) Load 120(inF0)
+            1156:   36(fvec3) VectorTimesScalar 1154 1155
+                              Store 1153(r04) 1156
+            1158:   24(fvec2) Load 122(inFV2)
+            1159:   24(fvec2) Load 122(inFV2)
+            1160:    6(float) Dot 1158 1159
+                              Store 1157(r05) 1160
+            1162:   36(fvec3) Load 123(inFV3)
+            1163:   36(fvec3) Load 123(inFV3)
+            1164:    6(float) Dot 1162 1163
+                              Store 1161(r06) 1164
+            1166:         111 Load 124(inFM2x3)
+            1167:   24(fvec2) Load 122(inFV2)
+            1168:   36(fvec3) MatrixTimesVector 1166 1167
+                              Store 1165(r07) 1168
+            1170:         113 Load 125(inFM3x2)
+            1171:   36(fvec3) Load 123(inFV3)
+            1172:   24(fvec2) MatrixTimesVector 1170 1171
+                              Store 1169(r08) 1172
+            1174:   36(fvec3) Load 123(inFV3)
+            1175:         111 Load 124(inFM2x3)
+            1176:   24(fvec2) VectorTimesMatrix 1174 1175
+                              Store 1173(r09) 1176
             1178:   24(fvec2) Load 122(inFV2)
-            1179:   24(fvec2) VectorTimesScalar 1178 1177
-                              Store 1176(r01) 1179
-            1181:    6(float) Load 120(inF0)
-            1182:   36(fvec3) Load 123(inFV3)
-            1183:   36(fvec3) VectorTimesScalar 1182 1181
-                              Store 1180(r02) 1183
-            1185:   24(fvec2) Load 122(inFV2)
+            1179:         113 Load 125(inFM3x2)
+            1180:   36(fvec3) VectorTimesMatrix 1178 1179
+                              Store 1177(r10) 1180
+            1182:    6(float) Load 120(inF0)
+            1183:         111 Load 124(inFM2x3)
+            1184:         111 MatrixTimesScalar 1183 1182
+                              Store 1181(r11) 1184
             1186:    6(float) Load 120(inF0)
-            1187:   24(fvec2) VectorTimesScalar 1185 1186
-                              Store 1184(r03) 1187
-            1189:   36(fvec3) Load 123(inFV3)
-            1190:    6(float) Load 120(inF0)
-            1191:   36(fvec3) VectorTimesScalar 1189 1190
-                              Store 1188(r04) 1191
-            1193:   24(fvec2) Load 122(inFV2)
-            1194:   24(fvec2) Load 122(inFV2)
-            1195:    6(float) Dot 1193 1194
-                              Store 1192(r05) 1195
-            1197:   36(fvec3) Load 123(inFV3)
-            1198:   36(fvec3) Load 123(inFV3)
-            1199:    6(float) Dot 1197 1198
-                              Store 1196(r06) 1199
-            1201:         111 Load 124(inFM2x3)
-            1202:   24(fvec2) Load 122(inFV2)
-            1203:   36(fvec3) MatrixTimesVector 1201 1202
-                              Store 1200(r07) 1203
-            1205:         113 Load 125(inFM3x2)
-            1206:   36(fvec3) Load 123(inFV3)
-            1207:   24(fvec2) MatrixTimesVector 1205 1206
-                              Store 1204(r08) 1207
-            1209:   36(fvec3) Load 123(inFV3)
-            1210:         111 Load 124(inFM2x3)
-            1211:   24(fvec2) VectorTimesMatrix 1209 1210
-                              Store 1208(r09) 1211
-            1213:   24(fvec2) Load 122(inFV2)
-            1214:         113 Load 125(inFM3x2)
-            1215:   36(fvec3) VectorTimesMatrix 1213 1214
-                              Store 1212(r10) 1215
-            1217:    6(float) Load 120(inF0)
-            1218:         111 Load 124(inFM2x3)
-            1219:         111 MatrixTimesScalar 1218 1217
-                              Store 1216(r11) 1219
-            1221:    6(float) Load 120(inF0)
-            1222:         113 Load 125(inFM3x2)
-            1223:         113 MatrixTimesScalar 1222 1221
-                              Store 1220(r12) 1223
-            1225:         113 Load 125(inFM3x2)
-            1226:         111 Load 124(inFM2x3)
-            1227:          60 MatrixTimesMatrix 1225 1226
-                              Store 1224(r13) 1227
-            1229:          68 Load 126(inFM3x3)
-            1230:         111 Load 124(inFM2x3)
-            1231:         111 MatrixTimesMatrix 1229 1230
-                              Store 1228(r14) 1231
-            1233:         115 Load 127(inFM3x4)
-            1234:         111 Load 124(inFM2x3)
-            1235:         117 MatrixTimesMatrix 1233 1234
-                              Store 1232(r15) 1235
-            1237:         117 Load 128(inFM2x4)
-            1238:         113 Load 125(inFM3x2)
-            1239:         115 MatrixTimesMatrix 1237 1238
-                              Store 1236(r16) 1239
+            1187:         113 Load 125(inFM3x2)
+            1188:         113 MatrixTimesScalar 1187 1186
+                              Store 1185(r12) 1188
+            1190:         113 Load 125(inFM3x2)
+            1191:         111 Load 124(inFM2x3)
+            1192:          60 MatrixTimesMatrix 1190 1191
+                              Store 1189(r13) 1192
+            1194:          68 Load 126(inFM3x3)
+            1195:         111 Load 124(inFM2x3)
+            1196:         111 MatrixTimesMatrix 1194 1195
+                              Store 1193(r14) 1196
+            1198:         115 Load 127(inFM3x4)
+            1199:         111 Load 124(inFM2x3)
+            1200:         117 MatrixTimesMatrix 1198 1199
+                              Store 1197(r15) 1200
+            1202:         117 Load 128(inFM2x4)
+            1203:         113 Load 125(inFM3x2)
+            1204:         115 MatrixTimesMatrix 1202 1203
+                              Store 1201(r16) 1204
                               Return
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.layout.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.layout.frag.out
index 6a5eceb..b8a96cf 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.layout.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.layout.frag.out
@@ -1,5 +1,5 @@
 hlsl.layout.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:16  Function Definition: PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -35,7 +35,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:16  Function Definition: PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -75,6 +75,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.2dms.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.2dms.dx10.frag.out
index f436f88..251d56e 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.2dms.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.2dms.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.2dms.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -179,7 +179,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -366,6 +366,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 120 124
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.array.dx10.frag.out
index 9430e75..78e2b2c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -194,7 +194,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -396,6 +396,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 104 108
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.frag.out
index 6a976f0..0da4048 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -245,7 +245,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -498,6 +498,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 133 137
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.vert.out
index a41675c..c3b7dc2 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.basic.dx10.vert.out
@@ -1,5 +1,5 @@
 hlsl.load.basic.dx10.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:47  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:47    Function Parameters: 
@@ -227,7 +227,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:47  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:47    Function Parameters: 
@@ -461,6 +461,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 129 173
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUTPUT"
                               MemberName 8(VS_OUTPUT) 0  "Pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.dx10.frag.out
index 134a7fd..e68e5f9 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.buffer.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -8,8 +8,8 @@ gl_FragCoord origin is upper left
 0:28      Sequence
 0:28        move second child to first child ( temp 4-component vector of float)
 0:28          'r00' ( temp 4-component vector of float)
-0:28          imageLoad ( temp 4-component vector of float)
-0:28            'g_tTexbf4' (layout( rgba32f) readonly uniform imageBuffer)
+0:28          textureFetch ( temp 4-component vector of float)
+0:28            'g_tTexbf4' (layout( rgba32f) uniform textureBuffer)
 0:28            c1: direct index for structure ( uniform int)
 0:28              'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:28              Constant:
@@ -17,8 +17,8 @@ gl_FragCoord origin is upper left
 0:29      Sequence
 0:29        move second child to first child ( temp 4-component vector of int)
 0:29          'r01' ( temp 4-component vector of int)
-0:29          imageLoad ( temp 4-component vector of int)
-0:29            'g_tTexbi4' (layout( rgba32i) readonly uniform iimageBuffer)
+0:29          textureFetch ( temp 4-component vector of int)
+0:29            'g_tTexbi4' (layout( rgba32i) uniform itextureBuffer)
 0:29            c1: direct index for structure ( uniform int)
 0:29              'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:29              Constant:
@@ -26,8 +26,8 @@ gl_FragCoord origin is upper left
 0:30      Sequence
 0:30        move second child to first child ( temp 4-component vector of uint)
 0:30          'r02' ( temp 4-component vector of uint)
-0:30          imageLoad ( temp 4-component vector of uint)
-0:30            'g_tTexbu4' (layout( rgba32ui) readonly uniform uimageBuffer)
+0:30          textureFetch ( temp 4-component vector of uint)
+0:30            'g_tTexbu4' (layout( rgba32ui) uniform utextureBuffer)
 0:30            c1: direct index for structure ( uniform int)
 0:30              'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:30              Constant:
@@ -71,10 +71,10 @@ gl_FragCoord origin is upper left
 0:24            Constant:
 0:24              1 (const int)
 0:?   Linker Objects
-0:?     'g_tTexbf4_test' (layout( binding=0 rgba32f) readonly uniform imageBuffer)
-0:?     'g_tTexbf4' (layout( rgba32f) readonly uniform imageBuffer)
-0:?     'g_tTexbi4' (layout( rgba32i) readonly uniform iimageBuffer)
-0:?     'g_tTexbu4' (layout( rgba32ui) readonly uniform uimageBuffer)
+0:?     'g_tTexbf4_test' (layout( binding=0 rgba32f) uniform textureBuffer)
+0:?     'g_tTexbf4' (layout( rgba32f) uniform textureBuffer)
+0:?     'g_tTexbi4' (layout( rgba32i) uniform itextureBuffer)
+0:?     'g_tTexbu4' (layout( rgba32ui) uniform utextureBuffer)
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:?     'Color' (layout( location=0) out 4-component vector of float)
 0:?     'Depth' ( out float FragDepth)
@@ -83,7 +83,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -92,8 +92,8 @@ gl_FragCoord origin is upper left
 0:28      Sequence
 0:28        move second child to first child ( temp 4-component vector of float)
 0:28          'r00' ( temp 4-component vector of float)
-0:28          imageLoad ( temp 4-component vector of float)
-0:28            'g_tTexbf4' (layout( rgba32f) readonly uniform imageBuffer)
+0:28          textureFetch ( temp 4-component vector of float)
+0:28            'g_tTexbf4' (layout( rgba32f) uniform textureBuffer)
 0:28            c1: direct index for structure ( uniform int)
 0:28              'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:28              Constant:
@@ -101,8 +101,8 @@ gl_FragCoord origin is upper left
 0:29      Sequence
 0:29        move second child to first child ( temp 4-component vector of int)
 0:29          'r01' ( temp 4-component vector of int)
-0:29          imageLoad ( temp 4-component vector of int)
-0:29            'g_tTexbi4' (layout( rgba32i) readonly uniform iimageBuffer)
+0:29          textureFetch ( temp 4-component vector of int)
+0:29            'g_tTexbi4' (layout( rgba32i) uniform itextureBuffer)
 0:29            c1: direct index for structure ( uniform int)
 0:29              'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:29              Constant:
@@ -110,8 +110,8 @@ gl_FragCoord origin is upper left
 0:30      Sequence
 0:30        move second child to first child ( temp 4-component vector of uint)
 0:30          'r02' ( temp 4-component vector of uint)
-0:30          imageLoad ( temp 4-component vector of uint)
-0:30            'g_tTexbu4' (layout( rgba32ui) readonly uniform uimageBuffer)
+0:30          textureFetch ( temp 4-component vector of uint)
+0:30            'g_tTexbu4' (layout( rgba32ui) uniform utextureBuffer)
 0:30            c1: direct index for structure ( uniform int)
 0:30              'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:30              Constant:
@@ -155,10 +155,10 @@ gl_FragCoord origin is upper left
 0:24            Constant:
 0:24              1 (const int)
 0:?   Linker Objects
-0:?     'g_tTexbf4_test' (layout( binding=0 rgba32f) readonly uniform imageBuffer)
-0:?     'g_tTexbf4' (layout( rgba32f) readonly uniform imageBuffer)
-0:?     'g_tTexbi4' (layout( rgba32i) readonly uniform iimageBuffer)
-0:?     'g_tTexbu4' (layout( rgba32ui) readonly uniform uimageBuffer)
+0:?     'g_tTexbf4_test' (layout( binding=0 rgba32f) uniform textureBuffer)
+0:?     'g_tTexbf4' (layout( rgba32f) uniform textureBuffer)
+0:?     'g_tTexbi4' (layout( rgba32i) uniform itextureBuffer)
+0:?     'g_tTexbu4' (layout( rgba32ui) uniform utextureBuffer)
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:?     'Color' (layout( location=0) out 4-component vector of float)
 0:?     'Depth' ( out float FragDepth)
@@ -173,6 +173,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 64 68
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
@@ -200,7 +201,6 @@ gl_FragCoord origin is upper left
                               Name 68  "Depth"
                               Name 71  "g_tTexbf4_test"
                               Decorate 16(g_tTexbf4) DescriptorSet 0
-                              Decorate 16(g_tTexbf4) NonWritable
                               MemberDecorate 22($Global) 0 Offset 0
                               MemberDecorate 22($Global) 1 Offset 8
                               MemberDecorate 22($Global) 2 Offset 16
@@ -212,14 +212,11 @@ gl_FragCoord origin is upper left
                               Decorate 22($Global) Block
                               Decorate 24 DescriptorSet 0
                               Decorate 34(g_tTexbi4) DescriptorSet 0
-                              Decorate 34(g_tTexbi4) NonWritable
                               Decorate 45(g_tTexbu4) DescriptorSet 0
-                              Decorate 45(g_tTexbu4) NonWritable
                               Decorate 64(Color) Location 0
                               Decorate 68(Depth) BuiltIn FragDepth
                               Decorate 71(g_tTexbf4_test) DescriptorSet 0
                               Decorate 71(g_tTexbf4_test) Binding 0
-                              Decorate 71(g_tTexbf4_test) NonWritable
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -227,7 +224,7 @@ gl_FragCoord origin is upper left
     8(PS_OUTPUT):             TypeStruct 7(fvec4) 6(float)
                9:             TypeFunction 8(PS_OUTPUT)
               12:             TypePointer Function 7(fvec4)
-              14:             TypeImage 6(float) Buffer nonsampled format:Rgba32f
+              14:             TypeImage 6(float) Buffer sampled format:Rgba32f
               15:             TypePointer UniformConstant 14
    16(g_tTexbf4):     15(ptr) Variable UniformConstant
               18:             TypeInt 32 1
@@ -240,13 +237,13 @@ gl_FragCoord origin is upper left
               25:     18(int) Constant 0
               26:             TypePointer Uniform 18(int)
               30:             TypePointer Function 21(ivec4)
-              32:             TypeImage 18(int) Buffer nonsampled format:Rgba32i
+              32:             TypeImage 18(int) Buffer sampled format:Rgba32i
               33:             TypePointer UniformConstant 32
    34(g_tTexbi4):     33(ptr) Variable UniformConstant
               39:             TypeInt 32 0
               40:             TypeVector 39(int) 4
               41:             TypePointer Function 40(ivec4)
-              43:             TypeImage 39(int) Buffer nonsampled format:Rgba32ui
+              43:             TypeImage 39(int) Buffer sampled format:Rgba32ui
               44:             TypePointer UniformConstant 43
    45(g_tTexbu4):     44(ptr) Variable UniformConstant
               50:             TypePointer Function 8(PS_OUTPUT)
@@ -281,17 +278,17 @@ gl_FragCoord origin is upper left
               17:          14 Load 16(g_tTexbf4)
               27:     26(ptr) AccessChain 24 25
               28:     18(int) Load 27
-              29:    7(fvec4) ImageRead 17 28
+              29:    7(fvec4) ImageFetch 17 28
                               Store 13(r00) 29
               35:          32 Load 34(g_tTexbi4)
               36:     26(ptr) AccessChain 24 25
               37:     18(int) Load 36
-              38:   21(ivec4) ImageRead 35 37
+              38:   21(ivec4) ImageFetch 35 37
                               Store 31(r01) 38
               46:          43 Load 45(g_tTexbu4)
               47:     26(ptr) AccessChain 24 25
               48:     18(int) Load 47
-              49:   40(ivec4) ImageRead 46 48
+              49:   40(ivec4) ImageFetch 46 48
                               Store 42(r02) 49
               54:     12(ptr) AccessChain 51(psout) 25
                               Store 54 53
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.float.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.float.dx10.frag.out
index 6ba3d99..ff686a3 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.float.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.buffer.float.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.buffer.float.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -9,8 +9,8 @@ gl_FragCoord origin is upper left
 0:28        move second child to first child ( temp float)
 0:28          'r00' ( temp float)
 0:28          Construct float ( temp float)
-0:?             imageLoad ( temp 4-component vector of float)
-0:28              'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?             textureFetch ( temp 4-component vector of float)
+0:28              'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:28              c1: direct index for structure ( uniform int)
 0:28                'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:28                Constant:
@@ -19,8 +19,8 @@ gl_FragCoord origin is upper left
 0:29        move second child to first child ( temp int)
 0:29          'r01' ( temp int)
 0:29          Construct int ( temp int)
-0:?             imageLoad ( temp 4-component vector of int)
-0:29              'g_tTexbis' (layout( r32i) readonly uniform iimageBuffer)
+0:?             textureFetch ( temp 4-component vector of int)
+0:29              'g_tTexbis' (layout( r32i) uniform itextureBuffer)
 0:29              c1: direct index for structure ( uniform int)
 0:29                'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:29                Constant:
@@ -29,8 +29,8 @@ gl_FragCoord origin is upper left
 0:30        move second child to first child ( temp uint)
 0:30          'r02' ( temp uint)
 0:30          Construct uint ( temp uint)
-0:?             imageLoad ( temp 4-component vector of uint)
-0:30              'g_tTexbus' (layout( r32ui) readonly uniform uimageBuffer)
+0:?             textureFetch ( temp 4-component vector of uint)
+0:30              'g_tTexbus' (layout( r32ui) uniform utextureBuffer)
 0:30              c1: direct index for structure ( uniform int)
 0:30                'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:30                Constant:
@@ -74,10 +74,10 @@ gl_FragCoord origin is upper left
 0:24            Constant:
 0:24              1 (const int)
 0:?   Linker Objects
-0:?     'g_tTexbfs_test' (layout( binding=0 r32f) readonly uniform imageBuffer)
-0:?     'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
-0:?     'g_tTexbis' (layout( r32i) readonly uniform iimageBuffer)
-0:?     'g_tTexbus' (layout( r32ui) readonly uniform uimageBuffer)
+0:?     'g_tTexbfs_test' (layout( binding=0 r32f) uniform textureBuffer)
+0:?     'g_tTexbfs' (layout( r32f) uniform textureBuffer)
+0:?     'g_tTexbis' (layout( r32i) uniform itextureBuffer)
+0:?     'g_tTexbus' (layout( r32ui) uniform utextureBuffer)
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:?     'Color' (layout( location=0) out 4-component vector of float)
 0:?     'Depth' ( out float FragDepth)
@@ -86,7 +86,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -96,8 +96,8 @@ gl_FragCoord origin is upper left
 0:28        move second child to first child ( temp float)
 0:28          'r00' ( temp float)
 0:28          Construct float ( temp float)
-0:?             imageLoad ( temp 4-component vector of float)
-0:28              'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
+0:?             textureFetch ( temp 4-component vector of float)
+0:28              'g_tTexbfs' (layout( r32f) uniform textureBuffer)
 0:28              c1: direct index for structure ( uniform int)
 0:28                'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:28                Constant:
@@ -106,8 +106,8 @@ gl_FragCoord origin is upper left
 0:29        move second child to first child ( temp int)
 0:29          'r01' ( temp int)
 0:29          Construct int ( temp int)
-0:?             imageLoad ( temp 4-component vector of int)
-0:29              'g_tTexbis' (layout( r32i) readonly uniform iimageBuffer)
+0:?             textureFetch ( temp 4-component vector of int)
+0:29              'g_tTexbis' (layout( r32i) uniform itextureBuffer)
 0:29              c1: direct index for structure ( uniform int)
 0:29                'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:29                Constant:
@@ -116,8 +116,8 @@ gl_FragCoord origin is upper left
 0:30        move second child to first child ( temp uint)
 0:30          'r02' ( temp uint)
 0:30          Construct uint ( temp uint)
-0:?             imageLoad ( temp 4-component vector of uint)
-0:30              'g_tTexbus' (layout( r32ui) readonly uniform uimageBuffer)
+0:?             textureFetch ( temp 4-component vector of uint)
+0:30              'g_tTexbus' (layout( r32ui) uniform utextureBuffer)
 0:30              c1: direct index for structure ( uniform int)
 0:30                'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:30                Constant:
@@ -161,10 +161,10 @@ gl_FragCoord origin is upper left
 0:24            Constant:
 0:24              1 (const int)
 0:?   Linker Objects
-0:?     'g_tTexbfs_test' (layout( binding=0 r32f) readonly uniform imageBuffer)
-0:?     'g_tTexbfs' (layout( r32f) readonly uniform imageBuffer)
-0:?     'g_tTexbis' (layout( r32i) readonly uniform iimageBuffer)
-0:?     'g_tTexbus' (layout( r32ui) readonly uniform uimageBuffer)
+0:?     'g_tTexbfs_test' (layout( binding=0 r32f) uniform textureBuffer)
+0:?     'g_tTexbfs' (layout( r32f) uniform textureBuffer)
+0:?     'g_tTexbis' (layout( r32i) uniform itextureBuffer)
+0:?     'g_tTexbus' (layout( r32ui) uniform utextureBuffer)
 0:?     'anon@0' (layout( row_major std140) uniform block{ uniform int c1,  uniform 2-component vector of int c2,  uniform 3-component vector of int c3,  uniform 4-component vector of int c4,  uniform int o1,  uniform 2-component vector of int o2,  uniform 3-component vector of int o3,  uniform 4-component vector of int o4})
 0:?     'Color' (layout( location=0) out 4-component vector of float)
 0:?     'Depth' ( out float FragDepth)
@@ -179,6 +179,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 67 71
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
@@ -206,7 +207,6 @@ gl_FragCoord origin is upper left
                               Name 71  "Depth"
                               Name 74  "g_tTexbfs_test"
                               Decorate 16(g_tTexbfs) DescriptorSet 0
-                              Decorate 16(g_tTexbfs) NonWritable
                               MemberDecorate 22($Global) 0 Offset 0
                               MemberDecorate 22($Global) 1 Offset 8
                               MemberDecorate 22($Global) 2 Offset 16
@@ -218,14 +218,11 @@ gl_FragCoord origin is upper left
                               Decorate 22($Global) Block
                               Decorate 24 DescriptorSet 0
                               Decorate 35(g_tTexbis) DescriptorSet 0
-                              Decorate 35(g_tTexbis) NonWritable
                               Decorate 46(g_tTexbus) DescriptorSet 0
-                              Decorate 46(g_tTexbus) NonWritable
                               Decorate 67(Color) Location 0
                               Decorate 71(Depth) BuiltIn FragDepth
                               Decorate 74(g_tTexbfs_test) DescriptorSet 0
                               Decorate 74(g_tTexbfs_test) Binding 0
-                              Decorate 74(g_tTexbfs_test) NonWritable
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -233,7 +230,7 @@ gl_FragCoord origin is upper left
     8(PS_OUTPUT):             TypeStruct 7(fvec4) 6(float)
                9:             TypeFunction 8(PS_OUTPUT)
               12:             TypePointer Function 6(float)
-              14:             TypeImage 6(float) Buffer nonsampled format:R32f
+              14:             TypeImage 6(float) Buffer sampled format:R32f
               15:             TypePointer UniformConstant 14
    16(g_tTexbfs):     15(ptr) Variable UniformConstant
               18:             TypeInt 32 1
@@ -246,12 +243,12 @@ gl_FragCoord origin is upper left
               25:     18(int) Constant 0
               26:             TypePointer Uniform 18(int)
               31:             TypePointer Function 18(int)
-              33:             TypeImage 18(int) Buffer nonsampled format:R32i
+              33:             TypeImage 18(int) Buffer sampled format:R32i
               34:             TypePointer UniformConstant 33
    35(g_tTexbis):     34(ptr) Variable UniformConstant
               41:             TypeInt 32 0
               42:             TypePointer Function 41(int)
-              44:             TypeImage 41(int) Buffer nonsampled format:R32ui
+              44:             TypeImage 41(int) Buffer sampled format:R32ui
               45:             TypePointer UniformConstant 44
    46(g_tTexbus):     45(ptr) Variable UniformConstant
               50:             TypeVector 41(int) 4
@@ -287,19 +284,19 @@ gl_FragCoord origin is upper left
               17:          14 Load 16(g_tTexbfs)
               27:     26(ptr) AccessChain 24 25
               28:     18(int) Load 27
-              29:    7(fvec4) ImageRead 17 28
+              29:    7(fvec4) ImageFetch 17 28
               30:    6(float) CompositeExtract 29 0
                               Store 13(r00) 30
               36:          33 Load 35(g_tTexbis)
               37:     26(ptr) AccessChain 24 25
               38:     18(int) Load 37
-              39:   21(ivec4) ImageRead 36 38
+              39:   21(ivec4) ImageFetch 36 38
               40:     18(int) CompositeExtract 39 0
                               Store 32(r01) 40
               47:          44 Load 46(g_tTexbus)
               48:     26(ptr) AccessChain 24 25
               49:     18(int) Load 48
-              51:   50(ivec4) ImageRead 47 49
+              51:   50(ivec4) ImageFetch 47 49
               52:     41(int) CompositeExtract 51 0
                               Store 43(r02) 52
               58:     57(ptr) AccessChain 54(psout) 25
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offset.dx10.frag.out
index 7d6c63a..473db61 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -281,7 +281,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -571,6 +571,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 155 159
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offsetarray.dx10.frag.out
index f20b607..4b6a09f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -218,7 +218,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:48  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -445,6 +445,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 119 123
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwbuffer.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwbuffer.dx10.frag.out
index ea8bdd2..3e748cc 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwbuffer.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwbuffer.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.rwbuffer.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:22  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -56,7 +56,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:22  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -119,6 +119,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 54
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.array.dx10.frag.out
index fba29c1..884ed8f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.rwtexture.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:40  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -104,7 +104,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:40  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -215,6 +215,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 82 86
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.dx10.frag.out
index 2b05b31..3061867 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.load.rwtexture.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.load.rwtexture.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:40  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -122,7 +122,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:40  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -251,6 +251,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 104 108
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.frag.out
index 7aec275..587e3d5 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.frag.out
@@ -1,5 +1,5 @@
 hlsl.logical.binary.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -65,7 +65,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -136,6 +136,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 59
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.vec.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.vec.frag.out
index 38708ef..a03890c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.vec.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.binary.vec.frag.out
@@ -1,5 +1,5 @@
 hlsl.logical.binary.vec.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -128,7 +128,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -262,6 +262,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 117
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.unary.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.unary.frag.out
index 4148794..7121d92 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.unary.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.logical.unary.frag.out
@@ -1,5 +1,5 @@
 hlsl.logical.unary.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -31,17 +31,19 @@ gl_FragCoord origin is upper left
 0:17              3 (const uint)
 0:19      Test condition and select ( temp void)
 0:19        Condition
-0:19        ival: direct index for structure ( uniform int)
-0:19          'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
-0:19          Constant:
-0:19            0 (const uint)
+0:19        Convert int to bool ( temp bool)
+0:19          ival: direct index for structure ( uniform int)
+0:19            'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
+0:19            Constant:
+0:19              0 (const uint)
 0:19        true case is null
 0:20      Test condition and select ( temp void)
 0:20        Condition
-0:20        fval: direct index for structure ( uniform float)
-0:20          'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
-0:20          Constant:
-0:20            2 (const uint)
+0:20        Convert float to bool ( temp bool)
+0:20          fval: direct index for structure ( uniform float)
+0:20            'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
+0:20            Constant:
+0:20              2 (const uint)
 0:20        true case is null
 0:21      Test condition and select ( temp void)
 0:21        Condition
@@ -91,7 +93,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -123,17 +125,19 @@ gl_FragCoord origin is upper left
 0:17              3 (const uint)
 0:19      Test condition and select ( temp void)
 0:19        Condition
-0:19        ival: direct index for structure ( uniform int)
-0:19          'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
-0:19          Constant:
-0:19            0 (const uint)
+0:19        Convert int to bool ( temp bool)
+0:19          ival: direct index for structure ( uniform int)
+0:19            'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
+0:19            Constant:
+0:19              0 (const uint)
 0:19        true case is null
 0:20      Test condition and select ( temp void)
 0:20        Condition
-0:20        fval: direct index for structure ( uniform float)
-0:20          'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
-0:20          Constant:
-0:20            2 (const uint)
+0:20        Convert float to bool ( temp bool)
+0:20          fval: direct index for structure ( uniform float)
+0:20            'anon@0' (layout( row_major std140) uniform block{ uniform int ival,  uniform 4-component vector of int ival4,  uniform float fval,  uniform 4-component vector of float fval4})
+0:20            Constant:
+0:20              2 (const uint)
 0:20        true case is null
 0:21      Test condition and select ( temp void)
 0:21        Condition
@@ -181,13 +185,14 @@ gl_FragCoord origin is upper left
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 82
+// Id's are bound by 84
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "main" 79
+                              EntryPoint Fragment 4  "main" 81
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
@@ -198,15 +203,15 @@ gl_FragCoord origin is upper left
                               MemberName 14($Global) 2  "fval"
                               MemberName 14($Global) 3  "fval4"
                               Name 16  ""
-                              Name 70  "psout"
-                              Name 79  "Color"
+                              Name 72  "psout"
+                              Name 81  "Color"
                               MemberDecorate 14($Global) 0 Offset 0
                               MemberDecorate 14($Global) 1 Offset 16
                               MemberDecorate 14($Global) 2 Offset 32
                               MemberDecorate 14($Global) 3 Offset 48
                               Decorate 14($Global) Block
                               Decorate 16 DescriptorSet 0
-                              Decorate 79(Color) Location 0
+                              Decorate 81(Color) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -234,22 +239,22 @@ gl_FragCoord origin is upper left
               42:     12(int) Constant 3
               43:             TypePointer Uniform 7(fvec4)
               46:    7(fvec4) ConstantComposite 39 39 39 39
-              69:             TypePointer Function 8(PS_OUTPUT)
-              71:    6(float) Constant 1065353216
-              72:    7(fvec4) ConstantComposite 71 71 71 71
-              73:             TypePointer Function 7(fvec4)
-              78:             TypePointer Output 7(fvec4)
-       79(Color):     78(ptr) Variable Output
+              71:             TypePointer Function 8(PS_OUTPUT)
+              73:    6(float) Constant 1065353216
+              74:    7(fvec4) ConstantComposite 73 73 73 73
+              75:             TypePointer Function 7(fvec4)
+              80:             TypePointer Output 7(fvec4)
+       81(Color):     80(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
-              80:8(PS_OUTPUT) FunctionCall 10(@main()
-              81:    7(fvec4) CompositeExtract 80 0
-                              Store 79(Color) 81
+              82:8(PS_OUTPUT) FunctionCall 10(@main()
+              83:    7(fvec4) CompositeExtract 82 0
+                              Store 81(Color) 83
                               Return
                               FunctionEnd
       10(@main():8(PS_OUTPUT) Function None 9
               11:             Label
-       70(psout):     69(ptr) Variable Function
+       72(psout):     71(ptr) Variable Function
               19:     18(ptr) AccessChain 16 17
               20:     12(int) Load 19
               24:    21(bool) INotEqual 20 23
@@ -268,38 +273,40 @@ gl_FragCoord origin is upper left
               48:   30(bvec4) LogicalNot 47
               49:     18(ptr) AccessChain 16 17
               50:     12(int) Load 49
-                              SelectionMerge 52 None
-                              BranchConditional 50 51 52
-              51:               Label
-                                Branch 52
-              52:             Label
-              53:     36(ptr) AccessChain 16 35
-              54:    6(float) Load 53
-                              SelectionMerge 56 None
-                              BranchConditional 54 55 56
-              55:               Label
-                                Branch 56
-              56:             Label
-              57:     18(ptr) AccessChain 16 17
-              58:     12(int) Load 57
-              59:    21(bool) INotEqual 58 23
-              60:    21(bool) LogicalNot 59
-                              SelectionMerge 62 None
-                              BranchConditional 60 61 62
-              61:               Label
-                                Branch 62
-              62:             Label
-              63:     36(ptr) AccessChain 16 35
-              64:    6(float) Load 63
-              65:    21(bool) FOrdNotEqual 64 39
-              66:    21(bool) LogicalNot 65
-                              SelectionMerge 68 None
-                              BranchConditional 66 67 68
-              67:               Label
-                                Branch 68
-              68:             Label
-              74:     73(ptr) AccessChain 70(psout) 17
-                              Store 74 72
-              75:8(PS_OUTPUT) Load 70(psout)
-                              ReturnValue 75
+              51:    21(bool) INotEqual 50 23
+                              SelectionMerge 53 None
+                              BranchConditional 51 52 53
+              52:               Label
+                                Branch 53
+              53:             Label
+              54:     36(ptr) AccessChain 16 35
+              55:    6(float) Load 54
+              56:    21(bool) FOrdNotEqual 55 39
+                              SelectionMerge 58 None
+                              BranchConditional 56 57 58
+              57:               Label
+                                Branch 58
+              58:             Label
+              59:     18(ptr) AccessChain 16 17
+              60:     12(int) Load 59
+              61:    21(bool) INotEqual 60 23
+              62:    21(bool) LogicalNot 61
+                              SelectionMerge 64 None
+                              BranchConditional 62 63 64
+              63:               Label
+                                Branch 64
+              64:             Label
+              65:     36(ptr) AccessChain 16 35
+              66:    6(float) Load 65
+              67:    21(bool) FOrdNotEqual 66 39
+              68:    21(bool) LogicalNot 67
+                              SelectionMerge 70 None
+                              BranchConditional 68 69 70
+              69:               Label
+                                Branch 70
+              70:             Label
+              76:     75(ptr) AccessChain 72(psout) 17
+                              Store 76 74
+              77:8(PS_OUTPUT) Load 72(psout)
+                              ReturnValue 77
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matNx1.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matNx1.frag.out
index cd0dbbf..0360db4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matNx1.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matNx1.frag.out
@@ -1,5 +1,5 @@
 hlsl.matNx1.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: TestMatNx1( ( temp void)
@@ -77,7 +77,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: TestMatNx1( ( temp void)
@@ -160,6 +160,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 74
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "TestMatNx1("
                               Name 10  "PS_OUTPUT"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.bool.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.bool.frag.out
index b1c5762..3f261a3 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.bool.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.bool.frag.out
@@ -1,5 +1,5 @@
 hlsl.matType.bool.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: TestBoolMatTypes( ( temp void)
@@ -117,7 +117,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: TestBoolMatTypes( ( temp void)
@@ -240,6 +240,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 127
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "TestBoolMatTypes("
                               Name 10  "PS_OUTPUT"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.frag.out
index ee40879..d76d55c 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.frag.out
@@ -1,5 +1,5 @@
 hlsl.matType.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: ShaderFunction(vf1;f1; ( temp 1-component vector of float)
@@ -17,7 +17,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: ShaderFunction(vf1;f1; ( temp 1-component vector of float)
@@ -40,6 +40,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "ShaderFunction(vf1;f1;"
                               Name 9  "inFloat1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.int.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.int.frag.out
index aef7862..602f068 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.int.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matType.int.frag.out
@@ -1,5 +1,5 @@
 hlsl.matType.int.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: TestIntMatTypes( ( temp void)
@@ -200,7 +200,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: TestIntMatTypes( ( temp void)
@@ -406,6 +406,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 229
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "TestIntMatTypes("
                               Name 8  "TestUintMatTypes("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixSwizzle.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixSwizzle.vert.out
index 69c774b..7e792b4 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixSwizzle.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixSwizzle.vert.out
@@ -1,5 +1,5 @@
 hlsl.matrixSwizzle.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:2  Function Definition: @ShaderFunction(f1; ( temp void)
 0:2    Function Parameters: 
@@ -339,7 +339,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:2  Function Definition: @ShaderFunction(f1; ( temp void)
 0:2    Function Parameters: 
@@ -684,6 +684,7 @@ Missing functionality: matrix swizzle
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "ShaderFunction" 81
+                              Source HLSL 500
                               Name 4  "ShaderFunction"
                               Name 10  "@ShaderFunction(f1;"
                               Name 9  "inf"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixindex.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixindex.frag.out
index 6f2339c..9741c5d 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixindex.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.matrixindex.frag.out
@@ -1,5 +1,5 @@
 hlsl.matrixindex.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -137,7 +137,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -280,6 +280,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 80
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.max.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.max.frag.out
index 6a0762e..2c74d72 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.max.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.max.frag.out
@@ -1,5 +1,5 @@
 hlsl.max.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4;vf4; ( temp 4-component vector of float)
@@ -34,7 +34,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4;vf4; ( temp 4-component vector of float)
@@ -74,6 +74,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 21 24 27
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 12  "@PixelShaderFunction(vf4;vf4;"
                               Name 10  "input1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.mintypes.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.mintypes.frag.out
index 257d7e4..1cbd477 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.mintypes.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.mintypes.frag.out
@@ -1,5 +1,5 @@
 hlsl.mintypes.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -50,7 +50,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -106,6 +106,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 64
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiEntry.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiEntry.vert.out
index 4873a45..8a309da 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiEntry.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiEntry.vert.out
@@ -1,13 +1,13 @@
 hlsl.multiEntry.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:4  Function Definition: FakeEntrypoint(u1; ( temp 4-component vector of float)
 0:4    Function Parameters: 
 0:4      'Index' ( in uint)
 0:?     Sequence
 0:5      Branch: Return with expression
-0:5        imageLoad ( temp 4-component vector of float)
-0:5          'Position' (layout( rgba32f) readonly uniform imageBuffer)
+0:5        textureFetch ( temp 4-component vector of float)
+0:5          'Position' (layout( rgba32f) uniform textureBuffer)
 0:5          Convert uint to int ( temp int)
 0:5            'Index' ( in uint)
 0:9  Function Definition: @RealEntrypoint(u1; ( temp 4-component vector of float)
@@ -28,7 +28,7 @@ Shader version: 450
 0:9        Function Call: @RealEntrypoint(u1; ( temp 4-component vector of float)
 0:?           'Index' ( temp uint)
 0:?   Linker Objects
-0:?     'Position' (layout( rgba32f) readonly uniform imageBuffer)
+0:?     'Position' (layout( rgba32f) uniform textureBuffer)
 0:?     '@entryPointOutput' ( out 4-component vector of float Position)
 0:?     'Index' ( in uint VertexIndex)
 
@@ -36,15 +36,15 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:4  Function Definition: FakeEntrypoint(u1; ( temp 4-component vector of float)
 0:4    Function Parameters: 
 0:4      'Index' ( in uint)
 0:?     Sequence
 0:5      Branch: Return with expression
-0:5        imageLoad ( temp 4-component vector of float)
-0:5          'Position' (layout( rgba32f) readonly uniform imageBuffer)
+0:5        textureFetch ( temp 4-component vector of float)
+0:5          'Position' (layout( rgba32f) uniform textureBuffer)
 0:5          Convert uint to int ( temp int)
 0:5            'Index' ( in uint)
 0:9  Function Definition: @RealEntrypoint(u1; ( temp 4-component vector of float)
@@ -65,7 +65,7 @@ Shader version: 450
 0:9        Function Call: @RealEntrypoint(u1; ( temp 4-component vector of float)
 0:?           'Index' ( temp uint)
 0:?   Linker Objects
-0:?     'Position' (layout( rgba32f) readonly uniform imageBuffer)
+0:?     'Position' (layout( rgba32f) uniform textureBuffer)
 0:?     '@entryPointOutput' ( out 4-component vector of float Position)
 0:?     'Index' ( in uint VertexIndex)
 
@@ -78,6 +78,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "RealEntrypoint" 34 37
+                              Source HLSL 500
                               Name 4  "RealEntrypoint"
                               Name 12  "FakeEntrypoint(u1;"
                               Name 11  "Index"
@@ -90,7 +91,6 @@ Shader version: 450
                               Name 37  "@entryPointOutput"
                               Name 38  "param"
                               Decorate 19(Position) DescriptorSet 0
-                              Decorate 19(Position) NonWritable
                               Decorate 34(Index) BuiltIn VertexIndex
                               Decorate 37(@entryPointOutput) BuiltIn Position
                2:             TypeVoid
@@ -100,7 +100,7 @@ Shader version: 450
                8:             TypeFloat 32
                9:             TypeVector 8(float) 4
               10:             TypeFunction 9(fvec4) 7(ptr)
-              17:             TypeImage 8(float) Buffer nonsampled format:Rgba32f
+              17:             TypeImage 8(float) Buffer sampled format:Rgba32f
               18:             TypePointer UniformConstant 17
     19(Position):     18(ptr) Variable UniformConstant
               22:             TypeInt 32 1
@@ -126,7 +126,7 @@ Shader version: 450
               20:          17 Load 19(Position)
               21:      6(int) Load 11(Index)
               23:     22(int) Bitcast 21
-              24:    9(fvec4) ImageRead 20 23
+              24:    9(fvec4) ImageFetch 20 23
                               ReturnValue 24
                               FunctionEnd
 15(@RealEntrypoint(u1;):    9(fvec4) Function None 10
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiReturn.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiReturn.frag.out
index 9efc9fc..1569b42 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiReturn.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.multiReturn.frag.out
@@ -1,5 +1,5 @@
 hlsl.multiReturn.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: foo( ( temp structure{ temp float f,  temp 3-component vector of float v,  temp 3X3 matrix of float m})
@@ -25,7 +25,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: foo( ( temp structure{ temp float f,  temp 3-component vector of float v,  temp 3X3 matrix of float m})
@@ -56,6 +56,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "S"
                               MemberName 9(S) 0  "f"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.nonstaticMemberFunction.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.nonstaticMemberFunction.frag.out
index d3049ec..27326c6 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.nonstaticMemberFunction.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.nonstaticMemberFunction.frag.out
@@ -1,5 +1,5 @@
 hlsl.nonstaticMemberFunction.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -135,7 +135,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -276,6 +276,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 109
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "type1"
                               MemberName 9(type1) 0  "memVar"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numericsuffixes.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numericsuffixes.frag.out
index cb24669..ee25cb3 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numericsuffixes.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numericsuffixes.frag.out
@@ -1,5 +1,5 @@
 hlsl.numericsuffixes.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -97,7 +97,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main( ( temp structure{ temp 4-component vector of float color})
@@ -200,6 +200,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 51
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numthreads.comp.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numthreads.comp.out
index ed0017b..fc1345f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numthreads.comp.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.numthreads.comp.out
@@ -1,5 +1,5 @@
 hlsl.numthreads.comp
-Shader version: 450
+Shader version: 500
 local_size = (4, 4, 2)
 0:? Sequence
 0:4  Function Definition: main(vu3; ( temp void)
@@ -23,7 +23,7 @@ local_size = (4, 4, 2)
 Linked compute stage:
 
 
-Shader version: 450
+Shader version: 500
 local_size = (4, 4, 2)
 0:? Sequence
 0:4  Function Definition: main(vu3; ( temp void)
@@ -52,6 +52,7 @@ local_size = (4, 4, 2)
                               MemoryModel Logical GLSL450
                               EntryPoint GLCompute 4  "main_aux1" 18
                               ExecutionMode 4 LocalSize 4 4 2
+                              Source HLSL 500
                               Name 4  "main_aux1"
                               Name 11  "main(vu3;"
                               Name 10  "tid"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.overload.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.overload.frag.out
index 1075ffe..b390526 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.overload.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.overload.frag.out
@@ -1,5 +1,5 @@
 hlsl.overload.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: foo1(d1;b1; ( temp void)
@@ -368,7 +368,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: foo1(d1;b1; ( temp void)
@@ -743,6 +743,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 513 516
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 13  "foo1(d1;b1;"
                               Name 11  "a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.frag.out
index b8ddf0d..3643c23 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.frag.out
@@ -1,5 +1,5 @@
 hlsl.params.default.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: fn1(vi4;b1;b1; ( temp 4-component vector of int)
@@ -189,7 +189,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:9  Function Definition: fn1(vi4;b1;b1; ( temp 4-component vector of int)
@@ -384,6 +384,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 175
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 15  "fn1(vi4;b1;b1;"
                               Name 12  "p0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.negative.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.negative.frag.out
index 219e920..f841bd8 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.negative.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.params.default.negative.frag.out
@@ -6,7 +6,7 @@ ERROR: 0:47: 'fn2' : ambiguous best function under implicit type conversion
 ERROR: 4 compilation errors.  No code generated.
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:7  Function Definition: fn1(vi4; ( temp 4-component vector of int)
@@ -196,7 +196,7 @@ ERROR: node is still EOpNull!
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 ERROR: node is still EOpNull!
 0:7  Function Definition: fn1(vi4; ( temp 4-component vector of int)
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.partialInit.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.partialInit.frag.out
index 2dc40be..b326799 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.partialInit.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.partialInit.frag.out
@@ -2,7 +2,7 @@ hlsl.partialInit.frag
 WARNING: 0:35: 'cgf2a' : variable with qualifier 'const' not initialized; zero initializing 
 WARNING: 0:36: 'ci' : variable with qualifier 'const' not initialized; zero initializing 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Sequence
@@ -202,7 +202,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:8  Sequence
@@ -407,6 +407,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 80 87 91 95 99
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "outs"
                               MemberName 11(outs) 0  "a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.pp.line.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.pp.line.frag.out
index 6e8998d..727fdd0 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.pp.line.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.pp.line.frag.out
@@ -1,5 +1,5 @@
 hlsl.pp.line.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -60,7 +60,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -126,6 +126,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 35 39
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence.frag.out
index 54dcf31..a46c6fb 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence.frag.out
@@ -1,5 +1,5 @@
 hlsl.precedence.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: @PixelShaderFunction(vf4;vf4;vf4;vf4; ( temp 4-component vector of float)
@@ -75,7 +75,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: @PixelShaderFunction(vf4;vf4;vf4;vf4; ( temp 4-component vector of float)
@@ -156,6 +156,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 43 46 49 52 55
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 14  "@PixelShaderFunction(vf4;vf4;vf4;vf4;"
                               Name 10  "a1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence2.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence2.frag.out
index a0ddd44..31e76c3 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence2.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precedence2.frag.out
@@ -1,5 +1,5 @@
 hlsl.precedence2.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: @PixelShaderFunction(i1;i1;i1;i1; ( temp int)
@@ -58,7 +58,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: @PixelShaderFunction(i1;i1;i1;i1; ( temp int)
@@ -122,6 +122,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 34 37 40 43 46
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 13  "@PixelShaderFunction(i1;i1;i1;i1;"
                               Name 9  "a1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precise.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precise.frag.out
index 8a0ac81..442f6c2 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precise.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.precise.frag.out
@@ -1,5 +1,5 @@
 hlsl.precise.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:6  Function Definition: MyFunction(f1;vf3; ( temp void)
@@ -39,7 +39,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:6  Function Definition: MyFunction(f1;vf3; ( temp void)
@@ -84,6 +84,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 32
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 13  "MyFunction(f1;vf3;"
                               Name 11  "myfloat"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.atomic.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.atomic.frag.out
index b09f68d..b31ac61 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.atomic.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.atomic.frag.out
@@ -1,5 +1,5 @@
 hlsl.promote.atomic.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main( ( temp 4-component vector of float)
@@ -33,7 +33,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main( ( temp 4-component vector of float)
@@ -73,6 +73,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 34
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "@main("
                               Name 13  "Orig"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.binary.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.binary.frag.out
index 34a46f9..09c1f93 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.binary.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.binary.frag.out
@@ -1,5 +1,5 @@
 hlsl.promote.binary.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:14  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -87,7 +87,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:14  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -180,6 +180,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 80
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.vec1.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.vec1.frag.out
index c76f5d2..b0536bd 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.vec1.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promote.vec1.frag.out
@@ -1,5 +1,5 @@
 hlsl.promote.vec1.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @main( ( temp 4-component vector of float)
@@ -41,7 +41,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @main( ( temp 4-component vector of float)
@@ -88,6 +88,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 29
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "@main("
                               Name 12  "f1a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promotions.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promotions.frag.out
index d955195..e1953e0 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promotions.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.promotions.frag.out
@@ -1,5 +1,5 @@
 hlsl.promotions.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:19  Function Definition: Fn_F3(vf3; ( temp void)
@@ -792,7 +792,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:19  Function Definition: Fn_F3(vf3; ( temp void)
@@ -1591,6 +1591,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 593
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "Fn_F3(vf3;"
                               Name 10  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.atomics.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.atomics.frag.out
index a2e5358..08369c0 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.atomics.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.atomics.frag.out
@@ -1,5 +1,5 @@
 hlsl.rw.atomics.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:45  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -1974,7 +1974,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:45  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -3956,6 +3956,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 1117
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.bracket.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.bracket.frag.out
index ece7bfa..55264de 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.bracket.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.bracket.frag.out
@@ -1,5 +1,5 @@
 hlsl.rw.bracket.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:42  Function Definition: Fn1(vi4; ( temp 4-component vector of int)
@@ -873,7 +873,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:42  Function Definition: Fn1(vi4; ( temp 4-component vector of int)
@@ -1753,6 +1753,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 583
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "Fn1(vi4;"
                               Name 10  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.register.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.register.frag.out
index 2e0c562..525c6aa 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.register.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.register.frag.out
@@ -1,5 +1,5 @@
 hlsl.rw.register.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:11  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -50,7 +50,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:11  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -108,6 +108,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 39
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.scalar.bracket.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.scalar.bracket.frag.out
index 991d524..e0e88d1 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.scalar.bracket.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.scalar.bracket.frag.out
@@ -1,5 +1,5 @@
 hlsl.rw.scalar.bracket.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:42  Function Definition: Fn1(i1; ( temp int)
@@ -846,7 +846,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:42  Function Definition: Fn1(i1; ( temp int)
@@ -1699,6 +1699,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 547
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 10  "Fn1(i1;"
                               Name 9  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.swizzle.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.swizzle.frag.out
index ae2f4c9..60a6b99 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.swizzle.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.swizzle.frag.out
@@ -1,5 +1,5 @@
 hlsl.rw.swizzle.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: SomeValue( ( temp 3-component vector of float)
@@ -102,7 +102,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: SomeValue( ( temp 3-component vector of float)
@@ -211,6 +211,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 58
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "SomeValue("
                               Name 13  "@main("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.vec2.bracket.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.vec2.bracket.frag.out
index c34ce90..afcc52a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.vec2.bracket.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.rw.vec2.bracket.frag.out
@@ -1,5 +1,5 @@
 hlsl.rw.vec2.bracket.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:42  Function Definition: Fn1(vi2; ( temp 2-component vector of int)
@@ -855,7 +855,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:42  Function Definition: Fn1(vi2; ( temp 2-component vector of int)
@@ -1718,6 +1718,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 581
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "Fn1(vi2;"
                               Name 10  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.array.dx10.frag.out
index 5a8a79c..1ac123e 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.sample.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -161,7 +161,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -330,6 +330,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 138 142
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.basic.dx10.frag.out
index 725de27..61abf2a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.basic.dx10.frag.out
@@ -1,7 +1,7 @@
 hlsl.sample.basic.dx10.frag
 WARNING: 0:4: 'immediate sampler state' : unimplemented 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:53  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -276,7 +276,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:53  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -557,6 +557,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 188 192
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offset.dx10.frag.out
index 923db24..f6bfb8d 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.sample.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -182,7 +182,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -371,6 +371,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 144 148
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offsetarray.dx10.frag.out
index d9cf65c..c08ec5c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.sample.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -137,7 +137,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -281,6 +281,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 110 114
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.sub-vec4.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.sub-vec4.dx10.frag.out
index c3d6f13..2829c02 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.sub-vec4.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sample.sub-vec4.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.sample.sub-vec4.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:14  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -78,7 +78,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:14  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -163,6 +163,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 69
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.array.dx10.frag.out
index 7f6881f..afed0a4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplebias.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -179,7 +179,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -366,6 +366,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 138 142
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.basic.dx10.frag.out
index 81e8da9..cff271f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplebias.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -212,7 +212,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -431,6 +431,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 162 166
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offset.dx10.frag.out
index 1919132..b05d1d7 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplebias.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -200,7 +200,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -407,6 +407,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 144 148
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offsetarray.dx10.frag.out
index 0aab048..e718743 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplebias.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplebias.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -149,7 +149,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -305,6 +305,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 110 114
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.array.dx10.frag.out
index 6f26c98..1ddff1a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmp.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -199,7 +199,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -406,6 +406,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 175 179
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.basic.dx10.frag.out
index df1004e..373423a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmp.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -190,7 +190,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -388,6 +388,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 164 168
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offset.dx10.frag.out
index a6193fd..290c8b4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmp.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -163,7 +163,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -334,6 +334,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 121 125
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offsetarray.dx10.frag.out
index 2fdce67..ed40102 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmp.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmp.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -169,7 +169,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -346,6 +346,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 132 136
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.array.dx10.frag.out
index 3a3e169..875d593 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmplevelzero.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -217,7 +217,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -442,6 +442,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 176 180
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.basic.dx10.frag.out
index 1fcc001..9d2802d 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmplevelzero.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -208,7 +208,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -424,6 +424,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 165 169
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offset.dx10.frag.out
index 1bd82be..2e30e7c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmplevelzero.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -175,7 +175,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -358,6 +358,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 122 126
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offsetarray.dx10.frag.out
index 8a87e9b..eda17f2 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplecmplevelzero.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplecmplevelzero.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -181,7 +181,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -370,6 +370,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 133 137
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.array.dx10.frag.out
index 5a5159d..03449f8 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplegrad.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -215,7 +215,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -438,6 +438,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 132 136
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.frag.out
index dc61751..42a27bf 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplegrad.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -266,7 +266,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -539,6 +539,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 167 171
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.vert.out
index afce97f..a93cbdf 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.basic.dx10.vert.out
@@ -1,5 +1,5 @@
 hlsl.samplegrad.basic.dx10.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:27  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:27    Function Parameters: 
@@ -248,7 +248,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:27  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:27    Function Parameters: 
@@ -502,6 +502,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 162 168
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUTPUT"
                               MemberName 8(VS_OUTPUT) 0  "Pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offset.dx10.frag.out
index 9da0f17..4b4e718 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplegrad.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -236,7 +236,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -479,6 +479,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 149 153
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offsetarray.dx10.frag.out
index a2abd0f..1f714e6 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplegrad.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplegrad.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -170,7 +170,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -348,6 +348,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 103 107
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.array.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.array.dx10.frag.out
index 88c7de9..0a3e7fd 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.array.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.array.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplelevel.array.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -179,7 +179,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:24  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -366,6 +366,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 139 143
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.frag.out
index 7d9f268..6ff36be 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplelevel.basic.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:29  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -213,7 +213,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:29  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -433,6 +433,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 163 167
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.vert.out
index 40996e9..5b2f950 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.basic.dx10.vert.out
@@ -1,5 +1,5 @@
 hlsl.samplelevel.basic.dx10.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:27  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:27    Function Parameters: 
@@ -194,7 +194,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:27  Function Definition: @main( ( temp structure{ temp 4-component vector of float Pos})
 0:27    Function Parameters: 
@@ -394,6 +394,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 158 164
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_OUTPUT"
                               MemberName 8(VS_OUTPUT) 0  "Pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offset.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offset.dx10.frag.out
index 2c98171..7311ae4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offset.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offset.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplelevel.offset.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -200,7 +200,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:28  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -407,6 +407,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 145 149
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offsetarray.dx10.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offsetarray.dx10.frag.out
index 06b4c7b..223bfaf 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offsetarray.dx10.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.samplelevel.offsetarray.dx10.frag.out
@@ -1,5 +1,5 @@
 hlsl.samplelevel.offsetarray.dx10.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -149,7 +149,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:20  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color,  temp float Depth})
@@ -305,6 +305,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 111 115
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.scope.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.scope.frag.out
index d3409d8..32eeeef 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.scope.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.scope.frag.out
@@ -1,5 +1,5 @@
 hlsl.scope.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp void)
@@ -52,7 +52,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp void)
@@ -110,6 +110,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 44
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.geom.out
index caed636..e5d67b9 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.geom.out
@@ -1,5 +1,5 @@
 hlsl.semantic.geom
-Shader version: 450
+Shader version: 500
 invocations = -1
 max_vertices = 4
 input primitive = triangles
@@ -25,25 +25,25 @@ output primitive = line_strip
 0:?             'VertexID' ( temp 3-element array of uint)
 0:?             'OutputStream' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12        move second child to first child ( temp float)
-0:?           '@entryPointOutput_clip0' ( out float ClipDistance)
+0:?           'OutputStream_clip0' ( out float ClipDistance)
 0:12          clip0: direct index for structure ( temp float)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
 0:12              0 (const int)
 0:12        move second child to first child ( temp float)
-0:?           '@entryPointOutput_cull0' ( out float CullDistance)
+0:?           'OutputStream_cull0' ( out float CullDistance)
 0:12          cull0: direct index for structure ( temp float)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
 0:12              1 (const int)
 0:12        move second child to first child ( temp uint)
-0:?           '@entryPointOutput_vpai' ( out uint ViewportIndex)
+0:?           'OutputStream_vpai' ( out uint ViewportIndex)
 0:12          vpai: direct index for structure ( temp uint)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
 0:12              2 (const int)
 0:12        move second child to first child ( temp uint)
-0:?           '@entryPointOutput_rtai' ( out uint Layer)
+0:?           'OutputStream_rtai' ( out uint Layer)
 0:12          rtai: direct index for structure ( temp uint)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
@@ -60,13 +60,14 @@ output primitive = line_strip
 0:?   Linker Objects
 0:?     '@entryPointOutput' (layout( location=0) out structure{ temp int ii})
 0:?     'VertexID' (layout( location=0) in 3-element array of uint)
-0:?     '@entryPointOutput_vpai' ( out uint ViewportIndex)
+0:?     'OutputStream' (layout( location=1) out structure{ temp int ii})
+0:?     'OutputStream_vpai' ( out uint ViewportIndex)
 
 
 Linked geometry stage:
 
 
-Shader version: 450
+Shader version: 500
 invocations = 1
 max_vertices = 4
 input primitive = triangles
@@ -92,25 +93,25 @@ output primitive = line_strip
 0:?             'VertexID' ( temp 3-element array of uint)
 0:?             'OutputStream' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12        move second child to first child ( temp float)
-0:?           '@entryPointOutput_clip0' ( out float ClipDistance)
+0:?           'OutputStream_clip0' ( out float ClipDistance)
 0:12          clip0: direct index for structure ( temp float)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
 0:12              0 (const int)
 0:12        move second child to first child ( temp float)
-0:?           '@entryPointOutput_cull0' ( out float CullDistance)
+0:?           'OutputStream_cull0' ( out float CullDistance)
 0:12          cull0: direct index for structure ( temp float)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
 0:12              1 (const int)
 0:12        move second child to first child ( temp uint)
-0:?           '@entryPointOutput_vpai' ( out uint ViewportIndex)
+0:?           'OutputStream_vpai' ( out uint ViewportIndex)
 0:12          vpai: direct index for structure ( temp uint)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
 0:12              2 (const int)
 0:12        move second child to first child ( temp uint)
-0:?           '@entryPointOutput_rtai' ( out uint Layer)
+0:?           'OutputStream_rtai' ( out uint Layer)
 0:12          rtai: direct index for structure ( temp uint)
 0:12            'flattenTemp' ( temp structure{ temp float clip0,  temp float cull0,  temp uint vpai,  temp uint rtai,  temp int ii})
 0:12            Constant:
@@ -127,11 +128,12 @@ output primitive = line_strip
 0:?   Linker Objects
 0:?     '@entryPointOutput' (layout( location=0) out structure{ temp int ii})
 0:?     'VertexID' (layout( location=0) in 3-element array of uint)
-0:?     '@entryPointOutput_vpai' ( out uint ViewportIndex)
+0:?     'OutputStream' (layout( location=1) out structure{ temp int ii})
+0:?     'OutputStream_vpai' ( out uint ViewportIndex)
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 62
+// Id's are bound by 66
 
                               Capability Geometry
                               Capability ClipDistance
@@ -139,11 +141,12 @@ output primitive = line_strip
                               Capability MultiViewport
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Geometry 4  "main" 25 34 39 44 49 55
+                              EntryPoint Geometry 4  "main" 25 35 40 45 50 56 65
                               ExecutionMode 4 Triangles
                               ExecutionMode 4 Invocations 1
                               ExecutionMode 4 OutputLineStrip
                               ExecutionMode 4 OutputVertices 4
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "S"
                               MemberName 12(S) 0  "clip0"
@@ -161,19 +164,23 @@ output primitive = line_strip
                               Name 28  "OutputStream"
                               Name 29  "param"
                               Name 31  "param"
-                              Name 34  "@entryPointOutput_clip0"
-                              Name 39  "@entryPointOutput_cull0"
-                              Name 44  "@entryPointOutput_vpai"
-                              Name 49  "@entryPointOutput_rtai"
-                              Name 53  "S"
-                              MemberName 53(S) 0  "ii"
-                              Name 55  "@entryPointOutput"
+                              Name 35  "OutputStream_clip0"
+                              Name 40  "OutputStream_cull0"
+                              Name 45  "OutputStream_vpai"
+                              Name 50  "OutputStream_rtai"
+                              Name 54  "S"
+                              MemberName 54(S) 0  "ii"
+                              Name 56  "@entryPointOutput"
+                              Name 63  "S"
+                              MemberName 63(S) 0  "ii"
+                              Name 65  "OutputStream"
                               Decorate 25(VertexID) Location 0
-                              Decorate 34(@entryPointOutput_clip0) BuiltIn ClipDistance
-                              Decorate 39(@entryPointOutput_cull0) BuiltIn CullDistance
-                              Decorate 44(@entryPointOutput_vpai) BuiltIn ViewportIndex
-                              Decorate 49(@entryPointOutput_rtai) BuiltIn Layer
-                              Decorate 55(@entryPointOutput) Location 0
+                              Decorate 35(OutputStream_clip0) BuiltIn ClipDistance
+                              Decorate 40(OutputStream_cull0) BuiltIn CullDistance
+                              Decorate 45(OutputStream_vpai) BuiltIn ViewportIndex
+                              Decorate 50(OutputStream_rtai) BuiltIn Layer
+                              Decorate 56(@entryPointOutput) Location 0
+                              Decorate 65(OutputStream) Location 1
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeInt 32 0
@@ -187,24 +194,27 @@ output primitive = line_strip
               14:             TypeFunction 12(S) 9(ptr) 13(ptr)
               24:             TypePointer Input 8
     25(VertexID):     24(ptr) Variable Input
-              33:             TypePointer Output 10(float)
-34(@entryPointOutput_clip0):     33(ptr) Variable Output
-              35:     11(int) Constant 0
-              36:             TypePointer Function 10(float)
-39(@entryPointOutput_cull0):     33(ptr) Variable Output
-              40:     11(int) Constant 1
-              43:             TypePointer Output 6(int)
-44(@entryPointOutput_vpai):     43(ptr) Variable Output
-              45:     11(int) Constant 2
-              46:             TypePointer Function 6(int)
-49(@entryPointOutput_rtai):     43(ptr) Variable Output
-              50:     11(int) Constant 3
-           53(S):             TypeStruct 11(int)
-              54:             TypePointer Output 53(S)
-55(@entryPointOutput):     54(ptr) Variable Output
-              56:     11(int) Constant 4
-              57:             TypePointer Function 11(int)
-              60:             TypePointer Output 11(int)
+              34:             TypePointer Output 10(float)
+35(OutputStream_clip0):     34(ptr) Variable Output
+              36:     11(int) Constant 0
+              37:             TypePointer Function 10(float)
+40(OutputStream_cull0):     34(ptr) Variable Output
+              41:     11(int) Constant 1
+              44:             TypePointer Output 6(int)
+45(OutputStream_vpai):     44(ptr) Variable Output
+              46:     11(int) Constant 2
+              47:             TypePointer Function 6(int)
+50(OutputStream_rtai):     44(ptr) Variable Output
+              51:     11(int) Constant 3
+           54(S):             TypeStruct 11(int)
+              55:             TypePointer Output 54(S)
+56(@entryPointOutput):     55(ptr) Variable Output
+              57:     11(int) Constant 4
+              58:             TypePointer Function 11(int)
+              61:             TypePointer Output 11(int)
+           63(S):             TypeStruct 11(int)
+              64:             TypePointer Output 63(S)
+65(OutputStream):     64(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
     23(VertexID):      9(ptr) Variable Function
@@ -217,23 +227,25 @@ output primitive = line_strip
               30:           8 Load 23(VertexID)
                               Store 29(param) 30
               32:       12(S) FunctionCall 17(@main(u1[3];struct-S-f1-f1-u1-u1-i11;) 29(param) 31(param)
+              33:       12(S) Load 31(param)
+                              Store 28(OutputStream) 33
                               Store 27(flattenTemp) 32
-              37:     36(ptr) AccessChain 27(flattenTemp) 35
-              38:   10(float) Load 37
-                              Store 34(@entryPointOutput_clip0) 38
-              41:     36(ptr) AccessChain 27(flattenTemp) 40
-              42:   10(float) Load 41
-                              Store 39(@entryPointOutput_cull0) 42
-              47:     46(ptr) AccessChain 27(flattenTemp) 45
-              48:      6(int) Load 47
-                              Store 44(@entryPointOutput_vpai) 48
-              51:     46(ptr) AccessChain 27(flattenTemp) 50
-              52:      6(int) Load 51
-                              Store 49(@entryPointOutput_rtai) 52
-              58:     57(ptr) AccessChain 27(flattenTemp) 56
-              59:     11(int) Load 58
-              61:     60(ptr) AccessChain 55(@entryPointOutput) 35
-                              Store 61 59
+              38:     37(ptr) AccessChain 27(flattenTemp) 36
+              39:   10(float) Load 38
+                              Store 35(OutputStream_clip0) 39
+              42:     37(ptr) AccessChain 27(flattenTemp) 41
+              43:   10(float) Load 42
+                              Store 40(OutputStream_cull0) 43
+              48:     47(ptr) AccessChain 27(flattenTemp) 46
+              49:      6(int) Load 48
+                              Store 45(OutputStream_vpai) 49
+              52:     47(ptr) AccessChain 27(flattenTemp) 51
+              53:      6(int) Load 52
+                              Store 50(OutputStream_rtai) 53
+              59:     58(ptr) AccessChain 27(flattenTemp) 57
+              60:     11(int) Load 59
+              62:     61(ptr) AccessChain 56(@entryPointOutput) 36
+                              Store 62 60
                               Return
                               FunctionEnd
 17(@main(u1[3];struct-S-f1-f1-u1-u1-i11;):       12(S) Function None 14
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.vert.out
index 0b08856..3618fee 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semantic.vert.out
@@ -1,5 +1,5 @@
 hlsl.semantic.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:12  Function Definition: @main(struct-S-f1-f1-f1-f1-f1-f1-i11; ( temp structure{ temp float clip,  temp float clip0,  temp float clip7,  temp float cull,  temp float cull2,  temp float cull5,  temp int ii})
 0:12    Function Parameters: 
@@ -117,7 +117,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:12  Function Definition: @main(struct-S-f1-f1-f1-f1-f1-f1-i11; ( temp structure{ temp float clip,  temp float clip0,  temp float clip7,  temp float cull,  temp float cull2,  temp float cull5,  temp int ii})
 0:12    Function Parameters: 
@@ -241,6 +241,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 21 26 30 34 38 42 47 56 63 72
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "S"
                               MemberName 8(S) 0  "clip"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semicolons.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semicolons.frag.out
index 9856b5a..2c26daa 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semicolons.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.semicolons.frag.out
@@ -1,5 +1,5 @@
 hlsl.semicolons.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: MyFunc( ( temp void)
@@ -38,7 +38,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: MyFunc( ( temp void)
@@ -82,6 +82,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 28
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 6  "MyFunc("
                               Name 8  "MyFunc2("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConv.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConv.frag.out
index 07fa9fd..775549f 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConv.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConv.frag.out
@@ -1,5 +1,5 @@
 hlsl.shapeConv.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: PixelShaderFunction(vf4;f1; ( temp 4-component vector of float)
@@ -117,7 +117,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: PixelShaderFunction(vf4;f1; ( temp 4-component vector of float)
@@ -239,6 +239,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 13  "PixelShaderFunction(vf4;f1;"
                               Name 11  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConvRet.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConvRet.frag.out
index 54ca7fa..e9dba22 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConvRet.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.shapeConvRet.frag.out
@@ -1,5 +1,5 @@
 hlsl.shapeConvRet.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: foo( ( temp 3-component vector of int)
@@ -35,7 +35,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: foo( ( temp 3-component vector of int)
@@ -76,6 +76,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 28 31
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "foo("
                               Name 16  "@main(f1;"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sin.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sin.frag.out
index 1f96440..37baab0 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sin.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.sin.frag.out
@@ -1,5 +1,5 @@
 hlsl.sin.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -27,7 +27,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -60,6 +60,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 19 22
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.staticMemberFunction.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.staticMemberFunction.frag.out
index e8a98cb..e313cbc 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.staticMemberFunction.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.staticMemberFunction.frag.out
@@ -1,5 +1,5 @@
 hlsl.staticMemberFunction.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: Test::staticMemFun(vf4; ( global 4-component vector of float)
@@ -60,7 +60,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: Test::staticMemFun(vf4; ( global 4-component vector of float)
@@ -126,6 +126,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 52
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "Test::staticMemFun(vf4;"
                               Name 10  "a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.string.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.string.frag.out
index c4a118f..1c6c542 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.string.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.string.frag.out
@@ -1,5 +1,5 @@
 hlsl.string.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main(f1; ( temp float)
@@ -26,7 +26,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:10  Function Definition: @main(f1; ( temp float)
@@ -58,6 +58,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 17 20
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 10  "@main(f1;"
                               Name 9  "f"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.stringtoken.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.stringtoken.frag.out
index c23d663..045b8fe 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.stringtoken.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.stringtoken.frag.out
@@ -1,5 +1,5 @@
 hlsl.stringtoken.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:16  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -36,7 +36,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:16  Function Definition: @main( ( temp structure{ temp 4-component vector of float Color})
@@ -78,6 +78,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 25
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.frag.out
index c01e8fa..6798bae 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.frag.out
@@ -3,7 +3,7 @@ WARNING: 0:26: 'register' : ignoring shader_profile
 WARNING: 0:27: 'register' : ignoring shader_profile 
 WARNING: 0:30: 'register' : ignoring shader_profile 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:40  Function Definition: @PixelShaderFunction(vf4;struct-IN_S-vf4-b1-vf1-vf2-b1-b1-b1-vf41; ( temp 4-component vector of float)
@@ -124,7 +124,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:40  Function Definition: @PixelShaderFunction(vf4;struct-IN_S-vf4-b1-vf1-vf2-b1-b1-b1-vf41; ( temp 4-component vector of float)
@@ -250,6 +250,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 51 56 79 94
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "IN_S"
                               MemberName 11(IN_S) 0  "a"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split-1.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split-1.vert.out
index 9ec01d8..db40682 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split-1.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split-1.vert.out
@@ -1,5 +1,5 @@
 hlsl.struct.split-1.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:17  Function Definition: @main(struct-VS_INPUT-i1-vf4-i11;vf4; ( temp structure{ temp int x0_out,  temp 4-component vector of float Pos_out,  temp int x1_out})
 0:17    Function Parameters: 
@@ -103,7 +103,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:17  Function Definition: @main(struct-VS_INPUT-i1-vf4-i11;vf4; ( temp structure{ temp int x0_out,  temp 4-component vector of float Pos_out,  temp int x1_out})
 0:17    Function Parameters: 
@@ -211,6 +211,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 40 44 47 51 61 67
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "VS_INPUT"
                               MemberName 9(VS_INPUT) 0  "x0_in"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.array.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.array.geom.out
index b907815..5c36e2b 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.array.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.array.geom.out
@@ -1,5 +1,5 @@
 hlsl.struct.split.array.geom
-Shader version: 450
+Shader version: 500
 invocations = -1
 max_vertices = 4
 input primitive = points
@@ -72,12 +72,13 @@ output primitive = triangle_strip
 0:?         'OutputStream' ( temp structure{ temp 4-component vector of float Pos,  temp 2-component vector of float TexCoord,  temp 3-component vector of float TerrainPos,  temp uint VertexID})
 0:?   Linker Objects
 0:?     'v' (layout( location=0) in 1-element array of uint)
+0:?     'OutputStream' (layout( location=0) out structure{ temp 2-component vector of float TexCoord,  temp 3-component vector of float TerrainPos,  temp uint VertexID})
 
 
 Linked geometry stage:
 
 
-Shader version: 450
+Shader version: 500
 invocations = 1
 max_vertices = 4
 input primitive = points
@@ -150,19 +151,21 @@ output primitive = triangle_strip
 0:?         'OutputStream' ( temp structure{ temp 4-component vector of float Pos,  temp 2-component vector of float TexCoord,  temp 3-component vector of float TerrainPos,  temp uint VertexID})
 0:?   Linker Objects
 0:?     'v' (layout( location=0) in 1-element array of uint)
+0:?     'OutputStream' (layout( location=0) out structure{ temp 2-component vector of float TexCoord,  temp 3-component vector of float TerrainPos,  temp uint VertexID})
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 73
+// Id's are bound by 77
 
                               Capability Geometry
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Geometry 4  "main" 66
+                              EntryPoint Geometry 4  "main" 66 76
                               ExecutionMode 4 InputPoints
                               ExecutionMode 4 Invocations 1
                               ExecutionMode 4 OutputTriangleStrip
                               ExecutionMode 4 OutputVertices 4
+                              Source HLSL 500
                               Name 4  "main"
                               Name 14  "PSInput"
                               MemberName 14(PSInput) 0  "Pos"
@@ -181,7 +184,13 @@ output primitive = triangle_strip
                               Name 68  "OutputStream"
                               Name 69  "param"
                               Name 71  "param"
+                              Name 74  "PSInput"
+                              MemberName 74(PSInput) 0  "TexCoord"
+                              MemberName 74(PSInput) 1  "TerrainPos"
+                              MemberName 74(PSInput) 2  "VertexID"
+                              Name 76  "OutputStream"
                               Decorate 66(v) Location 0
+                              Decorate 76(OutputStream) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeInt 32 0
@@ -214,6 +223,9 @@ output primitive = triangle_strip
               60:     28(int) Constant 1
               65:             TypePointer Input 8
            66(v):     65(ptr) Variable Input
+     74(PSInput):             TypeStruct 12(fvec2) 13(fvec3) 6(int)
+              75:             TypePointer Output 74(PSInput)
+76(OutputStream):     75(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
            64(v):      9(ptr) Variable Function
@@ -225,6 +237,8 @@ output primitive = triangle_strip
               70:           8 Load 64(v)
                               Store 69(param) 70
               72:           2 FunctionCall 19(@main(u1[1];struct-PSInput-vf4-vf2-vf3-u11;) 69(param) 71(param)
+              73: 14(PSInput) Load 71(param)
+                              Store 68(OutputStream) 73
                               Return
                               FunctionEnd
 19(@main(u1[1];struct-PSInput-vf4-vf2-vf3-u11;):           2 Function None 16
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.assign.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.assign.frag.out
index ad185ee..2228a8a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.assign.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.assign.frag.out
@@ -1,5 +1,5 @@
 hlsl.struct.split.assign.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: @main(i1;struct-S-f1-vf41[3]; ( temp 4-component vector of float)
@@ -118,7 +118,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:7  Function Definition: @main(i1;struct-S-f1-vf41[3]; ( temp 4-component vector of float)
@@ -242,6 +242,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 32 39 48 67
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 10  "S"
                               MemberName 10(S) 0  "f"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.call.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.call.vert.out
index 9dc7b85..647f9b4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.call.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.call.vert.out
@@ -1,5 +1,5 @@
 hlsl.struct.split.call.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:17  Function Definition: Fn1(struct-VS_INPUT-i1-vf4-i11;struct-VS_OUTPUT-i1-vf4-i11; ( temp void)
 0:17    Function Parameters: 
@@ -112,7 +112,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:17  Function Definition: Fn1(struct-VS_INPUT-i1-vf4-i11;struct-VS_OUTPUT-i1-vf4-i11; ( temp void)
 0:17    Function Parameters: 
@@ -229,6 +229,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 52 56 59 68 74
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "VS_INPUT"
                               MemberName 9(VS_INPUT) 0  "x0_in"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.nested.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.nested.geom.out
index 42046da..2f010f5 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.nested.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.nested.geom.out
@@ -1,5 +1,5 @@
 hlsl.struct.split.nested.geom
-Shader version: 450
+Shader version: 500
 invocations = -1
 max_vertices = 3
 input primitive = triangles
@@ -35,9 +35,40 @@ output primitive = triangle_strip
 0:?           5.000000
 0:?           6.000000
 0:30      Sequence
-0:30        move second child to first child ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
-0:30          'ts' ( out structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
-0:30          'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30        Sequence
+0:30          move second child to first child ( temp 4-component vector of float)
+0:?             'ts_psIn_pos' ( out 4-component vector of float Position)
+0:30            pos: direct index for structure ( temp 4-component vector of float)
+0:30              psIn: direct index for structure ( temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc})
+0:30                'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30                Constant:
+0:30                  0 (const int)
+0:30              Constant:
+0:30                0 (const int)
+0:30          move second child to first child ( temp 2-component vector of float)
+0:30            tc: direct index for structure ( temp 2-component vector of float)
+0:30              psIn: direct index for structure ( temp structure{ temp 2-component vector of float tc})
+0:30                'ts' (layout( location=0) out structure{ temp structure{ temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30                Constant:
+0:30                  0 (const int)
+0:30              Constant:
+0:30                0 (const int)
+0:30            tc: direct index for structure ( temp 2-component vector of float)
+0:30              psIn: direct index for structure ( temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc})
+0:30                'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30                Constant:
+0:30                  0 (const int)
+0:30              Constant:
+0:30                1 (const int)
+0:30          move second child to first child ( temp structure{ temp 2-element array of float m0_array,  temp int m1})
+0:30            contains_no_builtin_io: direct index for structure ( temp structure{ temp 2-element array of float m0_array,  temp int m1})
+0:30              'ts' (layout( location=0) out structure{ temp structure{ temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30              Constant:
+0:30                1 (const int)
+0:30            contains_no_builtin_io: direct index for structure ( temp structure{ temp 2-element array of float m0_array,  temp int m1})
+0:30              'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30              Constant:
+0:30                1 (const int)
 0:30        EmitVertex ( temp void)
 0:24  Function Definition: main( ( temp void)
 0:24    Function Parameters: 
@@ -129,12 +160,13 @@ output primitive = triangle_strip
 0:?         'ts' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
 0:?   Linker Objects
 0:?     'tin' (layout( location=0) in 3-element array of structure{ temp 2-component vector of float tc})
+0:?     'ts' (layout( location=0) out structure{ temp structure{ temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
 
 
 Linked geometry stage:
 
 
-Shader version: 450
+Shader version: 500
 invocations = 1
 max_vertices = 3
 input primitive = triangles
@@ -170,9 +202,40 @@ output primitive = triangle_strip
 0:?           5.000000
 0:?           6.000000
 0:30      Sequence
-0:30        move second child to first child ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
-0:30          'ts' ( out structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
-0:30          'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30        Sequence
+0:30          move second child to first child ( temp 4-component vector of float)
+0:?             'ts_psIn_pos' ( out 4-component vector of float Position)
+0:30            pos: direct index for structure ( temp 4-component vector of float)
+0:30              psIn: direct index for structure ( temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc})
+0:30                'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30                Constant:
+0:30                  0 (const int)
+0:30              Constant:
+0:30                0 (const int)
+0:30          move second child to first child ( temp 2-component vector of float)
+0:30            tc: direct index for structure ( temp 2-component vector of float)
+0:30              psIn: direct index for structure ( temp structure{ temp 2-component vector of float tc})
+0:30                'ts' (layout( location=0) out structure{ temp structure{ temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30                Constant:
+0:30                  0 (const int)
+0:30              Constant:
+0:30                0 (const int)
+0:30            tc: direct index for structure ( temp 2-component vector of float)
+0:30              psIn: direct index for structure ( temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc})
+0:30                'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30                Constant:
+0:30                  0 (const int)
+0:30              Constant:
+0:30                1 (const int)
+0:30          move second child to first child ( temp structure{ temp 2-element array of float m0_array,  temp int m1})
+0:30            contains_no_builtin_io: direct index for structure ( temp structure{ temp 2-element array of float m0_array,  temp int m1})
+0:30              'ts' (layout( location=0) out structure{ temp structure{ temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30              Constant:
+0:30                1 (const int)
+0:30            contains_no_builtin_io: direct index for structure ( temp structure{ temp 2-element array of float m0_array,  temp int m1})
+0:30              'o' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
+0:30              Constant:
+0:30                1 (const int)
 0:30        EmitVertex ( temp void)
 0:24  Function Definition: main( ( temp void)
 0:24    Function Parameters: 
@@ -264,19 +327,21 @@ output primitive = triangle_strip
 0:?         'ts' ( temp structure{ temp structure{ temp 4-component vector of float pos,  temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
 0:?   Linker Objects
 0:?     'tin' (layout( location=0) in 3-element array of structure{ temp 2-component vector of float tc})
+0:?     'ts' (layout( location=0) out structure{ temp structure{ temp 2-component vector of float tc} psIn,  temp structure{ temp 2-element array of float m0_array,  temp int m1} contains_no_builtin_io})
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 75
+// Id's are bound by 99
 
                               Capability Geometry
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Geometry 4  "main" 44 52
+                              EntryPoint Geometry 4  "main" 41 48 67 75
                               ExecutionMode 4 Triangles
                               ExecutionMode 4 Invocations 1
                               ExecutionMode 4 OutputTriangleStrip
                               ExecutionMode 4 OutputVertices 3
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "PS_IN"
                               MemberName 9(PS_IN) 0  "pos"
@@ -291,16 +356,28 @@ output primitive = triangle_strip
                               Name 21  "tin"
                               Name 22  "ts"
                               Name 25  "o"
-                              Name 41  "tin"
-                              Name 44  "tin_pos"
-                              Name 49  "PS_IN"
-                              MemberName 49(PS_IN) 0  "tc"
-                              Name 52  "tin"
-                              Name 70  "ts"
-                              Name 71  "param"
-                              Name 73  "param"
-                              Decorate 44(tin_pos) BuiltIn Position
-                              Decorate 52(tin) Location 0
+                              Name 41  "ts_psIn_pos"
+                              Name 44  "PS_IN"
+                              MemberName 44(PS_IN) 0  "tc"
+                              Name 45  "STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO"
+                              MemberName 45(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO) 0  "m0_array"
+                              MemberName 45(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO) 1  "m1"
+                              Name 46  "GS_OUT"
+                              MemberName 46(GS_OUT) 0  "psIn"
+                              MemberName 46(GS_OUT) 1  "contains_no_builtin_io"
+                              Name 48  "ts"
+                              Name 64  "tin"
+                              Name 67  "tin_pos"
+                              Name 72  "PS_IN"
+                              MemberName 72(PS_IN) 0  "tc"
+                              Name 75  "tin"
+                              Name 93  "ts"
+                              Name 94  "param"
+                              Name 96  "param"
+                              Decorate 41(ts_psIn_pos) BuiltIn Position
+                              Decorate 48(ts) Location 0
+                              Decorate 67(tin_pos) BuiltIn Position
+                              Decorate 75(tin) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -330,49 +407,63 @@ output primitive = triangle_strip
               36:    6(float) Constant 1086324736
               37:    8(fvec2) ConstantComposite 35 36
               38:             TypePointer Function 8(fvec2)
-              42:             TypeArray 7(fvec4) 11
-              43:             TypePointer Input 42
-     44(tin_pos):     43(ptr) Variable Input
-              45:             TypePointer Input 7(fvec4)
-       49(PS_IN):             TypeStruct 8(fvec2)
-              50:             TypeArray 49(PS_IN) 11
-              51:             TypePointer Input 50
-         52(tin):     51(ptr) Variable Input
-              53:             TypePointer Input 8(fvec2)
-              63:     16(int) Constant 2
+              40:             TypePointer Output 7(fvec4)
+ 41(ts_psIn_pos):     40(ptr) Variable Output
+       44(PS_IN):             TypeStruct 8(fvec2)
+45(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO):             TypeStruct 15 16(int)
+      46(GS_OUT):             TypeStruct 44(PS_IN) 45(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO)
+              47:             TypePointer Output 46(GS_OUT)
+          48(ts):     47(ptr) Variable Output
+              51:             TypePointer Output 8(fvec2)
+              53:             TypePointer Function 17(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO)
+              56:             TypePointer Output 45(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO)
+              59:             TypePointer Output 15
+              62:             TypePointer Output 16(int)
+              65:             TypeArray 7(fvec4) 11
+              66:             TypePointer Input 65
+     67(tin_pos):     66(ptr) Variable Input
+              68:             TypePointer Input 7(fvec4)
+       72(PS_IN):             TypeStruct 8(fvec2)
+              73:             TypeArray 72(PS_IN) 11
+              74:             TypePointer Input 73
+         75(tin):     74(ptr) Variable Input
+              76:             TypePointer Input 8(fvec2)
+              86:     16(int) Constant 2
          4(main):           2 Function None 3
                5:             Label
-         41(tin):     13(ptr) Variable Function
-          70(ts):     19(ptr) Variable Function
-       71(param):     13(ptr) Variable Function
-       73(param):     19(ptr) Variable Function
-              46:     45(ptr) AccessChain 44(tin_pos) 26
-              47:    7(fvec4) Load 46
-              48:     32(ptr) AccessChain 41(tin) 26 26
-                              Store 48 47
-              54:     53(ptr) AccessChain 52(tin) 26 26
-              55:    8(fvec2) Load 54
-              56:     38(ptr) AccessChain 41(tin) 26 34
-                              Store 56 55
-              57:     45(ptr) AccessChain 44(tin_pos) 34
-              58:    7(fvec4) Load 57
-              59:     32(ptr) AccessChain 41(tin) 34 26
-                              Store 59 58
-              60:     53(ptr) AccessChain 52(tin) 34 26
-              61:    8(fvec2) Load 60
-              62:     38(ptr) AccessChain 41(tin) 34 34
-                              Store 62 61
-              64:     45(ptr) AccessChain 44(tin_pos) 63
-              65:    7(fvec4) Load 64
-              66:     32(ptr) AccessChain 41(tin) 63 26
-                              Store 66 65
-              67:     53(ptr) AccessChain 52(tin) 63 26
-              68:    8(fvec2) Load 67
-              69:     38(ptr) AccessChain 41(tin) 63 34
-                              Store 69 68
-              72:          12 Load 41(tin)
-                              Store 71(param) 72
-              74:           2 FunctionCall 23(@main(struct-PS_IN-vf4-vf21[3];struct-GS_OUT-struct-PS_IN-vf4-vf21-struct-STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO-f1[2]-i111;) 71(param) 73(param)
+         64(tin):     13(ptr) Variable Function
+          93(ts):     19(ptr) Variable Function
+       94(param):     13(ptr) Variable Function
+       96(param):     19(ptr) Variable Function
+              69:     68(ptr) AccessChain 67(tin_pos) 26
+              70:    7(fvec4) Load 69
+              71:     32(ptr) AccessChain 64(tin) 26 26
+                              Store 71 70
+              77:     76(ptr) AccessChain 75(tin) 26 26
+              78:    8(fvec2) Load 77
+              79:     38(ptr) AccessChain 64(tin) 26 34
+                              Store 79 78
+              80:     68(ptr) AccessChain 67(tin_pos) 34
+              81:    7(fvec4) Load 80
+              82:     32(ptr) AccessChain 64(tin) 34 26
+                              Store 82 81
+              83:     76(ptr) AccessChain 75(tin) 34 26
+              84:    8(fvec2) Load 83
+              85:     38(ptr) AccessChain 64(tin) 34 34
+                              Store 85 84
+              87:     68(ptr) AccessChain 67(tin_pos) 86
+              88:    7(fvec4) Load 87
+              89:     32(ptr) AccessChain 64(tin) 86 26
+                              Store 89 88
+              90:     76(ptr) AccessChain 75(tin) 86 26
+              91:    8(fvec2) Load 90
+              92:     38(ptr) AccessChain 64(tin) 86 34
+                              Store 92 91
+              95:          12 Load 64(tin)
+                              Store 94(param) 95
+              97:           2 FunctionCall 23(@main(struct-PS_IN-vf4-vf21[3];struct-GS_OUT-struct-PS_IN-vf4-vf21-struct-STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO-f1[2]-i111;) 94(param) 96(param)
+              98:  18(GS_OUT) Load 96(param)
+                              Store 93(ts) 98
                               Return
                               FunctionEnd
 23(@main(struct-PS_IN-vf4-vf21[3];struct-GS_OUT-struct-PS_IN-vf4-vf21-struct-STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO-f1[2]-i111;):           2 Function None 20
@@ -384,8 +475,22 @@ output primitive = triangle_strip
                               Store 33 31
               39:     38(ptr) AccessChain 25(o) 26 34
                               Store 39 37
-              40:  18(GS_OUT) Load 25(o)
-                              Store 22(ts) 40
+              42:     32(ptr) AccessChain 25(o) 26 26
+              43:    7(fvec4) Load 42
+                              Store 41(ts_psIn_pos) 43
+              49:     38(ptr) AccessChain 25(o) 26 34
+              50:    8(fvec2) Load 49
+              52:     51(ptr) AccessChain 48(ts) 26 26
+                              Store 52 50
+              54:     53(ptr) AccessChain 25(o) 34
+              55:17(STRUCT_WITH_NO_BUILTIN_INTERSTAGE_IO) Load 54
+              57:     56(ptr) AccessChain 48(ts) 34
+              58:          15 CompositeExtract 55 0
+              60:     59(ptr) AccessChain 57 26
+                              Store 60 58
+              61:     16(int) CompositeExtract 55 1
+              63:     62(ptr) AccessChain 57 34
+                              Store 63 61
                               EmitVertex
                               Return
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.geom.out
index 29818f1..c552675 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.geom.out
@@ -1,5 +1,5 @@
 hlsl.struct.split.trivial.geom
-Shader version: 450
+Shader version: 500
 invocations = -1
 max_vertices = 3
 input primitive = triangles
@@ -35,9 +35,13 @@ output primitive = triangle_strip
 0:18                Constant:
 0:18                  0 (const int)
 0:19            Sequence
-0:19              move second child to first child ( temp structure{ temp 4-component vector of float pos})
-0:19                'ts' ( out structure{ temp 4-component vector of float pos})
-0:19                'o' ( temp structure{ temp 4-component vector of float pos})
+0:19              Sequence
+0:19                move second child to first child ( temp 4-component vector of float)
+0:?                   'ts_pos' ( out 4-component vector of float Position)
+0:19                  pos: direct index for structure ( temp 4-component vector of float)
+0:19                    'o' ( temp structure{ temp 4-component vector of float pos})
+0:19                    Constant:
+0:19                      0 (const int)
 0:19              EmitVertex ( temp void)
 0:17          Loop Terminal Expression
 0:17          Pre-Increment ( temp int)
@@ -87,12 +91,13 @@ output primitive = triangle_strip
 0:?         'ts' ( temp structure{ temp 4-component vector of float pos})
 0:?   Linker Objects
 0:?     'i' (layout( location=0) in 3-element array of structure{})
+0:?     'ts' (layout( location=0) out structure{})
 
 
 Linked geometry stage:
 
 
-Shader version: 450
+Shader version: 500
 invocations = 1
 max_vertices = 3
 input primitive = triangles
@@ -128,9 +133,13 @@ output primitive = triangle_strip
 0:18                Constant:
 0:18                  0 (const int)
 0:19            Sequence
-0:19              move second child to first child ( temp structure{ temp 4-component vector of float pos})
-0:19                'ts' ( out structure{ temp 4-component vector of float pos})
-0:19                'o' ( temp structure{ temp 4-component vector of float pos})
+0:19              Sequence
+0:19                move second child to first child ( temp 4-component vector of float)
+0:?                   'ts_pos' ( out 4-component vector of float Position)
+0:19                  pos: direct index for structure ( temp 4-component vector of float)
+0:19                    'o' ( temp structure{ temp 4-component vector of float pos})
+0:19                    Constant:
+0:19                      0 (const int)
 0:19              EmitVertex ( temp void)
 0:17          Loop Terminal Expression
 0:17          Pre-Increment ( temp int)
@@ -180,19 +189,21 @@ output primitive = triangle_strip
 0:?         'ts' ( temp structure{ temp 4-component vector of float pos})
 0:?   Linker Objects
 0:?     'i' (layout( location=0) in 3-element array of structure{})
+0:?     'ts' (layout( location=0) out structure{})
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 67
+// Id's are bound by 74
 
                               Capability Geometry
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Geometry 4  "main" 46 66
+                              EntryPoint Geometry 4  "main" 40 49 70 73
                               ExecutionMode 4 Triangles
                               ExecutionMode 4 Invocations 1
                               ExecutionMode 4 OutputTriangleStrip
                               ExecutionMode 4 OutputVertices 3
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_IN"
                               MemberName 8(PS_IN) 0  "pos"
@@ -203,15 +214,20 @@ output primitive = triangle_strip
                               Name 17  "ts"
                               Name 22  "x"
                               Name 33  "o"
-                              Name 43  "i"
-                              Name 46  "i_pos"
-                              Name 58  "ts"
-                              Name 59  "param"
-                              Name 61  "param"
-                              Name 63  "PS_IN"
-                              Name 66  "i"
-                              Decorate 46(i_pos) BuiltIn Position
-                              Decorate 66(i) Location 0
+                              Name 40  "ts_pos"
+                              Name 46  "i"
+                              Name 49  "i_pos"
+                              Name 61  "ts"
+                              Name 62  "param"
+                              Name 64  "param"
+                              Name 67  "PS_IN"
+                              Name 70  "i"
+                              Name 71  "GS_OUT"
+                              Name 73  "ts"
+                              Decorate 40(ts_pos) BuiltIn Position
+                              Decorate 49(i_pos) BuiltIn Position
+                              Decorate 70(i) Location 0
+                              Decorate 73(ts) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -230,37 +246,44 @@ output primitive = triangle_strip
               30:     20(int) Constant 3
               31:             TypeBool
               35:             TypePointer Function 7(fvec4)
-              41:     20(int) Constant 1
-              44:             TypeArray 7(fvec4) 10
-              45:             TypePointer Input 44
-       46(i_pos):     45(ptr) Variable Input
-              47:             TypePointer Input 7(fvec4)
-              54:     20(int) Constant 2
-       63(PS_IN):             TypeStruct
-              64:             TypeArray 63(PS_IN) 10
-              65:             TypePointer Input 64
-           66(i):     65(ptr) Variable Input
+              39:             TypePointer Output 7(fvec4)
+      40(ts_pos):     39(ptr) Variable Output
+              44:     20(int) Constant 1
+              47:             TypeArray 7(fvec4) 10
+              48:             TypePointer Input 47
+       49(i_pos):     48(ptr) Variable Input
+              50:             TypePointer Input 7(fvec4)
+              57:     20(int) Constant 2
+       67(PS_IN):             TypeStruct
+              68:             TypeArray 67(PS_IN) 10
+              69:             TypePointer Input 68
+           70(i):     69(ptr) Variable Input
+      71(GS_OUT):             TypeStruct
+              72:             TypePointer Output 71(GS_OUT)
+          73(ts):     72(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
-           43(i):     12(ptr) Variable Function
-          58(ts):     14(ptr) Variable Function
-       59(param):     12(ptr) Variable Function
-       61(param):     14(ptr) Variable Function
-              48:     47(ptr) AccessChain 46(i_pos) 23
-              49:    7(fvec4) Load 48
-              50:     35(ptr) AccessChain 43(i) 23 23
-                              Store 50 49
-              51:     47(ptr) AccessChain 46(i_pos) 41
+           46(i):     12(ptr) Variable Function
+          61(ts):     14(ptr) Variable Function
+       62(param):     12(ptr) Variable Function
+       64(param):     14(ptr) Variable Function
+              51:     50(ptr) AccessChain 49(i_pos) 23
               52:    7(fvec4) Load 51
-              53:     35(ptr) AccessChain 43(i) 41 23
+              53:     35(ptr) AccessChain 46(i) 23 23
                               Store 53 52
-              55:     47(ptr) AccessChain 46(i_pos) 54
-              56:    7(fvec4) Load 55
-              57:     35(ptr) AccessChain 43(i) 54 23
-                              Store 57 56
-              60:          11 Load 43(i)
-                              Store 59(param) 60
-              62:           2 FunctionCall 18(@main(struct-PS_IN-vf41[3];struct-GS_OUT-vf41;) 59(param) 61(param)
+              54:     50(ptr) AccessChain 49(i_pos) 44
+              55:    7(fvec4) Load 54
+              56:     35(ptr) AccessChain 46(i) 44 23
+                              Store 56 55
+              58:     50(ptr) AccessChain 49(i_pos) 57
+              59:    7(fvec4) Load 58
+              60:     35(ptr) AccessChain 46(i) 57 23
+                              Store 60 59
+              63:          11 Load 46(i)
+                              Store 62(param) 63
+              65:           2 FunctionCall 18(@main(struct-PS_IN-vf41[3];struct-GS_OUT-vf41;) 62(param) 64(param)
+              66:  13(GS_OUT) Load 64(param)
+                              Store 61(ts) 66
                               Return
                               FunctionEnd
 18(@main(struct-PS_IN-vf41[3];struct-GS_OUT-vf41;):           2 Function None 15
@@ -284,14 +307,15 @@ output primitive = triangle_strip
               37:    7(fvec4)   Load 36
               38:     35(ptr)   AccessChain 33(o) 23
                                 Store 38 37
-              39:  13(GS_OUT)   Load 33(o)
-                                Store 17(ts) 39
+              41:     35(ptr)   AccessChain 33(o) 23
+              42:    7(fvec4)   Load 41
+                                Store 40(ts_pos) 42
                                 EmitVertex
                                 Branch 27
               27:               Label
-              40:     20(int)   Load 22(x)
-              42:     20(int)   IAdd 40 41
-                                Store 22(x) 42
+              43:     20(int)   Load 22(x)
+              45:     20(int)   IAdd 43 44
+                                Store 22(x) 45
                                 Branch 24
               26:             Label
                               Return
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.vert.out
index 822f819..05dcb26 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.struct.split.trivial.vert.out
@@ -1,5 +1,5 @@
 hlsl.struct.split.trivial.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:16  Function Definition: @main(struct-VS_INPUT-vf41;vf4; ( temp structure{ temp 4-component vector of float Pos})
 0:16    Function Parameters: 
@@ -50,7 +50,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:16  Function Definition: @main(struct-VS_INPUT-vf41;vf4; ( temp structure{ temp 4-component vector of float Pos})
 0:16    Function Parameters: 
@@ -105,6 +105,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 31 35 38 47
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "VS_INPUT"
                               MemberName 8(VS_INPUT) 0  "Pos_in"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structIoFourWay.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structIoFourWay.frag.out
index 64c60ed..cb26bb0 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structIoFourWay.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structIoFourWay.frag.out
@@ -1,5 +1,5 @@
 hlsl.structIoFourWay.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 using depth_greater
 0:? Sequence
@@ -57,7 +57,7 @@ using depth_greater
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 using depth_greater
 0:? Sequence
@@ -121,6 +121,7 @@ using depth_greater
                               EntryPoint Fragment 4  "main" 21 43 46 49 53
                               ExecutionMode 4 OriginUpperLeft
                               ExecutionMode 4 DepthGreater
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "T"
                               MemberName 8(T) 0  "f"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structStructName.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structStructName.frag.out
index 0fc4032..3b28805 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structStructName.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structStructName.frag.out
@@ -1,5 +1,5 @@
 hlsl.structStructName.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: @main( ( temp int)
@@ -23,7 +23,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: @main( ( temp int)
@@ -52,6 +52,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 20
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "@main("
                               Name 10  "S"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.frag.out
index 7c84d37..d70af80 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.frag.out
@@ -1,5 +1,5 @@
 hlsl.structarray.flatten.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:23  Function Definition: @main(struct-PS_OUTPUT-vf41; ( temp void)
@@ -64,7 +64,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:23  Function Definition: @main(struct-PS_OUTPUT-vf41; ( temp void)
@@ -135,6 +135,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 51
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.geom.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.geom.out
index 1af304d..702f06e 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.geom.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structarray.flatten.geom.out
@@ -1,5 +1,5 @@
 hlsl.structarray.flatten.geom
-Shader version: 450
+Shader version: 500
 invocations = -1
 max_vertices = 4
 input primitive = lines
@@ -47,9 +47,31 @@ output primitive = triangle_strip
 0:21          Constant:
 0:21            0 (const int)
 0:22      Sequence
-0:22        move second child to first child ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
-0:22          'outStream' ( out structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
-0:22          'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22        Sequence
+0:22          move second child to first child ( temp 4-component vector of float)
+0:?             'outStream_position' ( out 4-component vector of float Position)
+0:22            position: direct index for structure ( temp 4-component vector of float)
+0:22              'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                0 (const int)
+0:22          move second child to first child ( temp 4-component vector of float)
+0:22            color: direct index for structure ( temp 4-component vector of float)
+0:22              'outStream' (layout( location=0) out structure{ temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                0 (const int)
+0:22            color: direct index for structure ( temp 4-component vector of float)
+0:22              'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                1 (const int)
+0:22          move second child to first child ( temp 2-component vector of float)
+0:22            uv: direct index for structure ( temp 2-component vector of float)
+0:22              'outStream' (layout( location=0) out structure{ temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                1 (const int)
+0:22            uv: direct index for structure ( temp 2-component vector of float)
+0:22              'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                2 (const int)
 0:22        EmitVertex ( temp void)
 0:16  Function Definition: main( ( temp void)
 0:16    Function Parameters: 
@@ -62,12 +84,13 @@ output primitive = triangle_strip
 0:?         'outStream' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
 0:?   Linker Objects
 0:?     'vin' (layout( location=0) in 2-element array of structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:?     'outStream' (layout( location=0) out structure{ temp 4-component vector of float color,  temp 2-component vector of float uv})
 
 
 Linked geometry stage:
 
 
-Shader version: 450
+Shader version: 500
 invocations = 1
 max_vertices = 4
 input primitive = lines
@@ -115,9 +138,31 @@ output primitive = triangle_strip
 0:21          Constant:
 0:21            0 (const int)
 0:22      Sequence
-0:22        move second child to first child ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
-0:22          'outStream' ( out structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
-0:22          'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22        Sequence
+0:22          move second child to first child ( temp 4-component vector of float)
+0:?             'outStream_position' ( out 4-component vector of float Position)
+0:22            position: direct index for structure ( temp 4-component vector of float)
+0:22              'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                0 (const int)
+0:22          move second child to first child ( temp 4-component vector of float)
+0:22            color: direct index for structure ( temp 4-component vector of float)
+0:22              'outStream' (layout( location=0) out structure{ temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                0 (const int)
+0:22            color: direct index for structure ( temp 4-component vector of float)
+0:22              'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                1 (const int)
+0:22          move second child to first child ( temp 2-component vector of float)
+0:22            uv: direct index for structure ( temp 2-component vector of float)
+0:22              'outStream' (layout( location=0) out structure{ temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                1 (const int)
+0:22            uv: direct index for structure ( temp 2-component vector of float)
+0:22              'vout' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:22              Constant:
+0:22                2 (const int)
 0:22        EmitVertex ( temp void)
 0:16  Function Definition: main( ( temp void)
 0:16    Function Parameters: 
@@ -130,19 +175,21 @@ output primitive = triangle_strip
 0:?         'outStream' ( temp structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
 0:?   Linker Objects
 0:?     'vin' (layout( location=0) in 2-element array of structure{ temp 4-component vector of float position,  temp 4-component vector of float color,  temp 2-component vector of float uv})
+0:?     'outStream' (layout( location=0) out structure{ temp 4-component vector of float color,  temp 2-component vector of float uv})
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 47
+// Id's are bound by 61
 
                               Capability Geometry
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Geometry 4  "main" 40
+                              EntryPoint Geometry 4  "main" 38 43 53
                               ExecutionMode 4 InputLines
                               ExecutionMode 4 Invocations 1
                               ExecutionMode 4 OutputTriangleStrip
                               ExecutionMode 4 OutputVertices 4
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "VertexData"
                               MemberName 9(VertexData) 0  "position"
@@ -156,12 +203,19 @@ output primitive = triangle_strip
                               Name 17  "vin"
                               Name 18  "outStream"
                               Name 21  "vout"
-                              Name 38  "vin"
-                              Name 40  "vin"
-                              Name 42  "outStream"
-                              Name 43  "param"
-                              Name 45  "param"
-                              Decorate 40(vin) Location 0
+                              Name 38  "outStream_position"
+                              Name 41  "PS_IN"
+                              MemberName 41(PS_IN) 0  "color"
+                              MemberName 41(PS_IN) 1  "uv"
+                              Name 43  "outStream"
+                              Name 51  "vin"
+                              Name 53  "vin"
+                              Name 55  "outStream"
+                              Name 56  "param"
+                              Name 58  "param"
+                              Decorate 38(outStream_position) BuiltIn Position
+                              Decorate 43(outStream) Location 0
+                              Decorate 53(vin) Location 0
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeFloat 32
@@ -181,19 +235,27 @@ output primitive = triangle_strip
               28:     22(int) Constant 2
               29:             TypePointer Function 8(fvec2)
               33:     22(int) Constant 0
-              39:             TypePointer Input 12
-         40(vin):     39(ptr) Variable Input
+              37:             TypePointer Output 7(fvec4)
+38(outStream_position):     37(ptr) Variable Output
+       41(PS_IN):             TypeStruct 7(fvec4) 8(fvec2)
+              42:             TypePointer Output 41(PS_IN)
+   43(outStream):     42(ptr) Variable Output
+              49:             TypePointer Output 8(fvec2)
+              52:             TypePointer Input 12
+         53(vin):     52(ptr) Variable Input
          4(main):           2 Function None 3
                5:             Label
-         38(vin):     13(ptr) Variable Function
-   42(outStream):     15(ptr) Variable Function
-       43(param):     13(ptr) Variable Function
-       45(param):     15(ptr) Variable Function
-              41:          12 Load 40(vin)
-                              Store 38(vin) 41
-              44:          12 Load 38(vin)
-                              Store 43(param) 44
-              46:           2 FunctionCall 19(@main(struct-VertexData-vf4-vf4-vf21[2];struct-PS_IN-vf4-vf4-vf21;) 43(param) 45(param)
+         51(vin):     13(ptr) Variable Function
+   55(outStream):     15(ptr) Variable Function
+       56(param):     13(ptr) Variable Function
+       58(param):     15(ptr) Variable Function
+              54:          12 Load 53(vin)
+                              Store 51(vin) 54
+              57:          12 Load 51(vin)
+                              Store 56(param) 57
+              59:           2 FunctionCall 19(@main(struct-VertexData-vf4-vf4-vf21[2];struct-PS_IN-vf4-vf4-vf21;) 56(param) 58(param)
+              60:   14(PS_IN) Load 58(param)
+                              Store 55(outStream) 60
                               Return
                               FunctionEnd
 19(@main(struct-VertexData-vf4-vf4-vf21[2];struct-PS_IN-vf4-vf4-vf21;):           2 Function None 16
@@ -213,8 +275,17 @@ output primitive = triangle_strip
               35:    7(fvec4) Load 34
               36:     24(ptr) AccessChain 21(vout) 33
                               Store 36 35
-              37:   14(PS_IN) Load 21(vout)
-                              Store 18(outStream) 37
+              39:     24(ptr) AccessChain 21(vout) 33
+              40:    7(fvec4) Load 39
+                              Store 38(outStream_position) 40
+              44:     24(ptr) AccessChain 21(vout) 23
+              45:    7(fvec4) Load 44
+              46:     37(ptr) AccessChain 43(outStream) 33
+                              Store 46 45
+              47:     29(ptr) AccessChain 21(vout) 28
+              48:    8(fvec2) Load 47
+              50:     49(ptr) AccessChain 43(outStream) 23
+                              Store 50 48
                               EmitVertex
                               Return
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.atomics.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.atomics.frag.out
index a463a88..c3a3f87 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.atomics.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.atomics.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.atomics.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -238,7 +238,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -482,6 +482,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 80 83
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "@main(u1;"
                               Name 11  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.byte.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.byte.frag.out
index f388f87..6541946 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.byte.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.byte.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.byte.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -163,7 +163,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -332,6 +332,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 107 110
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "@main(u1;"
                               Name 11  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.coherent.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.coherent.frag.out
index f5210c0..fab95f7 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.coherent.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.coherent.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.coherent.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -89,7 +89,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -184,6 +184,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 71 74
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "@main(u1;"
                               Name 11  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.fn.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.fn.frag.out
index 8ed27f6..762ea80 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.fn.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.fn.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.fn.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: get(block--vu4[0]1;u1; ( temp 4-component vector of uint)
@@ -67,7 +67,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: get(block--vu4[0]1;u1; ( temp 4-component vector of uint)
@@ -140,6 +140,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 59 62
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  ""
                               MemberName 9 0  "@data"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.frag.out
index 3c8b114..634c1b8 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -95,7 +95,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -196,6 +196,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 87 90
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "@main(u1;"
                               Name 11  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rw.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rw.frag.out
index bfe4c39..b1bba58 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rw.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rw.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.rw.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -89,7 +89,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:12  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -184,6 +184,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 71 74
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "@main(u1;"
                               Name 11  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rwbyte.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rwbyte.frag.out
index e99bc43..14c2eb6 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rwbyte.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structbuffer.rwbyte.frag.out
@@ -1,5 +1,5 @@
 hlsl.structbuffer.rwbyte.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -504,7 +504,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:5  Function Definition: @main(u1; ( temp 4-component vector of float)
@@ -1014,6 +1014,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 233 236
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 12  "@main(u1;"
                               Name 11  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structin.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structin.vert.out
index c621941..8378de2 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structin.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.structin.vert.out
@@ -1,5 +1,5 @@
 hlsl.structin.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:8  Function Definition: @main(vf4;struct-VI-vf4[2]-vu2-vf41;vf4; ( temp structure{ temp 2-element array of 4-component vector of float m,  temp 2-component vector of uint coord,  temp 4-component vector of float b})
 0:8    Function Parameters: 
@@ -132,7 +132,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:8  Function Definition: @main(vf4;struct-VI-vf4[2]-vu2-vf41;vf4; ( temp structure{ temp 2-element array of 4-component vector of float m,  temp 2-component vector of uint coord,  temp 4-component vector of float b})
 0:8    Function Parameters: 
@@ -269,6 +269,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 48 51 54 58 62 66 78
+                              Source HLSL 500
                               Name 4  "main"
                               Name 13  "VI"
                               MemberName 13(VI) 0  "m"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.switch.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.switch.frag.out
index 8ee9d7c..c77da93 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.switch.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.switch.frag.out
@@ -1,5 +1,5 @@
 hlsl.switch.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4;i1;i1; ( temp 4-component vector of float)
@@ -149,7 +149,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4;i1;i1; ( temp 4-component vector of float)
@@ -304,6 +304,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 88 92 95 98
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 15  "@PixelShaderFunction(vf4;i1;i1;"
                               Name 12  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.swizzle.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.swizzle.frag.out
index 4674b76..59b432b 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.swizzle.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.swizzle.frag.out
@@ -1,5 +1,5 @@
 hlsl.swizzle.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -40,7 +40,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -85,6 +85,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "ShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.templatetypes.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.templatetypes.frag.out
index 11ea394..9d0cced 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.templatetypes.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.templatetypes.frag.out
@@ -1,5 +1,5 @@
 hlsl.templatetypes.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @PixelShaderFunction( ( temp float)
@@ -255,7 +255,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @PixelShaderFunction( ( temp float)
@@ -517,6 +517,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 151
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 8  "@PixelShaderFunction("
                               Name 12  "r00"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.this.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.this.frag.out
index 8619468..b35e267 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.this.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.this.frag.out
@@ -1,5 +1,5 @@
 hlsl.this.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -121,7 +121,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Sequence
@@ -248,6 +248,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 96
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "type1"
                               MemberName 9(type1) 0  "bar"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.tx.bracket.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.tx.bracket.frag.out
index 9e1db24..12d5f17 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.tx.bracket.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.tx.bracket.frag.out
@@ -1,5 +1,5 @@
 hlsl.tx.bracket.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: Fn1(vi4; ( temp 4-component vector of int)
@@ -212,7 +212,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:38  Function Definition: Fn1(vi4; ( temp 4-component vector of int)
@@ -431,6 +431,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 164
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "Fn1(vi4;"
                               Name 10  "x"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.half.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.half.frag.out
index 104c739..889d79b 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.half.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.half.frag.out
@@ -1,5 +1,5 @@
 hlsl.type.half.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @main( ( temp 4-component vector of float)
@@ -55,7 +55,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:3  Function Definition: @main( ( temp 4-component vector of float)
@@ -116,6 +116,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 34
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 9  "@main("
                               Name 12  "h0"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.identifier.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.identifier.frag.out
index e8a763c..1977340 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.identifier.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.type.identifier.frag.out
@@ -1,5 +1,5 @@
 hlsl.type.identifier.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:6  Function Definition: fn(f1; ( temp float)
@@ -115,7 +115,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:6  Function Definition: fn(f1; ( temp float)
@@ -236,6 +236,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 95
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 10  "fn(f1;"
                               Name 9  "float"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typeGraphCopy.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typeGraphCopy.vert.out
index 7d55c44..217e7ec 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typeGraphCopy.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typeGraphCopy.vert.out
@@ -1,5 +1,5 @@
 hlsl.typeGraphCopy.vert
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:22  Function Definition: @main( ( temp float)
 0:22    Function Parameters: 
@@ -32,7 +32,7 @@ Shader version: 450
 Linked vertex stage:
 
 
-Shader version: 450
+Shader version: 500
 0:? Sequence
 0:22  Function Definition: @main( ( temp float)
 0:22    Function Parameters: 
@@ -69,6 +69,7 @@ Shader version: 450
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
                               EntryPoint Vertex 4  "main" 26
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "@main("
                               Name 11  "N1"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typedef.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typedef.frag.out
index 6edb191..1c1a351 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typedef.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.typedef.frag.out
@@ -1,5 +1,5 @@
 hlsl.typedef.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: ShaderFunction(vf4;i1; ( temp 4-component vector of float)
@@ -41,7 +41,7 @@ Linked fragment stage:
 
 WARNING: Linking fragment stage: Entry point not found
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:4  Function Definition: ShaderFunction(vf4;i1; ( temp 4-component vector of float)
@@ -87,6 +87,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction"
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 14  "ShaderFunction(vf4;i1;"
                               Name 12  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.void.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.void.frag.out
index 584f378..1df35a6 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.void.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.void.frag.out
@@ -1,5 +1,5 @@
 hlsl.void.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Function Definition: foo1( ( temp void)
@@ -28,7 +28,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:1  Function Definition: foo1( ( temp void)
@@ -62,6 +62,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 22
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 6  "foo1("
                               Name 8  "foo2("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.whileLoop.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.whileLoop.frag.out
index 2f68da1..cd47dc7 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.whileLoop.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/hlsl.whileLoop.frag.out
@@ -1,5 +1,5 @@
 hlsl.whileLoop.frag
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -49,7 +49,7 @@ gl_FragCoord origin is upper left
 Linked fragment stage:
 
 
-Shader version: 450
+Shader version: 500
 gl_FragCoord origin is upper left
 0:? Sequence
 0:2  Function Definition: @PixelShaderFunction(vf4; ( temp 4-component vector of float)
@@ -104,6 +104,7 @@ gl_FragCoord origin is upper left
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "PixelShaderFunction" 45 48
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "PixelShaderFunction"
                               Name 11  "@PixelShaderFunction(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.sample.basic.none.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.sample.basic.none.frag.out
index 5af75db..af309a1 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.sample.basic.none.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.sample.basic.none.frag.out
@@ -11,6 +11,7 @@ WARNING: 0:4: 'immediate sampler state' : unimplemented
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 188 192
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.templatetypes.none.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.templatetypes.none.frag.out
index c5d8a01..dd2917e 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.templatetypes.none.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/remap.hlsl.templatetypes.none.frag.out
@@ -9,6 +9,7 @@ remap.hlsl.templatetypes.none.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 153 156
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "@main(vf4;"
                               Name 10  "input"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.bool.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.bool.vert.out
index 2810ff1..1e87c42 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.bool.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.bool.vert.out
@@ -3,7 +3,7 @@ Warning, version 450 is not yet complete; most version-specific features are pre
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 44
+// Id's are bound by 46
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
@@ -56,7 +56,8 @@ Warning, version 450 is not yet complete; most version-specific features are pre
               38:   18(fvec4) ConstantComposite 37 37 37 37
               39:   17(float) Constant 1065353216
               40:   18(fvec4) ConstantComposite 39 39 39 39
-              42:             TypePointer Output 18(fvec4)
+              41:             TypeVector 6(bool) 4
+              44:             TypePointer Output 18(fvec4)
          4(main):           2 Function None 3
                5:             Label
        30(param):      7(ptr) Variable Function
@@ -65,9 +66,10 @@ Warning, version 450 is not yet complete; most version-specific features are pre
               35:     6(bool) INotEqual 33 34
                               Store 30(param) 35
               36:     6(bool) FunctionCall 10(foo(b1;) 30(param)
-              41:   18(fvec4) Select 36 38 40
-              43:     42(ptr) AccessChain 24 26
-                              Store 43 41
+              42:   41(bvec4) CompositeConstruct 36 36 36 36
+              43:   18(fvec4) Select 42 38 40
+              45:     44(ptr) AccessChain 24 26
+                              Store 45 43
                               Return
                               FunctionEnd
      10(foo(b1;):     6(bool) Function None 8
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.buffer.autoassign.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.buffer.autoassign.frag.out
index 8fb6215..2a316ee 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.buffer.autoassign.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.buffer.autoassign.frag.out
@@ -8,6 +8,7 @@ spv.buffer.autoassign.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 47
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.deepRvalue.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.deepRvalue.frag.out
index b6613ef..b8f4d06 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.deepRvalue.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.deepRvalue.frag.out
@@ -1,12 +1,12 @@
 spv.deepRvalue.frag
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 150
+// Id's are bound by 152
 
                               Capability Shader
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "main" 144
+                              EntryPoint Fragment 4  "main" 146
                               ExecutionMode 4 OriginUpperLeft
                               Source GLSL 330
                               Name 4  "main"
@@ -21,12 +21,12 @@ spv.deepRvalue.frag
                               Name 106  "h"
                               Name 107  "i"
                               Name 111  "samp2D"
-                              Name 129  "str"
-                              MemberName 129(str) 0  "a"
-                              MemberName 129(str) 1  "b"
-                              MemberName 129(str) 2  "c"
-                              Name 131  "t"
-                              Name 144  "gl_FragColor"
+                              Name 131  "str"
+                              MemberName 131(str) 0  "a"
+                              MemberName 131(str) 1  "b"
+                              MemberName 131(str) 2  "c"
+                              Name 133  "t"
+                              Name 146  "gl_FragColor"
                               Decorate 111(samp2D) DescriptorSet 0
                2:             TypeVoid
                3:             TypeFunction 2
@@ -77,19 +77,20 @@ spv.deepRvalue.frag
              115:  113(fvec2) ConstantComposite 114 114
              119:    6(float) Constant 1036831949
              120:             TypeBool
-             128:             TypeArray 113(fvec2) 84
-        129(str):             TypeStruct 81(int) 128 120(bool)
-             130:             TypePointer Function 129(str)
-             132:  113(fvec2) ConstantComposite 10 11
-             133:    6(float) Constant 1082130432
-             134:  113(fvec2) ConstantComposite 133 12
-             135:    6(float) Constant 1086324736
-             136:  113(fvec2) ConstantComposite 135 13
-             137:         128 ConstantComposite 132 134 136
-             138:   120(bool) ConstantTrue
-             139:    129(str) ConstantComposite 82 137 138
-             143:             TypePointer Output 7(fvec4)
-144(gl_FragColor):    143(ptr) Variable Output
+             124:             TypeVector 120(bool) 4
+             130:             TypeArray 113(fvec2) 84
+        131(str):             TypeStruct 81(int) 130 120(bool)
+             132:             TypePointer Function 131(str)
+             134:  113(fvec2) ConstantComposite 10 11
+             135:    6(float) Constant 1082130432
+             136:  113(fvec2) ConstantComposite 135 12
+             137:    6(float) Constant 1086324736
+             138:  113(fvec2) ConstantComposite 137 13
+             139:         130 ConstantComposite 134 136 138
+             140:   120(bool) ConstantTrue
+             141:    131(str) ConstantComposite 82 139 140
+             145:             TypePointer Output 7(fvec4)
+146(gl_FragColor):    145(ptr) Variable Output
          4(main):           2 Function None 3
                5:             Label
            35(m):     34(ptr) Variable Function
@@ -98,7 +99,7 @@ spv.deepRvalue.frag
            87(g):     79(ptr) Variable Function
           106(h):     79(ptr) Variable Function
           107(i):     79(ptr) Variable Function
-          131(t):    130(ptr) Variable Function
+          133(t):    132(ptr) Variable Function
                               Store 9(v1) 14
                               Store 15(v2) 20
                               Store 21(v3) 26
@@ -174,21 +175,22 @@ spv.deepRvalue.frag
              121:   120(bool) FOrdGreaterThan 118 119
              122:    7(fvec4) Load 9(v1)
              123:    7(fvec4) Load 15(v2)
-             124:    7(fvec4) Select 121 122 123
-             125:    6(float) CompositeExtract 124 3
-             126:    6(float) Load 107(i)
-             127:    6(float) FAdd 126 125
-                              Store 107(i) 127
-                              Store 131(t) 139
-             140:    6(float) CompositeExtract 139 1 2 1
-             141:    6(float) Load 107(i)
-             142:    6(float) FAdd 141 140
-                              Store 107(i) 142
-             145:    6(float) Load 80(f)
-             146:    6(float) Load 87(g)
-             147:    6(float) Load 106(h)
-             148:    6(float) Load 107(i)
-             149:    7(fvec4) CompositeConstruct 145 146 147 148
-                              Store 144(gl_FragColor) 149
+             125:  124(bvec4) CompositeConstruct 121 121 121 121
+             126:    7(fvec4) Select 125 122 123
+             127:    6(float) CompositeExtract 126 3
+             128:    6(float) Load 107(i)
+             129:    6(float) FAdd 128 127
+                              Store 107(i) 129
+                              Store 133(t) 141
+             142:    6(float) CompositeExtract 141 1 2 1
+             143:    6(float) Load 107(i)
+             144:    6(float) FAdd 143 142
+                              Store 107(i) 144
+             147:    6(float) Load 80(f)
+             148:    6(float) Load 87(g)
+             149:    6(float) Load 106(h)
+             150:    6(float) Load 107(i)
+             151:    7(fvec4) CompositeConstruct 147 148 149 150
+                              Store 146(gl_FragColor) 151
                               Return
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.image.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.image.frag.out
index ee29baf..b4f673b 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.image.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.image.frag.out
@@ -3,7 +3,7 @@ Warning, version 450 is not yet complete; most version-specific features are pre
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 374
+// Id's are bound by 376
 
                               Capability Shader
                               Capability SampledRect
@@ -16,7 +16,7 @@ Warning, version 450 is not yet complete; most version-specific features are pre
                               Capability StorageImageWriteWithoutFormat
                1:             ExtInstImport  "GLSL.std.450"
                               MemoryModel Logical GLSL450
-                              EntryPoint Fragment 4  "main" 132 142 152 248 362 373
+                              EntryPoint Fragment 4  "main" 132 142 152 248 362 375
                               ExecutionMode 4 OriginUpperLeft
                               Source GLSL 450
                               Name 4  "main"
@@ -42,7 +42,7 @@ Warning, version 450 is not yet complete; most version-specific features are pre
                               Name 248  "value"
                               Name 357  "wo2D"
                               Name 362  "fragData"
-                              Name 373  "ic4D"
+                              Name 375  "ic4D"
                               Decorate 15(i1D) DescriptorSet 0
                               Decorate 15(i1D) Binding 0
                               Decorate 27(i2D) DescriptorSet 0
@@ -76,7 +76,7 @@ Warning, version 450 is not yet complete; most version-specific features are pre
                               Decorate 357(wo2D) DescriptorSet 0
                               Decorate 357(wo2D) Binding 1
                               Decorate 357(wo2D) NonReadable
-                              Decorate 373(ic4D) Flat
+                              Decorate 375(ic4D) Flat
                2:             TypeVoid
                3:             TypeFunction 2
                6:             TypeInt 32 1
@@ -164,9 +164,10 @@ Warning, version 450 is not yet complete; most version-specific features are pre
              361:             TypePointer Output 125(fvec4)
    362(fragData):    361(ptr) Variable Output
              367:             TypeBool
-             371:             TypeVector 6(int) 4
-             372:             TypePointer Input 371(ivec4)
-       373(ic4D):    372(ptr) Variable Input
+             370:             TypeVector 367(bool) 4
+             373:             TypeVector 6(int) 4
+             374:             TypePointer Input 373(ivec4)
+       375(ic4D):    374(ptr) Variable Input
          4(main):           2 Function None 3
                5:             Label
            9(iv):      8(ptr) Variable Function
@@ -503,7 +504,8 @@ Warning, version 450 is not yet complete; most version-specific features are pre
              366:     18(int) Bitcast 365
              368:   367(bool) INotEqual 363 366
              369:  125(fvec4) Load 127(v)
-             370:  125(fvec4) Select 368 369 129
-                              Store 362(fragData) 370
+             371:  370(bvec4) CompositeConstruct 368 368 368 368
+             372:  125(fvec4) Select 371 369 129
+                              Store 362(fragData) 372
                               Return
                               FunctionEnd
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign-2.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign-2.frag.out
index c8273dd..f09a468 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign-2.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign-2.frag.out
@@ -8,6 +8,7 @@ spv.register.autoassign-2.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 44
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign.frag.out
index ae048ce..4874d90 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.autoassign.frag.out
@@ -9,6 +9,7 @@ spv.register.autoassign.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main_ep" 151
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main_ep"
                               Name 9  "Func1("
                               Name 11  "Func2("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.noautoassign.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.noautoassign.frag.out
index afadc48..71c0b37 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.noautoassign.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.register.noautoassign.frag.out
@@ -9,6 +9,7 @@ spv.register.noautoassign.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main_ep" 151
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main_ep"
                               Name 9  "Func1("
                               Name 11  "Func2("
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.rw.autoassign.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.rw.autoassign.frag.out
index e2b544a..9069666 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.rw.autoassign.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.rw.autoassign.frag.out
@@ -10,6 +10,7 @@ spv.rw.autoassign.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 39
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 8  "PS_OUTPUT"
                               MemberName 8(PS_OUTPUT) 0  "Color"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.specConstantOperations.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.specConstantOperations.vert.out
index 597820b..ab83e61 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.specConstantOperations.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.specConstantOperations.vert.out
@@ -3,7 +3,7 @@ Warning, version 450 is not yet complete; most version-specific features are pre
 
 // Module Version 10000
 // Generated by (magic number): 80001
-// Id's are bound by 160
+// Id's are bound by 162
 
                               Capability Shader
                               Capability Float64
@@ -168,7 +168,9 @@ Warning, version 450 is not yet complete; most version-specific features are pre
              156:  154(fvec2) ConstantComposite 155 155
              157:   39(float) Constant 1073741824
              158:  154(fvec2) ConstantComposite 157 157
-             159:  154(fvec2) SpecConstantOp 169 153 156 158
+             159:             TypeVector 22(bool) 2
+             160:  159(bvec2) SpecConstantComposite 153 153
+             161:  154(fvec2) SpecConstantOp 169 160 156 158
          4(main):           2 Function None 3
                5:             Label
                               Return
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.ssbo.autoassign.frag.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.ssbo.autoassign.frag.out
index 8d10b6b..7ec2a30 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.ssbo.autoassign.frag.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/spv.ssbo.autoassign.frag.out
@@ -8,6 +8,7 @@ spv.ssbo.autoassign.frag
                               MemoryModel Logical GLSL450
                               EntryPoint Fragment 4  "main" 88 91
                               ExecutionMode 4 OriginUpperLeft
+                              Source HLSL 500
                               Name 4  "main"
                               Name 11  "@main(vf4;"
                               Name 10  "pos"
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/tokenPaste.vert.out b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/tokenPaste.vert.out
index e544d9e..acc2ced 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/tokenPaste.vert.out
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/baseResults/tokenPaste.vert.out
@@ -8,7 +8,8 @@ ERROR: 0:69: '##' : combined token is invalid
 ERROR: 0:82: 'macro expansion' : Too few args in Macro rec
 ERROR: 0:82: '##' : unexpected location 
 ERROR: 0:82: '##' : unexpected location 
-ERROR: 8 compilation errors.  No code generated.
+ERROR: 0:86: '##' : unexpected location; end of argument 
+ERROR: 9 compilation errors.  No code generated.
 
 
 Shader version: 450
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.frag b/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.frag
index 15db637..029b156 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.frag
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.frag
@@ -52,7 +52,6 @@ float PixelShaderFunctionS(float inF0, float inF1, float inF2, uint inU0, uint i
     // TODO: fma(inD0, inD1, inD2);
     float r033 = fmod(inF0, inF1);
     float r034 = frac(inF0);
-    float r035 = frexp(inF0, inF1);
     float r036 = fwidth(inF0);
     bool r037 = isinf(inF0);
     bool r038 = isnan(inF0);
@@ -136,7 +135,6 @@ float2 PixelShaderFunction2(float2 inF0, float2 inF1, float2 inF2, uint2 inU0, u
     // TODO: fma(inD0, inD1, inD2);
     float2 r035 = fmod(inF0, inF1);
     float2 r036 = frac(inF0);
-    float2 r037 = frexp(inF0, inF1);
     float2 r038 = fwidth(inF0);
     bool2 r039 = isinf(inF0);
     bool2 r040 = isnan(inF0);
@@ -217,7 +215,6 @@ float3 PixelShaderFunction3(float3 inF0, float3 inF1, float3 inF2, uint3 inU0, u
     // TODO: fma(inD0, inD1, inD2);
     float3 r036 = fmod(inF0, inF1);
     float3 r037 = frac(inF0);
-    float3 r038 = frexp(inF0, inF1);
     float3 r039 = fwidth(inF0);
     bool3 r040 = isinf(inF0);
     bool3 r041 = isnan(inF0);
@@ -299,7 +296,6 @@ float4 PixelShaderFunction(float4 inF0, float4 inF1, float4 inF2, uint4 inU0, ui
     // TODO: fma(inD0, inD1, inD2);
     float4 r036 = fmod(inF0, inF1);
     float4 r037 = frac(inF0);
-    float4 r038 = frexp(inF0, inF1);
     float4 r039 = fwidth(inF0);
     bool4 r040 = isinf(inF0);
     bool4 r041 = isnan(inF0);
@@ -369,7 +365,6 @@ float4 PixelShaderFunction(float4 inF0, float4 inF1, float4 inF2, uint4 inU0, ui
     MT r021 = floor(inF0);                  \
     MT r022 = fmod(inF0, inF1);             \
     MT r023 = frac(inF0);                   \
-    MT r024 = frexp(inF0, inF1);            \
     MT r025 = fwidth(inF0);                 \
     MT r026 = ldexp(inF0, inF1);            \
     MT r026a = lerp(inF0, inF1, inF2);      \
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.vert b/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.vert
index de9476c..c442f16 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.vert
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/hlsl.intrinsics.vert
@@ -28,7 +28,6 @@ float VertexShaderFunctionS(float inF0, float inF1, float inF2, uint inU0, uint
     // TODO: fma(inD0, inD1, inD2);
     fmod(inF0, inF1);
     frac(inF0);
-    frexp(inF0, inF1);
     isinf(inF0);
     isnan(inF0);
     ldexp(inF0, inF1);
@@ -99,7 +98,6 @@ float2 VertexShaderFunction2(float2 inF0, float2 inF1, float2 inF2, uint2 inU0,
     // TODO: fma(inD0, inD1, inD2);
     fmod(inF0, inF1);
     frac(inF0);
-    frexp(inF0, inF1);
     isinf(inF0);
     isnan(inF0);
     ldexp(inF0, inF1);
@@ -170,7 +168,6 @@ float3 VertexShaderFunction3(float3 inF0, float3 inF1, float3 inF2, uint3 inU0,
     // TODO: fma(inD0, inD1, inD2);
     fmod(inF0, inF1);
     frac(inF0);
-    frexp(inF0, inF1);
     isinf(inF0);
     isnan(inF0);
     ldexp(inF0, inF1);
@@ -241,7 +238,6 @@ float4 VertexShaderFunction4(float4 inF0, float4 inF1, float4 inF2, uint4 inU0,
     // TODO: fma(inD0, inD1, inD2);
     fmod(inF0, inF1);
     frac(inF0);
-    frexp(inF0, inF1);
     isinf(inF0);
     isnan(inF0);
     ldexp(inF0, inF1);
@@ -305,7 +301,6 @@ float4 VertexShaderFunction4(float4 inF0, float4 inF1, float4 inF2, uint4 inU0,
     floor(inF0); \
     fmod(inF0, inF1); \
     frac(inF0); \
-    frexp(inF0, inF1); \
     ldexp(inF0, inF1); \
     lerp(inF0, inF1, inF2); \
     log(inF0); \
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/runtests b/3rdparty/bgfx/3rdparty/glslang/Test/runtests
index 86ffef3..efc449d 100755
--- a/3rdparty/bgfx/3rdparty/glslang/Test/runtests
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/runtests
@@ -73,6 +73,20 @@ echo Running explicit stage test
 $EXE -i -S vert nosuffix > $TARGETDIR/nosuffix.out
 diff -b $BASEDIR/nosuffix.out $TARGETDIR/nosuffix.out || HASERROR=1
 
+#
+# Testing --hlsl-offsets
+#
+echo Running hlsl offsets
+$EXE -i --hlsl-offsets -H spv.hlslOffsets.vert > $TARGETDIR/spv.hlslOffsets.vert.out
+diff -b $BASEDIR/spv.hlslOffsets.vert.out $TARGETDIR/spv.hlslOffsets.vert.out || HASERROR=1
+
+echo Running hlsl offsets
+$EXE -i  --hlsl-offsets -D -e main -H hlsl.hlslOffset.vert > $TARGETDIR/hlsl.hlslOffset.vert.out
+diff -b $BASEDIR/hlsl.hlslOffset.vert.out $TARGETDIR/hlsl.hlslOffset.vert.out || HASERROR=1
+
+#
+# Final checking
+#
 if [ $HASERROR -eq 0 ]
 then
     echo Tests Succeeded.
diff --git a/3rdparty/bgfx/3rdparty/glslang/Test/tokenPaste.vert b/3rdparty/bgfx/3rdparty/glslang/Test/tokenPaste.vert
index 7aa113a..40de6f9 100644
--- a/3rdparty/bgfx/3rdparty/glslang/Test/tokenPaste.vert
+++ b/3rdparty/bgfx/3rdparty/glslang/Test/tokenPaste.vert
@@ -79,4 +79,8 @@ uniform M_OUTER(argPaste);
 uniform M_OUTER2(argPaste);
 
 #define rec(x)##
-rec(rec())
\ No newline at end of file
+rec(rec())
+
+#define bax(bay)
+#define baz bax(/##)
+baz
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/Include/BaseTypes.h b/3rdparty/bgfx/3rdparty/glslang/glslang/Include/BaseTypes.h
index 230a6b9..a805a06 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/Include/BaseTypes.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/Include/BaseTypes.h
@@ -219,6 +219,9 @@ enum TBuiltInVariable {
     EbvFragDepthGreater,
     EbvFragDepthLesser,
     EbvStencilRef,
+    EbvGsOutputStream,
+    EbvOutputPatch,
+    EbvInputPatch,
 
     EbvLast
 };
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/Include/Types.h b/3rdparty/bgfx/3rdparty/glslang/glslang/Include/Types.h
index 2207d0b..6f58a52 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/Include/Types.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/Include/Types.h
@@ -43,6 +43,8 @@
 #include "../Public/ShaderLang.h"
 #include "arrays.h"
 
+#include <algorithm>
+
 namespace glslang {
 
 const int GlslangMaxTypeLength = 200;  // TODO: need to print block/struct one member per line, so this can stay bounded
@@ -1382,127 +1384,80 @@ public:
         return !isPerVertexAndBuiltIn(language);
     }
     
+    // return true if this type contains any subtype which satisfies the given predicate.
+    template <typename P> 
+    bool contains(P predicate) const
+    {
+        if (predicate(this))
+            return true;
+
+        const auto hasa = [predicate](const TTypeLoc& tl) { return tl.type->contains(predicate); };
+
+        return structure && std::any_of(structure->begin(), structure->end(), hasa);
+    }
+
     // Recursively checks if the type contains the given basic type
     virtual bool containsBasicType(TBasicType checkType) const
     {
-        if (basicType == checkType)
-            return true;
-        if (! structure)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsBasicType(checkType))
-                return true;
-        }
-        return false;
+        return contains([checkType](const TType* t) { return t->basicType == checkType; } );
     }
 
     // Recursively check the structure for any arrays, needed for some error checks
     virtual bool containsArray() const
     {
-        if (isArray())
-            return true;
-        if (structure == nullptr)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsArray())
-                return true;
-        }
-        return false;
+        return contains([](const TType* t) { return t->isArray(); } );
     }
 
     // Check the structure for any structures, needed for some error checks
     virtual bool containsStructure() const
     {
-        if (structure == nullptr)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->structure)
-                return true;
-        }
-        return false;
+        return contains([this](const TType* t) { return t != this && t->isStruct(); } );
     }
 
     // Recursively check the structure for any implicitly-sized arrays, needed for triggering a copyUp().
     virtual bool containsImplicitlySizedArray() const
     {
-        if (isImplicitlySizedArray())
-            return true;
-        if (structure == nullptr)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsImplicitlySizedArray())
-                return true;
-        }
-        return false;
+        return contains([](const TType* t) { return t->isImplicitlySizedArray(); } );
     }
 
     virtual bool containsOpaque() const
     {
-        if (isOpaque())
-            return true;
-        if (! structure)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsOpaque())
-                return true;
-        }
-        return false;
+        return contains([](const TType* t) { return t->isOpaque(); } );
     }
 
     // Recursively checks if the type contains an interstage IO builtin
     virtual bool containsBuiltInInterstageIO(EShLanguage language) const
     {
-        if (isBuiltInInterstageIO(language))
-            return true;
-
-        if (! structure)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsBuiltInInterstageIO(language))
-                return true;
-        }
-        return false;
+        return contains([language](const TType* t) { return t->isBuiltInInterstageIO(language); } );
     }
 
     virtual bool containsNonOpaque() const
     {
-        // list all non-opaque types
-        switch (basicType) {
-        case EbtVoid:
-        case EbtFloat:
-        case EbtDouble:
+        const auto nonOpaque = [](const TType* t) {
+            switch (t->basicType) {
+            case EbtVoid:
+            case EbtFloat:
+            case EbtDouble:
 #ifdef AMD_EXTENSIONS
-        case EbtFloat16:
+            case EbtFloat16:
 #endif
-        case EbtInt:
-        case EbtUint:
-        case EbtInt64:
-        case EbtUint64:
-        case EbtBool:
+            case EbtInt:
+            case EbtUint:
+            case EbtInt64:
+            case EbtUint64:
+            case EbtBool:
             return true;
-        default:
-            break;
-        }
-        if (! structure)
+            default:
             return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsNonOpaque())
-                return true;
-        }
-        return false;
+            }
+        };
+
+        return contains(nonOpaque);
     }
 
     virtual bool containsSpecializationSize() const
     {
-        if (isArray() && arraySizes->containsNode())
-            return true;
-        if (! structure)
-            return false;
-        for (unsigned int i = 0; i < structure->size(); ++i) {
-            if ((*structure)[i].type->containsSpecializationSize())
-                return true;
-        }
-        return false;
+        return contains([](const TType* t) { return t->isArray() && t->arraySizes->containsNode(); } );
     }
 
     // Array editing methods.  Array descriptors can be shared across
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/Include/revision.h b/3rdparty/bgfx/3rdparty/glslang/glslang/Include/revision.h
index d442909..31b5fee 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/Include/revision.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/Include/revision.h
@@ -2,5 +2,5 @@
 // For the version, it uses the latest git tag followed by the number of commits.
 // For the date, it uses the current date (when then script is run).
 
-#define GLSLANG_REVISION "Overload400-PrecQual.1937"
-#define GLSLANG_DATE "24-Mar-2017"
+#define GLSLANG_REVISION "Overload400-PrecQual.1985"
+#define GLSLANG_DATE "07-Apr-2017"
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Intermediate.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Intermediate.cpp
index 8abd9e3..85c0151 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Intermediate.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Intermediate.cpp
@@ -356,12 +356,12 @@ TIntermTyped* TIntermediate::addUnaryMath(TOperator op, TIntermTyped* child, TSo
     node->updatePrecision();
 
     // If it's a (non-specialization) constant, it must be folded.
-    if (child->getAsConstantUnion())
-        return child->getAsConstantUnion()->fold(op, node->getType());
+    if (node->getOperand()->getAsConstantUnion())
+        return node->getOperand()->getAsConstantUnion()->fold(op, node->getType());
 
     // If it's a specialization constant, the result is too,
     // if the operation is allowed for specialization constants.
-    if (child->getType().getQualifier().isSpecConstant() && isSpecializationOperation(*node))
+    if (node->getOperand()->getType().getQualifier().isSpecConstant() && isSpecializationOperation(*node))
         node->getWritableType().getQualifier().makeSpecConstant();
 
     return node;
@@ -920,7 +920,7 @@ bool TIntermediate::canImplicitlyPromote(TBasicType from, TBasicType to, TOperat
     case EbtUint:
         switch (from) {
         case EbtInt:
-            return version >= 400;
+            return version >= 400 || (source == EShSourceHlsl);
         case EbtUint:
             return true;
         case EbtBool:
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseContextBase.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseContextBase.cpp
index d2b6b26..44fc0b4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseContextBase.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseContextBase.cpp
@@ -531,7 +531,7 @@ void TParseContextBase::parseSwizzleSelector(const TSourceLoc& loc, const TStrin
 // Make the passed-in variable information become a member of the
 // global uniform block.  If this doesn't exist yet, make it.
 //
-void TParseContextBase::growGlobalUniformBlock(TSourceLoc& loc, TType& memberType, TString& memberName, TTypeList* typeList)
+void TParseContextBase::growGlobalUniformBlock(const TSourceLoc& loc, TType& memberType, const TString& memberName, TTypeList* typeList)
 {
     // Make the global block, if not yet made.
     if (globalUniformBlock == nullptr) {
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.cpp
index 78f459a..bc43986 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.cpp
@@ -342,10 +342,8 @@ TIntermTyped* TParseContext::handleBracketDereference(const TSourceLoc& loc, TIn
     TIntermTyped* result = nullptr;
 
     int indexValue = 0;
-    if (index->getQualifier().isFrontEndConstant()) {
+    if (index->getQualifier().isFrontEndConstant())
         indexValue = index->getAsConstantUnion()->getConstArray()[0].getIConst();
-        checkIndex(loc, base->getType(), indexValue);
-    }
 
     variableCheck(base);
     if (! base->isArray() && ! base->isMatrix() && ! base->isVector()) {
@@ -353,10 +351,12 @@ TIntermTyped* TParseContext::handleBracketDereference(const TSourceLoc& loc, TIn
             error(loc, " left of '[' is not of type array, matrix, or vector ", base->getAsSymbolNode()->getName().c_str(), "");
         else
             error(loc, " left of '[' is not of type array, matrix, or vector ", "expression", "");
-    } else if (base->getType().getQualifier().isFrontEndConstant() && index->getQualifier().isFrontEndConstant())
+    } else if (base->getType().getQualifier().isFrontEndConstant() && index->getQualifier().isFrontEndConstant()) {
+        // both base and index are front-end constants
+        checkIndex(loc, base->getType(), indexValue);
         return intermediate.foldDereference(base, indexValue, loc);
-    else {
-        // at least one of base and index is variable...
+    } else {
+        // at least one of base and index is not a front-end constant variable...
 
         if (base->getAsSymbolNode() && isIoResizeArray(base->getType()))
             handleIoResizeArrayAccess(loc, base);
@@ -364,6 +364,8 @@ TIntermTyped* TParseContext::handleBracketDereference(const TSourceLoc& loc, TIn
         if (index->getQualifier().isFrontEndConstant()) {
             if (base->getType().isImplicitlySizedArray())
                 updateImplicitArraySize(loc, base, indexValue);
+            else
+                checkIndex(loc, base->getType(), indexValue);
             result = intermediate.addIndex(EOpIndexDirect, base, index, loc);
         } else {
             if (base->getType().isImplicitlySizedArray()) {
@@ -2990,7 +2992,7 @@ void TParseContext::arrayDimMerge(TType& type, const TArraySizes* sizes)
 // Do all the semantic checking for declaring or redeclaring an array, with and
 // without a size, and make the right changes to the symbol table.
 //
-void TParseContext::declareArray(const TSourceLoc& loc, TString& identifier, const TType& type, TSymbol*& symbol)
+void TParseContext::declareArray(const TSourceLoc& loc, const TString& identifier, const TType& type, TSymbol*& symbol)
 {
     if (symbol == nullptr) {
         bool currentScope;
@@ -5053,7 +5055,7 @@ TVariable* TParseContext::makeInternalVariable(const char* name, const TType& ty
 //
 // Return the successfully declared variable.
 //
-TVariable* TParseContext::declareNonArray(const TSourceLoc& loc, TString& identifier, TType& type)
+TVariable* TParseContext::declareNonArray(const TSourceLoc& loc, const TString& identifier, const TType& type)
 {
     // make a new variable
     TVariable* variable = new TVariable(&identifier, type);
@@ -5770,7 +5772,7 @@ void TParseContext::blockStageIoCheck(const TSourceLoc& loc, const TQualifier& q
 }
 
 // Do all block-declaration checking regarding its qualifiers.
-void TParseContext::blockQualifierCheck(const TSourceLoc& loc, const TQualifier& qualifier, bool instanceName)
+void TParseContext::blockQualifierCheck(const TSourceLoc& loc, const TQualifier& qualifier, bool /*instanceName*/)
 {
     // The 4.5 specification says:
     //
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.h b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.h
index 61db6fe..dc9dc6a 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ParseHelper.h
@@ -136,7 +136,7 @@ public:
     TSymbolTable& symbolTable;   // symbol table that goes with the current language, version, and profile
 
     // Manage the global uniform block (default uniforms in GLSL, $Global in HLSL)
-    virtual void growGlobalUniformBlock(TSourceLoc&, TType&, TString& memberName, TTypeList* typeList = nullptr);
+    virtual void growGlobalUniformBlock(const TSourceLoc&, TType&, const TString& memberName, TTypeList* typeList = nullptr);
 
     virtual bool lValueErrorCheck(const TSourceLoc&, const char* op, TIntermTyped*);
     virtual void rValueErrorCheck(const TSourceLoc&, const char* op, TIntermTyped*);
@@ -174,7 +174,7 @@ protected:
     int firstNewMember;              // the index of the first member not yet inserted into the symbol table
     // override this to set the language-specific name
     virtual const char* getGlobalUniformBlockName() const { return ""; }
-    virtual void setUniformBlockDefaults(TType& block) const { }
+    virtual void setUniformBlockDefaults(TType&) const { }
     virtual void finalizeGlobalUniformBlockLayout(TVariable&) { }
     virtual void outputMessage(const TSourceLoc&, const char* szReason, const char* szToken,
                                const char* szExtraInfoFormat, TPrefixType prefix,
@@ -371,8 +371,8 @@ protected:
     void nonInitConstCheck(const TSourceLoc&, TString& identifier, TType& type);
     void inheritGlobalDefaults(TQualifier& dst) const;
     TVariable* makeInternalVariable(const char* name, const TType&) const;
-    TVariable* declareNonArray(const TSourceLoc&, TString& identifier, TType&);
-    void declareArray(const TSourceLoc&, TString& identifier, const TType&, TSymbol*&);
+    TVariable* declareNonArray(const TSourceLoc&, const TString& identifier, const TType&);
+    void declareArray(const TSourceLoc&, const TString& identifier, const TType&, TSymbol*&);
     TIntermNode* executeInitializer(const TSourceLoc&, TIntermTyped* initializer, TVariable* variable);
     TIntermTyped* convertInitializerList(const TSourceLoc&, const TType&, TIntermTyped* initializer);
     void finish() override;
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Scan.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Scan.cpp
index 8b3a306..f61439f 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Scan.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/Scan.cpp
@@ -175,7 +175,7 @@ bool TInputScanner::scanVersion(int& version, EProfile& profile, bool& notFirstT
 
     bool versionNotFirst = false;  // means not first WRT comments and white space, nothing more
     notFirstToken = false;         // means not first WRT to real tokens
-    version = 0;  // means not found
+    version = 0;                   // means not found
     profile = ENoProfile;
 
     bool foundNonSpaceTab = false;
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ShaderLang.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ShaderLang.cpp
index 5a8f679..14f2bde 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ShaderLang.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/ShaderLang.cpp
@@ -442,7 +442,7 @@ bool DeduceVersionProfile(TInfoSink& infoSink, EShLanguage stage, bool versionNo
     bool correct = true;
 
     if (source == EShSourceHlsl) {
-        version = 450;          // TODO: GLSL parser is still used for builtins.
+        version = 500;          // shader model; currently a characteristic of glslang, not the input
         profile = ECoreProfile; // allow doubles in prototype parsing
         return correct;
     }
@@ -679,13 +679,13 @@ bool ProcessDeferred(
     // First, without using the preprocessor or parser, find the #version, so we know what
     // symbol tables, processing rules, etc. to set up.  This does not need the extra strings
     // outlined above, just the user shader.
-    int version;
-    EProfile profile;
     glslang::TInputScanner userInput(numStrings, &strings[numPre], &lengths[numPre]);  // no preamble
-    bool versionNotFirstToken;
-    bool versionNotFirst = userInput.scanVersion(version, profile, versionNotFirstToken);
+    int version = 0;
+    EProfile profile = ENoProfile;
+    bool versionNotFirstToken = false;
+    bool versionNotFirst = (messages & EShMsgReadHlsl) ? true : userInput.scanVersion(version, profile, versionNotFirstToken);
     bool versionNotFound = version == 0;
-    if (forceDefaultVersionAndProfile) {
+    if (forceDefaultVersionAndProfile && (messages & EShMsgReadHlsl) == 0) {
         if (! (messages & EShMsgSuppressWarnings) && ! versionNotFound &&
             (version != defaultVersion || profile != defaultProfile)) {
             compiler->infoSink.info << "Warning, (version, profile) forced to be ("
@@ -726,6 +726,8 @@ bool ProcessDeferred(
     intermediate.setSpv(spvVersion);
     if (spvVersion.vulkan >= 100)
         intermediate.setOriginUpperLeft();
+    if (messages & EShMsgHlslOffsets) // source-language independent
+        intermediate.setHlslOffsets();
     SetupBuiltinSymbolTable(version, profile, spvVersion, source);
 
     TSymbolTable* cachedTable = SharedSymbolTables[MapVersionToIndex(version)]
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/intermOut.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/intermOut.cpp
index b8a6969..c0c60c5 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/intermOut.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/intermOut.cpp
@@ -884,6 +884,13 @@ void TIntermediate::output(TInfoSink& infoSink, bool tree)
 
     case EShLangTessControl:
         infoSink.debug << "vertices = " << vertices << "\n";
+
+        if (inputPrimitive != ElgNone)
+            infoSink.debug << "input primitive = " << TQualifier::getGeometryString(inputPrimitive) << "\n";
+        if (vertexSpacing != EvsNone)
+            infoSink.debug << "vertex spacing = " << TQualifier::getVertexSpacingString(vertexSpacing) << "\n";
+        if (vertexOrder != EvoNone)
+            infoSink.debug << "triangle order = " << TQualifier::getVertexOrderString(vertexOrder) << "\n";
         break;
 
     case EShLangTessEvaluation:
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/iomapper.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/iomapper.cpp
index bebf775..9a6613e 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/iomapper.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/iomapper.cpp
@@ -445,19 +445,19 @@ struct TDefaultIoResolver : public glslang::TIoMapResolver
         return 0;
     }
 
-    bool validateInOut(EShLanguage stage, const char* name, const TType& type, bool is_live) override
+    bool validateInOut(EShLanguage /*stage*/, const char* /*name*/, const TType& /*type*/, bool /*is_live*/) override
     {
         return true;
     }
-    int resolveInOutLocation(EShLanguage stage, const char* name, const TType& type, bool is_live) override
+    int resolveInOutLocation(EShLanguage /*stage*/, const char* /*name*/, const TType& /*type*/, bool /*is_live*/) override
     {
         return -1;
     }
-    int resolveInOutComponent(EShLanguage stage, const char* name, const TType& type, bool is_live) override
+    int resolveInOutComponent(EShLanguage /*stage*/, const char* /*name*/, const TType& /*type*/, bool /*is_live*/) override
     {
         return -1;
     }
-    int resolveInOutIndex(EShLanguage stage, const char* name, const TType& type, bool is_live) override
+    int resolveInOutIndex(EShLanguage /*stage*/, const char* /*name*/, const TType& /*type*/, bool /*is_live*/) override
     {
         return -1;
     }
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/linkValidate.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/linkValidate.cpp
index 761cb41..4bb2951 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/linkValidate.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/linkValidate.cpp
@@ -461,10 +461,12 @@ void TIntermediate::finalCheck(TInfoSink& infoSink, bool keepUncalled)
     case EShLangTessEvaluation:
         if (inputPrimitive == ElgNone)
             error(infoSink, "At least one shader must specify an input layout primitive");
-        if (vertexSpacing == EvsNone)
-            vertexSpacing = EvsEqual;
-        if (vertexOrder == EvoNone)
-            vertexOrder = EvoCcw;
+        if (source == EShSourceGlsl) {
+            if (vertexSpacing == EvsNone)
+                vertexSpacing = EvsEqual;
+            if (vertexOrder == EvoNone)
+                vertexOrder = EvoCcw;
+        }
         break;
     case EShLangGeometry:
         if (inputPrimitive == ElgNone)
@@ -1045,9 +1047,9 @@ unsigned int TIntermediate::computeTypeXfbSize(const TType& type, bool& contains
 
 const int baseAlignmentVec4Std140 = 16;
 
-// Return the size and alignment of a scalar.
+// Return the size and alignment of a component of the given type.
 // The size is returned in the 'size' parameter
-// Return value is the alignment of the type.
+// Return value is the alignment..
 int TIntermediate::getBaseAlignmentScalar(const TType& type, int& size)
 {
     switch (type.getBasicType()) {
@@ -1217,4 +1219,14 @@ int TIntermediate::getBaseAlignment(const TType& type, int& size, int& stride, b
     return baseAlignmentVec4Std140;
 }
 
+// To aid the basic HLSL rule about crossing vec4 boundaries.
+bool TIntermediate::improperStraddle(const TType& type, int size, int offset)
+{
+    if (! type.isVector() || type.isArray())
+        return false;
+
+    return size <= 16 ? offset / 16 != (offset + size - 1) / 16
+                      : offset % 16 != 0;
+}
+
 } // end namespace glslang
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/localintermediate.h b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/localintermediate.h
index 5460dd5..2cd912b 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/localintermediate.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/localintermediate.h
@@ -177,7 +177,8 @@ public:
         shiftSsboBinding(0),
         autoMapBindings(false),
         flattenUniformArrays(false),
-        useUnknownFormat(false)
+        useUnknownFormat(false),
+        hlslOffsets(false)
     {
         localSize[0] = 1;
         localSize[1] = 1;
@@ -216,6 +217,8 @@ public:
     bool getFlattenUniformArrays()        const { return flattenUniformArrays; }
     void setNoStorageFormat(bool b)             { useUnknownFormat = b; }
     bool getNoStorageFormat()             const { return useUnknownFormat; }
+    void setHlslOffsets()         { hlslOffsets = true; }
+    bool usingHlslOFfsets() const { return hlslOffsets; }
 
     void setVersion(int v) { version = v; }
     int getVersion() const { return version; }
@@ -413,7 +416,9 @@ public:
     }
     int addXfbBufferOffset(const TType&);
     unsigned int computeTypeXfbSize(const TType&, bool& containsDouble) const;
+    static int getBaseAlignmentScalar(const TType&, int& size);
     static int getBaseAlignment(const TType&, int& size, int& stride, bool std140, bool rowMajor);
+    static bool improperStraddle(const TType& type, int size, int offset);
     bool promote(TIntermOperator*);
 
 #ifdef NV_EXTENSIONS
@@ -443,7 +448,6 @@ protected:
     void inOutLocationCheck(TInfoSink&);
     TIntermSequence& findLinkerObjects() const;
     bool userOutputUsed() const;
-    static int getBaseAlignmentScalar(const TType&, int& size);
     bool isSpecializationOperation(const TIntermOperator&) const;
     bool promoteUnary(TIntermUnary&);
     bool promoteBinary(TIntermBinary&);
@@ -499,6 +503,7 @@ protected:
     bool autoMapBindings;
     bool flattenUniformArrays;
     bool useUnknownFormat;
+    bool hlslOffsets;
 
     typedef std::list<TCall> TGraph;
     TGraph callGraph;
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/preprocessor/PpScanner.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/preprocessor/PpScanner.cpp
index b9e7a5b..dd16269 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/preprocessor/PpScanner.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/MachineIndependent/preprocessor/PpScanner.cpp
@@ -799,6 +799,7 @@ int TPpContext::tokenPaste(int token, TPpToken& ppToken)
         token = scanToken(&pastedPpToken);
         assert(token == PpAtomPaste);
 
+        // This covers end of macro expansion
         if (endOfReplacementList()) {
             parseContext.ppError(ppToken.loc, "unexpected location; end of replacement list", "##", "");
             break;
@@ -807,6 +808,12 @@ int TPpContext::tokenPaste(int token, TPpToken& ppToken)
         // get the token after the ##
         token = scanToken(&pastedPpToken);
 
+        // This covers end of argument expansion
+        if (token == tMarkerInput::marker) {
+            parseContext.ppError(ppToken.loc, "unexpected location; end of argument", "##", "");
+            break;
+        }
+
         // get the token text
         switch (resultToken) {
         case PpAtomIdentifier:
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Unix/ossource.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Unix/ossource.cpp
index 4f8098b..24b77e1 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Unix/ossource.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Unix/ossource.cpp
@@ -184,20 +184,6 @@ void ReleaseGlobalLock()
   pthread_mutex_unlock(&gMutex);
 }
 
-// TODO: non-windows: if we need these on linux, flesh them out
-void* OS_CreateThread(TThreadEntrypoint /*entry*/)
-{
-    return 0;
-}
-
-void OS_WaitForAllThreads(void* /*threads*/, int /*numThreads*/)
-{
-}
-
-void OS_Sleep(int /*milliseconds*/)
-{
-}
-
 void OS_DumpMemoryCounters()
 {
 }
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Windows/ossource.cpp b/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Windows/ossource.cpp
index 73ae0ca..870840c 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Windows/ossource.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/Windows/ossource.cpp
@@ -131,21 +131,6 @@ unsigned int __stdcall EnterGenericThread (void* entry)
     return ((TThreadEntrypoint)entry)(0);
 }
 
-void* OS_CreateThread(TThreadEntrypoint entry)
-{
-    return (void*)_beginthreadex(0, 0, EnterGenericThread, (void*)entry, 0, 0);
-}
-
-void OS_WaitForAllThreads(void* threads, int numThreads)
-{
-    WaitForMultipleObjects(numThreads, (HANDLE*)threads, true, INFINITE);
-}
-
-void OS_Sleep(int milliseconds)
-{
-    Sleep(milliseconds);
-}
-
 //#define DUMP_COUNTERS
 
 void OS_DumpMemoryCounters()
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/osinclude.h b/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/osinclude.h
index e832526..218abe4 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/osinclude.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/OSDependent/osinclude.h
@@ -53,11 +53,8 @@ void GetGlobalLock();
 void ReleaseGlobalLock();
 
 typedef unsigned int (*TThreadEntrypoint)(void*);
-void* OS_CreateThread(TThreadEntrypoint);
-void OS_WaitForAllThreads(void* threads, int numThreads);
 
 void OS_CleanupThreadData(void);
-void OS_Sleep(int milliseconds);
 
 void OS_DumpMemoryCounters();
 
diff --git a/3rdparty/bgfx/3rdparty/glslang/glslang/Public/ShaderLang.h b/3rdparty/bgfx/3rdparty/glslang/glslang/Public/ShaderLang.h
index 8d8abd7..7ea9446 100644
--- a/3rdparty/bgfx/3rdparty/glslang/glslang/Public/ShaderLang.h
+++ b/3rdparty/bgfx/3rdparty/glslang/glslang/Public/ShaderLang.h
@@ -147,6 +147,7 @@ enum EShMessages {
     EShMsgReadHlsl         = (1 << 6),  // use HLSL parsing rules and semantics
     EShMsgCascadingErrors  = (1 << 7),  // get cascading errors; risks error-recovery issues, instead of an early exit
     EShMsgKeepUncalled     = (1 << 8),  // for testing, don't eliminate uncalled functions
+    EShMsgHlslOffsets      = (1 << 9),  // allow block offsets to follow HLSL rules instead of GLSL rules
 };
 
 //
@@ -466,25 +467,25 @@ class TIoMapResolver
 public:
   virtual ~TIoMapResolver() {}
 
-  // Should return true if the resulting/current binding would be ok.
+  // Should return true if the resulting/current binding would be okay.
   // Basic idea is to do aliasing binding checks with this.
   virtual bool validateBinding(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
   // Should return a value >= 0 if the current binding should be overridden.
   // Return -1 if the current binding (including no binding) should be kept.
   virtual int resolveBinding(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
-  // Should return a value >= 0 if the current set should be overriden.
+  // Should return a value >= 0 if the current set should be overridden.
   // Return -1 if the current set (including no set) should be kept.
   virtual int resolveSet(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
-  // Should return true if the resuling/current setup would be ok.
+  // Should return true if the resulting/current setup would be okay.
   // Basic idea is to do aliasing checks and reject invalid semantic names.
   virtual bool validateInOut(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
-  // Should return a value >= 0 if the current location should be overriden.
+  // Should return a value >= 0 if the current location should be overridden.
   // Return -1 if the current location (including no location) should be kept.
   virtual int resolveInOutLocation(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
-  // Should return a value >= 0 if the current component index should be overriden.
+  // Should return a value >= 0 if the current component index should be overridden.
   // Return -1 if the current component index (including no index) should be kept.
   virtual int resolveInOutComponent(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
-  // Should return a value >= 0 if the current color index should be overriden.
+  // Should return a value >= 0 if the current color index should be overridden.
   // Return -1 if the current color index (including no index) should be kept.
   virtual int resolveInOutIndex(EShLanguage stage, const char* name, const TType& type, bool is_live) = 0;
 };
diff --git a/3rdparty/bgfx/3rdparty/glslang/gtests/Hlsl.FromFile.cpp b/3rdparty/bgfx/3rdparty/glslang/gtests/Hlsl.FromFile.cpp
index 0b68aea..6bbc16b 100644
--- a/3rdparty/bgfx/3rdparty/glslang/gtests/Hlsl.FromFile.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/gtests/Hlsl.FromFile.cpp
@@ -93,6 +93,7 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.calculatelod.dx10.frag", "main"},
         {"hlsl.calculatelodunclamped.dx10.frag", "main"},
         {"hlsl.cast.frag", "PixelShaderFunction"},
+        {"hlsl.clip.frag", "main"},
         {"hlsl.comparison.vec.frag", "main"},
         {"hlsl.conditional.frag", "PixelShaderFunction"},
         {"hlsl.constructexpr.frag", "main"},
@@ -122,14 +123,21 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.getdimensions.rw.dx10.frag", "main"},
         {"hlsl.getdimensions.dx10.vert", "main"},
         {"hlsl.getsampleposition.dx10.frag", "main"},
+        {"hlsl.domain.1.tese", "main"},
+        {"hlsl.domain.2.tese", "main"},
+        {"hlsl.domain.3.tese", "main"},
         {"hlsl.hull.1.tesc", "main"},
         {"hlsl.hull.2.tesc", "main"},
         {"hlsl.hull.void.tesc", "main"},
+        {"hlsl.hull.ctrlpt-1.tesc", "main"},
+        {"hlsl.hull.ctrlpt-2.tesc", "main"},
         {"hlsl.identifier.sample.frag", "main"},
         {"hlsl.if.frag", "PixelShaderFunction"},
+        {"hlsl.implicitBool.frag", "main"},
         {"hlsl.inoutquals.frag", "main"},
         {"hlsl.init.frag", "ShaderFunction"},
         {"hlsl.init2.frag", "main"},
+        {"hlsl.isfinite.frag", "main"},
         {"hlsl.intrinsics.barriers.comp", "ComputeShaderFunction"},
         {"hlsl.intrinsics.comp", "ComputeShaderFunction"},
         {"hlsl.intrinsics.evalfns.frag", "main"},
@@ -138,6 +146,7 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.intrinsics.f1632.frag", "main"},
         {"hlsl.intrinsics.f3216.frag", "main"},
         {"hlsl.intrinsics.frag", "main"},
+        {"hlsl.intrinsic.frexp.frag", "main"},
         {"hlsl.intrinsics.lit.frag", "PixelShaderFunction"},
         {"hlsl.intrinsics.negative.comp", "ComputeShaderFunction"},
         {"hlsl.intrinsics.negative.frag", "PixelShaderFunction"},
@@ -157,9 +166,12 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.load.rwtexture.array.dx10.frag", "main"},
         {"hlsl.load.offset.dx10.frag", "main"},
         {"hlsl.load.offsetarray.dx10.frag", "main"},
-        {"hlsl.logical.unary.frag", "main"},
         {"hlsl.logical.binary.frag", "main"},
         {"hlsl.logical.binary.vec.frag", "main"},
+        {"hlsl.logicalConvert.frag", "main"},
+        {"hlsl.logical.unary.frag", "main"},
+        {"hlsl.namespace.frag", "main"},
+        {"hlsl.nonint-index.frag", "main"},
         {"hlsl.matNx1.frag", "main"},
         {"hlsl.matrixSwizzle.vert", "ShaderFunction"},
         {"hlsl.mintypes.frag", "main"},
@@ -212,6 +224,8 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.samplelevel.offset.dx10.frag", "main"},
         {"hlsl.samplelevel.offsetarray.dx10.frag", "main"},
         {"hlsl.sample.sub-vec4.dx10.frag", "main"},
+        {"hlsl.scalar-length.frag", "main"},
+        {"hlsl.scalarCast.vert", "main"},
         {"hlsl.semicolons.frag", "main"},
         {"hlsl.shapeConv.frag", "main"},
         {"hlsl.shapeConvRet.frag", "main"},
@@ -239,6 +253,7 @@ INSTANTIATE_TEST_CASE_P(
         {"hlsl.structStructName.frag", "main"},
         {"hlsl.this.frag", "main"},
         {"hlsl.intrinsics.vert", "VertexShaderFunction"},
+        {"hlsl.intrinsic.frexp.vert", "VertexShaderFunction"},
         {"hlsl.matType.frag", "PixelShaderFunction"},
         {"hlsl.matType.bool.frag", "main"},
         {"hlsl.matType.int.frag", "main"},
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.cpp b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.cpp
index 4f6f803..b8932ce 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.cpp
@@ -134,22 +134,17 @@ bool HlslGrammar::acceptIdentifier(HlslToken& idToken)
 }
 
 // compilationUnit
-//      : list of externalDeclaration
-//      |   SEMICOLONS
+//      : declaration_list EOF
 //
 bool HlslGrammar::acceptCompilationUnit()
 {
     TIntermNode* unitNode = nullptr;
 
-    while (! peekTokenClass(EHTokNone)) {
-        // HLSL allows semicolons between global declarations, e.g, between functions.
-        if (acceptTokenClass(EHTokSemicolon))
-            continue;
+    if (! acceptDeclarationList(unitNode))
+        return false;
 
-        // externalDeclaration
-        if (! acceptDeclaration(unitNode))
-            return false;
-    }
+    if (! peekTokenClass(EHTokNone))
+        return false;
 
     // set root of AST
     if (unitNode && !unitNode->getAsAggregate())
@@ -159,6 +154,34 @@ bool HlslGrammar::acceptCompilationUnit()
     return true;
 }
 
+// Recognize the following, but with the extra condition that it can be
+// successfully terminated by EOF or '}'.
+//
+// declaration_list
+//      : list of declaration_or_semicolon followed by EOF or RIGHT_BRACE
+//
+// declaration_or_semicolon
+//      : declaration
+//      : SEMICOLON
+//
+bool HlslGrammar::acceptDeclarationList(TIntermNode*& nodeList)
+{
+    do {
+        // HLSL allows extra semicolons between global declarations
+        do { } while (acceptTokenClass(EHTokSemicolon));
+
+        // EOF or RIGHT_BRACE
+        if (peekTokenClass(EHTokNone) || peekTokenClass(EHTokRightBrace))
+            return true;
+
+        // declaration
+        if (! acceptDeclaration(nodeList))
+            return false;
+    } while (true);
+
+    return true;
+}
+
 // sampler_state
 //      : LEFT_BRACE [sampler_state_assignment ... ] RIGHT_BRACE
 //
@@ -289,6 +312,7 @@ bool HlslGrammar::acceptSamplerDeclarationDX9(TType& /*type*/)
 //      | fully_specified_type identifier function_parameters post_decls compound_statement  // function definition
 //      | fully_specified_type identifier sampler_state post_decls compound_statement        // sampler definition
 //      | typedef declaration
+//      | NAMESPACE IDENTIFIER LEFT_BRACE declaration_list RIGHT_BRACE
 //
 // declarator_list
 //      : declarator COMMA declarator COMMA declarator...  // zero or more declarators
@@ -314,6 +338,30 @@ bool HlslGrammar::acceptSamplerDeclarationDX9(TType& /*type*/)
 //
 bool HlslGrammar::acceptDeclaration(TIntermNode*& nodeList)
 {
+    // NAMESPACE IDENTIFIER LEFT_BRACE declaration_list RIGHT_BRACE
+    if (acceptTokenClass(EHTokNamespace)) {
+        HlslToken namespaceToken;
+        if (!acceptIdentifier(namespaceToken)) {
+            expected("namespace name");
+            return false;
+        }
+        parseContext.pushNamespace(*namespaceToken.string);
+        if (!acceptTokenClass(EHTokLeftBrace)) {
+            expected("{");
+            return false;
+        }
+        if (!acceptDeclarationList(nodeList)) {
+            expected("declaration list");
+            return false;
+        }
+        if (!acceptTokenClass(EHTokRightBrace)) {
+            expected("}");
+            return false;
+        }
+        parseContext.popNamespace();
+        return true;
+    }
+
     bool declarator_list = false; // true when processing comma separation
 
     // attributes
@@ -342,15 +390,17 @@ bool HlslGrammar::acceptDeclaration(TIntermNode*& nodeList)
     HlslToken idToken;
     TIntermAggregate* initializers = nullptr;
     while (acceptIdentifier(idToken)) {
+        const TString *fullName = idToken.string;
+        if (parseContext.symbolTable.atGlobalLevel())
+            parseContext.getFullNamespaceName(fullName);
         if (peekTokenClass(EHTokLeftParen)) {
             // looks like function parameters
-            TString* fnName = idToken.string;
 
             // Potentially rename shader entry point function.  No-op most of the time.
-            parseContext.renameShaderFunction(fnName);
+            parseContext.renameShaderFunction(fullName);
 
             // function_parameters
-            declarator.function = new TFunction(fnName, declaredType);
+            declarator.function = new TFunction(fullName, declaredType);
             if (!acceptFunctionParameters(*declarator.function)) {
                 expected("function parameter list");
                 return false;
@@ -424,19 +474,19 @@ bool HlslGrammar::acceptDeclaration(TIntermNode*& nodeList)
             // TODO: strings are not yet handled.
             if (variableType.getBasicType() != EbtString && parseContext.getAnnotationNestingLevel() == 0) {
                 if (typedefDecl)
-                    parseContext.declareTypedef(idToken.loc, *idToken.string, variableType);
+                    parseContext.declareTypedef(idToken.loc, *fullName, variableType);
                 else if (variableType.getBasicType() == EbtBlock)
-                    parseContext.declareBlock(idToken.loc, variableType, idToken.string);
+                    parseContext.declareBlock(idToken.loc, variableType, fullName);
                 else {
                     if (variableType.getQualifier().storage == EvqUniform && ! variableType.containsOpaque()) {
                         // this isn't really an individual variable, but a member of the $Global buffer
-                        parseContext.growGlobalUniformBlock(idToken.loc, variableType, *idToken.string);
+                        parseContext.growGlobalUniformBlock(idToken.loc, variableType, *fullName);
                     } else {
                         // Declare the variable and add any initializer code to the AST.
                         // The top-level node is always made into an aggregate, as that's
                         // historically how the AST has been.
                         initializers = intermediate.growAggregate(initializers,
-                            parseContext.declareVariable(idToken.loc, *idToken.string, variableType, expressionNode),
+                            parseContext.declareVariable(idToken.loc, *fullName, variableType, expressionNode),
                             idToken.loc);
                     }
                 }
@@ -561,16 +611,15 @@ bool HlslGrammar::acceptFullySpecifiedType(TType& type, TIntermNode*& nodeList)
         qualifier.layoutFormat = type.getQualifier().layoutFormat;
         qualifier.precision    = type.getQualifier().precision;
 
-        // Propagate sampler readonly qualifier for buffers
-        if (type.getBasicType() == EbtSampler)
-            qualifier.readonly = type.getQualifier().readonly;
-
-        if (type.getQualifier().storage == EvqVaryingOut ||
+        if (type.getQualifier().storage == EvqOut ||
             type.getQualifier().storage == EvqBuffer) {
             qualifier.storage      = type.getQualifier().storage;
             qualifier.readonly     = type.getQualifier().readonly;
         }
 
+        if (type.getQualifier().builtIn != EbvNone)
+            qualifier.builtIn = type.getQualifier().builtIn;
+
         type.getQualifier()    = qualifier;
     }
 
@@ -914,14 +963,14 @@ bool HlslGrammar::acceptOutputPrimitiveGeometry(TLayoutGeometry& geometry)
 //      : INPUTPATCH
 //      | OUTPUTPATCH
 //
-bool HlslGrammar::acceptTessellationDeclType()
+bool HlslGrammar::acceptTessellationDeclType(TBuiltInVariable& patchType)
 {
     // read geometry type
     const EHlslTokenClass tessType = peek();
 
     switch (tessType) {
-    case EHTokInputPatch:    break;
-    case EHTokOutputPatch:   break;
+    case EHTokInputPatch:    patchType = EbvInputPatch;  break;
+    case EHTokOutputPatch:   patchType = EbvOutputPatch; break;
     default:
         return false;  // not a tessellation decl
     }
@@ -935,7 +984,9 @@ bool HlslGrammar::acceptTessellationDeclType()
 //
 bool HlslGrammar::acceptTessellationPatchTemplateType(TType& type)
 {
-    if (! acceptTessellationDeclType())
+    TBuiltInVariable patchType;
+
+    if (! acceptTessellationDeclType(patchType))
         return false;
     
     if (! acceptTokenClass(EHTokLeftAngle))
@@ -962,6 +1013,7 @@ bool HlslGrammar::acceptTessellationPatchTemplateType(TType& type)
     TArraySizes* arraySizes = new TArraySizes;
     arraySizes->addInnerSize(size->getAsConstantUnion()->getConstArray()[0].getIConst());
     type.newArraySizes(*arraySizes);
+    type.getQualifier().builtIn = patchType;
 
     if (! acceptTokenClass(EHTokRightAngle)) {
         expected("right angle bracket");
@@ -989,7 +1041,8 @@ bool HlslGrammar::acceptStreamOutTemplateType(TType& type, TLayoutGeometry& geom
         return false;
     }
 
-    type.getQualifier().storage = EvqVaryingOut;
+    type.getQualifier().storage = EvqOut;
+    type.getQualifier().builtIn = EbvGsOutputStream;
 
     if (! acceptTokenClass(EHTokRightAngle)) {
         expected("right angle bracket");
@@ -1099,25 +1152,25 @@ bool HlslGrammar::acceptTextureType(TType& type)
     bool array = false;
     bool ms    = false;
     bool image = false;
-    bool readonly = false;
+    bool combined = true;
 
     switch (textureType) {
-    case EHTokTexture1d:         dim = Esd1D;                                break;
-    case EHTokTexture1darray:    dim = Esd1D; array = true;                  break;
-    case EHTokTexture2d:         dim = Esd2D;                                break;
-    case EHTokTexture2darray:    dim = Esd2D; array = true;                  break;
-    case EHTokTexture3d:         dim = Esd3D;                                break;
-    case EHTokTextureCube:       dim = EsdCube;                              break;
-    case EHTokTextureCubearray:  dim = EsdCube; array = true;                break;
-    case EHTokTexture2DMS:       dim = Esd2D; ms = true;                     break;
-    case EHTokTexture2DMSarray:  dim = Esd2D; array = true; ms = true;       break;
-    case EHTokBuffer:            dim = EsdBuffer; readonly=true; image=true; break;
-    case EHTokRWBuffer:          dim = EsdBuffer; image=true;                break;
-    case EHTokRWTexture1d:       dim = Esd1D; array=false; image=true;       break;
-    case EHTokRWTexture1darray:  dim = Esd1D; array=true;  image=true;       break;
-    case EHTokRWTexture2d:       dim = Esd2D; array=false; image=true;       break;
-    case EHTokRWTexture2darray:  dim = Esd2D; array=true;  image=true;       break;
-    case EHTokRWTexture3d:       dim = Esd3D; array=false; image=true;       break;
+    case EHTokBuffer:            dim = EsdBuffer; combined = false;    break;
+    case EHTokTexture1d:         dim = Esd1D;                          break;
+    case EHTokTexture1darray:    dim = Esd1D; array = true;            break;
+    case EHTokTexture2d:         dim = Esd2D;                          break;
+    case EHTokTexture2darray:    dim = Esd2D; array = true;            break;
+    case EHTokTexture3d:         dim = Esd3D;                          break;
+    case EHTokTextureCube:       dim = EsdCube;                        break;
+    case EHTokTextureCubearray:  dim = EsdCube; array = true;          break;
+    case EHTokTexture2DMS:       dim = Esd2D; ms = true;               break;
+    case EHTokTexture2DMSarray:  dim = Esd2D; array = true; ms = true; break;
+    case EHTokRWBuffer:          dim = EsdBuffer; image=true;          break;
+    case EHTokRWTexture1d:       dim = Esd1D; array=false; image=true; break;
+    case EHTokRWTexture1darray:  dim = Esd1D; array=true;  image=true; break;
+    case EHTokRWTexture2d:       dim = Esd2D; array=false; image=true; break;
+    case EHTokRWTexture2darray:  dim = Esd2D; array=true;  image=true; break;
+    case EHTokRWTexture3d:       dim = Esd3D; array=false; image=true; break;
     default:
         return false;  // not a texture declaration
     }
@@ -1177,7 +1230,7 @@ bool HlslGrammar::acceptTextureType(TType& type)
     } else if (ms) {
         expected("texture type for multisample");
         return false;
-    } else if (image && !readonly) {
+    } else if (image) {
         expected("type for RWTexture/RWBuffer");
         return false;
     }
@@ -1207,10 +1260,12 @@ bool HlslGrammar::acceptTextureType(TType& type)
     // Remember the declared vector size.
     sampler.vectorSize = txType.getVectorSize();
 
-    type.shallowCopy(TType(sampler, EvqUniform, arraySizes));
+    // Force uncombined, if necessary
+    if (!combined)
+        sampler.combined = false;
 
+    type.shallowCopy(TType(sampler, EvqUniform, arraySizes));
     type.getQualifier().layoutFormat = format;
-    type.getQualifier().readonly = readonly;
 
     return true;
 }
@@ -1829,7 +1884,7 @@ bool HlslGrammar::acceptStruct(TType& type, TIntermNode*& nodeList)
     TVector<TFunctionDeclarator> functionDeclarators;
 
     parseContext.pushNamespace(structName);
-    bool acceptedList = acceptStructDeclarationList(typeList, nodeList, structName, functionDeclarators);
+    bool acceptedList = acceptStructDeclarationList(typeList, nodeList, functionDeclarators);
     parseContext.popNamespace();
 
     if (! acceptedList) {
@@ -1988,7 +2043,7 @@ bool HlslGrammar::acceptStructBufferType(TType& type)
 //      | IDENTIFIER array_specifier post_decls
 //      | IDENTIFIER function_parameters post_decls                                         // member-function prototype
 //
-bool HlslGrammar::acceptStructDeclarationList(TTypeList*& typeList, TIntermNode*& nodeList, const TString& typeName,
+bool HlslGrammar::acceptStructDeclarationList(TTypeList*& typeList, TIntermNode*& nodeList,
                                               TVector<TFunctionDeclarator>& declarators)
 {
     typeList = new TTypeList();
@@ -2090,7 +2145,8 @@ bool HlslGrammar::acceptMemberFunctionDefinition(TIntermNode*& nodeList, const T
 {
     bool accepted = false;
 
-    TString* functionName = parseContext.getFullNamespaceName(memberName);
+    const TString* functionName = &memberName;
+    parseContext.getFullNamespaceName(functionName);
     declarator.function = new TFunction(functionName, type);
     if (type.getQualifier().storage == EvqTemporary)
         declarator.function->setImplicitThis();
@@ -2163,7 +2219,7 @@ bool HlslGrammar::acceptDefaultParameterDeclaration(const TType& type, TIntermTy
 
         // For initializer lists, we have to const-fold into a constructor for the type, so build
         // that.
-        TFunction* constructor = parseContext.handleConstructorCall(token.loc, type);
+        TFunction* constructor = parseContext.makeConstructorCall(token.loc, type);
         if (constructor == nullptr)  // cannot construct
             return false;
 
@@ -2476,6 +2532,10 @@ bool HlslGrammar::acceptConditionalExpression(TIntermTyped*& node)
     if (! acceptTokenClass(EHTokQuestion))
         return true;
 
+    node = parseContext.convertConditionalExpression(token.loc, node);
+    if (node == nullptr)
+        return false;
+
     TIntermTyped* trueNode = nullptr;
     if (! acceptExpression(trueNode)) {
         expected("expression after ?");
@@ -2568,7 +2628,7 @@ bool HlslGrammar::acceptUnaryExpression(TIntermTyped*& node)
                     return false;
 
                 // Hook it up like a constructor
-                TFunction* constructorFunction = parseContext.handleConstructorCall(loc, castType);
+                TFunction* constructorFunction = parseContext.makeConstructorCall(loc, castType);
                 if (constructorFunction == nullptr) {
                     expected("type that can be constructed");
                     return false;
@@ -2623,12 +2683,12 @@ bool HlslGrammar::acceptUnaryExpression(TIntermTyped*& node)
 //      : LEFT_PAREN expression RIGHT_PAREN
 //      | literal
 //      | constructor
-//      | identifier
+//      | IDENTIFIER [ COLONCOLON IDENTIFIER [ COLONCOLON IDENTIFIER ... ] ]
 //      | function_call
 //      | postfix_expression LEFT_BRACKET integer_expression RIGHT_BRACKET
 //      | postfix_expression DOT IDENTIFIER
 //      | postfix_expression DOT IDENTIFIER arguments
-//      | postfix_expression COLONCOLON IDENTIFIER arguments
+//      | postfix_expression arguments
 //      | postfix_expression INC_OP
 //      | postfix_expression DEC_OP
 //
@@ -2640,9 +2700,6 @@ bool HlslGrammar::acceptPostfixExpression(TIntermTyped*& node)
     // idToken will pick up either a variable or a function name in a function call
     HlslToken idToken;
 
-    // scopeBase will pick up the type symbol on the left of '::'
-    TSymbol* scope = nullptr;
-
     // Find something before the postfix operations, as they can't operate
     // on nothing.  So, no "return true", they fall through, only "return false".
     if (acceptTokenClass(EHTokLeftParen)) {
@@ -2656,21 +2713,26 @@ bool HlslGrammar::acceptPostfixExpression(TIntermTyped*& node)
             return false;
         }
     } else if (acceptLiteral(node)) {
-        // literal (nothing else to do yet), go on to the
+        // literal (nothing else to do yet)
     } else if (acceptConstructor(node)) {
         // constructor (nothing else to do yet)
     } else if (acceptIdentifier(idToken)) {
-        // user-type, identifier, or function name
-        if (peekTokenClass(EHTokColonColon)) {
-            TType type;
-            scope = parseContext.lookupUserType(*idToken.string, type);
-            if (scope == nullptr) {
-                expected("type left of ::");
+        // user-type, namespace name, variable, or function name
+        TString* fullName = idToken.string;
+        while (acceptTokenClass(EHTokColonColon)) {
+            // user-type or namespace name
+            fullName = NewPoolTString(fullName->c_str());
+            fullName->append(parseContext.scopeMangler);
+            if (acceptIdentifier(idToken))
+                fullName->append(*idToken.string);
+            else {
+                expected("identifier after ::");
                 return false;
             }
-        } else if (! peekTokenClass(EHTokLeftParen)) {
-            node = parseContext.handleVariable(idToken.loc, idToken.string);
-        } else if (acceptFunctionCall(idToken, node)) {
+        }
+        if (! peekTokenClass(EHTokLeftParen)) {
+            node = parseContext.handleVariable(idToken.loc, fullName);
+        } else if (acceptFunctionCall(idToken.loc, *fullName, node, nullptr)) {
             // function_call (nothing else to do yet)
         } else {
             expected("function call arguments");
@@ -2734,7 +2796,7 @@ bool HlslGrammar::acceptPostfixExpression(TIntermTyped*& node)
                 TIntermTyped* thisNode = node;
 
                 // arguments
-                if (! acceptFunctionCall(field, node, thisNode, scope)) {
+                if (! acceptFunctionCall(field.loc, *field.string, node, thisNode)) {
                     expected("function parameters");
                     return false;
                 }
@@ -2754,6 +2816,8 @@ bool HlslGrammar::acceptPostfixExpression(TIntermTyped*& node)
             }
             advanceToken();
             node = parseContext.handleBracketDereference(indexNode->getLoc(), node, indexNode);
+            if (node == nullptr)
+                return false;
             break;
         }
         case EOpPostIncrement:
@@ -2779,7 +2843,7 @@ bool HlslGrammar::acceptConstructor(TIntermTyped*& node)
     // type
     TType type;
     if (acceptType(type)) {
-        TFunction* constructorFunction = parseContext.handleConstructorCall(token.loc, type);
+        TFunction* constructorFunction = parseContext.makeConstructorCall(token.loc, type);
         if (constructorFunction == nullptr)
             return false;
 
@@ -2806,26 +2870,26 @@ bool HlslGrammar::acceptConstructor(TIntermTyped*& node)
 // function_call
 //      : [idToken] arguments
 //
-bool HlslGrammar::acceptFunctionCall(HlslToken callToken, TIntermTyped*& node, TIntermTyped* baseObject,
-                                     const TSymbol* scope)
+bool HlslGrammar::acceptFunctionCall(const TSourceLoc& loc, TString& name, TIntermTyped*& node, TIntermTyped* baseObject)
 {
     // name
     TString* functionName = nullptr;
-    if ((baseObject == nullptr && scope == nullptr)) {
-        functionName = callToken.string;
-    } else if (parseContext.isBuiltInMethod(callToken.loc, baseObject, *callToken.string)) {
+    if (baseObject == nullptr) {
+        functionName = &name;
+    } else if (parseContext.isBuiltInMethod(loc, baseObject, name)) {
         // Built-in methods are not in the symbol table as methods, but as global functions
         // taking an explicit 'this' as the first argument.
         functionName = NewPoolTString(BUILTIN_PREFIX);
-        functionName->append(*callToken.string);
+        functionName->append(name);
     } else {
+        if (! baseObject->getType().isStruct()) {
+            expected("structure");
+            return false;
+        }
         functionName = NewPoolTString("");
-        if (baseObject != nullptr)
-            functionName->append(baseObject->getType().getTypeName());
-        else if (scope != nullptr)
-            functionName->append(scope->getType().getTypeName());
+        functionName->append(baseObject->getType().getTypeName());
         parseContext.addScopeMangler(*functionName);
-        functionName->append(*callToken.string);
+        functionName->append(name);
     }
 
     // function
@@ -2841,7 +2905,7 @@ bool HlslGrammar::acceptFunctionCall(HlslToken callToken, TIntermTyped*& node, T
         return false;
 
     // call
-    node = parseContext.handleFunctionCall(callToken.loc, function, arguments);
+    node = parseContext.handleFunctionCall(loc, function, arguments);
 
     return true;
 }
@@ -3144,6 +3208,9 @@ bool HlslGrammar::acceptSelectionStatement(TIntermNode*& statement)
     TIntermTyped* condition;
     if (! acceptParenExpression(condition))
         return false;
+    condition = parseContext.convertConditionalExpression(loc, condition);
+    if (condition == nullptr)
+        return false;
 
     // create the child statements
     TIntermNodePair thenElse = { nullptr, nullptr };
@@ -3227,6 +3294,9 @@ bool HlslGrammar::acceptIterationStatement(TIntermNode*& statement)
         // LEFT_PAREN condition RIGHT_PAREN
         if (! acceptParenExpression(condition))
             return false;
+        condition = parseContext.convertConditionalExpression(loc, condition);
+        if (condition == nullptr)
+            return false;
 
         // statement
         if (! acceptScopedStatement(statement)) {
@@ -3266,6 +3336,9 @@ bool HlslGrammar::acceptIterationStatement(TIntermNode*& statement)
         TIntermTyped* condition;
         if (! acceptParenExpression(condition))
             return false;
+        condition = parseContext.convertConditionalExpression(loc, condition);
+        if (condition == nullptr)
+            return false;
 
         if (! acceptTokenClass(EHTokSemicolon))
             expected(";");
@@ -3303,6 +3376,11 @@ bool HlslGrammar::acceptIterationStatement(TIntermNode*& statement)
         acceptExpression(condition);
         if (! acceptTokenClass(EHTokSemicolon))
             expected(";");
+        if (condition != nullptr) {
+            condition = parseContext.convertConditionalExpression(loc, condition);
+            if (condition == nullptr)
+                return false;
+        }
 
         // iterator SEMI_COLON
         TIntermTyped* iterator = nullptr;
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.h b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.h
index 1a3abf1..6d8ed8f 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.h
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslGrammar.h
@@ -65,6 +65,7 @@ namespace glslang {
         void unimplemented(const char*);
         bool acceptIdentifier(HlslToken&);
         bool acceptCompilationUnit();
+        bool acceptDeclarationList(TIntermNode*&);
         bool acceptDeclaration(TIntermNode*&);
         bool acceptControlDeclaration(TIntermNode*& node);
         bool acceptSamplerDeclarationDX9(TType&);
@@ -78,7 +79,7 @@ namespace glslang {
         bool acceptTemplateVecMatBasicType(TBasicType&);
         bool acceptVectorTemplateType(TType&);
         bool acceptMatrixTemplateType(TType&);
-        bool acceptTessellationDeclType();
+        bool acceptTessellationDeclType(TBuiltInVariable&);
         bool acceptTessellationPatchTemplateType(TType&);
         bool acceptStreamOutTemplateType(TType&, TLayoutGeometry&);
         bool acceptOutputPrimitiveGeometry(TLayoutGeometry&);
@@ -87,8 +88,7 @@ namespace glslang {
         bool acceptTextureType(TType&);
         bool acceptStructBufferType(TType&);
         bool acceptStruct(TType&, TIntermNode*& nodeList);
-        bool acceptStructDeclarationList(TTypeList*&, TIntermNode*& nodeList, const TString& typeName,
-                                         TVector<TFunctionDeclarator>&);
+        bool acceptStructDeclarationList(TTypeList*&, TIntermNode*& nodeList, TVector<TFunctionDeclarator>&);
         bool acceptMemberFunctionDefinition(TIntermNode*& nodeList, const TType&, const TString& memberName,
                                             TFunctionDeclarator&);
         bool acceptFunctionParameters(TFunction&);
@@ -104,8 +104,7 @@ namespace glslang {
         bool acceptUnaryExpression(TIntermTyped*&);
         bool acceptPostfixExpression(TIntermTyped*&);
         bool acceptConstructor(TIntermTyped*&);
-        bool acceptFunctionCall(HlslToken, TIntermTyped*&, TIntermTyped* objectBase = nullptr,
-                                const TSymbol* scope = nullptr);
+        bool acceptFunctionCall(const TSourceLoc&, TString& name, TIntermTyped*&, TIntermTyped* objectBase);
         bool acceptArguments(TFunction*, TIntermTyped*&);
         bool acceptLiteral(TIntermTyped*&);
         bool acceptCompoundStatement(TIntermNode*&);
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.cpp b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.cpp
index 48b94b6..745ee40 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.cpp
@@ -66,7 +66,9 @@ HlslParseContext::HlslParseContext(TSymbolTable& symbolTable, TIntermediate& int
     nextInLocation(0), nextOutLocation(0),
     sourceEntryPointName(sourceEntryPointName),
     entryPointFunction(nullptr),
-    entryPointFunctionBody(nullptr)
+    entryPointFunctionBody(nullptr),
+    gsStreamOutput(nullptr),
+    inputPatch(nullptr)
 {
     globalUniformDefaults.clear();
     globalUniformDefaults.layoutMatrix = ElmRowMajor;
@@ -162,7 +164,7 @@ bool HlslParseContext::shouldConvertLValue(const TIntermNode* node) const
     return false;
 }
 
-void HlslParseContext::growGlobalUniformBlock(TSourceLoc& loc, TType& memberType, TString& memberName, TTypeList* newTypeList)
+void HlslParseContext::growGlobalUniformBlock(const TSourceLoc& loc, TType& memberType, const TString& memberName, TTypeList* newTypeList)
 {
     newTypeList = nullptr;
     correctUniform(memberType.getQualifier());
@@ -333,13 +335,6 @@ TIntermTyped* HlslParseContext::handleLvalue(const TSourceLoc& loc, const char*
                            [](bool isSet) { return isSet; } );
     };
 
-    // helper to create a temporary variable
-    const auto addTmpVar = [&](const char* name, const TType& derefType) -> TIntermSymbol* {
-        TVariable* tmpVar = makeInternalVariable(name, derefType);
-        tmpVar->getWritableType().getQualifier().makeTemporary();
-        return intermediate.addSymbol(*tmpVar, loc);
-    };
-
     // Create swizzle matching input swizzle
     const auto addSwizzle = [&](TIntermSymbol* var, TIntermBinary* swizzle) -> TIntermTyped* {
         if (swizzle)
@@ -417,7 +412,7 @@ TIntermTyped* HlslParseContext::handleLvalue(const TSourceLoc& loc, const char*
                 TIntermTyped* coordTmp = coord;
 
                 if (rhsTmp == nullptr || isModifyOp || lhsIsSwizzle) {
-                    rhsTmp = addTmpVar("storeTemp", objDerefType);
+                    rhsTmp = makeInternalVariableNode(loc, "storeTemp", objDerefType);
 
                     // Partial updates not yet supported
                     if (!writesAllComponents(rhsTmp, lhsAsBinary)) {
@@ -427,7 +422,7 @@ TIntermTyped* HlslParseContext::handleLvalue(const TSourceLoc& loc, const char*
                     // Assign storeTemp = rhs
                     if (isModifyOp) {
                         // We have to make a temp var for the coordinate, to avoid evaluating it twice.
-                        coordTmp = addTmpVar("coordTemp", coord->getType());
+                        coordTmp = makeInternalVariableNode(loc, "coordTemp", coord->getType());
                         makeBinary(EOpAssign, coordTmp, coord); // coordtmp = load[param1]
                         makeLoad(rhsTmp, object, coordTmp, objDerefType); // rhsTmp = OpImageLoad(object, coordTmp)
                     }
@@ -460,8 +455,8 @@ TIntermTyped* HlslParseContext::handleLvalue(const TSourceLoc& loc, const char*
                 //      OpImageStore(object, coordTmp, rhsTmp)
                 //      rhsTmp
 
-                TIntermSymbol* rhsTmp = addTmpVar("storeTemp", objDerefType);
-                TIntermTyped* coordTmp = addTmpVar("coordTemp", coord->getType());
+                TIntermSymbol* rhsTmp = makeInternalVariableNode(loc, "storeTemp", objDerefType);
+                TIntermTyped* coordTmp = makeInternalVariableNode(loc, "coordTemp", coord->getType());
 
                 makeBinary(EOpAssign, coordTmp, coord);           // coordtmp = load[param1]
                 makeLoad(rhsTmp, object, coordTmp, objDerefType); // rhsTmp = OpImageLoad(object, coordTmp)
@@ -481,9 +476,9 @@ TIntermTyped* HlslParseContext::handleLvalue(const TSourceLoc& loc, const char*
                 //      rhsTmp2 op
                 //      OpImageStore(object, coordTmp, rhsTmp2)
                 //      rhsTmp1 (pre-op value)
-                TIntermSymbol* rhsTmp1 = addTmpVar("storeTempPre",  objDerefType);
-                TIntermSymbol* rhsTmp2 = addTmpVar("storeTempPost", objDerefType);
-                TIntermTyped* coordTmp = addTmpVar("coordTemp", coord->getType());
+                TIntermSymbol* rhsTmp1 = makeInternalVariableNode(loc, "storeTempPre",  objDerefType);
+                TIntermSymbol* rhsTmp2 = makeInternalVariableNode(loc, "storeTempPost", objDerefType);
+                TIntermTyped* coordTmp = makeInternalVariableNode(loc, "coordTemp", coord->getType());
 
                 makeBinary(EOpAssign, coordTmp, coord);            // coordtmp = load[param1]
                 makeLoad(rhsTmp1, object, coordTmp, objDerefType); // rhsTmp1 = OpImageLoad(object, coordTmp)
@@ -727,6 +722,18 @@ TIntermTyped* HlslParseContext::handleBracketDereference(const TSourceLoc& loc,
     if (result != nullptr)
         return result;  // it was handled as an operator[]
 
+    const TBasicType indexBasicType = index->getType().getBasicType();
+
+    // Cast index to unsigned integer if it isn't one.
+    if (indexBasicType != EbtInt && indexBasicType != EbtUint &&
+        indexBasicType != EbtInt64 && indexBasicType != EbtUint64)
+        index = intermediate.addConversion(EOpConstructUint, TType(EbtUint), index);
+
+    if (index == nullptr) {
+        error(loc, " unknown undex type ", "", "");
+        return nullptr;
+    }
+
     bool flattened = false;
     int indexValue = 0;
     if (index->getQualifier().storage == EvqConst) {
@@ -1047,6 +1054,8 @@ TType& HlslParseContext::split(TType& type, TString name, const TType* outerStru
             if (arraySizes)
                 ioVar->getWritableType().newArraySizes(*arraySizes);
 
+            fixBuiltInIoType(ioVar->getWritableType());
+
             interstageBuiltInIo[tInterstageIoData(memberType, *outerStructType)] = ioVar;
 
             // Merge qualifier from the user structure
@@ -1374,6 +1383,7 @@ TIntermTyped* HlslParseContext::splitAccessStruct(const TSourceLoc& loc, TInterm
 void HlslParseContext::trackLinkage(TSymbol& symbol)
 {
     TBuiltInVariable biType = symbol.getType().getQualifier().builtIn;
+
     if (biType != EbvNone)
         builtInLinkageSymbols[biType] = symbol.clone();
 
@@ -1381,6 +1391,50 @@ void HlslParseContext::trackLinkage(TSymbol& symbol)
 }
 
 
+// Some types require fixed array sizes in SPIR-V, but can be scalars or
+// arrays of sizes SPIR-V doesn't allow.  For example, tessellation factors.
+// This creates the right size.  A conversion is performed when the internal
+// type is copied to or from the external type.  This corrects the externally
+// facing input or output type to abide downstream semantics.
+void HlslParseContext::fixBuiltInIoType(TType& type)
+{
+    int requiredArraySize = 0;
+
+    switch (type.getQualifier().builtIn) {
+    case EbvTessLevelOuter: requiredArraySize = 4; break;
+    case EbvTessLevelInner: requiredArraySize = 2; break;
+    case EbvClipDistance:   // TODO: ...
+    case EbvCullDistance:   // TODO: ...
+        return;
+    case EbvTessCoord:
+        {
+            // tesscoord is always a vec3 for the IO variable, no matter the shader's
+            // declared vector size.
+            TType tessCoordType(type.getBasicType(), type.getQualifier().storage, 3);
+
+            tessCoordType.getQualifier() = type.getQualifier();
+            type.shallowCopy(tessCoordType);
+
+            break;
+        }
+    default:
+        return;
+    }
+
+    // Alter or set array size as needed.
+    if (requiredArraySize > 0) {
+        if (type.isArray()) {
+            // Already an array.  Fix the size.
+            type.changeOuterArraySize(requiredArraySize);
+        } else {
+            // it wasn't an array, but needs to be.
+            TArraySizes arraySizes;
+            arraySizes.addInnerSize(requiredArraySize);
+            type.newArraySizes(arraySizes);
+        }
+    }
+}
+
 // Variables that correspond to the user-interface in and out of a stage
 // (not the built-in interface) are assigned locations and
 // registered as a linkage node (part of the stage's external interface).
@@ -1389,15 +1443,24 @@ void HlslParseContext::trackLinkage(TSymbol& symbol)
 void HlslParseContext::assignLocations(TVariable& variable)
 {
     const auto assignLocation = [&](TVariable& variable) {
-        const TQualifier& qualifier = variable.getType().getQualifier();
+        const TType& type = variable.getType();
+        const TQualifier& qualifier = type.getQualifier();
         if (qualifier.storage == EvqVaryingIn || qualifier.storage == EvqVaryingOut) {
             if (qualifier.builtIn == EbvNone) {
+                // Strip off the outer array dimension for those having an extra one.
+                int size;
+                if (type.isArray() && qualifier.isArrayedIo(language)) {
+                    TType elementType(type, 0);
+                    size = intermediate.computeTypeLocationSize(elementType);
+                } else
+                    size = intermediate.computeTypeLocationSize(type);
+
                 if (qualifier.storage == EvqVaryingIn) {
                     variable.getWritableType().getQualifier().layoutLocation = nextInLocation;
-                    nextInLocation += intermediate.computeTypeLocationSize(variable.getType());
+                    nextInLocation += size;
                 } else {
                     variable.getWritableType().getQualifier().layoutLocation = nextOutLocation;
-                    nextOutLocation += intermediate.computeTypeLocationSize(variable.getType());
+                    nextOutLocation += size;
                 }
             }
 
@@ -1559,48 +1622,10 @@ TIntermAggregate* HlslParseContext::handleFunctionDefinition(const TSourceLoc& l
     return paramNodes;
 }
 
-//
-// Do all special handling for the entry point, including wrapping
-// the shader's entry point with the official entry point that will call it.
-//
-// The following:
-//
-//    retType shaderEntryPoint(args...) // shader declared entry point
-//    { body }
-//
-// Becomes
-//
-//    out retType ret;
-//    in iargs<that are input>...;
-//    out oargs<that are output> ...;
-//
-//    void shaderEntryPoint()    // synthesized, but official, entry point
-//    {
-//        args<that are input> = iargs...;
-//        ret = @shaderEntryPoint(args...);
-//        oargs = args<that are output>...;
-//    }
-//
-// The symbol table will still map the original entry point name to the
-// the modified function and it's new name:
-//
-//    symbol table:  shaderEntryPoint  ->   @shaderEntryPoint
-//
-// Returns nullptr if no entry-point tree was built, otherwise, returns
-// a subtree that creates the entry point.
-//
-TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunction& userFunction, const TAttributeMap& attributes)
+
+// Handle all [attrib] attribute for the shader entry point
+void HlslParseContext::handleEntryPointAttributes(const TSourceLoc& loc, const TAttributeMap& attributes)
 {
-    // if we aren't in the entry point, fix the IO as such and exit
-    if (userFunction.getName().compare(intermediate.getEntryPointName().c_str()) != 0) {
-        remapNonEntryPointIO(userFunction);
-        return nullptr;
-    }
-
-    entryPointFunction = &userFunction; // needed in finish()
-
-    // entry point logic...
-
     // Handle entry-point function attributes
     const TIntermAggregate* numThreads = attributes[EatNumThreads];
     if (numThreads != nullptr) {
@@ -1652,8 +1677,12 @@ TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunct
                 error(loc, "unsupported domain type", domainStr.c_str(), "");
             }
 
-            if (! intermediate.setInputPrimitive(domain)) {
-                error(loc, "cannot change previously set domain", TQualifier::getGeometryString(domain), "");
+            if (language == EShLangTessEvaluation) {
+                if (! intermediate.setInputPrimitive(domain))
+                    error(loc, "cannot change previously set domain", TQualifier::getGeometryString(domain), "");
+            } else {
+                if (! intermediate.setOutputPrimitive(domain))
+                    error(loc, "cannot change previously set domain", TQualifier::getGeometryString(domain), "");
             }
         }
     }
@@ -1731,6 +1760,61 @@ TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunct
             }
         }
     }
+}
+
+//
+// Do all special handling for the entry point, including wrapping
+// the shader's entry point with the official entry point that will call it.
+//
+// The following:
+//
+//    retType shaderEntryPoint(args...) // shader declared entry point
+//    { body }
+//
+// Becomes
+//
+//    out retType ret;
+//    in iargs<that are input>...;
+//    out oargs<that are output> ...;
+//
+//    void shaderEntryPoint()    // synthesized, but official, entry point
+//    {
+//        args<that are input> = iargs...;
+//        ret = @shaderEntryPoint(args...);
+//        oargs = args<that are output>...;
+//    }
+//
+// The symbol table will still map the original entry point name to the
+// the modified function and it's new name:
+//
+//    symbol table:  shaderEntryPoint  ->   @shaderEntryPoint
+//
+// Returns nullptr if no entry-point tree was built, otherwise, returns
+// a subtree that creates the entry point.
+//
+TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunction& userFunction, const TAttributeMap& attributes)
+{
+    // Return true if this is a tessellation patch constant function input to a domain shader.
+    const auto isDsPcfInput = [this](const TType& type) {
+        return language == EShLangTessEvaluation &&
+        type.contains([](const TType* t) {
+                return t->getQualifier().builtIn == EbvTessLevelOuter ||
+                t->getQualifier().builtIn == EbvTessLevelInner;
+            });
+    };
+
+    // if we aren't in the entry point, fix the IO as such and exit
+    if (userFunction.getName().compare(intermediate.getEntryPointName().c_str()) != 0) {
+        remapNonEntryPointIO(userFunction);
+        return nullptr;
+    }
+
+    entryPointFunction = &userFunction; // needed in finish()
+
+    // Handle entry point attributes
+    handleEntryPointAttributes(loc, attributes);
+
+    // entry point logic...
 
     // Move parameters and return value to shader in/out
     TVariable* entryPointOutput; // gets created in remapEntryPointIO
@@ -1750,15 +1834,28 @@ TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunct
             else if (variable.getType().containsBuiltInInterstageIO(language))
                 split(variable);
         }
+
         assignLocations(variable);
     };
     if (entryPointOutput)
         makeVariableInOut(*entryPointOutput);
     for (auto it = inputs.begin(); it != inputs.end(); ++it)
-        makeVariableInOut(*(*it));
+        if (!isDsPcfInput((*it)->getType()))  // skip domain shader PCF input (see comment below)
+            makeVariableInOut(*(*it));
     for (auto it = outputs.begin(); it != outputs.end(); ++it)
         makeVariableInOut(*(*it));
 
+    // In the domain shader, PCF input must be at the end of the linkage.  That's because in the
+    // hull shader there is no ordering: the output comes from the separate PCF, which does not
+    // participate in the argument list.  That is always put at the end of the HS linkage, so the
+    // input side of the DS must match.  The argument may be in any position in the DS argument list
+    // however, so this ensures the linkage is built in the correct order regardless of argument order.
+    if (language == EShLangTessEvaluation) {
+        for (auto it = inputs.begin(); it != inputs.end(); ++it)
+            if (isDsPcfInput((*it)->getType()))  // skip domain shader PCF input (see comment below)
+                makeVariableInOut(*(*it));
+    }
+
     // Synthesize the call
 
     pushScope(); // matches the one in handleFunctionBody()
@@ -1780,11 +1877,15 @@ TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunct
     TIntermAggregate* synthBody = new TIntermAggregate();
     auto inputIt = inputs.begin();
     TIntermTyped* callingArgs = nullptr;
+
     for (int i = 0; i < userFunction.getParamCount(); i++) {
         TParameter& param = userFunction[i];
         argVars.push_back(makeInternalVariable(*param.name, *param.type));
+
         argVars.back()->getWritableType().getQualifier().makeTemporary();
+
         TIntermSymbol* arg = intermediate.addSymbol(*argVars.back());
+
         handleFunctionArgument(&callee, callingArgs, arg);
         if (param.type->getQualifier().isParamInput()) {
             intermediate.growAggregate(synthBody, handleAssign(loc, EOpAssign, arg,
@@ -1799,20 +1900,60 @@ TIntermNode* HlslParseContext::transformEntryPoint(const TSourceLoc& loc, TFunct
     currentCaller = userFunction.getMangledName();
 
     // Return value
-    if (entryPointOutput)
-        intermediate.growAggregate(synthBody, handleAssign(loc, EOpAssign,
-                                                           intermediate.addSymbol(*entryPointOutput), callReturn));
-    else
+    if (entryPointOutput) {
+        TIntermTyped* returnAssign;
+
+        // For hull shaders, the wrapped entry point return value is written to
+        // an array element as indexed by invocation ID, which we might have to make up.
+        // This is required to match SPIR-V semantics.
+        if (language == EShLangTessControl) {
+            TIntermSymbol* invocationIdSym = findLinkageSymbol(EbvInvocationId);
+
+            // If there is no user declared invocation ID, we must make one.
+            if (invocationIdSym == nullptr) {
+                TType invocationIdType(EbtUint, EvqIn, 1);
+                TString* invocationIdName = NewPoolTString("InvocationId");
+                invocationIdType.getQualifier().builtIn = EbvInvocationId;
+
+                TVariable* variable = makeInternalVariable(*invocationIdName, invocationIdType);
+
+                globalQualifierFix(loc, variable->getWritableType().getQualifier());
+                trackLinkage(*variable);
+
+                invocationIdSym = intermediate.addSymbol(*variable);
+            }
+
+            TIntermTyped* element = intermediate.addIndex(EOpIndexIndirect, intermediate.addSymbol(*entryPointOutput),
+                                                          invocationIdSym, loc);
+            element->setType(callReturn->getType());
+
+            returnAssign = handleAssign(loc, EOpAssign, element, callReturn);
+        } else {
+            returnAssign = handleAssign(loc, EOpAssign, intermediate.addSymbol(*entryPointOutput), callReturn);
+        }
+        
+        intermediate.growAggregate(synthBody, returnAssign);
+    } else
         intermediate.growAggregate(synthBody, callReturn);
 
     // Output copies
     auto outputIt = outputs.begin();
     for (int i = 0; i < userFunction.getParamCount(); i++) {
         TParameter& param = userFunction[i];
+
+        // GS outputs are via emit, so we do not copy them here.
         if (param.type->getQualifier().isParamOutput()) {
-            intermediate.growAggregate(synthBody, handleAssign(loc, EOpAssign,
-                                                               intermediate.addSymbol(**outputIt),
-                                                               intermediate.addSymbol(*argVars[i])));
+            if (param.declaredBuiltIn == EbvGsOutputStream) {
+                // GS output stream does not assign outputs here: it's the Append() method
+                // which writes to the output, probably multiple times separated by Emit.
+                // We merely remember the output to use, here.
+                gsStreamOutput = *outputIt;
+            } else {
+                intermediate.growAggregate(synthBody, handleAssign(loc, EOpAssign,
+                                                                   intermediate.addSymbol(**outputIt),
+                                                                   intermediate.addSymbol(*argVars[i])));
+            }
+
             outputIt++;
         }
     }
@@ -1862,19 +2003,45 @@ void HlslParseContext::remapEntryPointIO(TFunction& function, TVariable*& return
                     ioVariable->getWritableType().setStruct(newLists->second.output);
             }
         }
-        if (storage == EvqVaryingIn)
+        if (storage == EvqVaryingIn) {
             correctInput(ioVariable->getWritableType().getQualifier());
-        else
+            if (language == EShLangTessEvaluation)
+                if (!ioVariable->getType().isArray())
+                    ioVariable->getWritableType().getQualifier().patch = true;
+        } else {
             correctOutput(ioVariable->getWritableType().getQualifier());
+        }
         ioVariable->getWritableType().getQualifier().storage = storage;
+
+        fixBuiltInIoType(ioVariable->getWritableType());
+
         return ioVariable;
     };
 
     // return value is actually a shader-scoped output (out)
-    if (function.getType().getBasicType() == EbtVoid)
+    if (function.getType().getBasicType() == EbtVoid) {
         returnValue = nullptr;
-    else
-        returnValue = makeIoVariable("@entryPointOutput", function.getWritableType(), EvqVaryingOut);
+    } else {
+        if (language == EShLangTessControl) {
+            // tessellation evaluation in HLSL writes a per-ctrl-pt value, but it needs to be an
+            // array in SPIR-V semantics.  We'll write to it indexed by invocation ID.
+
+            returnValue = makeIoVariable("@entryPointOutput", function.getWritableType(), EvqVaryingOut);
+
+            TType outputType;
+            outputType.shallowCopy(function.getType());
+
+            // vertices has necessarily already been set when handling entry point attributes.
+            TArraySizes arraySizes;
+            arraySizes.addInnerSize(intermediate.getVertices());
+            outputType.newArraySizes(arraySizes);
+
+            clearUniformInputOutput(function.getWritableType().getQualifier());
+            returnValue = makeIoVariable("@entryPointOutput", outputType, EvqVaryingOut);
+        } else {
+            returnValue = makeIoVariable("@entryPointOutput", function.getWritableType(), EvqVaryingOut);
+        }
+    }
 
     // parameters are actually shader-scoped inputs and outputs (in or out)
     for (int i = 0; i < function.getParamCount(); i++) {
@@ -1882,6 +2049,9 @@ void HlslParseContext::remapEntryPointIO(TFunction& function, TVariable*& return
         if (paramType.getQualifier().isParamInput()) {
             TVariable* argAsGlobal = makeIoVariable(function[i].name->c_str(), paramType, EvqVaryingIn);
             inputs.push_back(argAsGlobal);
+
+            if (function[i].declaredBuiltIn == EbvInputPatch)
+                inputPatch = argAsGlobal;
         }
         if (paramType.getQualifier().isParamOutput()) {
             TVariable* argAsGlobal = makeIoVariable(function[i].name->c_str(), paramType, EvqVaryingOut);
@@ -2031,7 +2201,11 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
         const bool split          = isLeft ? isSplitLeft : isSplitRight;
         const TIntermTyped* outer = isLeft ? outerLeft   : outerRight;
         const TVector<TVariable*>& flatVariables      = isLeft ? *leftVariables : *rightVariables;
-        const TOperator op = node->getType().isArray() ? EOpIndexDirect : EOpIndexDirectStruct;
+
+        // Index operator if it's an aggregate, else EOpNull
+        const TOperator op = node->getType().isArray()  ? EOpIndexDirect : 
+                             node->getType().isStruct() ? EOpIndexDirectStruct : EOpNull;
+
         const TType derefType(node->getType(), member);
 
         if (split && derefType.isBuiltInInterstageIO(language)) {
@@ -2047,10 +2221,14 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
         } else if (flattened && isFinalFlattening(derefType)) {
             subTree = intermediate.addSymbol(*flatVariables[memberIdx++]);
         } else {
-            const TType splitDerefType(splitNode->getType(), splitMember);
+            if (op == EOpNull) {
+                subTree = splitNode;
+            } else {
+                const TType splitDerefType(splitNode->getType(), splitMember);
 
-            subTree = intermediate.addIndex(op, splitNode, intermediate.addConstantUnion(splitMember, loc), loc);
-            subTree->setType(splitDerefType);
+                subTree = intermediate.addIndex(op, splitNode, intermediate.addConstantUnion(splitMember, loc), loc);
+                subTree->setType(splitDerefType);
+            }
         }
 
         return subTree;
@@ -2069,11 +2247,15 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
         // If we get here, we are assigning to or from a whole array or struct that must be
         // flattened, so have to do member-by-member assignment:
 
-        if (left->getType().isArray()) {
-            const TType dereferencedType(left->getType(), 0);
+        if (left->getType().isArray() || right->getType().isArray()) {
+            const int elementsL = left->getType().isArray() ? left->getType().getOuterArraySize() : 1;
+            const int elementsR = right->getType().isArray() ? right->getType().getOuterArraySize() : 1;
+
+            // The arrays may not be the same size, e.g, if the size has been forced for EbvTessLevelInner or Outer.
+            const int elementsToCopy = std::min(elementsL, elementsR);
 
             // array case
-            for (int element=0; element < left->getType().getOuterArraySize(); ++element) {
+            for (int element=0; element < elementsToCopy; ++element) {
                 arrayElement.push_back(element);
 
                 // Add a new AST symbol node if we have a temp variable holding a complex RHS.
@@ -2083,10 +2265,7 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
                 TIntermTyped* subSplitLeft =  isSplitLeft  ? getMember(true,  left,  element, splitLeft, element) : subLeft;
                 TIntermTyped* subSplitRight = isSplitRight ? getMember(false, right, element, splitRight, element) : subRight; 
 
-                if (isFinalFlattening(dereferencedType))
-                    assignList = intermediate.growAggregate(assignList, intermediate.addAssign(op, subLeft, subRight, loc), loc);
-                else
-                    traverse(subLeft, subRight, subSplitLeft, subSplitRight);
+                traverse(subLeft, subRight, subSplitLeft, subSplitRight);
 
                 arrayElement.pop_back();
             }
@@ -2120,8 +2299,8 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
                 // subtree here IFF it does not itself contain any interstage built-in IO variables, so we only have to
                 // recurse into it if there's something for splitting to do.  That can save a lot of AST verbosity for
                 // a bunch of memberwise copies.
-                if (isFinalFlattening(typeL) || (!isFlattenLeft && !isFlattenRight &&
-                                                 !typeL.containsBuiltInInterstageIO(language) && !typeR.containsBuiltInInterstageIO(language))) {
+                if ((!isFlattenLeft && !isFlattenRight &&
+                     !typeL.containsBuiltInInterstageIO(language) && !typeR.containsBuiltInInterstageIO(language))) {
                     assignList = intermediate.growAggregate(assignList, intermediate.addAssign(op, subSplitLeft, subSplitRight, loc), loc);
                 } else {
                     traverse(subLeft, subRight, subSplitLeft, subSplitRight);
@@ -2131,8 +2310,8 @@ TIntermTyped* HlslParseContext::handleAssign(const TSourceLoc& loc, TOperator op
                 memberR += (typeR.isBuiltInInterstageIO(language) ? 0 : 1);
             }
         } else {
-            assert(0);  // we should never be called on a non-flattenable thing, because
-                        // that case bails out above to a simple copy.
+            // Member copy
+            assignList = intermediate.growAggregate(assignList, intermediate.addAssign(op, left, right, loc), loc);
         }
 
     };
@@ -2272,6 +2451,9 @@ void HlslParseContext::decomposeStructBufferMethods(const TSourceLoc& loc, TInte
     if (argAggregate == nullptr)
         return;
 
+    if (argAggregate->getSequence().empty())
+        return;
+
     // Buffer is the object upon which method is called, so always arg 0
     TIntermTyped* bufferObj = argAggregate->getSequence()[0]->getAsTyped();
 
@@ -3164,9 +3346,15 @@ void HlslParseContext::decomposeGeometryMethods(const TSourceLoc& loc, TIntermTy
             emit->setLoc(loc);
             emit->setType(TType(EbtVoid));
 
+            // find the matching output
+            if (gsStreamOutput == nullptr) {
+                error(loc, "unable to find output symbol for Append()", "", "");
+                return;
+            }
+
             sequence = intermediate.growAggregate(sequence,
                                                   handleAssign(loc, EOpAssign,
-                                                               argAggregate->getSequence()[0]->getAsTyped(),
+                                                               intermediate.addSymbol(*gsStreamOutput, loc),
                                                                argAggregate->getSequence()[1]->getAsTyped()),
                                                   loc);
 
@@ -3702,6 +3890,48 @@ void HlslParseContext::decomposeIntrinsic(const TSourceLoc& loc, TIntermTyped*&
             break;
         }
 
+    case EOpIsFinite:
+        {
+            // Since OPIsFinite in SPIR-V is only supported with the Kernel capability, we translate
+            // it to !isnan && !isinf
+
+            TIntermTyped* arg0 = node->getAsUnaryNode()->getOperand();
+
+            // We'll make a temporary in case the RHS is cmoplex
+            TVariable* tempArg = makeInternalVariable("@finitetmp", arg0->getType());
+            tempArg->getWritableType().getQualifier().makeTemporary();
+
+            TIntermTyped* tmpArgAssign = intermediate.addAssign(EOpAssign,
+                                                                intermediate.addSymbol(*tempArg, loc),
+                                                                arg0, loc);
+
+            TIntermAggregate* compoundStatement = intermediate.makeAggregate(tmpArgAssign, loc);
+
+            TIntermTyped* isnan = handleUnaryMath(loc, "isnan", EOpIsNan, intermediate.addSymbol(*tempArg, loc));
+            isnan->setType(TType(EbtBool));
+
+            TIntermTyped* notnan = handleUnaryMath(loc, "!", EOpLogicalNot, isnan);
+            notnan->setType(TType(EbtBool));
+
+            TIntermTyped* isinf = handleUnaryMath(loc, "isinf", EOpIsInf, intermediate.addSymbol(*tempArg, loc));
+            isinf->setType(TType(EbtBool));
+
+            TIntermTyped* notinf = handleUnaryMath(loc, "!", EOpLogicalNot, isinf);
+            notinf->setType(TType(EbtBool));
+            
+            TIntermTyped* andNode = handleBinaryMath(loc, "and", EOpLogicalAnd, notnan, notinf);
+            andNode->setType(TType(EbtBool));
+
+            compoundStatement = intermediate.growAggregate(compoundStatement, andNode);
+            compoundStatement->setOperator(EOpSequence);
+            compoundStatement->setLoc(loc);
+            compoundStatement->setType(TType(EbtVoid));
+
+            node = compoundStatement;
+
+            break;
+        }
+        
     default:
         break; // most pass through unchanged
     }
@@ -3732,7 +3962,7 @@ TIntermTyped* HlslParseContext::handleFunctionCall(const TSourceLoc& loc, TFunct
             //
             // It's a constructor, of type 'type'.
             //
-            result = addConstructor(loc, arguments, type);
+            result = handleConstructor(loc, arguments, type);
             if (result == nullptr)
                 error(loc, "cannot construct with these arguments", type.getCompleteString().c_str(), "");
         }
@@ -3747,7 +3977,9 @@ TIntermTyped* HlslParseContext::handleFunctionCall(const TSourceLoc& loc, TFunct
         // the symbol table for an arbitrary type.  This is a temporary hack until that ability exists.
         // It will have false positives, since it doesn't check arg counts or types.
         if (arguments && arguments->getAsAggregate()) {
-            if (isStructBufferType(arguments->getAsAggregate()->getSequence()[0]->getAsTyped()->getType())) {
+            const TIntermSequence& sequence = arguments->getAsAggregate()->getSequence();
+
+            if (!sequence.empty() && isStructBufferType(sequence[0]->getAsTyped()->getType())) {
                 static const int methodPrefixSize = sizeof(BUILTIN_PREFIX)-1;
 
                 if (function->getName().length() > methodPrefixSize &&
@@ -4141,9 +4373,13 @@ void HlslParseContext::builtInOpCheck(const TSourceLoc& loc, const TFunction& fn
 }
 
 //
-// Handle seeing a built-in constructor in a grammar production.
+// Handle seeing something in a grammar production that can be done by calling
+// a constructor.
 //
-TFunction* HlslParseContext::handleConstructorCall(const TSourceLoc& loc, const TType& type)
+// The constructor still must be "handled" by handleFunctionCall(), which will
+// then call handleConstructor().
+//
+TFunction* HlslParseContext::makeConstructorCall(const TSourceLoc& loc, const TType& type)
 {
     TOperator op = intermediate.mapTypeToConstructorOp(type);
 
@@ -4173,6 +4409,10 @@ void HlslParseContext::handleSemantic(TSourceLoc loc, TQualifier& qualifier, TBu
     case EbvStencilRef:
         error(loc, "unimplemented; need ARB_shader_stencil_export", "SV_STENCILREF", "");
         break;
+    case EbvTessLevelInner:
+    case EbvTessLevelOuter:
+        qualifier.patch = true;
+        break;
     default:
         break;
     }
@@ -4285,6 +4525,18 @@ void HlslParseContext::handleRegister(const TSourceLoc& loc, TQualifier& qualifi
     }
 }
 
+// Convert to a scalar boolean, or if not allowed by HLSL semantics,
+// report an error and return nullptr.
+TIntermTyped* HlslParseContext::convertConditionalExpression(const TSourceLoc& loc, TIntermTyped* condition)
+{
+    if (!condition->getType().isScalarOrVec1()) {
+        error(loc, "requires a scalar", "conditional expression", "");
+        return nullptr;
+    }
+
+    return intermediate.addConversion(EOpConstructBool, TType(EbtBool), condition);
+}
+
 //
 // Same error message for all places assignments don't work.
 //
@@ -4510,7 +4762,7 @@ bool HlslParseContext::constructorError(const TSourceLoc& loc, TIntermNode* node
         return true;
     }
 
-    if (op == EOpConstructStruct && ! type.isArray() && isZeroConstructor(node))
+    if (op == EOpConstructStruct && ! type.isArray() && isScalarConstructor(node))
         return false;
 
     if (op == EOpConstructStruct && ! type.isArray() && (int)type.getStruct()->size() != function.getParamCount()) {
@@ -4527,10 +4779,21 @@ bool HlslParseContext::constructorError(const TSourceLoc& loc, TIntermNode* node
     return false;
 }
 
-bool HlslParseContext::isZeroConstructor(const TIntermNode* node)
+// See if 'node', in the context of constructing aggregates, is a scalar argument
+// to a constructor.
+//
+bool HlslParseContext::isScalarConstructor(const TIntermNode* node)
 {
-    return node->getAsTyped()->isScalar() && node->getAsConstantUnion() &&
-           node->getAsConstantUnion()->getConstArray()[0].getIConst() == 0;
+    // Obviously, it must be a scalar, but an aggregate node might not be fully
+    // completed yet: holding a sequence of initializers under an aggregate
+    // would not yet be typed, so don't check it's type.  This corresponds to
+    // the aggregate operator also not being set yet. (An aggregate operation
+    // that legitimately yields a scalar will have a getOp() of that operator,
+    // not EOpNull.)
+
+    return node->getAsTyped() != nullptr &&
+           node->getAsTyped()->isScalar() &&
+           (node->getAsAggregate() == nullptr || node->getAsAggregate()->getOp() != EOpNull);
 }
 
 // Verify all the correct semantics for constructing a combined texture/sampler.
@@ -4607,13 +4870,6 @@ bool HlslParseContext::voidErrorCheck(const TSourceLoc& loc, const TString& iden
     return false;
 }
 
-// Checks to see if the node (for the expression) contains a scalar boolean expression or not
-void HlslParseContext::boolCheck(const TSourceLoc& loc, const TIntermTyped* type)
-{
-    if (type->getBasicType() != EbtBool || type->isArray() || type->isMatrix() || type->isVector())
-        error(loc, "boolean expression expected", "", "");
-}
-
 //
 // Fix just a full qualifier (no variables or types yet, but qualifier is complete) at global level.
 //
@@ -4759,7 +5015,7 @@ void HlslParseContext::arrayDimMerge(TType& type, const TArraySizes* sizes)
 // Do all the semantic checking for declaring or redeclaring an array, with and
 // without a size, and make the right changes to the symbol table.
 //
-void HlslParseContext::declareArray(const TSourceLoc& loc, TString& identifier, const TType& type, TSymbol*& symbol, bool track)
+void HlslParseContext::declareArray(const TSourceLoc& loc, const TString& identifier, const TType& type, TSymbol*& symbol, bool track)
 {
     if (! symbol) {
         bool currentScope;
@@ -4857,7 +5113,7 @@ void HlslParseContext::updateImplicitArraySize(const TSourceLoc& loc, TIntermNod
 //
 // Enforce non-initializer type/qualifier rules.
 //
-void HlslParseContext::fixConstInit(const TSourceLoc& loc, TString& identifier, TType& type, TIntermTyped*& initializer)
+void HlslParseContext::fixConstInit(const TSourceLoc& loc, const TString& identifier, TType& type, TIntermTyped*& initializer)
 {
     //
     // Make the qualifier make sense, given that there is an initializer.
@@ -5846,7 +6102,7 @@ const TFunction* HlslParseContext::findFunction(const TSourceLoc& loc, TFunction
 // 'parseType' is the type part of the declaration (to the left)
 // 'arraySizes' is the arrayness tagged on the identifier (to the right)
 //
-void HlslParseContext::declareTypedef(const TSourceLoc& loc, TString& identifier, const TType& parseType)
+void HlslParseContext::declareTypedef(const TSourceLoc& loc, const TString& identifier, const TType& parseType)
 {
     TVariable* typeSymbol = new TVariable(&identifier, parseType, true);
     if (! symbolTable.insert(*typeSymbol))
@@ -5977,7 +6233,7 @@ TSymbol* HlslParseContext::lookupUserType(const TString& typeName, TType& type)
 // 'parseType' is the type part of the declaration (to the left)
 // 'arraySizes' is the arrayness tagged on the identifier (to the right)
 //
-TIntermNode* HlslParseContext::declareVariable(const TSourceLoc& loc, TString& identifier, TType& type, TIntermTyped* initializer)
+TIntermNode* HlslParseContext::declareVariable(const TSourceLoc& loc, const TString& identifier, TType& type, TIntermTyped* initializer)
 {
     if (voidErrorCheck(loc, identifier, type.getBasicType()))
         return nullptr;
@@ -6073,13 +6329,22 @@ TVariable* HlslParseContext::makeInternalVariable(const char* name, const TType&
     return variable;
 }
 
+// Make a symbol node holding a new internal temporary variable.
+TIntermSymbol* HlslParseContext::makeInternalVariableNode(const TSourceLoc& loc, const char* name, const TType& type) const
+{
+    TVariable* tmpVar = makeInternalVariable(name, type);
+    tmpVar->getWritableType().getQualifier().makeTemporary();
+
+    return intermediate.addSymbol(*tmpVar, loc);
+}
+
 //
 // Declare a non-array variable, the main point being there is no redeclaration
 // for resizing allowed.
 //
 // Return the successfully declared variable.
 //
-TVariable* HlslParseContext::declareNonArray(const TSourceLoc& loc, TString& identifier, TType& type, bool track)
+TVariable* HlslParseContext::declareNonArray(const TSourceLoc& loc, const TString& identifier, const TType& type, bool track)
 {
     // make a new variable
     TVariable* variable = new TVariable(&identifier, type);
@@ -6123,7 +6388,7 @@ TIntermNode* HlslParseContext::executeInitializer(const TSourceLoc& loc, TInterm
     skeletalType.shallowCopy(variable->getType());
     skeletalType.getQualifier().makeTemporary();
     if (initializer->getAsAggregate() && initializer->getAsAggregate()->getOp() == EOpNull)
-        initializer = convertInitializerList(loc, skeletalType, initializer);
+        initializer = convertInitializerList(loc, skeletalType, initializer, nullptr);
     if (! initializer) {
         // error recovery; don't leave const without constant values
         if (qualifier == EvqConst)
@@ -6159,7 +6424,7 @@ TIntermNode* HlslParseContext::executeInitializer(const TSourceLoc& loc, TInterm
         return nullptr;
     }
 
-    // Const variables require a constant initializer, depending on version
+    // Const variables require a constant initializer
     if (qualifier == EvqConst) {
         if (initializer->getType().getQualifier().storage != EvqConst) {
             variable->getWritableType().getQualifier().storage = EvqConstReadOnly;
@@ -6205,7 +6470,8 @@ TIntermNode* HlslParseContext::executeInitializer(const TSourceLoc& loc, TInterm
 //
 // Returns nullptr if there is an error.
 //
-TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, const TType& type, TIntermTyped* initializer)
+TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, const TType& type,
+                                                       TIntermTyped* initializer, TIntermTyped* scalarInit)
 {
     // Will operate recursively.  Once a subtree is found that is constructor style,
     // everything below it is already good: Only the "top part" of the initializer
@@ -6251,12 +6517,12 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
         }
 
         // lengthen list to be long enough
-        lengthenList(loc, initList->getSequence(), arrayType.getOuterArraySize());
+        lengthenList(loc, initList->getSequence(), arrayType.getOuterArraySize(), scalarInit);
 
         // recursively process each element
         TType elementType(arrayType, 0); // dereferenced type
         for (int i = 0; i < arrayType.getOuterArraySize(); ++i) {
-            initList->getSequence()[i] = convertInitializerList(loc, elementType, initList->getSequence()[i]->getAsTyped());
+            initList->getSequence()[i] = convertInitializerList(loc, elementType, initList->getSequence()[i]->getAsTyped(), scalarInit);
             if (initList->getSequence()[i] == nullptr)
                 return nullptr;
         }
@@ -6264,14 +6530,14 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
         return addConstructor(loc, initList, arrayType);
     } else if (type.isStruct()) {
         // lengthen list to be long enough
-        lengthenList(loc, initList->getSequence(), static_cast<int>(type.getStruct()->size()));
+        lengthenList(loc, initList->getSequence(), static_cast<int>(type.getStruct()->size()), scalarInit);
 
         if (type.getStruct()->size() != initList->getSequence().size()) {
             error(loc, "wrong number of structure members", "initializer list", "");
             return nullptr;
         }
         for (size_t i = 0; i < type.getStruct()->size(); ++i) {
-            initList->getSequence()[i] = convertInitializerList(loc, *(*type.getStruct())[i].type, initList->getSequence()[i]->getAsTyped());
+            initList->getSequence()[i] = convertInitializerList(loc, *(*type.getStruct())[i].type, initList->getSequence()[i]->getAsTyped(), scalarInit);
             if (initList->getSequence()[i] == nullptr)
                 return nullptr;
         }
@@ -6282,7 +6548,7 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
             // a constructor; no further processing needed.
         } else {
             // lengthen list to be long enough
-            lengthenList(loc, initList->getSequence(), type.getMatrixCols());
+            lengthenList(loc, initList->getSequence(), type.getMatrixCols(), scalarInit);
 
             if (type.getMatrixCols() != (int)initList->getSequence().size()) {
                 error(loc, "wrong number of matrix columns:", "initializer list", type.getCompleteString().c_str());
@@ -6290,14 +6556,14 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
             }
             TType vectorType(type, 0); // dereferenced type
             for (int i = 0; i < type.getMatrixCols(); ++i) {
-                initList->getSequence()[i] = convertInitializerList(loc, vectorType, initList->getSequence()[i]->getAsTyped());
+                initList->getSequence()[i] = convertInitializerList(loc, vectorType, initList->getSequence()[i]->getAsTyped(), scalarInit);
                 if (initList->getSequence()[i] == nullptr)
                     return nullptr;
             }
         }
     } else if (type.isVector()) {
         // lengthen list to be long enough
-        lengthenList(loc, initList->getSequence(), type.getVectorSize());
+        lengthenList(loc, initList->getSequence(), type.getVectorSize(), scalarInit);
 
         // error check; we're at bottom, so work is finished below
         if (type.getVectorSize() != (int)initList->getSequence().size()) {
@@ -6306,7 +6572,7 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
         }
     } else if (type.isScalar()) {
         // lengthen list to be long enough
-        lengthenList(loc, initList->getSequence(), 1);
+        lengthenList(loc, initList->getSequence(), 1, scalarInit);
 
         if ((int)initList->getSequence().size() != 1) {
             error(loc, "scalar expected one element:", "initializer list", type.getCompleteString().c_str());
@@ -6319,9 +6585,9 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
 
     // Now that the subtree is processed, process this node as if the
     // initializer list is a set of arguments to a constructor.
-    TIntermNode* emulatedConstructorArguments;
+    TIntermTyped* emulatedConstructorArguments;
     if (initList->getSequence().size() == 1)
-        emulatedConstructorArguments = initList->getSequence()[0];
+        emulatedConstructorArguments = initList->getSequence()[0]->getAsTyped();
     else
         emulatedConstructorArguments = initList;
 
@@ -6331,10 +6597,21 @@ TIntermTyped* HlslParseContext::convertInitializerList(const TSourceLoc& loc, co
 // Lengthen list to be long enough to cover any gap from the current list size
 // to 'size'. If the list is longer, do nothing.
 // The value to lengthen with is the default for short lists.
-void HlslParseContext::lengthenList(const TSourceLoc& loc, TIntermSequence& list, int size)
+//
+// By default, lists that are too short due to lack of initializers initialize to zero.
+// Alternatively, it could be a scalar initializer for a structure. Both cases are handled,
+// based on whether something is passed in as 'scalarInit'.
+//
+// 'scalarInit' must be safe to use each time this is called (no side effects replication).
+//
+void HlslParseContext::lengthenList(const TSourceLoc& loc, TIntermSequence& list, int size, TIntermTyped* scalarInit)
 {
-    for (int c = (int)list.size(); c < size; ++c)
-        list.push_back(intermediate.addConstantUnion(0, loc));
+    for (int c = (int)list.size(); c < size; ++c) {
+        if (scalarInit == nullptr)
+            list.push_back(intermediate.addConstantUnion(0, loc));
+        else
+            list.push_back(scalarInit);
+    }
 }
 
 //
@@ -6343,15 +6620,38 @@ void HlslParseContext::lengthenList(const TSourceLoc& loc, TIntermSequence& list
 //
 // Returns nullptr for an error or the constructed node (aggregate or typed) for no error.
 //
-TIntermTyped* HlslParseContext::addConstructor(const TSourceLoc& loc, TIntermNode* node, const TType& type)
+TIntermTyped* HlslParseContext::handleConstructor(const TSourceLoc& loc, TIntermTyped* node, const TType& type)
 {
-    if (node == nullptr || node->getAsTyped() == nullptr)
+    if (node == nullptr)
         return nullptr;
 
-    // Handle the idiom "(struct type)0"
-    if (type.isStruct() && isZeroConstructor(node))
-        return convertInitializerList(loc, type, intermediate.makeAggregate(loc));
+    // Handle the idiom "(struct type)<scalar value>"
+    if (type.isStruct() && isScalarConstructor(node)) {
+        // 'node' will almost always get used multiple times, so should not be used directly,
+        // it would create a DAG instead of a tree, which might be okay (would
+        // like to formalize that for constants and symbols), but if it has
+        // side effects, they would get executed multiple times, which is not okay.
+        if (node->getAsConstantUnion() == nullptr && node->getAsSymbolNode() == nullptr) {
+            TIntermAggregate* seq = intermediate.makeAggregate(loc);
+            TIntermSymbol* copy = makeInternalVariableNode(loc, "scalarCopy", node->getType());
+            seq = intermediate.growAggregate(seq, intermediate.addBinaryNode(EOpAssign, copy, node, loc));
+            seq = intermediate.growAggregate(seq, convertInitializerList(loc, type, intermediate.makeAggregate(loc), copy));
+            seq->setOp(EOpComma);
+            seq->setType(type);
+            return seq;
+        } else
+            return convertInitializerList(loc, type, intermediate.makeAggregate(loc), node);
+    }
 
+    return addConstructor(loc, node, type);
+}
+
+// Add a constructor, either from the grammar, or other programmatic reasons.
+//
+// Return nullptr if it can't be done.
+//
+TIntermTyped* HlslParseContext::addConstructor(const TSourceLoc& loc, TIntermTyped* node, const TType& type)
+{
     TIntermAggregate* aggrNode = node->getAsAggregate();
     TOperator op = intermediate.mapTypeToConstructorOp(type);
 
@@ -6389,7 +6689,7 @@ TIntermTyped* HlslParseContext::addConstructor(const TSourceLoc& loc, TIntermNod
         else if (op == EOpConstructStruct)
             newNode = constructAggregate(node, *(*memberTypes).type, 1, node->getLoc());
         else
-            newNode = constructBuiltIn(type, op, node->getAsTyped(), node->getLoc(), false);
+            newNode = constructBuiltIn(type, op, node, node->getLoc(), false);
 
         if (newNode && (type.isArray() || op == EOpConstructStruct))
             newNode = intermediate.setAggregateOperator(newNode, EOpConstructStruct, type, loc);
@@ -7132,15 +7432,15 @@ void HlslParseContext::popNamespace()
 
 // Use the class/struct nesting string to create a global name for
 // a member of a class/struct.
-TString* HlslParseContext::getFullNamespaceName(const TString& localName) const
+void HlslParseContext::getFullNamespaceName(const TString*& name) const
 {
-    TString* name = NewPoolTString("");
-    if (currentTypePrefix.size() > 0)
-        name->append(currentTypePrefix.back());
-    name->append(scopeMangler);
-    name->append(localName);
+    if (currentTypePrefix.size() == 0)
+        return;
 
-    return name;
+    TString* fullName = NewPoolTString(currentTypePrefix.back().c_str());
+    fullName->append(scopeMangler);
+    fullName->append(*name);
+    name = fullName;
 }
 
 // Helper function to add the namespace scope mangling syntax to a string.
@@ -7150,7 +7450,7 @@ void HlslParseContext::addScopeMangler(TString& name)
 }
 
 // Potentially rename shader entry point function
-void HlslParseContext::renameShaderFunction(TString*& name) const
+void HlslParseContext::renameShaderFunction(const TString*& name) const
 {
     // Replace the entry point name given in the shader with the real entry point name,
     // if there is a substitution.
@@ -7215,6 +7515,8 @@ bool HlslParseContext::isInputBuiltIn(const TQualifier& qualifier) const
     case EbvTessLevelInner:
     case EbvTessLevelOuter:
         return language == EShLangTessEvaluation;
+    case EbvTessCoord:
+        return language == EShLangTessEvaluation;
     default:
         return false;
     }
@@ -7352,6 +7654,17 @@ void HlslParseContext::clearUniformInputOutput(TQualifier& qualifier)
     correctUniform(qualifier);
 }
 
+
+// Return a symbol for the linkage variable of the given TBuiltInVariable type
+TIntermSymbol* HlslParseContext::findLinkageSymbol(TBuiltInVariable biType) const
+{
+    const auto it = builtInLinkageSymbols.find(biType);
+    if (it == builtInLinkageSymbols.end())  // if it wasn't declared by the user, return nullptr
+        return nullptr;
+
+    return intermediate.addSymbol(*it->second->getAsVariable());
+}
+
 // Add patch constant function invocation
 void HlslParseContext::addPatchConstantInvocation()
 {
@@ -7392,7 +7705,10 @@ void HlslParseContext::addPatchConstantInvocation()
     // Look for builtin variables in a function's parameter list.
     const auto findBuiltIns = [&](const TFunction& function, std::set<tInterstageIoData>& builtIns) {
         for (int p=0; p<function.getParamCount(); ++p) {
-            const TStorageQualifier storage = function[p].type->getQualifier().storage;
+            TStorageQualifier storage = function[p].type->getQualifier().storage;
+
+            if (storage == EvqConstReadOnly) // treated identically to input
+                storage = EvqIn;
 
             if (function[p].declaredBuiltIn != EbvNone)
                 builtIns.insert(HlslParseContext::tInterstageIoData(function[p].declaredBuiltIn, storage));
@@ -7423,13 +7739,11 @@ void HlslParseContext::addPatchConstantInvocation()
         }
     };
 
-    // Return a symbol for the linkage variable of the given TBuiltInVariable type
-    const auto findLinkageSymbol = [this](TBuiltInVariable biType) -> TIntermSymbol* {
-        const auto it = builtInLinkageSymbols.find(biType);
-        if (it == builtInLinkageSymbols.end())  // if it wasn't declared by the user, return nullptr
-            return nullptr;
+    const auto isOutputPatch = [this](TFunction& patchConstantFunction, int param) {
+        const TType& type = *patchConstantFunction[param].type;
+        const TBuiltInVariable biType = patchConstantFunction[param].declaredBuiltIn;
 
-        return intermediate.addSymbol(*it->second->getAsVariable());
+        return type.isArray() && !type.isRuntimeSizedArray() && biType == EbvOutputPatch;
     };
     
     // We will perform these steps.  Each is in a scoped block for separation: they could
@@ -7441,21 +7755,25 @@ void HlslParseContext::addPatchConstantInvocation()
     // 2. Synthesizes a call to the patchconstfunction using builtin variables from either main,
     //    or the ones we created.  Matching is based on builtin type.  We may use synthesized
     //    variables from (1) above.
+    // 
+    // 2B: Synthesize per control point invocations of wrapped entry point if the PCF requires them.
     //
     // 3. Create a return sequence: copy the return value (if any) from the PCF to a
     //    (non-sanitized) output variable.  In case this may involve multiple copies, such as for
     //    an arrayed variable, a temporary copy of the PCF output is created to avoid multiple
     //    indirections into a complex R-value coming from the call to the PCF.
-    //
-    // 4. Add a barrier to the end of the entry point body
-    //
-    // 5. Call the PCF inside an if test for (invocation id == 0).
+    // 
+    // 4. Create a barrier.
+    // 
+    // 5/5B. Call the PCF inside an if test for (invocation id == 0).
 
     TFunction& patchConstantFunction = const_cast<TFunction&>(*candidateList[0]);
     const int pcfParamCount = patchConstantFunction.getParamCount();
     TIntermSymbol* invocationIdSym = findLinkageSymbol(EbvInvocationId);
     TIntermSequence& epBodySeq = entryPointFunctionBody->getAsAggregate()->getSequence();
 
+    int outPatchParam = -1; // -1 means there isn't one.
+
     // ================ Step 1A: Union Interfaces ================
     // Our patch constant function.
     {
@@ -7468,16 +7786,6 @@ void HlslParseContext::addPatchConstantInvocation()
         findBuiltIns(patchConstantFunction, pcfBuiltIns);
         findBuiltIns(*entryPointFunction,   epfBuiltIns);
 
-        // Patchconstantfunction can contain only builtin qualified variables.  (Technically, only HS inputs,
-        // but this test is less assertive than that).
-
-        for (auto bi = pcfBuiltIns.begin(); bi != pcfBuiltIns.end(); ++bi) {
-            if (bi->builtIn == EbvNone) {
-                error(loc, "patch constant function invalid parameter", "", "");
-                return;
-            }
-        }
-
         // Find the set of builtins in the PCF that are not present in the entry point.
         std::set<tInterstageIoData> notInEntryPoint;
 
@@ -7489,15 +7797,39 @@ void HlslParseContext::addPatchConstantInvocation()
 
         // Now we'll add those to the entry and to the linkage.
         for (int p=0; p<pcfParamCount; ++p) {
-            TType* paramType = patchConstantFunction[p].type->clone();
             const TBuiltInVariable biType   = patchConstantFunction[p].declaredBuiltIn;
-            const TStorageQualifier storage = patchConstantFunction[p].type->getQualifier().storage;
+            TStorageQualifier storage = patchConstantFunction[p].type->getQualifier().storage;
 
-            // Use the original declaration type for the linkage
-            paramType->getQualifier().builtIn = biType;
+            // Track whether there is an output patch param
+            if (isOutputPatch(patchConstantFunction, p)) {
+                if (outPatchParam >= 0) {
+                    // Presently we only support one per ctrl pt input.
+                    error(loc, "unimplemented: multiple output patches in patch constant function", "", "");
+                    return;
+                }
+                outPatchParam = p;
+            }
 
-            if (notInEntryPoint.count(tInterstageIoData(biType, storage)) == 1)
-                addToLinkage(*paramType, patchConstantFunction[p].name, nullptr);
+            if (biType != EbvNone) {
+                TType* paramType = patchConstantFunction[p].type->clone();
+
+                if (storage == EvqConstReadOnly) // treated identically to input
+                    storage = EvqIn;
+
+                // Presently, the only non-builtin we support is InputPatch, which is treated as
+                // a pseudo-builtin.
+                if (biType == EbvInputPatch) {
+                    builtInLinkageSymbols[biType] = inputPatch;
+                } else if (biType == EbvOutputPatch) {
+                    // Nothing...
+                } else {
+                    // Use the original declaration type for the linkage
+                    paramType->getQualifier().builtIn = biType;
+
+                    if (notInEntryPoint.count(tInterstageIoData(biType, storage)) == 1)
+                        addToLinkage(*paramType, patchConstantFunction[p].name, nullptr);
+                }
+            }
         }
 
         // If we didn't find it because the shader made one, add our own.
@@ -7512,36 +7844,44 @@ void HlslParseContext::addPatchConstantInvocation()
     }
 
     TIntermTyped* pcfArguments = nullptr;
+    TVariable* perCtrlPtVar = nullptr;
 
     // ================ Step 1B: Argument synthesis ================
     // Create pcfArguments for synthesis of patchconstantfunction invocation
     // TODO: handle struct or array inputs
     {
         for (int p=0; p<pcfParamCount; ++p) {
-            if (patchConstantFunction[p].type->isArray() ||
-                patchConstantFunction[p].type->isStruct()) {
-                error(loc, "unimplemented array or variable in patch constant function signature", "", "");
-                return;
-            }
-        
-            // find which builtin it is
-            const TBuiltInVariable biType = patchConstantFunction[p].declaredBuiltIn;
+            TIntermSymbol* inputArg = nullptr;
 
-            TIntermSymbol* builtIn = findLinkageSymbol(biType);
-        
-            if (builtIn == nullptr) {
-                error(loc, "unable to find patch constant function builtin variable", "", "");
-                return;
+            if (p == outPatchParam) {
+                if (perCtrlPtVar == nullptr) {
+                    perCtrlPtVar = makeInternalVariable(*patchConstantFunction[outPatchParam].name,
+                                                        *patchConstantFunction[outPatchParam].type);
+
+                    perCtrlPtVar->getWritableType().getQualifier().makeTemporary();
+                }
+                inputArg = intermediate.addSymbol(*perCtrlPtVar, loc);
+            } else {
+                // find which builtin it is
+                const TBuiltInVariable biType = patchConstantFunction[p].declaredBuiltIn;
+                
+                inputArg = findLinkageSymbol(biType);
+
+                if (inputArg == nullptr) {
+                    error(loc, "unable to find patch constant function builtin variable", "", "");
+                    return;
+                }
             }
 
             if (pcfParamCount == 1)
-                pcfArguments = builtIn;
+                pcfArguments = inputArg;
             else
-                pcfArguments = intermediate.growAggregate(pcfArguments, builtIn);
+                pcfArguments = intermediate.growAggregate(pcfArguments, inputArg);
         }
     }
 
     // ================ Step 2: Synthesize call to PCF ================
+    TIntermAggregate* pcfCallSequence = nullptr;
     TIntermTyped* pcfCall = nullptr;
 
     {
@@ -7553,7 +7893,8 @@ void HlslParseContext::addPatchConstantInvocation()
         pcfCall = intermediate.setAggregateOperator(pcfArguments, EOpFunctionCall, patchConstantFunction.getType(), loc);
         pcfCall->getAsAggregate()->setUserDefined();
         pcfCall->getAsAggregate()->setName(patchConstantFunction.getMangledName());
-        intermediate.addToCallGraph(infoSink, entryPointFunction->getMangledName(), patchConstantFunction.getMangledName());
+        intermediate.addToCallGraph(infoSink, intermediate.getEntryPointMangledName().c_str(),
+                                    patchConstantFunction.getMangledName());
 
         if (pcfCall->getAsAggregate()) {
             TQualifierList& qualifierList = pcfCall->getAsAggregate()->getQualifierList();
@@ -7565,6 +7906,71 @@ void HlslParseContext::addPatchConstantInvocation()
         }
     }
 
+    // ================ Step 2B: Per Control Point synthesis ================
+    // If there is per control point data, we must either emulate that with multiple
+    // invocations of the entry point to build up an array, or (TODO:) use a yet
+    // unavailable extension to look across the SIMD lanes.  This is the former
+    // as a placeholder for the latter.
+    if (outPatchParam >= 0) {
+        // We must introduce a local temp variable of the type wanted by the PCF input.
+        const int arraySize = patchConstantFunction[outPatchParam].type->getOuterArraySize();
+
+        if (entryPointFunction->getType().getBasicType() == EbtVoid) {
+            error(loc, "entry point must return a value for use with patch constant function", "", "");
+            return;
+        }
+
+        // Create calls to wrapped main to fill in the array.  We will substitute fixed values
+        // of invocation ID when calling the wrapped main.
+
+        // This is the type of the each member of the per ctrl point array.
+        const TType derefType(perCtrlPtVar->getType(), 0);
+
+        for (int cpt = 0; cpt < arraySize; ++cpt) {
+            // TODO: improve.  substr(1) here is to avoid the '@' that was grafted on but isn't in the symtab
+            // for this function.
+            const TString origName = entryPointFunction->getName().substr(1);
+            TFunction callee(&origName, TType(EbtVoid));
+            TIntermTyped* callingArgs = nullptr;
+
+            for (int i = 0; i < entryPointFunction->getParamCount(); i++) {
+                TParameter& param = (*entryPointFunction)[i];
+                TType& paramType = *param.type;
+
+                if (paramType.getQualifier().isParamOutput()) {
+                    error(loc, "unimplemented: entry point outputs in patch constant function invocation", "", "");
+                    return;
+                }
+
+                if (paramType.getQualifier().isParamInput())  {
+                    TIntermTyped* arg = nullptr;
+                    if ((*entryPointFunction)[i].declaredBuiltIn == EbvInvocationId) {
+                        // substitute invocation ID with the array element ID
+                        arg = intermediate.addConstantUnion(cpt, loc);
+                    } else {
+                        TVariable* argVar = makeInternalVariable(*param.name, *param.type);
+                        argVar->getWritableType().getQualifier().makeTemporary();
+                        arg = intermediate.addSymbol(*argVar);
+                    }
+
+                    handleFunctionArgument(&callee, callingArgs, arg);
+                }
+            }
+
+            // Call and assign to per ctrl point variable
+            currentCaller = intermediate.getEntryPointMangledName().c_str();
+            TIntermTyped* callReturn = handleFunctionCall(loc, &callee, callingArgs);
+            TIntermTyped* index = intermediate.addConstantUnion(cpt, loc);
+            TIntermSymbol* perCtrlPtSym = intermediate.addSymbol(*perCtrlPtVar, loc);
+            TIntermTyped* element = intermediate.addIndex(EOpIndexDirect, perCtrlPtSym, index, loc);
+            element->setType(derefType);
+            element->setLoc(loc);
+
+            pcfCallSequence = intermediate.growAggregate(pcfCallSequence, 
+                                                         handleAssign(loc, EOpAssign, element, callReturn));
+        }
+    }
+
     // ================ Step 3: Create return Sequence ================
     // Return sequence: copy PCF result to a temporary, then to shader output variable.
     if (pcfCall->getBasicType() != EbtVoid) {
@@ -7581,30 +7987,31 @@ void HlslParseContext::addPatchConstantInvocation()
         if (patchConstantFunction.getDeclaredBuiltInType() != EbvNone)
             outType.getQualifier().builtIn = patchConstantFunction.getDeclaredBuiltInType();
 
+        outType.getQualifier().patch = true; // make it a per-patch variable
+
         TVariable* pcfOutput = makeInternalVariable("@patchConstantOutput", outType);
         pcfOutput->getWritableType().getQualifier().storage = EvqVaryingOut;
 
         if (pcfOutput->getType().containsBuiltInInterstageIO(language))
             split(*pcfOutput);
 
+        assignLocations(*pcfOutput);
+
         TIntermSymbol* pcfOutputSym = intermediate.addSymbol(*pcfOutput, loc);
 
         // The call to the PCF is a complex R-value: we want to store it in a temp to avoid
         // repeated calls to the PCF:
         TVariable* pcfCallResult = makeInternalVariable("@patchConstantResult", *retType);
         pcfCallResult->getWritableType().getQualifier().makeTemporary();
-        TIntermSymbol* pcfResultVar = intermediate.addSymbol(*pcfCallResult, loc);
-        // sanitizeType(&pcfCall->getWritableType());
-        TIntermNode* pcfResultAssign = intermediate.addAssign(EOpAssign, pcfResultVar, pcfCall, loc);
 
+        TIntermSymbol* pcfResultVar = intermediate.addSymbol(*pcfCallResult, loc);
+        TIntermNode* pcfResultAssign = handleAssign(loc, EOpAssign, pcfResultVar, pcfCall);
         TIntermNode* pcfResultToOut = handleAssign(loc, EOpAssign, pcfOutputSym, intermediate.addSymbol(*pcfCallResult, loc));
 
-        TIntermTyped* pcfAggregate = nullptr;
-        pcfAggregate = intermediate.growAggregate(pcfAggregate, pcfResultAssign);
-        pcfAggregate = intermediate.growAggregate(pcfAggregate, pcfResultToOut);
-        pcfAggregate = intermediate.setAggregateOperator(pcfAggregate, EOpSequence, *retType, loc);
-
-        pcfCall = pcfAggregate;
+        pcfCallSequence = intermediate.growAggregate(pcfCallSequence, pcfResultAssign);
+        pcfCallSequence = intermediate.growAggregate(pcfCallSequence, pcfResultToOut);
+    } else {
+        pcfCallSequence = intermediate.growAggregate(pcfCallSequence, pcfCall);
     }
 
     // ================ Step 4: Barrier ================    
@@ -7613,12 +8020,14 @@ void HlslParseContext::addPatchConstantInvocation()
     barrier->setType(TType(EbtVoid));
     epBodySeq.insert(epBodySeq.end(), barrier);
 
-    // ================ Step 5: Test on invocation ID ================    
+    // ================ Step 5: Test on invocation ID ================
     TIntermTyped* zero = intermediate.addConstantUnion(0, loc, true);
     TIntermTyped* cmp =  intermediate.addBinaryNode(EOpEqual, invocationIdSym, zero, loc, TType(EbtBool));
 
-    // Create if statement
-    TIntermTyped* invocationIdTest = new TIntermSelection(cmp, pcfCall, nullptr);
+
+    // ================ Step 5B: Create if statement on Invocation ID == 0 ================
+    intermediate.setAggregateOperator(pcfCallSequence, EOpSequence, TType(EbtVoid), loc);
+    TIntermTyped* invocationIdTest = new TIntermSelection(cmp, pcfCallSequence, nullptr);
     invocationIdTest->setLoc(loc);
 
     // add our test sequence before the return.
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.h b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.h
index 2d19681..14c9809 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.h
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseHelper.h
@@ -80,6 +80,7 @@ public:
     void handleFunctionDeclarator(const TSourceLoc&, TFunction& function, bool prototype);
     TIntermAggregate* handleFunctionDefinition(const TSourceLoc&, TFunction&, const TAttributeMap&, TIntermNode*& entryPointTree);
     TIntermNode* transformEntryPoint(const TSourceLoc&, TFunction&, const TAttributeMap&);
+    void handleEntryPointAttributes(const TSourceLoc&, const TAttributeMap&);
     void handleFunctionBody(const TSourceLoc&, TFunction&, TIntermNode* functionBody, TIntermNode*& node);
     void remapEntryPointIO(TFunction& function, TVariable*& returnValue, TVector<TVariable*>& inputs, TVector<TVariable*>& outputs);
     void remapNonEntryPointIO(TFunction& function);
@@ -95,13 +96,13 @@ public:
     void addInputArgumentConversions(const TFunction&, TIntermTyped*&);
     TIntermTyped* addOutputArgumentConversions(const TFunction&, TIntermOperator&);
     void builtInOpCheck(const TSourceLoc&, const TFunction&, TIntermOperator&);
-    TFunction* handleConstructorCall(const TSourceLoc&, const TType&);
+    TFunction* makeConstructorCall(const TSourceLoc&, const TType&);
     void handleSemantic(TSourceLoc, TQualifier&, TBuiltInVariable, const TString& upperCase);
     void handlePackOffset(const TSourceLoc&, TQualifier&, const glslang::TString& location,
                           const glslang::TString* component);
     void handleRegister(const TSourceLoc&, TQualifier&, const glslang::TString* profile, const glslang::TString& desc,
                         int subComponent, const glslang::TString*);
-
+    TIntermTyped* convertConditionalExpression(const TSourceLoc&, TIntermTyped*);
     TIntermAggregate* handleSamplerTextureCombine(const TSourceLoc& loc, TIntermTyped* argTex, TIntermTyped* argSampler);
 
     bool parseMatrixSwizzleSelector(const TSourceLoc&, const TString&, int cols, int rows, TSwizzleSelectors<TMatrixSelector>&);
@@ -120,7 +121,6 @@ public:
     void structArrayCheck(const TSourceLoc&, const TType& structure);
     void arrayDimMerge(TType& type, const TArraySizes* sizes);
     bool voidErrorCheck(const TSourceLoc&, const TString&, TBasicType);
-    void boolCheck(const TSourceLoc&, const TIntermTyped*);
     void globalQualifierFix(const TSourceLoc&, TQualifier&);
     bool structQualifierErrorCheck(const TSourceLoc&, const TPublicType& pType);
     void mergeQualifiers(TQualifier& dst, const TQualifier& src);
@@ -136,12 +136,13 @@ public:
     void checkNoShaderLayouts(const TSourceLoc&, const TShaderQualifiers&);
 
     const TFunction* findFunction(const TSourceLoc& loc, TFunction& call, bool& builtIn, TIntermTyped*& args);
-    void declareTypedef(const TSourceLoc&, TString& identifier, const TType&);
+    void declareTypedef(const TSourceLoc&, const TString& identifier, const TType&);
     void declareStruct(const TSourceLoc&, TString& structName, TType&);
     TSymbol* lookupUserType(const TString&, TType&);
-    TIntermNode* declareVariable(const TSourceLoc&, TString& identifier, TType&, TIntermTyped* initializer = 0);
-    void lengthenList(const TSourceLoc&, TIntermSequence& list, int size);
-    TIntermTyped* addConstructor(const TSourceLoc&, TIntermNode*, const TType&);
+    TIntermNode* declareVariable(const TSourceLoc&, const TString& identifier, TType&, TIntermTyped* initializer = 0);
+    void lengthenList(const TSourceLoc&, TIntermSequence& list, int size, TIntermTyped* scalarInit);
+    TIntermTyped* handleConstructor(const TSourceLoc&, TIntermTyped*, const TType&);
+    TIntermTyped* addConstructor(const TSourceLoc&, TIntermTyped*, const TType&);
     TIntermTyped* constructAggregate(TIntermNode*, const TType&, int, const TSourceLoc&);
     TIntermTyped* constructBuiltIn(const TType&, TOperator, TIntermTyped*, const TSourceLoc&, bool subset);
     void declareBlock(const TSourceLoc&, TType&, const TString* instanceName = 0, TArraySizes* arraySizes = 0);
@@ -173,13 +174,13 @@ public:
 
     void pushNamespace(const TString& name);
     void popNamespace();
-    TString* getFullNamespaceName(const TString& localName) const;
+    void getFullNamespaceName(const TString*&) const;
     void addScopeMangler(TString&);
 
     void pushSwitchSequence(TIntermSequence* sequence) { switchSequenceStack.push_back(sequence); }
     void popSwitchSequence() { switchSequenceStack.pop_back(); }
 
-    virtual void growGlobalUniformBlock(TSourceLoc&, TType&, TString& memberName, TTypeList* typeList = nullptr) override;
+    virtual void growGlobalUniformBlock(const TSourceLoc&, TType&, const TString& memberName, TTypeList* typeList = nullptr) override;
 
     // Apply L-value conversions.  E.g, turning a write to a RWTexture into an ImageStore.
     TIntermTyped* handleLvalue(const TSourceLoc&, const char* op, TIntermTyped* node);
@@ -191,7 +192,7 @@ public:
     bool handleInputGeometry(const TSourceLoc&, const TLayoutGeometry& geometry);
 
     // Potentially rename shader entry point function
-    void renameShaderFunction(TString*& name) const;
+    void renameShaderFunction(const TString*& name) const;
 
     // Reset data for incrementally built referencing of flattened composite structures
     void initFlattening() { flattenLevel.push_back(0); flattenOffset.push_back(0); }
@@ -210,17 +211,18 @@ protected:
         int                 nextBinding; // next binding to use.
     };
 
-    void fixConstInit(const TSourceLoc&, TString& identifier, TType& type, TIntermTyped*& initializer);
+    void fixConstInit(const TSourceLoc&, const TString& identifier, TType& type, TIntermTyped*& initializer);
     void inheritGlobalDefaults(TQualifier& dst) const;
     TVariable* makeInternalVariable(const char* name, const TType&) const;
     TVariable* makeInternalVariable(const TString& name, const TType& type) const {
         return makeInternalVariable(name.c_str(), type);
     }
-    TVariable* declareNonArray(const TSourceLoc&, TString& identifier, TType&, bool track);
-    void declareArray(const TSourceLoc&, TString& identifier, const TType&, TSymbol*&, bool track);
+    TIntermSymbol* makeInternalVariableNode(const TSourceLoc&, const char* name, const TType&) const;
+    TVariable* declareNonArray(const TSourceLoc&, const TString& identifier, const TType&, bool track);
+    void declareArray(const TSourceLoc&, const TString& identifier, const TType&, TSymbol*&, bool track);
     TIntermNode* executeInitializer(const TSourceLoc&, TIntermTyped* initializer, TVariable* variable);
-    TIntermTyped* convertInitializerList(const TSourceLoc&, const TType&, TIntermTyped* initializer);
-    bool isZeroConstructor(const TIntermNode*);
+    TIntermTyped* convertInitializerList(const TSourceLoc&, const TType&, TIntermTyped* initializer, TIntermTyped* scalarInit);
+    bool isScalarConstructor(const TIntermNode*);
     TOperator mapAtomicOp(const TSourceLoc& loc, TOperator op, bool isImage);
 
     // Return true if this node requires L-value conversion (e.g, to an imageStore).
@@ -248,6 +250,8 @@ protected:
     void addInterstageIoToLinkage();
     void addPatchConstantInvocation();
 
+    void fixBuiltInIoType(TType&);
+
     void flatten(const TSourceLoc& loc, const TVariable& variable);
     int flatten(const TSourceLoc& loc, const TVariable& variable, const TType&, TFlattenData&, TString name);
     int flattenStruct(const TSourceLoc& loc, const TVariable& variable, const TType&, TFlattenData&, TString name);
@@ -281,6 +285,9 @@ protected:
 
     void finish() override; // post-processing
 
+    // Linkage symbol helpers
+    TIntermSymbol* findLinkageSymbol(TBuiltInVariable biType) const;
+
     // Current state of parsing
     struct TPragma contextPragma;
     int loopNestingLevel;        // 0 if outside all loops
@@ -381,6 +388,7 @@ protected:
     };
 
     TMap<tInterstageIoData, TVariable*> interstageBuiltInIo; // individual builtin interstage IO vars, indexed by builtin type.
+    TVariable* inputPatch;
 
     // We have to move array references to structs containing builtin interstage IO to the split variables.
     // This is only handled for one level.  This stores the index, because we'll need it in the future, since
@@ -400,6 +408,8 @@ protected:
 
     TVector<TString> currentTypePrefix;      // current scoping prefix for nested structures
     TVector<TVariable*> implicitThisStack;   // currently active 'this' variables for nested structures
+
+    TVariable* gsStreamOutput;               // geometry shader stream outputs, for emit (Append method)
 };
 
 // This is the prefix we use for builtin methods to avoid namespace collisions with
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseables.cpp b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseables.cpp
index 1a47a0b..c77b541 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseables.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslParseables.cpp
@@ -359,7 +359,7 @@ inline bool IsValid(const char* cname, char retOrder, char retType, char argOrde
     const std::string name(cname);
 
     // these do not have vec1 versions
-    if (dim0 == 1 && (name == "length" || name == "normalize" || name == "reflect" || name == "refract"))
+    if (dim0 == 1 && (name == "normalize" || name == "reflect" || name == "refract"))
         return false;
 
     if (!IsTextureType(argOrder) && (isVec && dim0 == 1)) // avoid vec1
@@ -625,7 +625,7 @@ void TBuiltInParseablesHlsl::initialize(int /*version*/, EProfile /*profile*/, c
         { "isinf",                            nullptr, "B" ,      "SVM",            "F",             EShLangAll,    false },
         { "isnan",                            nullptr, "B" ,      "SVM",            "F",             EShLangAll,    false },
         { "ldexp",                            nullptr, nullptr,   "SVM,",           "F,",            EShLangAll,    false },
-        { "length",                           "S",     "F",       "V",              "F",             EShLangAll,    false },
+        { "length",                           "S",     "F",       "SV",             "F",             EShLangAll,    false },
         { "lerp",                             nullptr, nullptr,   "VM,,",           "F,,",           EShLangAll,    false },
         { "lerp",                             nullptr, nullptr,   "SVM,,S",         "F,,",           EShLangAll,    false },
         { "lit",                              "V4",    "F",       "S,,",            "F,,",           EShLangAll,    false },
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslScanContext.cpp b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslScanContext.cpp
index 7b0365f..55304a0 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslScanContext.cpp
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslScanContext.cpp
@@ -334,6 +334,7 @@ void HlslScanContext::fillInKeywordMap()
     (*KeywordMap)["tbuffer"] =                 EHTokTBuffer;
     (*KeywordMap)["typedef"] =                 EHTokTypedef;
     (*KeywordMap)["this"] =                    EHTokThis;
+    (*KeywordMap)["namespace"] =               EHTokNamespace;
 
     (*KeywordMap)["true"] =                    EHTokBoolConstant;
     (*KeywordMap)["false"] =                   EHTokBoolConstant;
@@ -828,6 +829,7 @@ EHlslTokenClass HlslScanContext::tokenizeIdentifier()
     case EHTokCBuffer:
     case EHTokTBuffer:
     case EHTokThis:
+    case EHTokNamespace:
         return keyword;
 
     case EHTokBoolConstant:
@@ -876,28 +878,4 @@ EHlslTokenClass HlslScanContext::reservedWord()
     return EHTokNone;
 }
 
-EHlslTokenClass HlslScanContext::identifierOrReserved(bool reserved)
-{
-    if (reserved) {
-        reservedWord();
-
-        return EHTokNone;
-    }
-
-    if (parseContext.forwardCompatible)
-        parseContext.warn(loc, "using future reserved keyword", tokenText, "");
-
-    return identifierOrType();
-}
-
-// For a keyword that was never reserved, until it suddenly
-// showed up.
-EHlslTokenClass HlslScanContext::nonreservedKeyword(int version)
-{
-    if (parseContext.version < version)
-        return identifierOrType();
-
-    return keyword;
-}
-
 } // end namespace glslang
diff --git a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslTokens.h b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslTokens.h
index 9f91906..cba0b96 100755
--- a/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslTokens.h
+++ b/3rdparty/bgfx/3rdparty/glslang/hlsl/hlslTokens.h
@@ -274,6 +274,7 @@ enum EHlslTokenClass {
     EHTokTBuffer,
     EHTokTypedef,
     EHTokThis,
+    EHTokNamespace,
 
     // constant
     EHTokFloatConstant,
diff --git a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.cpp b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.cpp
index db8ce64..04ba7e3 100644
--- a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.cpp
+++ b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.cpp
@@ -4126,12 +4126,15 @@ bool ImGui::Begin(const char* name, bool* p_open, const ImVec2& size_on_first_us
         }
         else if (flags & ImGuiWindowFlags_ChildMenu)
         {
+            // Child menus typically request _any_ position within the parent menu item, and then our FindBestPopupWindowPos() function will move the new menu outside the parent bounds.
+            // This is how we end up with child menus appearing (most-commonly) on the right of the parent menu.
             IM_ASSERT(window_pos_set_by_api);
+            float horizontal_overlap = style.ItemSpacing.x; // We want some overlap to convey the relative depth of each popup (currently the amount of overlap it is hard-coded to style.ItemSpacing.x, may need to introduce another style value).
             ImRect rect_to_avoid;
             if (parent_window->DC.MenuBarAppending)
                 rect_to_avoid = ImRect(-FLT_MAX, parent_window->Pos.y + parent_window->TitleBarHeight(), FLT_MAX, parent_window->Pos.y + parent_window->TitleBarHeight() + parent_window->MenuBarHeight());
             else
-                rect_to_avoid = ImRect(parent_window->Pos.x + style.ItemSpacing.x, -FLT_MAX, parent_window->Pos.x + parent_window->Size.x - style.ItemSpacing.x - parent_window->ScrollbarSizes.x, FLT_MAX); // We want some overlap to convey the relative depth of each popup (here hard-coded to 4)
+                rect_to_avoid = ImRect(parent_window->Pos.x + horizontal_overlap, -FLT_MAX, parent_window->Pos.x + parent_window->Size.x - horizontal_overlap - parent_window->ScrollbarSizes.x, FLT_MAX);
             window->PosFloat = FindBestPopupWindowPos(window->PosFloat, window->Size, &window->AutoPosLastDirection, rect_to_avoid);
         }
         else if ((flags & ImGuiWindowFlags_Popup) != 0 && !window_pos_set_by_api && window_appearing_after_being_hidden)
@@ -8867,6 +8870,7 @@ bool ImGui::BeginMenu(const char* label, bool enabled)
     if (menuset_is_open)
         g.FocusedWindow = window;
 
+    // The reference position stored in popup_pos will be used by Begin() to find a suitable position for the child menu (using FindBestPopupWindowPos).
     ImVec2 popup_pos, pos = window->DC.CursorPos;
     if (window->DC.LayoutType == ImGuiLayoutType_Horizontal)
     {
@@ -8979,7 +8983,7 @@ bool ImGui::ColorButton(const ImVec4& col, bool small_height, bool outline_borde
     RenderFrame(bb.Min, bb.Max, GetColorU32(col), outline_border, style.FrameRounding);
 
     if (hovered)
-        SetTooltip("Color:\n(%.2f,%.2f,%.2f,%.2f)\n#%02X%02X%02X%02X", col.x, col.y, col.z, col.w, IM_F32_TO_INT8_SAT(col.x), IM_F32_TO_INT8_SAT(col.y), IM_F32_TO_INT8_SAT(col.z), IM_F32_TO_INT8_SAT(col.z));
+        SetTooltip("Color:\n(%.2f,%.2f,%.2f,%.2f)\n#%02X%02X%02X%02X", col.x, col.y, col.z, col.w, IM_F32_TO_INT8_SAT(col.x), IM_F32_TO_INT8_SAT(col.y), IM_F32_TO_INT8_SAT(col.z), IM_F32_TO_INT8_SAT(col.w));
 
     return pressed;
 }
diff --git a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.h b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.h
index f698f0b..aa477dc 100644
--- a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.h
+++ b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui.h
@@ -118,7 +118,7 @@ namespace ImGui
     IMGUI_API void          ShowUserGuide();                            // help block
     IMGUI_API void          ShowStyleEditor(ImGuiStyle* ref = NULL);    // style editor block. you can pass in a reference ImGuiStyle structure to compare to, revert to and save to (else it uses the default style)
     IMGUI_API void          ShowTestWindow(bool* p_open = NULL);        // test window demonstrating ImGui features
-    IMGUI_API void          ShowMetricsWindow(bool* p_open = NULL);     // metrics window for debugging ImGui
+    IMGUI_API void          ShowMetricsWindow(bool* p_open = NULL);     // metrics window for debugging ImGui (browse draw commands, individual vertices, window list, etc.)
 
     // Window
     IMGUI_API bool          Begin(const char* name, bool* p_open = NULL, ImGuiWindowFlags flags = 0);                                                   // push window to the stack and start appending to it. see .cpp for details. return false when window is collapsed, so you can early out in your code. 'bool* p_open' creates a widget on the upper-right to close the window (which sets your bool to false).
@@ -521,7 +521,7 @@ enum ImGuiInputTextFlags_
     ImGuiInputTextFlags_CallbackAlways      = 1 << 8,   // Call user function every time. User code may query cursor position, modify text buffer.
     ImGuiInputTextFlags_CallbackCharFilter  = 1 << 9,   // Call user function to filter character. Modify data->EventChar to replace/filter input, or return 1 to discard character.
     ImGuiInputTextFlags_AllowTabInput       = 1 << 10,  // Pressing TAB input a '\t' character into the text field
-    ImGuiInputTextFlags_CtrlEnterForNewLine = 1 << 11,  // In multi-line mode, allow exiting edition by pressing Enter. Ctrl+Enter to add new line (by default adds new lines with Enter).
+    ImGuiInputTextFlags_CtrlEnterForNewLine = 1 << 11,  // In multi-line mode, unfocus with Enter, add new line with Ctrl+Enter (default is opposite: unfocus with Ctrl+Enter, add line with Enter).
     ImGuiInputTextFlags_NoHorizontalScroll  = 1 << 12,  // Disable following the cursor horizontally
     ImGuiInputTextFlags_AlwaysInsertMode    = 1 << 13,  // Insert mode
     ImGuiInputTextFlags_ReadOnly            = 1 << 14,  // Read-only mode
@@ -1023,8 +1023,8 @@ struct ImGuiTextEditCallbackData
     int                 SelectionEnd;   //                                      // Read-write
 
     // NB: Helper functions for text manipulation. Calling those function loses selection.
-    void    DeleteChars(int pos, int bytes_count);
-    void    InsertChars(int pos, const char* text, const char* text_end = NULL);
+    IMGUI_API void    DeleteChars(int pos, int bytes_count);
+    IMGUI_API void    InsertChars(int pos, const char* text, const char* text_end = NULL);
     bool    HasSelection() const { return SelectionStart != SelectionEnd; }
 };
 
@@ -1371,6 +1371,7 @@ struct ImFont
     ImFontConfig*               ConfigData;         //              // Pointer within ContainerAtlas->ConfigData
     ImFontAtlas*                ContainerAtlas;     //              // What we has been loaded into
     float                       Ascent, Descent;    //              // Ascent: distance from top to bottom of e.g. 'A' [0..FontSize]
+    int                         MetricsTotalSurface;//              // Total surface in pixels to get an idea of the font rasterization/texture cost (not exact, we approximate the cost of padding between glyphs)
 
     // Methods
     IMGUI_API ImFont();
diff --git a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_demo.cpp b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_demo.cpp
index c08de0c..d58c6ae 100644
--- a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_demo.cpp
+++ b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_demo.cpp
@@ -1750,10 +1750,11 @@ void ImGui::ShowStyleEditor(ImGuiStyle* ref)
                 ImGui::SameLine(); ShowHelpMarker("Note than the default embedded font is NOT meant to be scaled.\n\nFont are currently rendered into bitmaps at a given size at the time of building the atlas. You may oversample them to get some flexibility with scaling. You can also render at multiple sizes and select which one to use at runtime.\n\n(Glimmer of hope: the atlas system should hopefully be rewritten in the future to make scaling more natural and automatic.)");
                 ImGui::Text("Ascent: %f, Descent: %f, Height: %f", font->Ascent, font->Descent, font->Ascent - font->Descent);
                 ImGui::Text("Fallback character: '%c' (%d)", font->FallbackChar, font->FallbackChar);
+                ImGui::Text("Texture surface: %d pixels (approx)", font->MetricsTotalSurface);
                 for (int config_i = 0; config_i < font->ConfigDataCount; config_i++)
                 {
                     ImFontConfig* cfg = &font->ConfigData[config_i];
-                    ImGui::BulletText("Input %d: \'%s\'\nOversample: (%d,%d), PixelSnapH: %d", config_i, cfg->Name, cfg->OversampleH, cfg->OversampleV, cfg->PixelSnapH);
+                    ImGui::BulletText("Input %d: \'%s\', Oversample: (%d,%d), PixelSnapH: %d", config_i, cfg->Name, cfg->OversampleH, cfg->OversampleV, cfg->PixelSnapH);
                 }
                 if (ImGui::TreeNode("Glyphs", "Glyphs (%d)", font->Glyphs.Size))
                 {
diff --git a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_draw.cpp b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_draw.cpp
index 8b3b2ec..8d1de8c 100644
--- a/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_draw.cpp
+++ b/3rdparty/bgfx/3rdparty/ocornut-imgui/imgui_draw.cpp
@@ -1388,7 +1388,7 @@ bool    ImFontAtlas::Build()
     {
         ImFontConfig& cfg = ConfigData[input_i];
         ImFontTempBuildData& tmp = tmp_array[input_i];
-        ImFont* dst_font = cfg.DstFont;
+        ImFont* dst_font = cfg.DstFont; // We can have multiple input fonts writing into a same destination font (when using MergeMode=true)
 
         float font_scale = stbtt_ScaleForPixelHeight(&tmp.FontInfo, cfg.SizePixels);
         int unscaled_ascent, unscaled_descent, unscaled_line_gap;
@@ -1405,6 +1405,7 @@ bool    ImFontAtlas::Build()
             dst_font->Ascent = ascent;
             dst_font->Descent = descent;
             dst_font->Glyphs.resize(0);
+            dst_font->MetricsTotalSurface = 0;
         }
         dst_font->ConfigDataCount++;
         float off_y = (cfg.MergeMode && cfg.MergeGlyphCenterV) ? (ascent - dst_font->Ascent) * 0.5f : 0.0f;
@@ -1437,6 +1438,7 @@ bool    ImFontAtlas::Build()
                 glyph.XAdvance = (pc.xadvance + cfg.GlyphExtraSpacing.x);  // Bake spacing into XAdvance
                 if (cfg.PixelSnapH)
                     glyph.XAdvance = (float)(int)(glyph.XAdvance + 0.5f);
+                dst_font->MetricsTotalSurface += (int)(glyph.X1 - glyph.X0 + 1.99f) * (int)(glyph.Y1 - glyph.Y0 + 1.99f); // +1 to account for average padding, +0.99 to round
             }
         }
         cfg.DstFont->BuildLookupTable();
@@ -1696,15 +1698,16 @@ void    ImFont::Clear()
 {
     FontSize = 0.0f;
     DisplayOffset = ImVec2(0.0f, 1.0f);
-    ConfigData = NULL;
-    ConfigDataCount = 0;
-    Ascent = Descent = 0.0f;
-    ContainerAtlas = NULL;
     Glyphs.clear();
-    FallbackGlyph = NULL;
-    FallbackXAdvance = 0.0f;
     IndexXAdvance.clear();
     IndexLookup.clear();
+    FallbackGlyph = NULL;
+    FallbackXAdvance = 0.0f;
+    ConfigDataCount = 0;
+    ConfigData = NULL;
+    ContainerAtlas = NULL;
+    Ascent = Descent = 0.0f;
+    MetricsTotalSurface = 0;
 }
 
 void ImFont::BuildLookupTable()
diff --git a/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.h b/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.h
index caac986..b929150 100644
--- a/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.h
+++ b/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.h
@@ -96,7 +96,8 @@ void EditTransform(const Camera& camera, matrix_t& matrix)
 		ImGui::InputFloat("Scale Snap", &snap.x);
 		break;
 	}
-
+	ImGuiIO& io = ImGui::GetIO();
+	ImGuizmo::SetRect(0, 0, io.DisplaySize.x, io.DisplaySize.y);
 	ImGuizmo::Manipulate(camera.mView.m16, camera.mProjection.m16, mCurrentGizmoOperation, mCurrentGizmoMode, matrix.m16, NULL, useSnap ? &snap.x : NULL);
 }
 #endif
@@ -132,6 +133,8 @@ namespace ImGuizmo
 	void DecomposeMatrixToComponents(const float *matrix, float *translation, float *rotation, float *scale);
 	void RecomposeMatrixFromComponents(const float *translation, const float *rotation, const float *scale, float *matrix);
 
+	void SetRect(float x, float y, float width, float height);
+
 	// Render a cube with face color corresponding to face normal. Usefull for debug/tests
 	void DrawCube(const float *view, const float *projection, float *matrix);
 
diff --git a/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.inl b/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.inl
index d9342bb..53dbc15 100644
--- a/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.inl
+++ b/3rdparty/bgfx/3rdparty/ocornut-imgui/widgets/gizmo.inl
@@ -20,12 +20,19 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
+#define IMGUI_DEFINE_MATH_OPERATORS
+
+// includes patches for multiview from
+// https://github.com/CedricGuillemet/ImGuizmo/issues/15
+
 namespace ImGuizmo
 {
    static const float ZPI = 3.14159265358979323846f;
    static const float RAD2DEG = (180.f / ZPI);
    static const float DEG2RAD = (ZPI / 180.f);
 
+   const float screenRotateSize = 0.06f;
+
    ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    // utility and math
 
@@ -551,9 +558,13 @@ namespace ImGuizmo
 	  bool mbUsingBounds;
 	  matrix_t mBoundsMatrix;
 
-
       //
       int mCurrentOperation;
+
+	  float mX = 0.f;
+	  float mY = 0.f;
+	  float mWidth = 0.f;
+	  float mHeight = 0.f;
    };
 
    static Context gContext;
@@ -584,16 +595,16 @@ namespace ImGuizmo
 
    static ImVec2 worldToPos(const vec_t& worldPos, const matrix_t& mat)
    {
-      ImGuiIO& io = ImGui::GetIO();
-
       vec_t trans;
       trans.TransformPoint(worldPos, mat);
       trans *= 0.5f / trans.w;
       trans += makeVect(0.5f, 0.5f);
       trans.y = 1.f - trans.y;
-      trans.x *= io.DisplaySize.x;
-      trans.y *= io.DisplaySize.y;
-      return ImVec2(trans.x, trans.y);
+	  trans.x *= gContext.mWidth;
+	  trans.y *= gContext.mHeight;
+	  trans.x += gContext.mX;
+	  trans.y += gContext.mY;
+	  return ImVec2(trans.x, trans.y);
    }
 
    static void ComputeCameraRay(vec_t &rayOrigin, vec_t &rayDir)
@@ -603,9 +614,9 @@ namespace ImGuizmo
       matrix_t mViewProjInverse;
       mViewProjInverse.Inverse(gContext.mViewMat * gContext.mProjectionMat);
 
-      float mox = (io.MousePos.x / io.DisplaySize.x) * 2.f - 1.f;
-      float moy = (1.f - (io.MousePos.y / io.DisplaySize.y)) * 2.f - 1.f;
-
+	  float mox = ((io.MousePos.x - gContext.mX) / gContext.mWidth) * 2.f - 1.f;
+	  float moy = (1.f - ((io.MousePos.y - gContext.mY) / gContext.mHeight)) * 2.f - 1.f;
+	  
       rayOrigin.Transform(makeVect(mox, moy, 0.f, 1.f), mViewProjInverse);
       rayOrigin *= 1.f / rayOrigin.w;
       vec_t rayEnd;
@@ -625,12 +636,21 @@ namespace ImGuizmo
       return -(numer / denom);
    }
 
+   void SetRect(float x, float y, float width, float height)
+   {
+	   gContext.mX = x;
+	   gContext.mY = y;
+	   gContext.mWidth = width;
+	   gContext.mHeight = height;
+   }
+
    void BeginFrame()
    {
       ImGuiIO& io = ImGui::GetIO();
 
       ImGui::Begin("gizmo", NULL, io.DisplaySize, 0, ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoInputs | ImGuiWindowFlags_NoSavedSettings | ImGuiWindowFlags_NoFocusOnAppearing | ImGuiWindowFlags_NoBringToFrontOnFocus);
-      gContext.mDrawList = ImGui::GetWindowDrawList();
+
+	  gContext.mDrawList = ImGui::GetWindowDrawList();
 
       ImGui::End();
    }
@@ -726,6 +746,8 @@ namespace ImGuizmo
             for (int i = 0; i < 3; i++)
                colors[i + 1] = (type == (int)(SCALE_X + i)) ? selectionColor : directionColor[i];
             break;
+		 default:
+			 break;
          }
       }
       else
@@ -821,7 +843,6 @@ namespace ImGuizmo
    static void DrawRotationGizmo(int type)
    {
       ImDrawList* drawList = gContext.mDrawList;
-      ImGuiIO& io = ImGui::GetIO();
 
       // colors
       ImU32 colors[7];
@@ -845,7 +866,7 @@ namespace ImGuizmo
          }
          drawList->AddPolyline(circlePos, halfCircleSegmentCount, colors[3 - axis], false, 2, true);
       }
-      drawList->AddCircle(worldToPos(gContext.mModel.v.position, gContext.mViewProjection), 0.06f * io.DisplaySize.x, colors[0], 64);
+      drawList->AddCircle(worldToPos(gContext.mModel.v.position, gContext.mViewProjection), screenRotateSize * gContext.mHeight, colors[0], 64);
 
       if (gContext.mbUsing)
       {
@@ -951,6 +972,8 @@ namespace ImGuizmo
    static void DrawTranslationGizmo(int type)
    {
       ImDrawList* drawList = gContext.mDrawList;
+	  if (!drawList)
+		  return;
 
       // colors
       ImU32 colors[7];
@@ -1226,7 +1249,7 @@ namespace ImGuizmo
 
       vec_t deltaScreen = { io.MousePos.x - gContext.mScreenSquareCenter.x, io.MousePos.y - gContext.mScreenSquareCenter.y, 0.f, 0.f };
       float dist = deltaScreen.Length();
-      if (dist >= 0.058f * io.DisplaySize.x && dist < 0.062f * io.DisplaySize.x)
+	  if (dist >= (screenRotateSize - 0.002f) * gContext.mHeight && dist < (screenRotateSize + 0.002f) * gContext.mHeight)
          type = ROTATE_SCREEN;
 
       const vec_t planNormals[] = { gContext.mModel.v.right, gContext.mModel.v.up, gContext.mModel.v.dir};
@@ -1688,3 +1711,4 @@ namespace ImGuizmo
       }
    }
 };
+
diff --git a/3rdparty/bgfx/CMakeLists.txt b/3rdparty/bgfx/CMakeLists.txt
index 4855aea..0498d2e 100644
--- a/3rdparty/bgfx/CMakeLists.txt
+++ b/3rdparty/bgfx/CMakeLists.txt
@@ -13,6 +13,7 @@ ADD_DEFINITIONS ("-D__STDC_LIMIT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_CONSTANT
 
 INCLUDE_DIRECTORIES (
 	${CMAKE_CURRENT_SOURCE_DIR}/bx/include
+	${CMAKE_CURRENT_SOURCE_DIR}/../bimg/include
 	${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/khronos
 	${CMAKE_CURRENT_SOURCE_DIR}/include
 	${CMAKE_CURRENT_SOURCE_DIR}/src
diff --git a/3rdparty/bgfx/LICENSE b/3rdparty/bgfx/LICENSE
index af0e7b0..dd17ed4 100644
--- a/3rdparty/bgfx/LICENSE
+++ b/3rdparty/bgfx/LICENSE
@@ -1,7 +1,5 @@
 Copyright 2010-2017 Branimir Karadzic. All rights reserved.
 
-https://github.com/bkaradzic/bgfx
-
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
@@ -22,5 +20,3 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY
 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 OF THE POSSIBILITY OF SUCH DAMAGE.
-
-https://github.com/bkaradzic/bgfx/blob/master/LICENSE
diff --git a/3rdparty/bgfx/README.md b/3rdparty/bgfx/README.md
index bd56e14..5d918d3 100644
--- a/3rdparty/bgfx/README.md
+++ b/3rdparty/bgfx/README.md
@@ -237,6 +237,11 @@ developed using the Nim programming language, and is currently in pre-alpha
 status. The immediate development focus for FRAG is supporting the creation of
 2D desktop games.
 
+## vg-renderer
+
+https://github.com/jdryg/vg-renderer#vg-renderer - A vector graphics renderer
+for bgfx, based on ideas from both NanoVG and ImDrawList (Dear ImGUI)
+
 [License (BSD 2-clause)](https://bkaradzic.github.io/bgfx/license.html)
 -----------------------------------------------------------------------
 
diff --git a/3rdparty/bgfx/examples/common/bgfx_utils.cpp b/3rdparty/bgfx/examples/common/bgfx_utils.cpp
index edea94c..6a92d1c 100644
--- a/3rdparty/bgfx/examples/common/bgfx_utils.cpp
+++ b/3rdparty/bgfx/examples/common/bgfx_utils.cpp
@@ -21,6 +21,8 @@ namespace stl = tinystl;
 
 #include "bgfx_utils.h"
 
+#include <bimg/decode.h>
+
 void* load(bx::FileReaderI* _reader, bx::AllocatorI* _allocator, const char* _filePath, uint32_t* _size)
 {
 	if (bx::open(_reader, _filePath) )
@@ -149,8 +151,8 @@ bgfx::ProgramHandle loadProgram(const char* _vsName, const char* _fsName)
 static void imageReleaseCb(void* _ptr, void* _userData)
 {
 	BX_UNUSED(_ptr);
-	bgfx::ImageContainer* imageContainer = (bgfx::ImageContainer*)_userData;
-	bgfx::imageFree(imageContainer);
+	bimg::ImageContainer* imageContainer = (bimg::ImageContainer*)_userData;
+	bimg::imageFree(imageContainer);
 }
 
 bgfx::TextureHandle loadTexture(bx::FileReaderI* _reader, const char* _filePath, uint32_t _flags, uint8_t _skip, bgfx::TextureInfo* _info)
@@ -162,7 +164,7 @@ bgfx::TextureHandle loadTexture(bx::FileReaderI* _reader, const char* _filePath,
 	void* data = load(_reader, entry::getAllocator(), _filePath, &size);
 	if (NULL != data)
 	{
-		bgfx::ImageContainer* imageContainer = bgfx::imageParse(entry::getAllocator(), data, size);
+		bimg::ImageContainer* imageContainer = bimg::imageParse(entry::getAllocator(), data, size);
 
 		if (NULL != imageContainer)
 		{
@@ -180,7 +182,7 @@ bgfx::TextureHandle loadTexture(bx::FileReaderI* _reader, const char* _filePath,
 					  uint16_t(imageContainer->m_width)
 					, 1 < imageContainer->m_numMips
 					, imageContainer->m_numLayers
-					, imageContainer->m_format
+					, bgfx::TextureFormat::Enum(imageContainer->m_format)
 					, _flags
 					, mem
 					);
@@ -192,7 +194,7 @@ bgfx::TextureHandle loadTexture(bx::FileReaderI* _reader, const char* _filePath,
 					, uint16_t(imageContainer->m_height)
 					, 1 < imageContainer->m_numMips
 					, imageContainer->m_numLayers
-					, imageContainer->m_format
+					, bgfx::TextureFormat::Enum(imageContainer->m_format)
 					, _flags
 					, mem
 					);
@@ -208,7 +210,7 @@ bgfx::TextureHandle loadTexture(bx::FileReaderI* _reader, const char* _filePath,
 					, false
 					, false
 					, 1
-					, imageContainer->m_format
+					, bgfx::TextureFormat::Enum(imageContainer->m_format)
 					);
 			}
 		}
@@ -222,12 +224,12 @@ bgfx::TextureHandle loadTexture(const char* _name, uint32_t _flags, uint8_t _ski
 	return loadTexture(entry::getFileReader(), _name, _flags, _skip, _info);
 }
 
-bgfx::ImageContainer* imageLoad(const char* _filePath, bgfx::TextureFormat::Enum _dstFormat)
+bimg::ImageContainer* imageLoad(const char* _filePath, bgfx::TextureFormat::Enum _dstFormat)
 {
 	uint32_t size = 0;
 	void* data = loadMem(entry::getFileReader(), entry::getAllocator(), _filePath, &size);
 
-	return bgfx::imageParse(entry::getAllocator(), data, size, _dstFormat);
+	return bimg::imageParse(entry::getAllocator(), data, size, bimg::TextureFormat::Enum(_dstFormat) );
 }
 
 void calcTangents(void* _vertices, uint16_t _numVertices, bgfx::VertexDecl _decl, const uint16_t* _indices, uint32_t _numIndices)
diff --git a/3rdparty/bgfx/examples/common/bgfx_utils.h b/3rdparty/bgfx/examples/common/bgfx_utils.h
index f948d1b..e81510f 100644
--- a/3rdparty/bgfx/examples/common/bgfx_utils.h
+++ b/3rdparty/bgfx/examples/common/bgfx_utils.h
@@ -6,8 +6,9 @@
 #ifndef BGFX_UTILS_H_HEADER_GUARD
 #define BGFX_UTILS_H_HEADER_GUARD
 
+#include <bx/pixelformat.h>
 #include <bgfx/bgfx.h>
-#include "image.h"
+#include <bimg/bimg.h>
 
 ///
 void* load(const char* _filePath, uint32_t* _size = NULL);
@@ -25,7 +26,7 @@ bgfx::ProgramHandle loadProgram(const char* _vsName, const char* _fsName);
 bgfx::TextureHandle loadTexture(const char* _name, uint32_t _flags = BGFX_TEXTURE_NONE, uint8_t _skip = 0, bgfx::TextureInfo* _info = NULL);
 
 ///
-bgfx::ImageContainer* imageLoad(const char* _filePath, bgfx::TextureFormat::Enum _dstFormat);
+bimg::ImageContainer* imageLoad(const char* _filePath, bgfx::TextureFormat::Enum _dstFormat);
 
 ///
 void calcTangents(void* _vertices, uint16_t _numVertices, bgfx::VertexDecl _decl, const uint16_t* _indices, uint32_t _numIndices);
@@ -44,6 +45,21 @@ inline bool checkAvailTransientBuffers(uint32_t _numVertices, const bgfx::Vertex
 		;
 }
 
+///
+inline uint32_t encodeNormalRgba8(float _x, float _y = 0.0f, float _z = 0.0f, float _w = 0.0f)
+{
+	const float src[] =
+	{
+		_x * 0.5f + 0.5f,
+		_y * 0.5f + 0.5f,
+		_z * 0.5f + 0.5f,
+		_w * 0.5f + 0.5f,
+	};
+	uint32_t dst;
+	bx::packRgba8(&dst, src);
+	return dst;
+}
+
 ///
 struct MeshState
 {
diff --git a/3rdparty/bgfx/examples/common/imgui/imgui.cpp b/3rdparty/bgfx/examples/common/imgui/imgui.cpp
index 235a914..af1ff08 100644
--- a/3rdparty/bgfx/examples/common/imgui/imgui.cpp
+++ b/3rdparty/bgfx/examples/common/imgui/imgui.cpp
@@ -99,6 +99,8 @@ void  imguiFree(void* _ptr, void*);
 BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4505); // error C4505: '' : unreferenced local function has been removed
 BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-function"); // warning: ‘int rect_width_compare(const void*, const void*)’ defined but not used
 BX_PRAGMA_DIAGNOSTIC_PUSH();
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG("-Wunknown-pragmas")
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-but-set-variable"); // warning: variable ‘L1’ set but not used
 BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wtype-limits"); // warning: comparison is always true due to limited range of data type
 #define STBTT_malloc(_size, _userData) imguiMalloc(_size, _userData)
 #define STBTT_free(_ptr, _userData) imguiFree(_ptr, _userData)
diff --git a/3rdparty/bgfx/examples/common/nanovg/nanovg.cpp b/3rdparty/bgfx/examples/common/nanovg/nanovg.cpp
index 36a87ff..c768728 100644
--- a/3rdparty/bgfx/examples/common/nanovg/nanovg.cpp
+++ b/3rdparty/bgfx/examples/common/nanovg/nanovg.cpp
@@ -23,6 +23,10 @@
 
 #include "nanovg.h"
 
+#ifndef NANOVG_HAS_STB_IMAGE
+#	define NANOVG_HAS_STB_IMAGE 0
+#endif // NANOVG_HAS_STB_IMAGE
+
 #include <bx/macros.h>
 
 BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4701) // error C4701: potentially uninitialized local variable 'cint' used
@@ -37,31 +41,14 @@ BX_PRAGMA_DIAGNOSTIC_IGNORED_GCC("-Wunused-result");
 #include "fontstash.h"
 BX_PRAGMA_DIAGNOSTIC_POP();
 
-BX_PRAGMA_DIAGNOSTIC_PUSH();
-BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4127) // warning C4127: conditional expression is constant
+#if NANOVG_HAS_STB_IMAGE
 #define LODEPNG_NO_COMPILE_ENCODER
 #define LODEPNG_NO_COMPILE_DISK
 #define LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
 #define LODEPNG_NO_COMPILE_ERROR_TEXT
 #define LODEPNG_NO_COMPILE_ALLOCATORS
 #define LODEPNG_NO_COMPILE_CPP
-#include <lodepng/lodepng.cpp>
-BX_PRAGMA_DIAGNOSTIC_POP();
-
-void* lodepng_malloc(size_t _size)
-{
-	return ::malloc(_size);
-}
-
-void* lodepng_realloc(void* _ptr, size_t _size)
-{
-	return ::realloc(_ptr, _size);
-}
-
-void lodepng_free(void* _ptr)
-{
-	::free(_ptr);
-}
+#include <lodepng/lodepng.h>
 
 BX_PRAGMA_DIAGNOSTIC_PUSH();
 BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wmissing-field-initializers");
@@ -71,12 +58,10 @@ BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wint-to-pointer-cast")
 BX_PRAGMA_DIAGNOSTIC_IGNORED_GCC("-Wmisleading-indentation");
 BX_PRAGMA_DIAGNOSTIC_IGNORED_GCC("-Wshift-negative-value");
 #endif // BX_COMPILER_GCC >= 60000_
-#define STBI_MALLOC(_size)        lodepng_malloc(_size)
-#define STBI_REALLOC(_ptr, _size) lodepng_realloc(_ptr, _size)
-#define STBI_FREE(_ptr)           lodepng_free(_ptr)
-#define STB_IMAGE_IMPLEMENTATION
+
 #include <stb/stb_image.c>
 BX_PRAGMA_DIAGNOSTIC_POP();
+#endif // NANOVG_HAS_STB_IMAGE
 
 #ifdef _MSC_VER
 #pragma warning(disable: 4100)  // unreferenced formal parameter
@@ -833,6 +818,7 @@ void nvgFillPaint(NVGcontext* ctx, NVGpaint paint)
 
 int nvgCreateImage(NVGcontext* ctx, const char* filename, int imageFlags)
 {
+#if NANOVG_HAS_STB_IMAGE
 	int w, h, n, image;
 	unsigned char* img;
 	stbi_set_unpremultiply_on_load(1);
@@ -845,10 +831,15 @@ int nvgCreateImage(NVGcontext* ctx, const char* filename, int imageFlags)
 	image = nvgCreateImageRGBA(ctx, w, h, imageFlags, img);
 	stbi_image_free(img);
 	return image;
+#else
+	BX_UNUSED(ctx, filename, imageFlags);
+	return 0;
+#endif // NANOVG_HAS_STB_IMAGE
 }
 
 int nvgCreateImageMem(NVGcontext* ctx, int imageFlags, unsigned char* data, int ndata)
 {
+#if NANOVG_HAS_STB_IMAGE
 	int w, h, n, image;
 	unsigned char* img = stbi_load_from_memory(data, ndata, &w, &h, &n, 4);
 	if (img == NULL) {
@@ -858,6 +849,10 @@ int nvgCreateImageMem(NVGcontext* ctx, int imageFlags, unsigned char* data, int
 	image = nvgCreateImageRGBA(ctx, w, h, imageFlags, img);
 	stbi_image_free(img);
 	return image;
+#else
+	BX_UNUSED(ctx, imageFlags, data, ndata);
+	return 0;
+#endif // NANOVG_HAS_STB_IMAGE
 }
 
 int nvgCreateImageRGBA(NVGcontext* ctx, int w, int h, int imageFlags, const unsigned char* data)
diff --git a/3rdparty/bgfx/examples/common/shaderlib.sh b/3rdparty/bgfx/examples/common/shaderlib.sh
index 5871aaf..a890edb 100644
--- a/3rdparty/bgfx/examples/common/shaderlib.sh
+++ b/3rdparty/bgfx/examples/common/shaderlib.sh
@@ -383,4 +383,13 @@ vec3 fixCubeLookup(vec3 _v, float _lod, float _topLevelCubeSize)
 	return _v;
 }
 
+vec2 texture2DBc5(sampler2D _sampler, vec2 _uv)
+{
+#if BGFX_SHADER_LANGUAGE_HLSL && BGFX_SHADER_LANGUAGE_HLSL <= 3
+	return texture2D(_sampler, _uv).yx;
+#else
+	return texture2D(_sampler, _uv).xy;
+#endif
+}
+
 #endif // __SHADERLIB_SH__
diff --git a/3rdparty/bgfx/examples/makefile b/3rdparty/bgfx/examples/makefile
index f39a5d4..0df96c2 100644
--- a/3rdparty/bgfx/examples/makefile
+++ b/3rdparty/bgfx/examples/makefile
@@ -33,6 +33,7 @@ rebuild:
 	@make -s --no-print-directory rebuild -C 28-wireframe
 	@make -s --no-print-directory rebuild -C 30-picking
 	@make -s --no-print-directory rebuild -C 31-rsm
+	@make -s --no-print-directory rebuild -C 33-pom
 	@make -s --no-print-directory rebuild -C common/debugdraw
 	@make -s --no-print-directory rebuild -C common/font
 	@make -s --no-print-directory rebuild -C common/imgui
diff --git a/3rdparty/bgfx/include/bgfx/bgfx.h b/3rdparty/bgfx/include/bgfx/bgfx.h
index 32db928..b39c74b 100644
--- a/3rdparty/bgfx/include/bgfx/bgfx.h
+++ b/3rdparty/bgfx/include/bgfx/bgfx.h
@@ -10,7 +10,7 @@
 #include <stdint.h> // uint32_t
 #include <stdlib.h> // NULL
 
-#include <bgfx/defines.h>
+#include "defines.h"
 
 ///
 #define BGFX_HANDLE(_name) \
@@ -691,16 +691,16 @@ namespace bgfx
 			float translation[3];       //!< Eye translation.
 			float fov[4];               //!< Field of view (up, down, left, right).
 			float viewOffset[3];        //!< Eye view matrix translation adjustment.
-			float projection[16];       //!< Eye projection matrix
+			float projection[16];       //!< Eye projection matrix.
 			float pixelsPerTanAngle[2]; //!< Number of pixels that fit in tan(angle) = 1.
 		};
 
 		Eye eye[2];
-		uint16_t width;        //!< Framebuffer width.
-		uint16_t height;       //!< Framebuffer width.
-		uint32_t deviceWidth;  //!< Device resolution width
-		uint32_t deviceHeight; //!< Device resolution height
-		uint8_t flags;         //!< Status flags
+		uint16_t width;        //!< Frame buffer width.
+		uint16_t height;       //!< Frame buffer height.
+		uint32_t deviceWidth;  //!< Device resolution width.
+		uint32_t deviceHeight; //!< Device resolution height.
+		uint8_t  flags;        //!< Status flags.
 	};
 
 	/// Renderer statistics data.
@@ -933,44 +933,6 @@ namespace bgfx
 		, bool _index32
 		);
 
-	/// Swizzle RGBA8 image to BGRA8.
-	///
-	/// @param[in] _dst Destination image. Must be the same size as input image.
-	///   _dst might be pointer to the same memory as _src.
-	/// @param[in] _width Width of input image (pixels).
-	/// @param[in] _height Height of input image (pixels).
-	/// @param[in] _pitch Pitch of input image (bytes).
-	/// @param[in] _src Source image.
-	///
-	/// @attention C99 equivalent is `bgfx_image_swizzle_bgra8`.
-	///
-	void imageSwizzleBgra8(
-		  void* _dst
-		, uint32_t _width
-		, uint32_t _height
-		, uint32_t _pitch
-		, const void* _src
-		);
-
-	/// Downsample RGBA8 image with 2x2 pixel average filter.
-	///
-	/// @param[in] _dst Destination image. Must be at least quarter size of
-	///   input image. _dst might be pointer to the same memory as _src.
-	/// @param[in] _width Width of input image (pixels).
-	/// @param[in] _height Height of input image (pixels).
-	/// @param[in] _pitch Pitch of input image (bytes).
-	/// @param[in] _src Source image.
-	///
-	/// @attention C99 equivalent is `bgfx_image_rgba8_downsample_2x2`.
-	///
-	void imageRgba8Downsample2x2(
-		  void* _dst
-		, uint32_t _width
-		, uint32_t _height
-		, uint32_t _pitch
-		, const void* _src
-		);
-
 	/// Returns supported backend API renderers.
 	///
 	/// @param[in] _max Maximum number of elements in _enum array.
diff --git a/3rdparty/bgfx/include/bgfx/bgfxdefines.h b/3rdparty/bgfx/include/bgfx/bgfxdefines.h
deleted file mode 100644
index f42fd25..0000000
--- a/3rdparty/bgfx/include/bgfx/bgfxdefines.h
+++ /dev/null
@@ -1,444 +0,0 @@
-/*
- * Copyright 2011-2016 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
- */
-
-#ifndef BGFX_DEFINES_H_HEADER_GUARD
-#define BGFX_DEFINES_H_HEADER_GUARD
-
-#define BGFX_API_VERSION UINT32_C(29)
-
-///
-#define BGFX_STATE_RGB_WRITE               UINT64_C(0x0000000000000001) //!< Enable RGB write.
-#define BGFX_STATE_ALPHA_WRITE             UINT64_C(0x0000000000000002) //!< Enable alpha write.
-#define BGFX_STATE_DEPTH_WRITE             UINT64_C(0x0000000000000004) //!< Enable depth write.
-
-#define BGFX_STATE_DEPTH_TEST_LESS         UINT64_C(0x0000000000000010) //!< Enable depth test, less.
-#define BGFX_STATE_DEPTH_TEST_LEQUAL       UINT64_C(0x0000000000000020) //!< Enable depth test, less equal.
-#define BGFX_STATE_DEPTH_TEST_EQUAL        UINT64_C(0x0000000000000030) //!< Enable depth test, equal.
-#define BGFX_STATE_DEPTH_TEST_GEQUAL       UINT64_C(0x0000000000000040) //!< Enable depth test, greater equal.
-#define BGFX_STATE_DEPTH_TEST_GREATER      UINT64_C(0x0000000000000050) //!< Enable depth test, greater.
-#define BGFX_STATE_DEPTH_TEST_NOTEQUAL     UINT64_C(0x0000000000000060) //!< Enable depth test, not equal.
-#define BGFX_STATE_DEPTH_TEST_NEVER        UINT64_C(0x0000000000000070) //!< Enable depth test, never.
-#define BGFX_STATE_DEPTH_TEST_ALWAYS       UINT64_C(0x0000000000000080) //!< Enable depth test, always.
-#define BGFX_STATE_DEPTH_TEST_SHIFT        4                            //!< Depth test state bit shift.
-#define BGFX_STATE_DEPTH_TEST_MASK         UINT64_C(0x00000000000000f0) //!< Depth test state bit mask.
-
-#define BGFX_STATE_BLEND_ZERO              UINT64_C(0x0000000000001000) //!<
-#define BGFX_STATE_BLEND_ONE               UINT64_C(0x0000000000002000) //!<
-#define BGFX_STATE_BLEND_SRC_COLOR         UINT64_C(0x0000000000003000) //!<
-#define BGFX_STATE_BLEND_INV_SRC_COLOR     UINT64_C(0x0000000000004000) //!<
-#define BGFX_STATE_BLEND_SRC_ALPHA         UINT64_C(0x0000000000005000) //!<
-#define BGFX_STATE_BLEND_INV_SRC_ALPHA     UINT64_C(0x0000000000006000) //!<
-#define BGFX_STATE_BLEND_DST_ALPHA         UINT64_C(0x0000000000007000) //!<
-#define BGFX_STATE_BLEND_INV_DST_ALPHA     UINT64_C(0x0000000000008000) //!<
-#define BGFX_STATE_BLEND_DST_COLOR         UINT64_C(0x0000000000009000) //!<
-#define BGFX_STATE_BLEND_INV_DST_COLOR     UINT64_C(0x000000000000a000) //!<
-#define BGFX_STATE_BLEND_SRC_ALPHA_SAT     UINT64_C(0x000000000000b000) //!<
-#define BGFX_STATE_BLEND_FACTOR            UINT64_C(0x000000000000c000) //!<
-#define BGFX_STATE_BLEND_INV_FACTOR        UINT64_C(0x000000000000d000) //!<
-#define BGFX_STATE_BLEND_SHIFT             12                           //!< Blend state bit shift.
-#define BGFX_STATE_BLEND_MASK              UINT64_C(0x000000000ffff000) //!< Blend state bit mask.
-
-#define BGFX_STATE_BLEND_EQUATION_ADD      UINT64_C(0x0000000000000000) //!<
-#define BGFX_STATE_BLEND_EQUATION_SUB      UINT64_C(0x0000000010000000) //!<
-#define BGFX_STATE_BLEND_EQUATION_REVSUB   UINT64_C(0x0000000020000000) //!<
-#define BGFX_STATE_BLEND_EQUATION_MIN      UINT64_C(0x0000000030000000) //!<
-#define BGFX_STATE_BLEND_EQUATION_MAX      UINT64_C(0x0000000040000000) //!<
-#define BGFX_STATE_BLEND_EQUATION_SHIFT    28                           //!< Blend equation bit shift.
-#define BGFX_STATE_BLEND_EQUATION_MASK     UINT64_C(0x00000003f0000000) //!< Blend equation bit mask.
-
-#define BGFX_STATE_BLEND_INDEPENDENT       UINT64_C(0x0000000400000000) //!< Enable blend independent.
-#define BGFX_STATE_BLEND_ALPHA_TO_COVERAGE UINT64_C(0x0000000800000000) //!< Enable alpha to coverage.
-
-#define BGFX_STATE_CULL_CW                 UINT64_C(0x0000001000000000) //!< Cull clockwise triangles.
-#define BGFX_STATE_CULL_CCW                UINT64_C(0x0000002000000000) //!< Cull counter-clockwise triangles.
-#define BGFX_STATE_CULL_SHIFT              36                           //!< Culling mode bit shift.
-#define BGFX_STATE_CULL_MASK               UINT64_C(0x0000003000000000) //!< Culling mode bit mask.
-
-/// See BGFX_STATE_ALPHA_REF(_ref) helper macro.
-#define BGFX_STATE_ALPHA_REF_SHIFT         40                           //!< Alpha reference bit shift.
-#define BGFX_STATE_ALPHA_REF_MASK          UINT64_C(0x0000ff0000000000) //!< Alpha reference bit mask.
-
-#define BGFX_STATE_PT_TRISTRIP             UINT64_C(0x0001000000000000) //!< Tristrip.
-#define BGFX_STATE_PT_LINES                UINT64_C(0x0002000000000000) //!< Lines.
-#define BGFX_STATE_PT_LINESTRIP            UINT64_C(0x0003000000000000) //!< Line strip.
-#define BGFX_STATE_PT_POINTS               UINT64_C(0x0004000000000000) //!< Points.
-#define BGFX_STATE_PT_SHIFT                48                           //!< Primitive type bit shift.
-#define BGFX_STATE_PT_MASK                 UINT64_C(0x0007000000000000) //!< Primitive type bit mask.
-
-#define BGFX_STATE_POINT_SIZE_SHIFT        52                           //!< Point size bit shift.
-#define BGFX_STATE_POINT_SIZE_MASK         UINT64_C(0x00f0000000000000) //!< Point size bit mask.
-
-/// Enable MSAA write when writing into MSAA frame buffer. This flag is ignored when not writing into
-/// MSAA frame buffer.
-#define BGFX_STATE_MSAA                    UINT64_C(0x0100000000000000) //!< Enable MSAA rasterization.
-#define BGFX_STATE_LINEAA                  UINT64_C(0x0200000000000000) //!< Enable line AA rasterization.
-#define BGFX_STATE_CONSERVATIVE_RASTER     UINT64_C(0x0400000000000000) //!< Enable conservative rasterization.
-
-/// Do not use!
-#define BGFX_STATE_RESERVED_SHIFT          61                           //!< Internal bits shift.
-#define BGFX_STATE_RESERVED_MASK           UINT64_C(0xe000000000000000) //!< Internal bits mask.
-
-/// See BGFX_STATE_POINT_SIZE(_size) helper macro.
-#define BGFX_STATE_NONE                    UINT64_C(0x0000000000000000) //!< No state.
-#define BGFX_STATE_MASK                    UINT64_C(0xffffffffffffffff) //!< State mask.
-
-/// Default state is write to RGB, alpha, and depth with depth test less enabled, with clockwise
-/// culling and MSAA (when writing into MSAA frame buffer, otherwise this flag is ignored).
-#define BGFX_STATE_DEFAULT (0 \
-					| BGFX_STATE_RGB_WRITE \
-					| BGFX_STATE_ALPHA_WRITE \
-					| BGFX_STATE_DEPTH_TEST_LESS \
-					| BGFX_STATE_DEPTH_WRITE \
-					| BGFX_STATE_CULL_CW \
-					| BGFX_STATE_MSAA \
-					)
-
-#define BGFX_STATE_ALPHA_REF(_ref)   ( ( (uint64_t)(_ref )<<BGFX_STATE_ALPHA_REF_SHIFT )&BGFX_STATE_ALPHA_REF_MASK)
-#define BGFX_STATE_POINT_SIZE(_size) ( ( (uint64_t)(_size)<<BGFX_STATE_POINT_SIZE_SHIFT)&BGFX_STATE_POINT_SIZE_MASK)
-
-///
-#define BGFX_STATE_BLEND_FUNC_SEPARATE(_srcRGB, _dstRGB, _srcA, _dstA) (UINT64_C(0) \
-					| ( ( (uint64_t)(_srcRGB)|( (uint64_t)(_dstRGB)<<4) )   ) \
-					| ( ( (uint64_t)(_srcA  )|( (uint64_t)(_dstA  )<<4) )<<8) \
-					)
-
-#define BGFX_STATE_BLEND_EQUATION_SEPARATE(_rgb, _a) ( (uint64_t)(_rgb)|( (uint64_t)(_a)<<3) )
-
-///
-#define BGFX_STATE_BLEND_FUNC(_src, _dst)    BGFX_STATE_BLEND_FUNC_SEPARATE(_src, _dst, _src, _dst)
-#define BGFX_STATE_BLEND_EQUATION(_equation) BGFX_STATE_BLEND_EQUATION_SEPARATE(_equation, _equation)
-
-#define BGFX_STATE_BLEND_ADD         (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_ONE,       BGFX_STATE_BLEND_ONE          ) )
-#define BGFX_STATE_BLEND_ALPHA       (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_SRC_ALPHA, BGFX_STATE_BLEND_INV_SRC_ALPHA) )
-#define BGFX_STATE_BLEND_DARKEN      (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_ONE,       BGFX_STATE_BLEND_ONE          ) | BGFX_STATE_BLEND_EQUATION(BGFX_STATE_BLEND_EQUATION_MIN) )
-#define BGFX_STATE_BLEND_LIGHTEN     (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_ONE,       BGFX_STATE_BLEND_ONE          ) | BGFX_STATE_BLEND_EQUATION(BGFX_STATE_BLEND_EQUATION_MAX) )
-#define BGFX_STATE_BLEND_MULTIPLY    (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_DST_COLOR, BGFX_STATE_BLEND_ZERO         ) )
-#define BGFX_STATE_BLEND_NORMAL      (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_ONE,       BGFX_STATE_BLEND_INV_SRC_ALPHA) )
-#define BGFX_STATE_BLEND_SCREEN      (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_ONE,       BGFX_STATE_BLEND_INV_SRC_COLOR) )
-#define BGFX_STATE_BLEND_LINEAR_BURN (BGFX_STATE_BLEND_FUNC(BGFX_STATE_BLEND_DST_COLOR, BGFX_STATE_BLEND_INV_DST_COLOR) | BGFX_STATE_BLEND_EQUATION(BGFX_STATE_BLEND_EQUATION_SUB) )
-
-///
-#define BGFX_STATE_BLEND_FUNC_RT_x(_src, _dst) (0 \
-					| ( uint32_t( (_src)>>BGFX_STATE_BLEND_SHIFT) \
-					| ( uint32_t( (_dst)>>BGFX_STATE_BLEND_SHIFT)<<4) ) \
-					)
-
-#define BGFX_STATE_BLEND_FUNC_RT_xE(_src, _dst, _equation) (0 \
-					| BGFX_STATE_BLEND_FUNC_RT_x(_src, _dst) \
-					| ( uint32_t( (_equation)>>BGFX_STATE_BLEND_EQUATION_SHIFT)<<8) \
-					)
-
-#define BGFX_STATE_BLEND_FUNC_RT_1(_src, _dst)  (BGFX_STATE_BLEND_FUNC_RT_x(_src, _dst)<< 0)
-#define BGFX_STATE_BLEND_FUNC_RT_2(_src, _dst)  (BGFX_STATE_BLEND_FUNC_RT_x(_src, _dst)<<11)
-#define BGFX_STATE_BLEND_FUNC_RT_3(_src, _dst)  (BGFX_STATE_BLEND_FUNC_RT_x(_src, _dst)<<22)
-
-#define BGFX_STATE_BLEND_FUNC_RT_1E(_src, _dst, _equation) (BGFX_STATE_BLEND_FUNC_RT_xE(_src, _dst, _equation)<< 0)
-#define BGFX_STATE_BLEND_FUNC_RT_2E(_src, _dst, _equation) (BGFX_STATE_BLEND_FUNC_RT_xE(_src, _dst, _equation)<<11)
-#define BGFX_STATE_BLEND_FUNC_RT_3E(_src, _dst, _equation) (BGFX_STATE_BLEND_FUNC_RT_xE(_src, _dst, _equation)<<22)
-
-///
-#define BGFX_STENCIL_FUNC_REF_SHIFT      0                    //!<
-#define BGFX_STENCIL_FUNC_REF_MASK       UINT32_C(0x000000ff) //!<
-#define BGFX_STENCIL_FUNC_RMASK_SHIFT    8                    //!<
-#define BGFX_STENCIL_FUNC_RMASK_MASK     UINT32_C(0x0000ff00) //!<
-
-#define BGFX_STENCIL_TEST_LESS           UINT32_C(0x00010000) //!< Enable stencil test, less.
-#define BGFX_STENCIL_TEST_LEQUAL         UINT32_C(0x00020000) //!<
-#define BGFX_STENCIL_TEST_EQUAL          UINT32_C(0x00030000) //!<
-#define BGFX_STENCIL_TEST_GEQUAL         UINT32_C(0x00040000) //!<
-#define BGFX_STENCIL_TEST_GREATER        UINT32_C(0x00050000) //!<
-#define BGFX_STENCIL_TEST_NOTEQUAL       UINT32_C(0x00060000) //!<
-#define BGFX_STENCIL_TEST_NEVER          UINT32_C(0x00070000) //!<
-#define BGFX_STENCIL_TEST_ALWAYS         UINT32_C(0x00080000) //!<
-#define BGFX_STENCIL_TEST_SHIFT          16                   //!< Stencil test bit shift.
-#define BGFX_STENCIL_TEST_MASK           UINT32_C(0x000f0000) //!< Stencil test bit mask.
-
-#define BGFX_STENCIL_OP_FAIL_S_ZERO      UINT32_C(0x00000000) //!< Zero.
-#define BGFX_STENCIL_OP_FAIL_S_KEEP      UINT32_C(0x00100000) //!< Keep.
-#define BGFX_STENCIL_OP_FAIL_S_REPLACE   UINT32_C(0x00200000) //!< Replace.
-#define BGFX_STENCIL_OP_FAIL_S_INCR      UINT32_C(0x00300000) //!< Increment and wrap.
-#define BGFX_STENCIL_OP_FAIL_S_INCRSAT   UINT32_C(0x00400000) //!< Increment and clamp.
-#define BGFX_STENCIL_OP_FAIL_S_DECR      UINT32_C(0x00500000) //!< Decrement and wrap.
-#define BGFX_STENCIL_OP_FAIL_S_DECRSAT   UINT32_C(0x00600000) //!< Decrement and clamp.
-#define BGFX_STENCIL_OP_FAIL_S_INVERT    UINT32_C(0x00700000) //!< Invert.
-#define BGFX_STENCIL_OP_FAIL_S_SHIFT     20                   //!< Stencil operation fail bit shift.
-#define BGFX_STENCIL_OP_FAIL_S_MASK      UINT32_C(0x00f00000) //!< Stencil operation fail bit mask.
-
-#define BGFX_STENCIL_OP_FAIL_Z_ZERO      UINT32_C(0x00000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_KEEP      UINT32_C(0x01000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_REPLACE   UINT32_C(0x02000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_INCR      UINT32_C(0x03000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_INCRSAT   UINT32_C(0x04000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_DECR      UINT32_C(0x05000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_DECRSAT   UINT32_C(0x06000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_INVERT    UINT32_C(0x07000000) //!<
-#define BGFX_STENCIL_OP_FAIL_Z_SHIFT     24                   //!< Stencil operation fail depth bit shift.
-#define BGFX_STENCIL_OP_FAIL_Z_MASK      UINT32_C(0x0f000000) //!< Stencil operation fail depth bit mask.
-
-#define BGFX_STENCIL_OP_PASS_Z_ZERO      UINT32_C(0x00000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_KEEP      UINT32_C(0x10000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_REPLACE   UINT32_C(0x20000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_INCR      UINT32_C(0x30000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_INCRSAT   UINT32_C(0x40000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_DECR      UINT32_C(0x50000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_DECRSAT   UINT32_C(0x60000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_INVERT    UINT32_C(0x70000000) //!<
-#define BGFX_STENCIL_OP_PASS_Z_SHIFT     28                   //!< Stencil operation pass depth bit shift.
-#define BGFX_STENCIL_OP_PASS_Z_MASK      UINT32_C(0xf0000000) //!< Stencil operation pass depth bit mask.
-
-#define BGFX_STENCIL_NONE                UINT32_C(0x00000000) //!<
-#define BGFX_STENCIL_MASK                UINT32_C(0xffffffff) //!<
-#define BGFX_STENCIL_DEFAULT             UINT32_C(0x00000000) //!<
-
-/// Set stencil ref value.
-#define BGFX_STENCIL_FUNC_REF(_ref) ( (uint32_t(_ref)<<BGFX_STENCIL_FUNC_REF_SHIFT)&BGFX_STENCIL_FUNC_REF_MASK)
-
-/// Set stencil rmask value.
-#define BGFX_STENCIL_FUNC_RMASK(_mask) ( (uint32_t(_mask)<<BGFX_STENCIL_FUNC_RMASK_SHIFT)&BGFX_STENCIL_FUNC_RMASK_MASK)
-
-///
-#define BGFX_CLEAR_NONE                  UINT16_C(0x0000) //!< No clear flags.
-#define BGFX_CLEAR_COLOR                 UINT16_C(0x0001) //!< Clear color.
-#define BGFX_CLEAR_DEPTH                 UINT16_C(0x0002) //!< Clear depth.
-#define BGFX_CLEAR_STENCIL               UINT16_C(0x0004) //!< Clear stencil.
-#define BGFX_CLEAR_DISCARD_COLOR_0       UINT16_C(0x0008) //!< Discard frame buffer attachment 0.
-#define BGFX_CLEAR_DISCARD_COLOR_1       UINT16_C(0x0010) //!< Discard frame buffer attachment 1.
-#define BGFX_CLEAR_DISCARD_COLOR_2       UINT16_C(0x0020) //!< Discard frame buffer attachment 2.
-#define BGFX_CLEAR_DISCARD_COLOR_3       UINT16_C(0x0040) //!< Discard frame buffer attachment 3.
-#define BGFX_CLEAR_DISCARD_COLOR_4       UINT16_C(0x0080) //!< Discard frame buffer attachment 4.
-#define BGFX_CLEAR_DISCARD_COLOR_5       UINT16_C(0x0100) //!< Discard frame buffer attachment 5.
-#define BGFX_CLEAR_DISCARD_COLOR_6       UINT16_C(0x0200) //!< Discard frame buffer attachment 6.
-#define BGFX_CLEAR_DISCARD_COLOR_7       UINT16_C(0x0400) //!< Discard frame buffer attachment 7.
-#define BGFX_CLEAR_DISCARD_DEPTH         UINT16_C(0x0800) //!< Discard frame buffer depth attachment.
-#define BGFX_CLEAR_DISCARD_STENCIL       UINT16_C(0x1000) //!< Discard frame buffer stencil attachment.
-
-#define BGFX_CLEAR_DISCARD_COLOR_MASK (0 \
-			| BGFX_CLEAR_DISCARD_COLOR_0 \
-			| BGFX_CLEAR_DISCARD_COLOR_1 \
-			| BGFX_CLEAR_DISCARD_COLOR_2 \
-			| BGFX_CLEAR_DISCARD_COLOR_3 \
-			| BGFX_CLEAR_DISCARD_COLOR_4 \
-			| BGFX_CLEAR_DISCARD_COLOR_5 \
-			| BGFX_CLEAR_DISCARD_COLOR_6 \
-			| BGFX_CLEAR_DISCARD_COLOR_7 \
-			)
-#define BGFX_CLEAR_DISCARD_MASK (0 \
-			| BGFX_CLEAR_DISCARD_COLOR_MASK \
-			| BGFX_CLEAR_DISCARD_DEPTH \
-			| BGFX_CLEAR_DISCARD_STENCIL \
-			)
-
-#define BGFX_DEBUG_NONE                  UINT32_C(0x00000000) //!< No debug.
-#define BGFX_DEBUG_WIREFRAME             UINT32_C(0x00000001) //!< Enable wireframe for all primitives.
-#define BGFX_DEBUG_IFH                   UINT32_C(0x00000002) //!< Enable infinitely fast hardware test. No draw calls will be submitted to driver. It’s useful when profiling to quickly assess bottleneck between CPU and GPU.
-#define BGFX_DEBUG_STATS                 UINT32_C(0x00000004) //!< Enable statistics display.
-#define BGFX_DEBUG_TEXT                  UINT32_C(0x00000008) //!< Enable debug text display.
-
-///
-#define BGFX_BUFFER_NONE                 UINT16_C(0x0000) //!<
-
-#define BGFX_BUFFER_COMPUTE_FORMAT_8x1   UINT16_C(0x0001) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_8x2   UINT16_C(0x0002) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_8x4   UINT16_C(0x0003) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_16x1  UINT16_C(0x0004) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_16x2  UINT16_C(0x0005) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_16x4  UINT16_C(0x0006) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_32x1  UINT16_C(0x0007) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_32x2  UINT16_C(0x0008) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_32x4  UINT16_C(0x0009) //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_SHIFT 0                //!<
-#define BGFX_BUFFER_COMPUTE_FORMAT_MASK  UINT16_C(0x000f) //!<
-
-#define BGFX_BUFFER_COMPUTE_TYPE_UINT    UINT16_C(0x0010) //!<
-#define BGFX_BUFFER_COMPUTE_TYPE_INT     UINT16_C(0x0020) //!<
-#define BGFX_BUFFER_COMPUTE_TYPE_FLOAT   UINT16_C(0x0030) //!<
-#define BGFX_BUFFER_COMPUTE_TYPE_SHIFT   4                //!<
-#define BGFX_BUFFER_COMPUTE_TYPE_MASK    UINT16_C(0x0030) //!<
-
-#define BGFX_BUFFER_COMPUTE_READ         UINT16_C(0x0100) //!< Buffer will be read by shader.
-#define BGFX_BUFFER_COMPUTE_WRITE        UINT16_C(0x0200) //!< Buffer will be used for writing.
-#define BGFX_BUFFER_DRAW_INDIRECT        UINT16_C(0x0400) //!< Buffer will be used for storing draw indirect commands.
-#define BGFX_BUFFER_ALLOW_RESIZE         UINT16_C(0x0800) //!<
-#define BGFX_BUFFER_INDEX32              UINT16_C(0x1000) //!<
-
-#define BGFX_BUFFER_COMPUTE_READ_WRITE (0 \
-			| BGFX_BUFFER_COMPUTE_READ \
-			| BGFX_BUFFER_COMPUTE_WRITE \
-			)
-
-///
-#define BGFX_TEXTURE_NONE                UINT32_C(0x00000000) //!<
-#define BGFX_TEXTURE_U_MIRROR            UINT32_C(0x00000001) //!<
-#define BGFX_TEXTURE_U_CLAMP             UINT32_C(0x00000002) //!<
-#define BGFX_TEXTURE_U_BORDER            UINT32_C(0x00000003) //!<
-#define BGFX_TEXTURE_U_SHIFT             0                    //!<
-#define BGFX_TEXTURE_U_MASK              UINT32_C(0x00000003) //!<
-#define BGFX_TEXTURE_V_MIRROR            UINT32_C(0x00000004) //!<
-#define BGFX_TEXTURE_V_CLAMP             UINT32_C(0x00000008) //!<
-#define BGFX_TEXTURE_V_BORDER            UINT32_C(0x0000000c) //!<
-#define BGFX_TEXTURE_V_SHIFT             2                    //!<
-#define BGFX_TEXTURE_V_MASK              UINT32_C(0x0000000c) //!<
-#define BGFX_TEXTURE_W_MIRROR            UINT32_C(0x00000010) //!<
-#define BGFX_TEXTURE_W_CLAMP             UINT32_C(0x00000020) //!<
-#define BGFX_TEXTURE_W_BORDER            UINT32_C(0x00000030) //!<
-#define BGFX_TEXTURE_W_SHIFT             4                    //!<
-#define BGFX_TEXTURE_W_MASK              UINT32_C(0x00000030) //!<
-#define BGFX_TEXTURE_MIN_POINT           UINT32_C(0x00000040) //!<
-#define BGFX_TEXTURE_MIN_ANISOTROPIC     UINT32_C(0x00000080) //!<
-#define BGFX_TEXTURE_MIN_SHIFT           6                    //!<
-#define BGFX_TEXTURE_MIN_MASK            UINT32_C(0x000000c0) //!<
-#define BGFX_TEXTURE_MAG_POINT           UINT32_C(0x00000100) //!<
-#define BGFX_TEXTURE_MAG_ANISOTROPIC     UINT32_C(0x00000200) //!<
-#define BGFX_TEXTURE_MAG_SHIFT           8                    //!<
-#define BGFX_TEXTURE_MAG_MASK            UINT32_C(0x00000300) //!<
-#define BGFX_TEXTURE_MIP_POINT           UINT32_C(0x00000400) //!<
-#define BGFX_TEXTURE_MIP_SHIFT           10                   //!<
-#define BGFX_TEXTURE_MIP_MASK            UINT32_C(0x00000400) //!<
-#define BGFX_TEXTURE_MSAA_SAMPLE         UINT32_C(0x00000800) //!<
-#define BGFX_TEXTURE_RT                  UINT32_C(0x00001000) //!<
-#define BGFX_TEXTURE_RT_MSAA_X2          UINT32_C(0x00002000) //!<
-#define BGFX_TEXTURE_RT_MSAA_X4          UINT32_C(0x00003000) //!<
-#define BGFX_TEXTURE_RT_MSAA_X8          UINT32_C(0x00004000) //!<
-#define BGFX_TEXTURE_RT_MSAA_X16         UINT32_C(0x00005000) //!<
-#define BGFX_TEXTURE_RT_MSAA_SHIFT       12                   //!<
-#define BGFX_TEXTURE_RT_MSAA_MASK        UINT32_C(0x00007000) //!<
-#define BGFX_TEXTURE_RT_WRITE_ONLY       UINT32_C(0x00008000) //!<
-#define BGFX_TEXTURE_RT_MASK             UINT32_C(0x0000f000) //!<
-#define BGFX_TEXTURE_COMPARE_LESS        UINT32_C(0x00010000) //!<
-#define BGFX_TEXTURE_COMPARE_LEQUAL      UINT32_C(0x00020000) //!<
-#define BGFX_TEXTURE_COMPARE_EQUAL       UINT32_C(0x00030000) //!<
-#define BGFX_TEXTURE_COMPARE_GEQUAL      UINT32_C(0x00040000) //!<
-#define BGFX_TEXTURE_COMPARE_GREATER     UINT32_C(0x00050000) //!<
-#define BGFX_TEXTURE_COMPARE_NOTEQUAL    UINT32_C(0x00060000) //!<
-#define BGFX_TEXTURE_COMPARE_NEVER       UINT32_C(0x00070000) //!<
-#define BGFX_TEXTURE_COMPARE_ALWAYS      UINT32_C(0x00080000) //!<
-#define BGFX_TEXTURE_COMPARE_SHIFT       16                   //!<
-#define BGFX_TEXTURE_COMPARE_MASK        UINT32_C(0x000f0000) //!<
-#define BGFX_TEXTURE_COMPUTE_WRITE       UINT32_C(0x00100000) //!<
-#define BGFX_TEXTURE_SRGB                UINT32_C(0x00200000) //!<
-#define BGFX_TEXTURE_BLIT_DST            UINT32_C(0x00400000) //!<
-#define BGFX_TEXTURE_READ_BACK           UINT32_C(0x00800000) //!<
-#define BGFX_TEXTURE_BORDER_COLOR_SHIFT  24                   //!<
-#define BGFX_TEXTURE_BORDER_COLOR_MASK   UINT32_C(0x0f000000) //!<
-#define BGFX_TEXTURE_RESERVED_SHIFT      28                   //!<
-#define BGFX_TEXTURE_RESERVED_MASK       UINT32_C(0xf0000000) //!<
-
-#define BGFX_TEXTURE_BORDER_COLOR(_index) ( (_index << BGFX_TEXTURE_BORDER_COLOR_SHIFT) & BGFX_TEXTURE_BORDER_COLOR_MASK)
-
-#define BGFX_TEXTURE_SAMPLER_BITS_MASK (0 \
-			| BGFX_TEXTURE_U_MASK \
-			| BGFX_TEXTURE_V_MASK \
-			| BGFX_TEXTURE_W_MASK \
-			| BGFX_TEXTURE_MIN_MASK \
-			| BGFX_TEXTURE_MAG_MASK \
-			| BGFX_TEXTURE_MIP_MASK \
-			| BGFX_TEXTURE_COMPARE_MASK \
-			)
-
-///
-#define BGFX_RESET_NONE                  UINT32_C(0x00000000) //!< No reset flags.
-#define BGFX_RESET_FULLSCREEN            UINT32_C(0x00000001) //!< Not supported yet.
-#define BGFX_RESET_FULLSCREEN_SHIFT      0                    //!< Fullscreen bit shift.
-#define BGFX_RESET_FULLSCREEN_MASK       UINT32_C(0x00000001) //!< Fullscreen bit mask.
-#define BGFX_RESET_MSAA_X2               UINT32_C(0x00000010) //!< Enable 2x MSAA.
-#define BGFX_RESET_MSAA_X4               UINT32_C(0x00000020) //!< Enable 4x MSAA.
-#define BGFX_RESET_MSAA_X8               UINT32_C(0x00000030) //!< Enable 8x MSAA.
-#define BGFX_RESET_MSAA_X16              UINT32_C(0x00000040) //!< Enable 16x MSAA.
-#define BGFX_RESET_MSAA_SHIFT            4                    //!< MSAA mode bit shift.
-#define BGFX_RESET_MSAA_MASK             UINT32_C(0x00000070) //!< MSAA mode bit mask.
-#define BGFX_RESET_VSYNC                 UINT32_C(0x00000080) //!< Enable V-Sync.
-#define BGFX_RESET_MAXANISOTROPY         UINT32_C(0x00000100) //!< Turn on/off max anisotropy.
-#define BGFX_RESET_CAPTURE               UINT32_C(0x00000200) //!< Begin screen capture.
-#define BGFX_RESET_HMD                   UINT32_C(0x00000400) //!< HMD stereo rendering.
-#define BGFX_RESET_HMD_DEBUG             UINT32_C(0x00000800) //!< HMD stereo rendering debug mode.
-#define BGFX_RESET_HMD_RECENTER          UINT32_C(0x00001000) //!< HMD calibration.
-#define BGFX_RESET_FLUSH_AFTER_RENDER    UINT32_C(0x00002000) //!< Flush rendering after submitting to GPU.
-#define BGFX_RESET_FLIP_AFTER_RENDER     UINT32_C(0x00004000) //!< This flag  specifies where flip occurs. Default behavior is that flip occurs before rendering new frame. This flag only has effect when `BGFX_CONFIG_MULTITHREADED=0`.
-#define BGFX_RESET_SRGB_BACKBUFFER       UINT32_C(0x00008000) //!< Enable sRGB backbuffer.
-#define BGFX_RESET_HIDPI                 UINT32_C(0x00010000) //!< Enable HiDPI rendering.
-#define BGFX_RESET_DEPTH_CLAMP           UINT32_C(0x00020000) //!< Enable depth clamp.
-#define BGFX_RESET_SUSPEND               UINT32_C(0x00040000) //!< Suspend rendering.
-
-#define BGFX_RESET_RESERVED_SHIFT        31                   //!< Internal bits shift.
-#define BGFX_RESET_RESERVED_MASK         UINT32_C(0x80000000) //!< Internal bits mask.
-
-///
-#define BGFX_CAPS_ALPHA_TO_COVERAGE      UINT64_C(0x0000000000000001) //!< Alpha to coverage is supported.
-#define BGFX_CAPS_BLEND_INDEPENDENT      UINT64_C(0x0000000000000002) //!< Blend independent is supported.
-#define BGFX_CAPS_COMPUTE                UINT64_C(0x0000000000000004) //!< Compute shaders are supported.
-#define BGFX_CAPS_CONSERVATIVE_RASTER    UINT64_C(0x0000000000000008) //!< Conservative rasterization is supported.
-#define BGFX_CAPS_DRAW_INDIRECT          UINT64_C(0x0000000000000010) //!< Draw indirect is supported.
-#define BGFX_CAPS_FRAGMENT_DEPTH         UINT64_C(0x0000000000000020) //!< Fragment depth is accessible in fragment shader.
-#define BGFX_CAPS_FRAGMENT_ORDERING      UINT64_C(0x0000000000000040) //!< Fragment ordering is available in fragment shader.
-#define BGFX_CAPS_GRAPHICS_DEBUGGER      UINT64_C(0x0000000000000080) //!< Graphics debugger is present.
-#define BGFX_CAPS_HIDPI                  UINT64_C(0x0000000000000100) //!< HiDPI rendering is supported.
-#define BGFX_CAPS_HMD                    UINT64_C(0x0000000000000200) //!< Head Mounted Display is available.
-#define BGFX_CAPS_INDEX32                UINT64_C(0x0000000000000400) //!< 32-bit indices are supported.
-#define BGFX_CAPS_INSTANCING             UINT64_C(0x0000000000000800) //!< Instancing is supported.
-#define BGFX_CAPS_OCCLUSION_QUERY        UINT64_C(0x0000000000001000) //!< Occlusion query is supported.
-#define BGFX_CAPS_RENDERER_MULTITHREADED UINT64_C(0x0000000000002000) //!< Renderer is on separate thread.
-#define BGFX_CAPS_SWAP_CHAIN             UINT64_C(0x0000000000004000) //!< Multiple windows are supported.
-#define BGFX_CAPS_TEXTURE_2D_ARRAY       UINT64_C(0x0000000000008000) //!< 2D texture array is supported.
-#define BGFX_CAPS_TEXTURE_3D             UINT64_C(0x0000000000010000) //!< 3D textures are supported.
-#define BGFX_CAPS_TEXTURE_BLIT           UINT64_C(0x0000000000020000) //!< Texture blit is supported.
-#define BGFX_CAPS_TEXTURE_COMPARE_ALL    UINT64_C(0x00000000000c0000) //!< All texture compare modes are supported.
-#define BGFX_CAPS_TEXTURE_COMPARE_LEQUAL UINT64_C(0x0000000000080000) //!< Texture compare less equal mode is supported.
-#define BGFX_CAPS_TEXTURE_CUBE_ARRAY     UINT64_C(0x0000000000100000) //!< Cubemap texture array is supported.
-#define BGFX_CAPS_TEXTURE_READ_BACK      UINT64_C(0x0000000000200000) //!< Read-back texture is supported.
-#define BGFX_CAPS_VERTEX_ATTRIB_HALF     UINT64_C(0x0000000000400000) //!< Vertex attribute half-float is supported.
-#define BGFX_CAPS_VERTEX_ATTRIB_UINT10   UINT64_C(0x0000000000800000) //!< Vertex attribute 10_10_10_2 is supported.
-
-///
-#define BGFX_CAPS_FORMAT_TEXTURE_NONE             UINT16_C(0x0000) //!< Texture format is not supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_2D               UINT16_C(0x0001) //!< Texture format is supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_2D_SRGB          UINT16_C(0x0002) //!< Texture as sRGB format is supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_2D_EMULATED      UINT16_C(0x0004) //!< Texture format is emulated.
-#define BGFX_CAPS_FORMAT_TEXTURE_3D               UINT16_C(0x0008) //!< Texture format is supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_3D_SRGB          UINT16_C(0x0010) //!< Texture as sRGB format is supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_3D_EMULATED      UINT16_C(0x0020) //!< Texture format is emulated.
-#define BGFX_CAPS_FORMAT_TEXTURE_CUBE             UINT16_C(0x0040) //!< Texture format is supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_CUBE_SRGB        UINT16_C(0x0080) //!< Texture as sRGB format is supported.
-#define BGFX_CAPS_FORMAT_TEXTURE_CUBE_EMULATED    UINT16_C(0x0100) //!< Texture format is emulated.
-#define BGFX_CAPS_FORMAT_TEXTURE_VERTEX           UINT16_C(0x0200) //!< Texture format can be used from vertex shader.
-#define BGFX_CAPS_FORMAT_TEXTURE_IMAGE            UINT16_C(0x0400) //!< Texture format can be used as image from compute shader.
-#define BGFX_CAPS_FORMAT_TEXTURE_FRAMEBUFFER      UINT16_C(0x0800) //!< Texture format can be used as frame buffer.
-#define BGFX_CAPS_FORMAT_TEXTURE_FRAMEBUFFER_MSAA UINT16_C(0x1000) //!< Texture format can be used as MSAA frame buffer.
-#define BGFX_CAPS_FORMAT_TEXTURE_MSAA             UINT16_C(0x2000) //!< Texture can be sampled as MSAA.
-#define BGFX_CAPS_FORMAT_TEXTURE_MIP_AUTOGEN      UINT16_C(0x4000) //!< Texture format supports auto-generated mips.
-
-///
-#define BGFX_VIEW_NONE   UINT8_C(0x00) //!<
-#define BGFX_VIEW_STEREO UINT8_C(0x01) //!< View will be rendered in stereo mode.
-
-///
-#define BGFX_SUBMIT_EYE_LEFT       UINT8_C(0x01) //!< Submit to left eye.
-#define BGFX_SUBMIT_EYE_RIGHT      UINT8_C(0x02) //!< Submit to right eye.
-#define BGFX_SUBMIT_EYE_MASK       UINT8_C(0x03) //!<
-#define BGFX_SUBMIT_EYE_FIRST      BGFX_SUBMIT_EYE_LEFT
-
-#define BGFX_SUBMIT_RESERVED_SHIFT 7             //!< Internal bits shift.
-#define BGFX_SUBMIT_RESERVED_MASK  UINT8_C(0x80) //!< Internal bits mask.
-
-///
-#define BGFX_PCI_ID_NONE                UINT16_C(0x0000) //!< Autoselect adapter.
-#define BGFX_PCI_ID_SOFTWARE_RASTERIZER UINT16_C(0x0001) //!< Software rasterizer.
-#define BGFX_PCI_ID_AMD                 UINT16_C(0x1002) //!< AMD adapter.
-#define BGFX_PCI_ID_INTEL               UINT16_C(0x8086) //!< Intel adapter.
-#define BGFX_PCI_ID_NVIDIA              UINT16_C(0x10de) //!< nVidia adapter.
-
-///
-#define BGFX_HMD_NONE              UINT8_C(0x00) //!< None.
-#define BGFX_HMD_DEVICE_RESOLUTION UINT8_C(0x01) //!< Has HMD native resolution.
-#define BGFX_HMD_RENDERING         UINT8_C(0x02) //!< Rendering to HMD.
-
-///
-#define BGFX_CUBE_MAP_POSITIVE_X UINT8_C(0x00) //!< Cubemap +x.
-#define BGFX_CUBE_MAP_NEGATIVE_X UINT8_C(0x01) //!< Cubemap -x.
-#define BGFX_CUBE_MAP_POSITIVE_Y UINT8_C(0x02) //!< Cubemap +y.
-#define BGFX_CUBE_MAP_NEGATIVE_Y UINT8_C(0x03) //!< Cubemap -y.
-#define BGFX_CUBE_MAP_POSITIVE_Z UINT8_C(0x04) //!< Cubemap +z.
-#define BGFX_CUBE_MAP_NEGATIVE_Z UINT8_C(0x05) //!< Cubemap -z.
-
-#endif // BGFX_DEFINES_H_HEADER_GUARD
diff --git a/3rdparty/bgfx/include/bgfx/bgfxplatform.h b/3rdparty/bgfx/include/bgfx/bgfxplatform.h
deleted file mode 100644
index 09962b6..0000000
--- a/3rdparty/bgfx/include/bgfx/bgfxplatform.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright 2011-2016 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bgfx/blob/master/LICENSE
- */
-
-#ifndef BGFX_PLATFORM_H_HEADER_GUARD
-#define BGFX_PLATFORM_H_HEADER_GUARD
-
-// NOTICE:
-// This header file contains platform specific interfaces. It is only
-// necessary to use this header in conjunction with creating windows.
-
-#include <bx/platform.h>
-#include <bgfx/bgfx.h>
-
-namespace bgfx
-{
-	/// Render frame enum.
-	///
-	/// @attention C99 equivalent is `bgfx_render_frame_t`.
-	///
-	struct RenderFrame
-	{
-		enum Enum
-		{
-			NoContext,
-			Render,
-			Exiting,
-
-			Count
-		};
-	};
-
-	/// Render frame.
-	///
-	/// @returns Current renderer state. See: `bgfx::RenderFrame`.
-	///
-	/// @warning This call should be only used on platforms that don't
-	///   allow creating separate rendering thread. If it is called before
-	///   to bgfx::init, render thread won't be created by bgfx::init call.
-	RenderFrame::Enum renderFrame();
-
-	/// Platform data.
-	///
-	/// @attention C99 equivalent is `bgfx_platform_data_t`.
-	///
-	struct PlatformData
-	{
-		void* ndt;          //!< Native display type.
-		void* nwh;          //!< Native window handle.
-		void* context;      //!< GL context, or D3D device.
-		void* backBuffer;   //!< GL backbuffer, or D3D render target view.
-		void* backBufferDS; //!< Backbuffer depth/stencil.
-		void* session;      //!< ovrSession, for Oculus SDK
-	};
-
-	/// Set platform data.
-	///
-	/// @warning Must be called before `bgfx::init`.
-	///
-	/// @attention C99 equivalent is `bgfx_set_platform_data`.
-	///
-	void setPlatformData(const PlatformData& _data);
-
-	/// Internal data.
-	///
-	/// @attention C99 equivalent is `bgfx_internal_data_t`.
-	///
-	struct InternalData
-	{
-		const struct Caps* caps; //!< Renderer capabilities.
-		void* context;           //!< GL context, or D3D device.
-	};
-
-	/// Get internal data for interop.
-	///
-	/// @attention It's expected you understand some bgfx internals before you
-	///   use this call.
-	///
-	/// @warning Must be called only on render thread.
-	///
-	/// @attention C99 equivalent is `bgfx_get_internal_data`.
-	///
-	const InternalData* getInternalData();
-
-	/// Override internal texture with externally created texture. Previously
-	/// created internal texture will released.
-	///
-	/// @attention It's expected you understand some bgfx internals before you
-	///   use this call.
-	///
-	/// @param[in] _handle Texture handle.
-	/// @param[in] _ptr Native API pointer to texture.
-	///
-	/// @returns Native API pointer to texture. If result is 0, texture is not created yet from the
-	///   main thread.
-	///
-	/// @warning Must be called only on render thread.
-	///
-	/// @attention C99 equivalent is `bgfx_override_internal_texture_ptr`.
-	///
-	uintptr_t overrideInternal(TextureHandle _handle, uintptr_t _ptr);
-
-	/// Override internal texture by creating new texture. Previously created
-	/// internal texture will released.
-	///
-	/// @attention It's expected you understand some bgfx internals before you
-	///   use this call.
-	///
-	/// @param[in] _handle Texture handle.
-	/// @param[in] _width Width.
-	/// @param[in] _height Height.
-	/// @param[in] _numMips Number of mip-maps.
-	/// @param[in] _format Texture format. See: `TextureFormat::Enum`.
-	/// @param[in] _flags Default texture sampling mode is linear, and wrap mode
-	///   is repeat.
-	///   - `BGFX_TEXTURE_[U/V/W]_[MIRROR/CLAMP]` - Mirror or clamp to edge wrap
-	///     mode.
-	///   - `BGFX_TEXTURE_[MIN/MAG/MIP]_[POINT/ANISOTROPIC]` - Point or anisotropic
-	///     sampling.
-	///
-	/// @returns Native API pointer to texture. If result is 0, texture is not created yet from the
-	///   main thread.
-	///
-	/// @warning Must be called only on render thread.
-	///
-	/// @attention C99 equivalent is `bgfx_override_internal_texture`.
-	///
-	uintptr_t overrideInternal(TextureHandle _handle, uint16_t _width, uint16_t _height, uint8_t _numMips, TextureFormat::Enum _format, uint32_t _flags = BGFX_TEXTURE_NONE);
-
-} // namespace bgfx
-
-#if BX_PLATFORM_NACL
-#	include <ppapi/c/ppb_graphics_3d.h>
-#	include <ppapi/c/ppb_instance.h>
-
-namespace bgfx
-{
-	typedef void (*PostSwapBuffersFn)(uint32_t _width, uint32_t _height);
-
-	///
-	bool naclSetInterfaces(::PP_Instance, const ::PPB_Instance*, const ::PPB_Graphics3D*, PostSwapBuffersFn);
-
-} // namespace bgfx
-
-#endif // BX_PLATFORM_
-
-#endif // BGFX_PLATFORM_H_HEADER_GUARD
diff --git a/3rdparty/bgfx/include/bgfx/c99/bgfx.h b/3rdparty/bgfx/include/bgfx/c99/bgfx.h
index d329437..58e109e 100644
--- a/3rdparty/bgfx/include/bgfx/c99/bgfx.h
+++ b/3rdparty/bgfx/include/bgfx/c99/bgfx.h
@@ -39,7 +39,7 @@
 #   define BGFX_C_API BGFX_SHARED_LIB_API
 #endif // defined(__cplusplus)
 
-#include <bgfx/defines.h>
+#include "../defines.h"
 
 typedef enum bgfx_renderer_type
 {
@@ -548,12 +548,6 @@ BGFX_C_API uint32_t bgfx_topology_convert(bgfx_topology_convert_t _conversion, v
 /**/
 BGFX_C_API void bgfx_topology_sort_tri_list(bgfx_topology_sort_t _sort, void* _dst, uint32_t _dstSize, const float _dir[3], const float _pos[3], const void* _vertices, uint32_t _stride, const void* _indices, uint32_t _numIndices, bool _index32);
 
-/**/
-BGFX_C_API void bgfx_image_swizzle_bgra8(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-/**/
-BGFX_C_API void bgfx_image_rgba8_downsample_2x2(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
 /**/
 BGFX_C_API uint8_t bgfx_get_supported_renderers(uint8_t _max, bgfx_renderer_type_t* _enum);
 
diff --git a/3rdparty/bgfx/include/bgfx/c99/bgfxplatform.h b/3rdparty/bgfx/include/bgfx/c99/bgfxplatform.h
deleted file mode 100644
index 56b4d8a..0000000
--- a/3rdparty/bgfx/include/bgfx/c99/bgfxplatform.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright 2011-2016 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bgfx/blob/master/LICENSE
- *
- * vim: set tabstop=4 expandtab:
- */
-
-#ifndef BGFX_PLATFORM_C99_H_HEADER_GUARD
-#define BGFX_PLATFORM_C99_H_HEADER_GUARD
-
-// NOTICE:
-// This header file contains platform specific interfaces. It is only
-// necessary to use this header in conjunction with creating windows.
-
-#include <bx/platform.h>
-#include <bgfx/c99/bgfx.h>
-
-typedef enum bgfx_render_frame
-{
-    BGFX_RENDER_FRAME_NO_CONTEXT,
-    BGFX_RENDER_FRAME_RENDER,
-    BGFX_RENDER_FRAME_EXITING,
-
-    BGFX_RENDER_FRAME_COUNT
-
-} bgfx_render_frame_t;
-
-/**
- * WARNING: This call should be only used on platforms that don't
- * allow creating separate rendering thread. If it is called before
- * to bgfx_init, render thread won't be created by bgfx_init call.
- */
-BGFX_C_API bgfx_render_frame_t bgfx_render_frame();
-
-typedef struct bgfx_platform_data
-{
-    void* ndt;
-    void* nwh;
-    void* context;
-    void* backBuffer;
-    void* backBufferDS;
-    void* session;
-
-} bgfx_platform_data_t;
-
-/**/
-BGFX_C_API void bgfx_set_platform_data(const bgfx_platform_data_t* _data);
-
-typedef struct bgfx_internal_data
-{
-    const struct bgfx_caps* caps;
-    void* context;
-
-} bgfx_internal_data_t;
-
-/**/
-BGFX_C_API const bgfx_internal_data_t* bgfx_get_internal_data();
-
-/**/
-BGFX_C_API uintptr_t bgfx_override_internal_texture_ptr(bgfx_texture_handle_t _handle, uintptr_t _ptr);
-
-/**/
-BGFX_C_API uintptr_t bgfx_override_internal_texture(bgfx_texture_handle_t _handle, uint16_t _width, uint16_t _height, uint8_t _numMips, bgfx_texture_format_t _format, uint32_t _flags);
-
-/**/
-typedef struct bgfx_interface_vtbl
-{
-    bgfx_render_frame_t (*render_frame)();
-    void (*set_platform_data)(const bgfx_platform_data_t* _data);
-    const bgfx_internal_data_t* (*get_internal_data)();
-    uintptr_t (*override_internal_texture_ptr)(bgfx_texture_handle_t _handle, uintptr_t _ptr);
-    uintptr_t (*override_internal_texture)(bgfx_texture_handle_t _handle, uint16_t _width, uint16_t _height, uint8_t _numMips, bgfx_texture_format_t _format, uint32_t _flags);
-    void (*vertex_decl_begin)(bgfx_vertex_decl_t* _decl, bgfx_renderer_type_t _renderer);
-    void (*vertex_decl_add)(bgfx_vertex_decl_t* _decl, bgfx_attrib_t _attrib, uint8_t _num, bgfx_attrib_type_t _type, bool _normalized, bool _asInt);
-    void (*vertex_decl_skip)(bgfx_vertex_decl_t* _decl, uint8_t _num);
-    void (*vertex_decl_end)(bgfx_vertex_decl_t* _decl);
-    void (*vertex_pack)(const float _input[4], bool _inputNormalized, bgfx_attrib_t _attr, const bgfx_vertex_decl_t* _decl, void* _data, uint32_t _index);
-    void (*vertex_unpack)(float _output[4], bgfx_attrib_t _attr, const bgfx_vertex_decl_t* _decl, const void* _data, uint32_t _index);
-    void (*vertex_convert)(const bgfx_vertex_decl_t* _destDecl, void* _destData, const bgfx_vertex_decl_t* _srcDecl, const void* _srcData, uint32_t _num);
-    uint16_t (*weld_vertices)(uint16_t* _output, const bgfx_vertex_decl_t* _decl, const void* _data, uint16_t _num, float _epsilon);
-    uint32_t (*topology_convert)(bgfx_topology_convert_t _conversion, void* _dst, uint32_t _dstSize, const void* _indices, uint32_t _numIndices, bool _index32);
-    void (*topology_sort_tri_list)(bgfx_topology_sort_t _sort, void* _dst, uint32_t _dstSize, const float _dir[3], const float _pos[3], const void* _vertices, uint32_t _stride, const void* _indices, uint32_t _numIndices, bool _index32);
-    void (*image_swizzle_bgra8)(uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src, void* _dst);
-    void (*image_rgba8_downsample_2x2)(uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src, void* _dst);
-    uint8_t (*get_supported_renderers)(uint8_t _max, bgfx_renderer_type_t* _enum);
-    const char* (*get_renderer_name)(bgfx_renderer_type_t _type);
-    bool (*init)(bgfx_renderer_type_t _type, uint16_t _vendorId, uint16_t _deviceId, bgfx_callback_interface_t* _callback, bgfx_allocator_interface_t* _allocator);
-    void (*shutdown)();
-    void (*reset)(uint32_t _width, uint32_t _height, uint32_t _flags);
-    uint32_t (*frame)(bool _capture);
-    bgfx_renderer_type_t (*get_renderer_type)();
-    const bgfx_caps_t* (*get_caps)();
-    const bgfx_hmd_t* (*get_hmd)();
-    const bgfx_stats_t* (*get_stats)();
-    const bgfx_memory_t* (*alloc)(uint32_t _size);
-    const bgfx_memory_t* (*copy)(const void* _data, uint32_t _size);
-    const bgfx_memory_t* (*make_ref)(const void* _data, uint32_t _size);
-    const bgfx_memory_t* (*make_ref_release)(const void* _data, uint32_t _size, bgfx_release_fn_t _releaseFn, void* _userData);
-    void (*set_debug)(uint32_t _debug);
-    void (*dbg_text_clear)(uint8_t _attr, bool _small);
-    void (*dbg_text_printf)(uint16_t _x, uint16_t _y, uint8_t _attr, const char* _format, ...);
-    void (*dbg_text_vprintf)(uint16_t _x, uint16_t _y, uint8_t _attr, const char* _format, va_list _argList);
-    void (*dbg_text_image)(uint16_t _x, uint16_t _y, uint16_t _width, uint16_t _height, const void* _data, uint16_t _pitch);
-    bgfx_index_buffer_handle_t (*create_index_buffer)(const bgfx_memory_t* _mem, uint16_t _flags);
-    void (*destroy_index_buffer)(bgfx_index_buffer_handle_t _handle);
-    bgfx_vertex_buffer_handle_t (*create_vertex_buffer)(const bgfx_memory_t* _mem, const bgfx_vertex_decl_t* _decl, uint16_t _flags);
-    void (*destroy_vertex_buffer)(bgfx_vertex_buffer_handle_t _handle);
-    bgfx_dynamic_index_buffer_handle_t (*create_dynamic_index_buffer)(uint32_t _num, uint16_t _flags);
-    bgfx_dynamic_index_buffer_handle_t (*create_dynamic_index_buffer_mem)(const bgfx_memory_t* _mem, uint16_t _flags);
-    void (*update_dynamic_index_buffer)(bgfx_dynamic_index_buffer_handle_t _handle, uint32_t _startIndex, const bgfx_memory_t* _mem);
-    void (*destroy_dynamic_index_buffer)(bgfx_dynamic_index_buffer_handle_t _handle);
-    bgfx_dynamic_vertex_buffer_handle_t (*create_dynamic_vertex_buffer)(uint32_t _num, const bgfx_vertex_decl_t* _decl, uint16_t _flags);
-    bgfx_dynamic_vertex_buffer_handle_t (*create_dynamic_vertex_buffer_mem)(const bgfx_memory_t* _mem, const bgfx_vertex_decl_t* _decl, uint16_t _flags);
-    void (*update_dynamic_vertex_buffer)(bgfx_dynamic_vertex_buffer_handle_t _handle, uint32_t _startVertex, const bgfx_memory_t* _mem);
-    void (*destroy_dynamic_vertex_buffer)(bgfx_dynamic_vertex_buffer_handle_t _handle);
-    bool (*check_avail_transient_index_buffer)(uint32_t _num);
-    bool (*check_avail_transient_vertex_buffer)(uint32_t _num, const bgfx_vertex_decl_t* _decl);
-    bool (*check_avail_instance_data_buffer)(uint32_t _num, uint16_t _stride);
-    bool (*check_avail_transient_buffers)(uint32_t _numVertices, const bgfx_vertex_decl_t* _decl, uint32_t _numIndices);
-    void (*alloc_transient_index_buffer)(bgfx_transient_index_buffer_t* _tib, uint32_t _num);
-    void (*alloc_transient_vertex_buffer)(bgfx_transient_vertex_buffer_t* _tvb, uint32_t _num, const bgfx_vertex_decl_t* _decl);
-    bool (*alloc_transient_buffers)(bgfx_transient_vertex_buffer_t* _tvb, const bgfx_vertex_decl_t* _decl, uint32_t _numVertices, bgfx_transient_index_buffer_t* _tib, uint32_t _numIndices);
-    const bgfx_instance_data_buffer_t* (*alloc_instance_data_buffer)(uint32_t _num, uint16_t _stride);
-    bgfx_indirect_buffer_handle_t (*create_indirect_buffer)(uint32_t _num);
-    void (*destroy_indirect_buffer)(bgfx_indirect_buffer_handle_t _handle);
-    bgfx_shader_handle_t (*create_shader)(const bgfx_memory_t* _mem);
-    uint16_t (*get_shader_uniforms)(bgfx_shader_handle_t _handle, bgfx_uniform_handle_t* _uniforms, uint16_t _max);
-    void (*destroy_shader)(bgfx_shader_handle_t _handle);
-    bgfx_program_handle_t (*create_program)(bgfx_shader_handle_t _vsh, bgfx_shader_handle_t _fsh, bool _destroyShaders);
-    bgfx_program_handle_t (*create_compute_program)(bgfx_shader_handle_t _csh, bool _destroyShaders);
-    void (*destroy_program)(bgfx_program_handle_t _handle);
-    void (*calc_texture_size)(bgfx_texture_info_t* _info, uint16_t _width, uint16_t _height, uint16_t _depth, bool _cubeMap, bool _hasMips, uint16_t _numLayers, bgfx_texture_format_t _format);
-    bgfx_texture_handle_t (*create_texture)(const bgfx_memory_t* _mem, uint32_t _flags, uint8_t _skip, bgfx_texture_info_t* _info);
-    bgfx_texture_handle_t (*create_texture_2d)(uint16_t _width, uint16_t _height, bool _hasMips, uint16_t _numLayers, bgfx_texture_format_t _format, uint32_t _flags, const bgfx_memory_t* _mem);
-    bgfx_texture_handle_t (*create_texture_2d_scaled)(bgfx_backbuffer_ratio_t _ratio, bool _hasMips, uint16_t _numLayers, bgfx_texture_format_t _format, uint32_t _flags);
-    bgfx_texture_handle_t (*create_texture_3d)(uint16_t _width, uint16_t _height, uint16_t _depth, bool _hasMips, bgfx_texture_format_t _format, uint32_t _flags, const bgfx_memory_t* _mem);
-    bgfx_texture_handle_t (*create_texture_cube)(uint16_t _size, bool _hasMips, uint16_t _numLayers, bgfx_texture_format_t _format, uint32_t _flags, const bgfx_memory_t* _mem);
-    void (*update_texture_2d)(bgfx_texture_handle_t _handle, uint16_t _layer, uint8_t _mip, uint16_t _x, uint16_t _y, uint16_t _width, uint16_t _height, const bgfx_memory_t* _mem, uint16_t _pitch);
-    void (*update_texture_3d)(bgfx_texture_handle_t _handle, uint8_t _mip, uint16_t _x, uint16_t _y, uint16_t _z, uint16_t _width, uint16_t _height, uint16_t _depth, const bgfx_memory_t* _mem);
-    void (*update_texture_cube)(bgfx_texture_handle_t _handle, uint16_t _layer, uint8_t _side, uint8_t _mip, uint16_t _x, uint16_t _y, uint16_t _width, uint16_t _height, const bgfx_memory_t* _mem, uint16_t _pitch);
-    uint32_t (*read_texture)(bgfx_texture_handle_t _handle, void* _data, uint8_t _mip);
-    uint32_t (*read_frame_buffer)(bgfx_frame_buffer_handle_t _handle, uint8_t _attachment, void* _data);
-    void (*destroy_texture)(bgfx_texture_handle_t _handle);
-    bgfx_frame_buffer_handle_t (*create_frame_buffer)(uint16_t _width, uint16_t _height, bgfx_texture_format_t _format, uint32_t _textureFlags);
-    bgfx_frame_buffer_handle_t (*create_frame_buffer_scaled)(bgfx_backbuffer_ratio_t _ratio, bgfx_texture_format_t _format, uint32_t _textureFlags);
-    bgfx_frame_buffer_handle_t (*create_frame_buffer_from_attachment)(uint8_t _num, const bgfx_attachment_t* _attachment, bool _destroyTextures);
-    bgfx_frame_buffer_handle_t (*create_frame_buffer_from_nwh)(void* _nwh, uint16_t _width, uint16_t _height, bgfx_texture_format_t _depthFormat);
-    void (*destroy_frame_buffer)(bgfx_frame_buffer_handle_t _handle);
-    bgfx_uniform_handle_t (*create_uniform)(const char* _name, bgfx_uniform_type_t _type, uint16_t _num);
-    void (*get_uniform_info)(bgfx_uniform_handle_t _handle, bgfx_uniform_info_t* _info);
-    void (*destroy_uniform)(bgfx_uniform_handle_t _handle);
-    bgfx_occlusion_query_handle_t (*create_occlusion_query)();
-    bgfx_occlusion_query_result_t (*get_result)(bgfx_occlusion_query_handle_t _handle);
-    void (*destroy_occlusion_query)(bgfx_occlusion_query_handle_t _handle);
-    void (*set_palette_color)(uint8_t _index, const float _rgba[4]);
-    void (*set_view_name)(uint8_t _id, const char* _name);
-    void (*set_view_rect)(uint8_t _id, uint16_t _x, uint16_t _y, uint16_t _width, uint16_t _height);
-    void (*set_view_scissor)(uint8_t _id, uint16_t _x, uint16_t _y, uint16_t _width, uint16_t _height);
-    void (*set_view_clear)(uint8_t _id, uint16_t _flags, uint32_t _rgba, float _depth, uint8_t _stencil);
-    void (*set_view_clear_mrt)(uint8_t _id, uint16_t _flags, float _depth, uint8_t _stencil, uint8_t _0, uint8_t _1, uint8_t _2, uint8_t _3, uint8_t _4, uint8_t _5, uint8_t _6, uint8_t _7);
-    void (*set_view_seq)(uint8_t _id, bool _enabled);
-    void (*set_view_frame_buffer)(uint8_t _id, bgfx_frame_buffer_handle_t _handle);
-    void (*set_view_transform)(uint8_t _id, const void* _view, const void* _proj);
-    void (*set_view_transform_stereo)(uint8_t _id, const void* _view, const void* _projL, uint8_t _flags, const void* _projR);
-    void (*set_view_remap)(uint8_t _id, uint8_t _num, const void* _remap);
-    void (*set_marker)(const char* _marker);
-    void (*set_state)(uint64_t _state, uint32_t _rgba);
-    void (*set_condition)(bgfx_occlusion_query_handle_t _handle, bool _visible);
-    void (*set_stencil)(uint32_t _fstencil, uint32_t _bstencil);
-    uint16_t (*set_scissor)(uint16_t _x, uint16_t _y, uint16_t _width, uint16_t _height);
-    void (*set_scissor_cached)(uint16_t _cache);
-    uint32_t (*set_transform)(const void* _mtx, uint16_t _num);
-    uint32_t (*alloc_transform)(bgfx_transform_t* _transform, uint16_t _num);
-    void (*set_transform_cached)(uint32_t _cache, uint16_t _num);
-    void (*set_uniform)(bgfx_uniform_handle_t _handle, const void* _value, uint16_t _num);
-    void (*set_index_buffer)(bgfx_index_buffer_handle_t _handle, uint32_t _firstIndex, uint32_t _numIndices);
-    void (*set_dynamic_index_buffer)(bgfx_dynamic_index_buffer_handle_t _handle, uint32_t _firstIndex, uint32_t _numIndices);
-    void (*set_transient_index_buffer)(const bgfx_transient_index_buffer_t* _tib, uint32_t _firstIndex, uint32_t _numIndices);
-    void (*set_vertex_buffer)(bgfx_vertex_buffer_handle_t _handle, uint32_t _startVertex, uint32_t _numVertices);
-    void (*set_dynamic_vertex_buffer)(bgfx_dynamic_vertex_buffer_handle_t _handle, uint32_t _startVertex, uint32_t _numVertices);
-    void (*set_transient_vertex_buffer)(const bgfx_transient_vertex_buffer_t* _tvb, uint32_t _startVertex, uint32_t _numVertices);
-    void (*set_instance_data_buffer)(const bgfx_instance_data_buffer_t* _idb, uint32_t _num);
-    void (*set_instance_data_from_vertex_buffer)(bgfx_vertex_buffer_handle_t _handle, uint32_t _startVertex, uint32_t _num);
-    void (*set_instance_data_from_dynamic_vertex_buffer)(bgfx_dynamic_vertex_buffer_handle_t _handle, uint32_t _startVertex, uint32_t _num);
-    void (*set_texture)(uint8_t _stage, bgfx_uniform_handle_t _sampler, bgfx_texture_handle_t _handle, uint32_t _flags);
-    void (*set_texture_from_frame_buffer)(uint8_t _stage, bgfx_uniform_handle_t _sampler, bgfx_frame_buffer_handle_t _handle, uint8_t _attachment, uint32_t _flags);
-    uint32_t (*touch)(uint8_t _id);
-    uint32_t (*submit)(uint8_t _id, bgfx_program_handle_t _handle, int32_t _depth, bool _preserveState);
-    uint32_t (*submit_occlusion_query)(uint8_t _id, bgfx_program_handle_t _program, bgfx_occlusion_query_handle_t _occlusionQuery, int32_t _depth, bool _preserveState);
-    uint32_t (*submit_indirect)(uint8_t _id, bgfx_program_handle_t _handle, bgfx_indirect_buffer_handle_t _indirectHandle, uint16_t _start, uint16_t _num, int32_t _depth, bool _preserveState);
-    void (*set_image)(uint8_t _stage, bgfx_uniform_handle_t _sampler, bgfx_texture_handle_t _handle, uint8_t _mip, bgfx_access_t _access, bgfx_texture_format_t _format);
-    void (*set_image_from_frame_buffer)(uint8_t _stage, bgfx_uniform_handle_t _sampler, bgfx_frame_buffer_handle_t _handle, uint8_t _attachment, bgfx_access_t _access, bgfx_texture_format_t _format);
-    void (*set_compute_index_buffer)(uint8_t _stage, bgfx_index_buffer_handle_t _handle, bgfx_access_t _access);
-    void (*set_compute_vertex_buffer)(uint8_t _stage, bgfx_vertex_buffer_handle_t _handle, bgfx_access_t _access);
-    void (*set_compute_dynamic_index_buffer)(uint8_t _stage, bgfx_dynamic_index_buffer_handle_t _handle, bgfx_access_t _access);
-    void (*set_compute_dynamic_vertex_buffer)(uint8_t _stage, bgfx_dynamic_vertex_buffer_handle_t _handle, bgfx_access_t _access);
-    void (*set_compute_indirect_buffer)(uint8_t _stage, bgfx_indirect_buffer_handle_t _handle, bgfx_access_t _access);
-    uint32_t (*dispatch)(uint8_t _id, bgfx_program_handle_t _handle, uint16_t _numX, uint16_t _numY, uint16_t _numZ, uint8_t _flags);
-    uint32_t (*dispatch_indirect)(uint8_t _id, bgfx_program_handle_t _handle, bgfx_indirect_buffer_handle_t _indirectHandle, uint16_t _start, uint16_t _num, uint8_t _flags);
-    void (*discard)();
-    void (*blit)(uint8_t _id, bgfx_texture_handle_t _dst, uint8_t _dstMip, uint16_t _dstX, uint16_t _dstY, uint16_t _dstZ, bgfx_texture_handle_t _src, uint8_t _srcMip, uint16_t _srcX, uint16_t _srcY, uint16_t _srcZ, uint16_t _width, uint16_t _height, uint16_t _depth);
-    void (*save_screen_shot)(const char* _filePath);
-
-} bgfx_interface_vtbl_t;
-
-typedef bgfx_interface_vtbl_t* (*PFN_BGFX_GET_INTERFACE)(uint32_t _version);
-
-#endif // BGFX_PLATFORM_C99_H_HEADER_GUARD
diff --git a/3rdparty/bgfx/include/bgfx/c99/platform.h b/3rdparty/bgfx/include/bgfx/c99/platform.h
index 78b7791..e734fdf 100644
--- a/3rdparty/bgfx/include/bgfx/c99/platform.h
+++ b/3rdparty/bgfx/include/bgfx/c99/platform.h
@@ -13,7 +13,7 @@
 // necessary to use this header in conjunction with creating windows.
 
 #include <bx/platform.h>
-#include <bgfx/c99/bgfx.h>
+#include "bgfx.h"
 
 typedef enum bgfx_render_frame
 {
@@ -81,8 +81,6 @@ typedef struct bgfx_interface_vtbl
     uint16_t (*weld_vertices)(uint16_t* _output, const bgfx_vertex_decl_t* _decl, const void* _data, uint16_t _num, float _epsilon);
     uint32_t (*topology_convert)(bgfx_topology_convert_t _conversion, void* _dst, uint32_t _dstSize, const void* _indices, uint32_t _numIndices, bool _index32);
     void (*topology_sort_tri_list)(bgfx_topology_sort_t _sort, void* _dst, uint32_t _dstSize, const float _dir[3], const float _pos[3], const void* _vertices, uint32_t _stride, const void* _indices, uint32_t _numIndices, bool _index32);
-    void (*image_swizzle_bgra8)(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-    void (*image_rgba8_downsample_2x2)(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
     uint8_t (*get_supported_renderers)(uint8_t _max, bgfx_renderer_type_t* _enum);
     const char* (*get_renderer_name)(bgfx_renderer_type_t _type);
     bool (*init)(bgfx_renderer_type_t _type, uint16_t _vendorId, uint16_t _deviceId, bgfx_callback_interface_t* _callback, bgfx_allocator_interface_t* _allocator);
diff --git a/3rdparty/bgfx/include/bgfx/defines.h b/3rdparty/bgfx/include/bgfx/defines.h
index 98b9ac8..ae459cf 100644
--- a/3rdparty/bgfx/include/bgfx/defines.h
+++ b/3rdparty/bgfx/include/bgfx/defines.h
@@ -6,7 +6,7 @@
 #ifndef BGFX_DEFINES_H_HEADER_GUARD
 #define BGFX_DEFINES_H_HEADER_GUARD
 
-#define BGFX_API_VERSION UINT32_C(39)
+#define BGFX_API_VERSION UINT32_C(40)
 
 ///
 #define BGFX_STATE_RGB_WRITE               UINT64_C(0x0000000000000001) //!< Enable RGB write.
diff --git a/3rdparty/bgfx/include/bgfx/platform.h b/3rdparty/bgfx/include/bgfx/platform.h
index 1c9b3c7..7ace99c 100644
--- a/3rdparty/bgfx/include/bgfx/platform.h
+++ b/3rdparty/bgfx/include/bgfx/platform.h
@@ -11,7 +11,7 @@
 // necessary to use this header in conjunction with creating windows.
 
 #include <bx/platform.h>
-#include <bgfx/bgfx.h>
+#include "bgfx.h"
 
 namespace bgfx
 {
diff --git a/3rdparty/bgfx/scripts/bgfx.lua b/3rdparty/bgfx/scripts/bgfx.lua
index f95e99f..3c529d8 100644
--- a/3rdparty/bgfx/scripts/bgfx.lua
+++ b/3rdparty/bgfx/scripts/bgfx.lua
@@ -47,6 +47,7 @@ function bgfxProject(_name, _kind, _defines)
 			}
 
 			links {
+				"bimg",
 				"bx",
 			}
 
@@ -73,6 +74,7 @@ function bgfxProject(_name, _kind, _defines)
 			path.join(BGFX_DIR, "3rdparty"),
 			path.join(BGFX_DIR, "3rdparty/dxsdk/include"),
 			path.join(BX_DIR,   "include"),
+			path.join(BIMG_DIR, "include"),
 		}
 
 		defines {
diff --git a/3rdparty/bgfx/scripts/build.ninja b/3rdparty/bgfx/scripts/build.ninja
index 54b099e..f36701e 100644
--- a/3rdparty/bgfx/scripts/build.ninja
+++ b/3rdparty/bgfx/scripts/build.ninja
@@ -29,6 +29,15 @@ rule texturec_etc1
 rule texturec_etc2
     command = texturec -f $in -o $out -t etc2 -m
 
+rule texturec_diffuse
+    command = texturec -f $in -o $out -t bc2 -m
+
+rule texturec_normal
+    command = texturec -f $in -o $out -t bc5 -m -n
+
+rule texturec_height
+    command = texturec -f $in -o $out -t r8
+
 pwd = ../examples/assets/meshes
 subninja ../examples/assets/meshes/meshes.ninja
 
diff --git a/3rdparty/bgfx/scripts/example-common.lua b/3rdparty/bgfx/scripts/example-common.lua
index f87849f..37e0f41 100644
--- a/3rdparty/bgfx/scripts/example-common.lua
+++ b/3rdparty/bgfx/scripts/example-common.lua
@@ -8,7 +8,8 @@ project ("example-common")
 	kind "StaticLib"
 
 	includedirs {
-		path.join(BX_DIR, "include"),
+		path.join(BX_DIR,   "include"),
+		path.join(BIMG_DIR, "include"),
 		path.join(BGFX_DIR, "include"),
 		path.join(BGFX_DIR, "3rdparty"),
 	}
diff --git a/3rdparty/bgfx/scripts/genie.lua b/3rdparty/bgfx/scripts/genie.lua
index a01d7fe..84c50db 100644
--- a/3rdparty/bgfx/scripts/genie.lua
+++ b/3rdparty/bgfx/scripts/genie.lua
@@ -72,6 +72,7 @@ solution "bgfx"
 
 MODULE_DIR = path.getabsolute("../")
 BGFX_DIR   = path.getabsolute("..")
+BIMG_DIR   = path.getabsolute(path.join(BGFX_DIR, "../bimg"))
 BX_DIR     = os.getenv("BX_DIR")
 
 local BGFX_BUILD_DIR = path.join(BGFX_DIR, ".build")
@@ -122,6 +123,7 @@ function exampleProject(_name)
 
 	includedirs {
 		path.join(BX_DIR,   "include"),
+		path.join(BIMG_DIR, "include"),
 		path.join(BGFX_DIR, "include"),
 		path.join(BGFX_DIR, "3rdparty"),
 		path.join(BGFX_DIR, "examples/common"),
@@ -144,6 +146,8 @@ function exampleProject(_name)
 	links {
 		"example-common",
 		"bgfx",
+		"bimg_decode",
+		"bimg",
 		"bx",
 	}
 
@@ -364,7 +368,10 @@ dofile "bgfx.lua"
 group "libs"
 bgfxProject("", "StaticLib", {})
 
-dofile(path.join(BX_DIR, "scripts/bx.lua"))
+dofile(path.join(BX_DIR,   "scripts/bx.lua"))
+dofile(path.join(BIMG_DIR, "scripts/bimg.lua"))
+dofile(path.join(BIMG_DIR, "scripts/bimg_decode.lua"))
+dofile(path.join(BIMG_DIR, "scripts/bimg_encode.lua"))
 
 if _OPTIONS["with-examples"] or _OPTIONS["with-tools"] then
 	group "examples"
@@ -405,6 +412,7 @@ if _OPTIONS["with-examples"] then
 	exampleProject("30-picking")
 	exampleProject("31-rsm")
 	exampleProject("32-particles")
+	exampleProject("33-pom")
 
 	-- C99 source doesn't compile under WinRT settings
 	if not premake.vstudio.iswinrt() then
diff --git a/3rdparty/bgfx/scripts/shader.mk b/3rdparty/bgfx/scripts/shader.mk
index 897b280..4a695d0 100644
--- a/3rdparty/bgfx/scripts/shader.mk
+++ b/3rdparty/bgfx/scripts/shader.mk
@@ -154,6 +154,10 @@ all: dirs $(BIN)
 clean:
 	@echo Cleaning...
 	@-rm -vf $(BIN)
+
+.PHONY: cleanall
+cleanall:
+	@echo Cleaning...
 	@-$(call CMD_RMDIR,$(BUILD_INTERMEDIATE_DIR))
 
 .PHONY: dirs
diff --git a/3rdparty/bgfx/scripts/shaderc.lua b/3rdparty/bgfx/scripts/shaderc.lua
index 300c4c6..18d9988 100644
--- a/3rdparty/bgfx/scripts/shaderc.lua
+++ b/3rdparty/bgfx/scripts/shaderc.lua
@@ -219,7 +219,8 @@ project "shaderc"
 	kind "ConsoleApp"
 
 	includedirs {
-		path.join(BX_DIR, "include"),
+		path.join(BX_DIR,   "include"),
+		path.join(BIMG_DIR, "include"),
 		path.join(BGFX_DIR, "include"),
 
 		path.join(BGFX_DIR, "3rdparty/dxsdk/include"),
diff --git a/3rdparty/bgfx/scripts/texturec.lua b/3rdparty/bgfx/scripts/texturec.lua
index 07184ed..9cff76e 100644
--- a/3rdparty/bgfx/scripts/texturec.lua
+++ b/3rdparty/bgfx/scripts/texturec.lua
@@ -8,36 +8,20 @@ project "texturec"
 	kind "ConsoleApp"
 
 	includedirs {
-		path.join(BX_DIR, "include"),
+		path.join(BX_DIR,   "include"),
+		path.join(BIMG_DIR, "include"),
 		path.join(BGFX_DIR, "include"),
-		path.join(BGFX_DIR, "src"),
-		path.join(BGFX_DIR, "3rdparty"),
-		path.join(BGFX_DIR, "3rdparty/nvtt"),
-		path.join(BGFX_DIR, "3rdparty/iqa/include"),
+		path.join(BIMG_DIR, "3rdparty/iqa/include"),
 	}
 
 	files {
-		path.join(BGFX_DIR, "src/image.*"),
-		path.join(BGFX_DIR, "3rdparty/libsquish/**.cpp"),
-		path.join(BGFX_DIR, "3rdparty/libsquish/**.h"),
-		path.join(BGFX_DIR, "3rdparty/edtaa3/**.cpp"),
-		path.join(BGFX_DIR, "3rdparty/edtaa3/**.h"),
-		path.join(BGFX_DIR, "3rdparty/etc1/**.cpp"),
-		path.join(BGFX_DIR, "3rdparty/etc1/**.h"),
-		path.join(BGFX_DIR, "3rdparty/etc2/**.cpp"),
-		path.join(BGFX_DIR, "3rdparty/etc2/**.hpp"),
-		path.join(BGFX_DIR, "3rdparty/nvtt/**.cpp"),
-		path.join(BGFX_DIR, "3rdparty/nvtt/**.h"),
-		path.join(BGFX_DIR, "3rdparty/pvrtc/**.cpp"),
-		path.join(BGFX_DIR, "3rdparty/pvrtc/**.h"),
-		path.join(BGFX_DIR, "3rdparty/tinyexr/**.h"),
-		path.join(BGFX_DIR, "3rdparty/iqa/include/**.h"),
-		path.join(BGFX_DIR, "3rdparty/iqa/source/**.c"),
-		path.join(BGFX_DIR, "tools/texturec/**.cpp"),
-		path.join(BGFX_DIR, "tools/texturec/**.h"),
+		path.join(BGFX_DIR, "tools/texturec/texturec.cpp"),
 	}
 
 	links {
+		"bimg_decode",
+		"bimg_encode",
+		"bimg",
 		"bx",
 	}
 
diff --git a/3rdparty/bgfx/scripts/texturev.lua b/3rdparty/bgfx/scripts/texturev.lua
index fe08f59..4443391 100644
--- a/3rdparty/bgfx/scripts/texturev.lua
+++ b/3rdparty/bgfx/scripts/texturev.lua
@@ -6,6 +6,7 @@ project ("texturev")
 
 	includedirs {
 		path.join(BX_DIR,   "include"),
+		path.join(BIMG_DIR, "include"),
 		path.join(BGFX_DIR, "include"),
 		path.join(BGFX_DIR, "3rdparty"),
 		path.join(BGFX_DIR, "examples/common"),
@@ -19,6 +20,8 @@ project ("texturev")
 
 	links {
 		"example-common",
+		"bimg_decode",
+		"bimg",
 		"bgfx",
 		"bx",
 	}
diff --git a/3rdparty/bgfx/src/amalgamated.cpp b/3rdparty/bgfx/src/amalgamated.cpp
index 860f448..0ea731d 100644
--- a/3rdparty/bgfx/src/amalgamated.cpp
+++ b/3rdparty/bgfx/src/amalgamated.cpp
@@ -8,7 +8,6 @@
 #include "glcontext_glx.cpp"
 #include "glcontext_ppapi.cpp"
 #include "glcontext_wgl.cpp"
-#include "image.cpp"
 #include "hmd.cpp"
 #include "hmd_ovr.cpp"
 #include "hmd_openvr.cpp"
diff --git a/3rdparty/bgfx/src/bgfx.cpp b/3rdparty/bgfx/src/bgfx.cpp
index 5886e84..0a77f8d 100644
--- a/3rdparty/bgfx/src/bgfx.cpp
+++ b/3rdparty/bgfx/src/bgfx.cpp
@@ -126,7 +126,7 @@ namespace bgfx
 			bx::CrtFileWriter writer;
 			if (bx::open(&writer, filePath) )
 			{
-				imageWriteTga(&writer, _width, _height, _pitch, _data, false, _yflip);
+				bimg::imageWriteTga(&writer, _width, _height, _pitch, _data, false, _yflip);
 				bx::close(&writer);
 			}
 #endif // BX_CONFIG_CRT_FILE_READER_WRITER
@@ -910,11 +910,13 @@ namespace bgfx
 		}
 
 		m_renderItem[m_numRenderItems].draw = m_draw;
+		m_renderItemBind[m_numRenderItems]  = m_bind;
 		++m_numRenderItems;
 
 		if (!_preserveState)
 		{
 			m_draw.clear();
+			m_bind.clear();
 			m_uniformBegin = m_uniformEnd;
 			m_stateFlags = BGFX_STATE_NONE;
 		}
@@ -959,9 +961,11 @@ namespace bgfx
 		m_compute.m_constBegin = m_uniformBegin;
 		m_compute.m_constEnd   = m_uniformEnd;
 		m_renderItem[m_numRenderItems].compute = m_compute;
+		m_renderItemBind[m_numRenderItems]     = m_bind;
 		++m_numRenderItems;
 
 		m_compute.clear();
+		m_bind.clear();
 		m_uniformBegin = m_uniformEnd;
 
 		return m_num;
@@ -1246,7 +1250,7 @@ namespace bgfx
 		BX_TRACE("");
 	}
 
-	TextureFormat::Enum getViableTextureFormat(const ImageContainer& _imageContainer)
+	TextureFormat::Enum getViableTextureFormat(const bimg::ImageContainer& _imageContainer)
 	{
 		const uint32_t formatCaps = g_caps.formats[_imageContainer.m_format];
 		bool convert = 0 == formatCaps;
@@ -1275,7 +1279,12 @@ namespace bgfx
 			return TextureFormat::BGRA8;
 		}
 
-		return _imageContainer.m_format;
+		return TextureFormat::Enum(_imageContainer.m_format);
+	}
+
+	const char* getName(TextureFormat::Enum _fmt)
+	{
+		return bimg::getName(bimg::TextureFormat::Enum(_fmt));
 	}
 
 	static TextureFormat::Enum s_emulatedFormats[] =
@@ -1389,7 +1398,7 @@ namespace bgfx
 
 		for (uint32_t ii = 0; ii < TextureFormat::UnknownDepth; ++ii)
 		{
-			bool convertable = imageConvert(TextureFormat::BGRA8, TextureFormat::Enum(ii) );
+			bool convertable = bimg::imageConvert(bimg::TextureFormat::BGRA8, bimg::TextureFormat::Enum(ii) );
 			g_caps.formats[ii] |= 0 == (g_caps.formats[ii] & BGFX_CAPS_FORMAT_TEXTURE_2D  ) && convertable ? BGFX_CAPS_FORMAT_TEXTURE_2D_EMULATED   : 0;
 			g_caps.formats[ii] |= 0 == (g_caps.formats[ii] & BGFX_CAPS_FORMAT_TEXTURE_3D  ) && convertable ? BGFX_CAPS_FORMAT_TEXTURE_3D_EMULATED   : 0;
 			g_caps.formats[ii] |= 0 == (g_caps.formats[ii] & BGFX_CAPS_FORMAT_TEXTURE_CUBE) && convertable ? BGFX_CAPS_FORMAT_TEXTURE_CUBE_EMULATED : 0;
@@ -1650,16 +1659,35 @@ namespace bgfx
 		return m_uniformRef[_handle.idx].m_name.getPtr();
 	}
 
-	RenderFrame::Enum Context::renderFrame(int32_t _msecs)
-	{
-		BGFX_PROFILER_SCOPE(bgfx, render_frame, 0xff2040ff);
+	RendererContextI* rendererCreate(RendererType::Enum _type);
+	void rendererDestroy(RendererContextI* _renderCtx);
 
+	void Context::flip()
+	{
 		if (m_rendererInitialized
-		&& !m_flipAfterRender
 		&& !m_flipped)
 		{
 			m_renderCtx->flip(m_render->m_hmd);
 			m_flipped = true;
+
+			if (m_renderCtx->isDeviceRemoved() )
+			{
+				// Something horribly went wrong, fallback to noop renderer.
+				rendererDestroy(m_renderCtx);
+
+				m_renderCtx = rendererCreate(RendererType::Noop);
+				g_caps.rendererType = RendererType::Noop;
+			}
+		}
+	}
+
+	RenderFrame::Enum Context::renderFrame(int32_t _msecs)
+	{
+		BGFX_PROFILER_SCOPE(bgfx, render_frame, 0xff2040ff);
+
+		if (!m_flipAfterRender)
+		{
+			flip();
 		}
 
 		if (apiSemWait(_msecs) )
@@ -1675,11 +1703,9 @@ namespace bgfx
 
 			renderSemPost();
 
-			if (m_rendererInitialized
-			&&  m_flipAfterRender)
+			if (m_flipAfterRender)
 			{
-				m_renderCtx->flip(m_render->m_hmd);
-				m_flipped = true;
+				flip();
 			}
 		}
 		else
@@ -1825,7 +1851,7 @@ namespace bgfx
 
 	static RendererCreator s_rendererCreator[] =
 	{
-		{ noop::rendererCreate,  noop::rendererDestroy,  BGFX_RENDERER_NOOP_NAME,       !!BGFX_CONFIG_RENDERER_NOOP       }, // Noop
+		{ noop::rendererCreate,  noop::rendererDestroy,  BGFX_RENDERER_NOOP_NAME,       true                              }, // Noop
 		{ d3d9::rendererCreate,  d3d9::rendererDestroy,  BGFX_RENDERER_DIRECT3D9_NAME,  !!BGFX_CONFIG_RENDERER_DIRECT3D9  }, // Direct3D9
 		{ d3d11::rendererCreate, d3d11::rendererDestroy, BGFX_RENDERER_DIRECT3D11_NAME, !!BGFX_CONFIG_RENDERER_DIRECT3D11 }, // Direct3D11
 		{ d3d12::rendererCreate, d3d12::rendererDestroy, BGFX_RENDERER_DIRECT3D12_NAME, !!BGFX_CONFIG_RENDERER_DIRECT3D12 }, // Direct3D12
@@ -1841,8 +1867,6 @@ namespace bgfx
 	};
 	BX_STATIC_ASSERT(BX_COUNTOF(s_rendererCreator) == RendererType::Count);
 
-	static RendererDestroyFn s_rendererDestroyFn;
-
 	struct Condition
 	{
 		enum Enum
@@ -1965,7 +1989,6 @@ namespace bgfx
 			renderCtx = s_rendererCreator[renderer].createFn();
 			if (NULL != renderCtx)
 			{
-				s_rendererDestroyFn = s_rendererCreator[renderer].destroyFn;
 				break;
 			}
 
@@ -1975,9 +1998,12 @@ namespace bgfx
 		return renderCtx;
 	}
 
-	void rendererDestroy()
+	void rendererDestroy(RendererContextI* _renderCtx)
 	{
-		s_rendererDestroyFn();
+		if (NULL != _renderCtx)
+		{
+			s_rendererCreator[_renderCtx->getRendererType()].destroyFn();
+		}
 	}
 
 	void Context::rendererExecCommands(CommandBuffer& _cmdbuf)
@@ -2012,6 +2038,7 @@ namespace bgfx
 					_cmdbuf.read(type);
 
 					m_renderCtx = rendererCreate(type);
+
 					m_rendererInitialized = NULL != m_renderCtx;
 
 					if (!m_rendererInitialized)
@@ -2044,8 +2071,10 @@ namespace bgfx
 			case CommandBuffer::RendererShutdownEnd:
 				{
 					BX_CHECK(!m_rendererInitialized && !m_exit, "This shouldn't happen! Bad synchronization?");
-					rendererDestroy();
+
+					rendererDestroy(m_renderCtx);
 					m_renderCtx = NULL;
+
 					m_exit = true;
 				}
 				// fall through
@@ -2345,7 +2374,7 @@ namespace bgfx
 					uint8_t mip;
 					_cmdbuf.read(mip);
 
-					m_renderCtx->readTexture(handle, data,mip);
+					m_renderCtx->readTexture(handle, data, mip);
 				}
 				break;
 
@@ -3136,7 +3165,7 @@ error:
 
 	void calcTextureSize(TextureInfo& _info, uint16_t _width, uint16_t _height, uint16_t _depth, bool _cubeMap, bool _hasMips, uint16_t _numLayers, TextureFormat::Enum _format)
 	{
-		imageGetSize(&_info, _width, _height, _depth, _cubeMap, _hasMips, _numLayers, _format);
+		bimg::imageGetSize( (bimg::TextureInfo*)&_info, _width, _height, _depth, _cubeMap, _hasMips, _numLayers, bimg::TextureFormat::Enum(_format) );
 	}
 
 	TextureHandle createTexture(const Memory* _mem, uint32_t _flags, uint8_t _skip, TextureInfo* _info)
@@ -3867,6 +3896,89 @@ error:
 	}
 } // namespace bgfx
 
+#define BGFX_TEXTURE_FORMAT_BIMG(_fmt) \
+			BX_STATIC_ASSERT(uint32_t(bgfx::TextureFormat::_fmt) == uint32_t(bimg::TextureFormat::_fmt) )
+
+BGFX_TEXTURE_FORMAT_BIMG(BC1);
+BGFX_TEXTURE_FORMAT_BIMG(BC2);
+BGFX_TEXTURE_FORMAT_BIMG(BC3);
+BGFX_TEXTURE_FORMAT_BIMG(BC4);
+BGFX_TEXTURE_FORMAT_BIMG(BC5);
+BGFX_TEXTURE_FORMAT_BIMG(BC6H);
+BGFX_TEXTURE_FORMAT_BIMG(BC7);
+BGFX_TEXTURE_FORMAT_BIMG(ETC1);
+BGFX_TEXTURE_FORMAT_BIMG(ETC2);
+BGFX_TEXTURE_FORMAT_BIMG(ETC2A);
+BGFX_TEXTURE_FORMAT_BIMG(ETC2A1);
+BGFX_TEXTURE_FORMAT_BIMG(PTC12);
+BGFX_TEXTURE_FORMAT_BIMG(PTC14);
+BGFX_TEXTURE_FORMAT_BIMG(PTC12A);
+BGFX_TEXTURE_FORMAT_BIMG(PTC14A);
+BGFX_TEXTURE_FORMAT_BIMG(PTC22);
+BGFX_TEXTURE_FORMAT_BIMG(PTC24);
+BGFX_TEXTURE_FORMAT_BIMG(Unknown);
+BGFX_TEXTURE_FORMAT_BIMG(R1);
+BGFX_TEXTURE_FORMAT_BIMG(A8);
+BGFX_TEXTURE_FORMAT_BIMG(R8);
+BGFX_TEXTURE_FORMAT_BIMG(R8I);
+BGFX_TEXTURE_FORMAT_BIMG(R8U);
+BGFX_TEXTURE_FORMAT_BIMG(R8S);
+BGFX_TEXTURE_FORMAT_BIMG(R16);
+BGFX_TEXTURE_FORMAT_BIMG(R16I);
+BGFX_TEXTURE_FORMAT_BIMG(R16U);
+BGFX_TEXTURE_FORMAT_BIMG(R16F);
+BGFX_TEXTURE_FORMAT_BIMG(R16S);
+BGFX_TEXTURE_FORMAT_BIMG(R32I);
+BGFX_TEXTURE_FORMAT_BIMG(R32U);
+BGFX_TEXTURE_FORMAT_BIMG(R32F);
+BGFX_TEXTURE_FORMAT_BIMG(RG8);
+BGFX_TEXTURE_FORMAT_BIMG(RG8I);
+BGFX_TEXTURE_FORMAT_BIMG(RG8U);
+BGFX_TEXTURE_FORMAT_BIMG(RG8S);
+BGFX_TEXTURE_FORMAT_BIMG(RG16);
+BGFX_TEXTURE_FORMAT_BIMG(RG16I);
+BGFX_TEXTURE_FORMAT_BIMG(RG16U);
+BGFX_TEXTURE_FORMAT_BIMG(RG16F);
+BGFX_TEXTURE_FORMAT_BIMG(RG16S);
+BGFX_TEXTURE_FORMAT_BIMG(RG32I);
+BGFX_TEXTURE_FORMAT_BIMG(RG32U);
+BGFX_TEXTURE_FORMAT_BIMG(RG32F);
+BGFX_TEXTURE_FORMAT_BIMG(RGB8);
+BGFX_TEXTURE_FORMAT_BIMG(RGB8I);
+BGFX_TEXTURE_FORMAT_BIMG(RGB8U);
+BGFX_TEXTURE_FORMAT_BIMG(RGB8S);
+BGFX_TEXTURE_FORMAT_BIMG(RGB9E5F);
+BGFX_TEXTURE_FORMAT_BIMG(BGRA8);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA8);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA8I);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA8U);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA8S);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA16);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA16I);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA16U);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA16F);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA16S);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA32I);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA32U);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA32F);
+BGFX_TEXTURE_FORMAT_BIMG(R5G6B5);
+BGFX_TEXTURE_FORMAT_BIMG(RGBA4);
+BGFX_TEXTURE_FORMAT_BIMG(RGB5A1);
+BGFX_TEXTURE_FORMAT_BIMG(RGB10A2);
+BGFX_TEXTURE_FORMAT_BIMG(R11G11B10F);
+BGFX_TEXTURE_FORMAT_BIMG(UnknownDepth);
+BGFX_TEXTURE_FORMAT_BIMG(D16);
+BGFX_TEXTURE_FORMAT_BIMG(D24);
+BGFX_TEXTURE_FORMAT_BIMG(D24S8);
+BGFX_TEXTURE_FORMAT_BIMG(D32);
+BGFX_TEXTURE_FORMAT_BIMG(D16F);
+BGFX_TEXTURE_FORMAT_BIMG(D24F);
+BGFX_TEXTURE_FORMAT_BIMG(D32F);
+BGFX_TEXTURE_FORMAT_BIMG(D0S8);
+BGFX_TEXTURE_FORMAT_BIMG(Count);
+
+#undef BGFX_TEXTURE_FORMAT_BIMG
+
 #include <bgfx/c99/bgfx.h>
 #include <bgfx/c99/platform.h>
 
@@ -4070,16 +4182,6 @@ void bgfx_topology_sort_tri_list(bgfx_topology_sort_t _sort, void* _dst, uint32_
 	bgfx::topologySortTriList(bgfx::TopologySort::Enum(_sort), _dst, _dstSize, _dir, _pos, _vertices, _stride, _indices, _numIndices, _index32);
 }
 
-BGFX_C_API void bgfx_image_swizzle_bgra8(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src)
-{
-	bgfx::imageSwizzleBgra8(_dst, _width, _height, _pitch, _src);
-}
-
-BGFX_C_API void bgfx_image_rgba8_downsample_2x2(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src)
-{
-	bgfx::imageRgba8Downsample2x2(_dst, _width, _height, _pitch, _src);
-}
-
 BGFX_C_API uint8_t bgfx_get_supported_renderers(uint8_t _max, bgfx_renderer_type_t* _enum)
 {
 	return bgfx::getSupportedRenderers(_max, (bgfx::RendererType::Enum*)_enum);
@@ -4844,8 +4946,6 @@ BGFX_C_API bgfx_interface_vtbl_t* bgfx_get_interface(uint32_t _version)
 	BGFX_IMPORT_FUNC(weld_vertices) \
 	BGFX_IMPORT_FUNC(topology_convert) \
 	BGFX_IMPORT_FUNC(topology_sort_tri_list) \
-	BGFX_IMPORT_FUNC(image_swizzle_bgra8) \
-	BGFX_IMPORT_FUNC(image_rgba8_downsample_2x2) \
 	BGFX_IMPORT_FUNC(get_supported_renderers) \
 	BGFX_IMPORT_FUNC(get_renderer_name) \
 	BGFX_IMPORT_FUNC(init) \
diff --git a/3rdparty/bgfx/src/bgfx_compute.sh b/3rdparty/bgfx/src/bgfx_compute.sh
index 641dda8..a58ae1d 100644
--- a/3rdparty/bgfx/src/bgfx_compute.sh
+++ b/3rdparty/bgfx/src/bgfx_compute.sh
@@ -25,22 +25,22 @@
 #define UIMAGE2D_RO(_name, _format, _reg) Texture2D<_format>   _name : register(t[_reg])
 #define IMAGE2D_WR( _name, _format, _reg) RWTexture2D<_format> _name : register(u[_reg])
 #define UIMAGE2D_WR(_name, _format, _reg) RWTexture2D<_format> _name : register(u[_reg])
-#define IMAGE2D_RW( _name, _reg) RWTexture2D<float> _name : register(u[_reg])
-#define UIMAGE2D_RW(_name, _reg) RWTexture2D<uint>  _name : register(u[_reg])
+#define IMAGE2D_RW( _name,          _reg) RWTexture2D<float>   _name : register(u[_reg])
+#define UIMAGE2D_RW(_name,          _reg) RWTexture2D<uint>    _name : register(u[_reg])
 
 #define IMAGE2D_ARRAY_RO( _name, _format, _reg) Texture2DArray<_format>   _name : register(t[_reg])
 #define UIMAGE2D_ARRAY_RO(_name, _format, _reg) Texture2DArray<_format>   _name : register(t[_reg])
 #define IMAGE2D_ARRAY_WR( _name, _format, _reg) RWTexture2DArray<_format> _name : register(u[_reg])
 #define UIMAGE2D_ARRAY_WR(_name, _format, _reg) RWTexture2DArray<_format> _name : register(u[_reg])
-#define IMAGE2D_ARRAY_RW( _name, _reg) RWTexture2DArray<float> _name : register(u[_reg])
-#define UIMAGE2D_ARRAY_RW(_name, _reg) RWTexture2DArray<uint>  _name : register(u[_reg])
+#define IMAGE2D_ARRAY_RW( _name,          _reg) RWTexture2DArray<float>   _name : register(u[_reg])
+#define UIMAGE2D_ARRAY_RW(_name,          _reg) RWTexture2DArray<uint>    _name : register(u[_reg])
 
 #define IMAGE3D_RO( _name, _format, _reg) Texture3D<_format>   _name : register(t[_reg])
 #define UIMAGE3D_RO(_name, _format, _reg) Texture3D<_format>   _name : register(t[_reg])
 #define IMAGE3D_WR( _name, _format, _reg) RWTexture3D<_format> _name : register(u[_reg])
 #define UIMAGE3D_WR(_name, _format, _reg) RWTexture3D<_format> _name : register(u[_reg])
-#define IMAGE3D_RW( _name, _reg) RWTexture3D<float> _name : register(u[_reg])
-#define UIMAGE3D_RW(_name, _reg) RWTexture3D<uint>  _name : register(u[_reg])
+#define IMAGE3D_RW( _name,          _reg) RWTexture3D<float>   _name : register(u[_reg])
+#define UIMAGE3D_RW(_name,          _reg) RWTexture3D<uint>    _name : register(u[_reg])
 
 #define BUFFER_RO(_name, _struct, _reg) Buffer<_struct>   _name : register(t[_reg])
 #define BUFFER_RW(_name, _struct, _reg) RWBuffer<_struct> _name : register(u[_reg])
@@ -48,9 +48,9 @@
 
 #define NUM_THREADS(_x, _y, _z) [numthreads(_x, _y, _z)]
 
-#define __IMAGE_IMPL(_textureType, _storeComponents, _type, _loadComponents) \
+#define __IMAGE_IMPL(_textureType, _storeComponents, _type, _loadComponents)                                                     \
 	_type imageLoad(       Texture2D<_textureType> _image, ivec2 _uv)                { return _image[_uv ]._loadComponents;    } \
-	_type imageLoad(RWTexture2DArray<_textureType> _image, ivec3 _uvw)                { return _image[_uvw ]._loadComponents;  } \
+	_type imageLoad(RWTexture2DArray<_textureType> _image, ivec3 _uvw)               { return _image[_uvw ]._loadComponents;   } \
 	_type imageLoad(       Texture3D<_textureType> _image, ivec3 _uvw)               { return _image[_uvw]._loadComponents;    } \
 	_type imageLoad(     RWTexture2D<_textureType> _image, ivec2 _uv)                { return _image[_uv ]._loadComponents;    } \
 	_type imageLoad(RWTexture2DArray<_textureType> _image, ivec3 _uvw, _type _value) { return _image[_uvw]._loadComponents;    } \
@@ -100,15 +100,15 @@ ivec2 imageSize(RWTexture2D<uint> _image)
 	return result;
 }
 
-#define __ATOMIC_IMPL_TYPE(_genType, _glFunc, _dxFunc) \
+#define __ATOMIC_IMPL_TYPE(_genType, _glFunc, _dxFunc)      \
 			_genType _glFunc(_genType _mem, _genType _data) \
-			{ \
-				_genType result; \
-				_dxFunc(_mem, _data, result); \
-				return result; \
+			{                                               \
+				_genType result;                            \
+				_dxFunc(_mem, _data, result);               \
+				return result;                              \
 			}
 
-#define __ATOMIC_IMPL(_glFunc, _dxFunc) \
+#define __ATOMIC_IMPL(_glFunc, _dxFunc)                \
 			__ATOMIC_IMPL_TYPE(int,  _glFunc, _dxFunc) \
 			__ATOMIC_IMPL_TYPE(uint, _glFunc, _dxFunc)
 
@@ -156,27 +156,27 @@ uint atomicCompSwap(uint _mem, uint _compare, uint _data)
 #define UIMAGE2D_RO(_name, _format, _reg) __IMAGE_XX(_name, _format, _reg, uimage2D, readonly)
 #define IMAGE2D_WR( _name, _format, _reg) __IMAGE_XX(_name, _format, _reg, image2D,  writeonly)
 #define UIMAGE2D_WR(_name, _format, _reg) __IMAGE_XX(_name, _format, _reg, uimage2D, writeonly)
-#define IMAGE2D_RW( _name, _reg) __IMAGE_XX(_name, r32f,  _reg, image2D,  readwrite)
-#define UIMAGE2D_RW(_name, _reg) __IMAGE_XX(_name, r32ui, _reg, uimage2D, readwrite)
+#define IMAGE2D_RW( _name,          _reg) __IMAGE_XX(_name, r32f,    _reg, image2D,  readwrite)
+#define UIMAGE2D_RW(_name,          _reg) __IMAGE_XX(_name, r32ui,   _reg, uimage2D, readwrite)
 
 #define IMAGE2D_ARRAY_RO( _name, _format, _reg) __IMAGE_XX(_name, _format, _reg, image2DArray,  readonly)
 #define UIMAGE2D_ARRAY_RO(_name, _format, _reg) __IMAGE_XX(_name, _format, _reg, uimage2DArray, readonly)
 #define IMAGE2D_ARRAY_WR( _name, _format, _reg) __IMAGE_XX(_name, _format, _reg, image2DArray,  writeonly)
 #define UIMAGE2D_ARRAY_WR(_name, _format, _reg) __IMAGE_XX(_name, _format, _reg, uimage2DArray, writeonly)
-#define IMAGE2D_ARRAY_RW( _name, _reg) __IMAGE_XX(_name, r32f,  _reg, image2DArray,  readwrite)
-#define UIMAGE2D_ARRAY_RW(_name, _reg) __IMAGE_XX(_name, r32ui, _reg, uimage2DArray, readwrite)
+#define IMAGE2D_ARRAY_RW( _name,          _reg) __IMAGE_XX(_name, r32f,    _reg, image2DArray,  readwrite)
+#define UIMAGE2D_ARRAY_RW(_name,          _reg) __IMAGE_XX(_name, r32ui,   _reg, uimage2DArray, readwrite)
 
 #define IMAGE3D_RO( _name, _format, _reg) __IMAGE_XX(_name, _format, _reg, image3D,  readonly)
 #define UIMAGE3D_RO(_name, _format, _reg) __IMAGE_XX(_name, _format, _reg, uimage3D, readonly)
 #define IMAGE3D_WR( _name, _format, _reg) __IMAGE_XX(_name, _format, _reg, image3D,  writeonly)
 #define UIMAGE3D_WR(_name, _format, _reg) __IMAGE_XX(_name, _format, _reg, uimage3D, writeonly)
-#define IMAGE3D_RW( _name, _reg) __IMAGE_XX(_name, r32f,  _reg, image2D,  readwrite)
-#define UIMAGE3D_RW(_name, _reg) __IMAGE_XX(_name, r32ui, _reg, uimage2D, readwrite)
+#define IMAGE3D_RW( _name,          _reg) __IMAGE_XX(_name, r32f,    _reg, image3D,  readwrite)
+#define UIMAGE3D_RW(_name,          _reg) __IMAGE_XX(_name, r32ui,   _reg, uimage3D, readwrite)
 
-#define __BUFFER_XX(_name, _type, _reg, _access) \
+#define __BUFFER_XX(_name, _type, _reg, _access)                        \
 			layout(std430, binding=_reg) _access buffer _name ## Buffer \
-			{ \
-				_type _name[]; \
+			{                                                           \
+				_type _name[];                                          \
 			}
 
 #define BUFFER_RO(_name, _type, _reg) __BUFFER_XX(_name, _type, _reg, readonly)
diff --git a/3rdparty/bgfx/src/bgfx_p.h b/3rdparty/bgfx/src/bgfx_p.h
index 1ac2e15..9936cf8 100644
--- a/3rdparty/bgfx/src/bgfx_p.h
+++ b/3rdparty/bgfx/src/bgfx_p.h
@@ -132,7 +132,7 @@ namespace bgfx
 #include <bx/maputil.h>
 
 #include <bgfx/platform.h>
-#include "image.h"
+#include <bimg/bimg.h>
 #include "shader.h"
 
 #define BGFX_CHUNK_MAGIC_CSH BX_MAKEFOURCC('C', 'S', 'H', 0x2)
@@ -360,7 +360,8 @@ namespace bgfx
 	void release(const Memory* _mem);
 	const char* getAttribName(Attrib::Enum _attr);
 	void getTextureSizeFromRatio(BackbufferRatio::Enum _ratio, uint16_t& _width, uint16_t& _height);
-	TextureFormat::Enum getViableTextureFormat(const ImageContainer& _imageContainer);
+	TextureFormat::Enum getViableTextureFormat(const bimg::ImageContainer& _imageContainer);
+	const char* getName(TextureFormat::Enum _fmt);
 
 	inline uint32_t castfu(float _value)
 	{
@@ -1236,6 +1237,22 @@ namespace bgfx
 		VertexDeclHandle   m_decl;
 	};
 
+	struct RenderBind
+	{
+		void clear()
+		{
+			for (uint32_t ii = 0; ii < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++ii)
+			{
+				Binding& bind = m_bind[ii];
+				bind.m_idx = invalidHandle;
+				bind.m_type = 0;
+				bind.m_un.m_draw.m_textureFlags = 0;
+			}
+		};
+
+		Binding m_bind[BGFX_CONFIG_MAX_TEXTURE_SAMPLERS];
+	};
+
 	struct RenderDraw
 	{
 		void clear()
@@ -1263,14 +1280,6 @@ namespace bgfx
 			m_instanceDataBuffer.idx = invalidHandle;
 			m_indirectBuffer.idx     = invalidHandle;
 			m_occlusionQuery.idx     = invalidHandle;
-
-			for (uint32_t ii = 0; ii < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++ii)
-			{
-				Binding& bind = m_bind[ii];
-				bind.m_idx  = invalidHandle;
-				bind.m_type = 0;
-				bind.m_un.m_draw.m_textureFlags = 0;
-			}
 		}
 
 		bool setStreamBit(uint8_t _stream, VertexBufferHandle _handle)
@@ -1282,7 +1291,6 @@ namespace bgfx
 			return 0 != tmp;
 		}
 
-		Binding  m_bind[BGFX_CONFIG_MAX_TEXTURE_SAMPLERS];
 		Stream   m_stream[BGFX_CONFIG_MAX_VERTEX_STREAMS];
 		uint64_t m_stateFlags;
 		uint64_t m_stencil;
@@ -1325,14 +1333,8 @@ namespace bgfx
 			m_indirectBuffer.idx = invalidHandle;
 			m_startIndirect      = 0;
 			m_numIndirect        = UINT16_MAX;
-
-			for (uint32_t ii = 0; ii < BGFX_MAX_COMPUTE_BINDINGS; ++ii)
-			{
-				m_bind[ii].m_idx = invalidHandle;
-			}
 		}
 
-		Binding  m_bind[BGFX_MAX_COMPUTE_BINDINGS];
 		uint32_t m_constBegin;
 		uint32_t m_constEnd;
 		uint32_t m_matrix;
@@ -1460,6 +1462,7 @@ namespace bgfx
 			m_uniformEnd   = 0;
 			m_draw.clear();
 			m_compute.clear();
+			m_bind.clear();
 			m_matrixCache.reset();
 			m_rectCache.reset();
 			m_key.reset();
@@ -1641,7 +1644,7 @@ namespace bgfx
 
 		void setTexture(uint8_t _stage, UniformHandle _sampler, TextureHandle _handle, uint32_t _flags)
 		{
-			Binding& bind = m_draw.m_bind[_stage];
+			Binding& bind = m_bind.m_bind[_stage];
 			bind.m_idx    = _handle.idx;
 			bind.m_type   = uint8_t(Binding::Texture);
 			bind.m_un.m_draw.m_textureFlags = (_flags&BGFX_TEXTURE_INTERNAL_DEFAULT_SAMPLER)
@@ -1658,7 +1661,7 @@ namespace bgfx
 
 		void setBuffer(uint8_t _stage, IndexBufferHandle _handle, Access::Enum _access)
 		{
-			Binding& bind = m_compute.m_bind[_stage];
+			Binding& bind = m_bind.m_bind[_stage];
 			bind.m_idx    = _handle.idx;
 			bind.m_type   = uint8_t(Binding::IndexBuffer);
 			bind.m_un.m_compute.m_format = 0;
@@ -1668,7 +1671,7 @@ namespace bgfx
 
 		void setBuffer(uint8_t _stage, VertexBufferHandle _handle, Access::Enum _access)
 		{
-			Binding& bind = m_compute.m_bind[_stage];
+			Binding& bind = m_bind.m_bind[_stage];
 			bind.m_idx    = _handle.idx;
 			bind.m_type   = uint8_t(Binding::VertexBuffer);
 			bind.m_un.m_compute.m_format = 0;
@@ -1678,7 +1681,7 @@ namespace bgfx
 
 		void setImage(uint8_t _stage, UniformHandle _sampler, TextureHandle _handle, uint8_t _mip, Access::Enum _access, TextureFormat::Enum _format)
 		{
-			Binding& bind = m_compute.m_bind[_stage];
+			Binding& bind = m_bind.m_bind[_stage];
 			bind.m_idx    = _handle.idx;
 			bind.m_type   = uint8_t(Binding::Image);
 			bind.m_un.m_compute.m_format = uint8_t(_format);
@@ -1837,8 +1840,12 @@ namespace bgfx
 		uint64_t m_sortKeys[BGFX_CONFIG_MAX_DRAW_CALLS+1];
 		RenderItemCount m_sortValues[BGFX_CONFIG_MAX_DRAW_CALLS+1];
 		RenderItem m_renderItem[BGFX_CONFIG_MAX_DRAW_CALLS+1];
-		RenderDraw m_draw;
+		RenderBind m_renderItemBind[BGFX_CONFIG_MAX_DRAW_CALLS + 1];
+
+		RenderDraw    m_draw;
 		RenderCompute m_compute;
+		RenderBind    m_bind;
+
 		uint32_t m_numVertices[BGFX_CONFIG_MAX_VERTEX_STREAMS];
 		uint32_t m_blitKeys[BGFX_CONFIG_MAX_BLIT_ITEMS+1];
 		BlitItem m_blitItem[BGFX_CONFIG_MAX_BLIT_ITEMS+1];
@@ -1955,14 +1962,18 @@ namespace bgfx
 		{
 			bx::memSet(m_vertexDeclRef, 0, sizeof(m_vertexDeclRef) );
 			bx::memSet(m_vertexBufferRef, 0xff, sizeof(m_vertexBufferRef) );
+			bx::memSet(m_dynamicVertexBufferRef, 0xff, sizeof(m_vertexBufferRef) );
 		}
 
 		template <uint16_t MaxHandlesT>
 		void shutdown(bx::HandleAllocT<MaxHandlesT>& _handleAlloc)
 		{
-			for (VertexDeclMap::Iterator it = m_vertexDeclMap.first(); m_vertexDeclMap.next(it); )
+			for (uint16_t ii = 0, num = _handleAlloc.getNumHandles(); ii < num; ++ii)
 			{
-				_handleAlloc.free(it.handle);
+				VertexDeclHandle handle = { _handleAlloc.getHandleAt(ii) };
+				m_vertexDeclRef[handle.idx] = 0;
+				m_vertexDeclMap.removeByHandle(handle.idx);
+				_handleAlloc.free(handle.idx);
 			}
 
 			m_vertexDeclMap.reset();
@@ -1974,28 +1985,59 @@ namespace bgfx
 			return handle;
 		}
 
+		void add(VertexDeclHandle _declHandle, uint32_t _hash)
+		{
+			m_vertexDeclRef[_declHandle.idx]++;
+			m_vertexDeclMap.insert(_hash, _declHandle.idx);
+		}
+
 		void add(VertexBufferHandle _handle, VertexDeclHandle _declHandle, uint32_t _hash)
 		{
+			BX_CHECK(m_vertexBufferRef[_handle.idx].idx == invalidHandle, "");
 			m_vertexBufferRef[_handle.idx] = _declHandle;
 			m_vertexDeclRef[_declHandle.idx]++;
 			m_vertexDeclMap.insert(_hash, _declHandle.idx);
 		}
 
-		VertexDeclHandle release(VertexBufferHandle _handle)
+		void add(DynamicVertexBufferHandle _handle, VertexDeclHandle _declHandle, uint32_t _hash)
 		{
-			VertexDeclHandle declHandle = m_vertexBufferRef[_handle.idx];
-			if (isValid(declHandle) )
-			{
-				m_vertexDeclRef[declHandle.idx]--;
+			BX_CHECK(m_dynamicVertexBufferRef[_handle.idx].idx == invalidHandle, "");
+			m_dynamicVertexBufferRef[_handle.idx] = _declHandle;
+			m_vertexDeclRef[_declHandle.idx]++;
+			m_vertexDeclMap.insert(_hash, _declHandle.idx);
+		}
 
-				if (0 != m_vertexDeclRef[declHandle.idx])
+		VertexDeclHandle release(VertexDeclHandle _declHandle)
+		{
+			if (isValid(_declHandle) )
+			{
+				m_vertexDeclRef[_declHandle.idx]--;
+
+				if (0 == m_vertexDeclRef[_declHandle.idx])
 				{
-					VertexDeclHandle invalid = BGFX_INVALID_HANDLE;
-					return invalid;
+					m_vertexDeclMap.removeByHandle(_declHandle.idx);
+					return _declHandle;
 				}
 			}
 
-			m_vertexDeclMap.removeByHandle(declHandle.idx);
+			VertexDeclHandle invalid = BGFX_INVALID_HANDLE;
+			return invalid;
+		}
+
+		VertexDeclHandle release(VertexBufferHandle _handle)
+		{
+			VertexDeclHandle declHandle = m_vertexBufferRef[_handle.idx];
+			declHandle = release(declHandle);
+			m_vertexBufferRef[_handle.idx].idx = invalidHandle;
+
+			return declHandle;
+		}
+
+		VertexDeclHandle release(DynamicVertexBufferHandle _handle)
+		{
+			VertexDeclHandle declHandle = m_dynamicVertexBufferRef[_handle.idx];
+			declHandle = release(declHandle);
+			m_dynamicVertexBufferRef[_handle.idx].idx = invalidHandle;
 
 			return declHandle;
 		}
@@ -2005,6 +2047,7 @@ namespace bgfx
 
 		uint16_t m_vertexDeclRef[BGFX_CONFIG_MAX_VERTEX_DECLS];
 		VertexDeclHandle m_vertexBufferRef[BGFX_CONFIG_MAX_VERTEX_BUFFERS];
+		VertexDeclHandle m_dynamicVertexBufferRef[BGFX_CONFIG_MAX_DYNAMIC_VERTEX_BUFFERS];
 	};
 
 	// First-fit non-local allocator.
@@ -2135,6 +2178,7 @@ namespace bgfx
 		virtual ~RendererContextI() = 0;
 		virtual RendererType::Enum getRendererType() const = 0;
 		virtual const char* getRendererName() const = 0;
+		virtual bool isDeviceRemoved() = 0;
 		virtual void flip(HMD& _hmd) = 0;
 		virtual void createIndexBuffer(IndexBufferHandle _handle, Memory* _mem, uint16_t _flags) = 0;
 		virtual void destroyIndexBuffer(IndexBufferHandle _handle) = 0;
@@ -2201,6 +2245,8 @@ namespace bgfx
 			, m_frames(0)
 			, m_debug(BGFX_DEBUG_NONE)
 			, m_renderCtx(NULL)
+			, m_renderMain(NULL)
+			, m_renderNoop(NULL)
 			, m_rendererInitialized(false)
 			, m_exit(false)
 			, m_flipAfterRender(false)
@@ -2326,6 +2372,10 @@ namespace bgfx
 				cmdbuf.write(_mem);
 				cmdbuf.write(_flags);
 			}
+			else
+			{
+				release(_mem);
+			}
 
 			return handle;
 		}
@@ -2374,6 +2424,10 @@ namespace bgfx
 				cmdbuf.write(declHandle);
 				cmdbuf.write(_flags);
 			}
+			else
+			{
+				release(_mem);
+			}
 
 			return handle;
 		}
@@ -2480,10 +2534,16 @@ namespace bgfx
 			BX_CHECK(0 == (_flags &  BGFX_BUFFER_COMPUTE_READ_WRITE), "Cannot initialize compute buffer from CPU.");
 			const uint32_t indexSize = 0 == (_flags & BGFX_BUFFER_INDEX32) ? 2 : 4;
 			DynamicIndexBufferHandle handle = createDynamicIndexBuffer(_mem->size/indexSize, _flags);
+
 			if (isValid(handle) )
 			{
 				updateDynamicIndexBuffer(handle, 0, _mem);
 			}
+			else
+			{
+				release(_mem);
+			}
+
 			return handle;
 		}
 
@@ -2588,7 +2648,7 @@ namespace bgfx
 			uint32_t size = bx::strideAlign16(_num*_decl.m_stride, _decl.m_stride);
 
 			uint64_t ptr = 0;
-			if (0 != (_flags & BGFX_BUFFER_COMPUTE_WRITE) )
+			if (0 != (_flags & BGFX_BUFFER_COMPUTE_READ_WRITE) )
 			{
 				VertexBufferHandle vertexBufferHandle = { m_vertexBufferHandle.alloc() };
 				if (!isValid(vertexBufferHandle) )
@@ -2624,7 +2684,7 @@ namespace bgfx
 			dvb.m_stride      = _decl.m_stride;
 			dvb.m_decl        = declHandle;
 			dvb.m_flags       = _flags;
-			m_declRef.add(dvb.m_handle, declHandle, _decl.m_hash);
+			m_declRef.add(handle, declHandle, _decl.m_hash);
 
 			return handle;
 		}
@@ -2633,10 +2693,16 @@ namespace bgfx
 		{
 			uint32_t numVertices = _mem->size/_decl.m_stride;
 			DynamicVertexBufferHandle handle = createDynamicVertexBuffer(numVertices, _decl, _flags);
+
 			if (isValid(handle) )
 			{
 				updateDynamicVertexBuffer(handle, 0, _mem);
 			}
+			else
+			{
+				release(_mem);
+			}
+
 			return handle;
 		}
 
@@ -2645,7 +2711,7 @@ namespace bgfx
 			BGFX_CHECK_HANDLE("updateDynamicVertexBuffer", m_dynamicVertexBufferHandle, _handle);
 
 			DynamicVertexBuffer& dvb = m_dynamicVertexBuffers[_handle.idx];
-			BX_CHECK(0 == (dvb.m_flags &  BGFX_BUFFER_COMPUTE_READ_WRITE), "Can't update GPU buffer from CPU.");
+			BX_CHECK(0 == (dvb.m_flags &  BGFX_BUFFER_COMPUTE_WRITE), "Can't update GPU write buffer from CPU.");
 
 			if (dvb.m_size < _mem->size
 			&&  0 != (dvb.m_flags & BGFX_BUFFER_ALLOW_RESIZE) )
@@ -2687,9 +2753,9 @@ namespace bgfx
 
 		void destroyDynamicVertexBufferInternal(DynamicVertexBufferHandle _handle)
 		{
-			DynamicVertexBuffer& dvb = m_dynamicVertexBuffers[_handle.idx];
+			VertexDeclHandle declHandle = m_declRef.release(_handle);
+			BGFX_CHECK_HANDLE_INVALID_OK("destroyDynamicVertexBufferInternal", m_vertexDeclHandle, declHandle);
 
-			VertexDeclHandle declHandle = m_declRef.release(dvb.m_handle);
 			if (isValid(declHandle) )
 			{
 				CommandBuffer& cmdbuf = getCommandBuffer(CommandBuffer::DestroyVertexDecl);
@@ -2697,6 +2763,8 @@ namespace bgfx
 				m_render->free(declHandle);
 			}
 
+			DynamicVertexBuffer& dvb = m_dynamicVertexBuffers[_handle.idx];
+
 			if (0 != (dvb.m_flags & BGFX_BUFFER_COMPUTE_WRITE) )
 			{
 				destroyVertexBuffer(dvb.m_handle);
@@ -2833,7 +2901,7 @@ namespace bgfx
 				CommandBuffer& cmdbuf = getCommandBuffer(CommandBuffer::CreateVertexDecl);
 				cmdbuf.write(declHandle);
 				cmdbuf.write(_decl);
-				m_declRef.add(dvb.handle, declHandle, _decl.m_hash);
+				m_declRef.add(declHandle, _decl.m_hash);
 			}
 
 			uint32_t offset = m_submit->allocTransientVertexBuffer(_num, _decl.m_stride);
@@ -2907,6 +2975,7 @@ namespace bgfx
 			if (!err.isOk() )
 			{
 				ShaderHandle invalid = BGFX_INVALID_HANDLE;
+				release(_mem);
 				return invalid;
 			}
 
@@ -2921,6 +2990,7 @@ namespace bgfx
 					, ( (uint8_t*)&magic)[3]
 					);
 				ShaderHandle invalid = BGFX_INVALID_HANDLE;
+				release(_mem);
 				return invalid;
 			}
 
@@ -2985,6 +3055,10 @@ namespace bgfx
 				cmdbuf.write(handle);
 				cmdbuf.write(_mem);
 			}
+			else
+			{
+				release(_mem);
+			}
 
 			return handle;
 		}
@@ -3205,8 +3279,8 @@ namespace bgfx
 				_info = &ti;
 			}
 
-			ImageContainer imageContainer;
-			if (imageParse(imageContainer, _mem->data, _mem->size) )
+			bimg::ImageContainer imageContainer;
+			if (bimg::imageParse(imageContainer, _mem->data, _mem->size) )
 			{
 				calcTextureSize(*_info
 					, (uint16_t)imageContainer.m_width
@@ -3247,6 +3321,10 @@ namespace bgfx
 				cmdbuf.write(_flags);
 				cmdbuf.write(_skip);
 			}
+			else
+			{
+				release(_mem);
+			}
 
 			return handle;
 		}
@@ -3290,7 +3368,7 @@ namespace bgfx
 				, _handle.idx
 				, _width
 				, _height
-				, bgfx::getName(TextureFormat::Enum(textureRef.m_format) )
+				, bimg::getName(bimg::TextureFormat::Enum(textureRef.m_format) )
 				);
 
 			CommandBuffer& cmdbuf = getCommandBuffer(CommandBuffer::ResizeTexture);
@@ -3368,7 +3446,7 @@ namespace bgfx
 			for (uint32_t ii = 0; ii < _num; ++ii)
 			{
 				TextureHandle texHandle = _attachment[ii].handle;
-				if (isDepth(TextureFormat::Enum(m_textureRef[texHandle.idx].m_format)))
+				if (bimg::isDepth(bimg::TextureFormat::Enum(m_textureRef[texHandle.idx].m_format)))
 				{
 					++depth;
 				}
@@ -3500,6 +3578,8 @@ namespace bgfx
 				return handle;
 			}
 
+			_num  = bx::uint16_max(1, _num);
+
 			uint16_t idx = m_uniformHashMap.find(bx::hashMurmur2A(_name) );
 			if (UniformHashMap::invalid != idx)
 			{
@@ -4047,8 +4127,8 @@ namespace bgfx
 			const TextureRef& dst = m_textureRef[_dst.idx];
 			BX_CHECK(src.m_format == dst.m_format
 				, "Texture format must match (src %s, dst %s)."
-				, bgfx::getName(TextureFormat::Enum(src.m_format) )
-				, bgfx::getName(TextureFormat::Enum(dst.m_format) )
+				, bimg::getName(bimg::TextureFormat::Enum(src.m_format) )
+				, bimg::getName(bimg::TextureFormat::Enum(dst.m_format) )
 				);
 			BX_UNUSED(src, dst);
 			m_submit->blit(_id, _dst, _dstMip, _dstX, _dstY, _dstZ, _src, _srcMip, _srcX, _srcY, _srcZ, _width, _height, _depth);
@@ -4064,6 +4144,7 @@ namespace bgfx
 		const char* getName(UniformHandle _handle) const;
 
 		// render thread
+		void flip();
 		RenderFrame::Enum renderFrame(int32_t _msecs = -1);
 		void flushTextureUpdateBatch(CommandBuffer& _cmdbuf);
 		void rendererExecCommands(CommandBuffer& _cmdbuf);
@@ -4261,6 +4342,8 @@ namespace bgfx
 		ClearQuad m_clearQuad;
 
 		RendererContextI* m_renderCtx;
+		RendererContextI* m_renderMain;
+		RendererContextI* m_renderNoop;
 
 		bool m_rendererInitialized;
 		bool m_exit;
diff --git a/3rdparty/bgfx/src/config.h b/3rdparty/bgfx/src/config.h
index 6ee6822..9f5ca58 100644
--- a/3rdparty/bgfx/src/config.h
+++ b/3rdparty/bgfx/src/config.h
@@ -19,8 +19,7 @@
 	&& !defined(BGFX_CONFIG_RENDERER_OPENGL) \
 	&& !defined(BGFX_CONFIG_RENDERER_OPENGLES) \
 	&& !defined(BGFX_CONFIG_RENDERER_VULKAN) \
-	&& !defined(BGFX_CONFIG_RENDERER_GNM) \
-	&& !defined(BGFX_CONFIG_RENDERER_NOOP)
+	&& !defined(BGFX_CONFIG_RENDERER_GNM)
 
 #	ifndef BGFX_CONFIG_RENDERER_DIRECT3D9
 #		define BGFX_CONFIG_RENDERER_DIRECT3D9 (0 \
@@ -86,18 +85,6 @@
 					? 1 : 0)
 #	endif // BGFX_CONFIG_RENDERER_GNM
 
-#	ifndef BGFX_CONFIG_RENDERER_NOOP
-#		define BGFX_CONFIG_RENDERER_NOOP (!(0 \
-					|| BGFX_CONFIG_RENDERER_DIRECT3D9 \
-					|| BGFX_CONFIG_RENDERER_DIRECT3D11 \
-					|| BGFX_CONFIG_RENDERER_DIRECT3D12 \
-					|| BGFX_CONFIG_RENDERER_METAL \
-					|| BGFX_CONFIG_RENDERER_OPENGL \
-					|| BGFX_CONFIG_RENDERER_OPENGLES \
-					|| BGFX_CONFIG_RENDERER_VULKAN \
-					|| BGFX_CONFIG_RENDERER_GNM \
-					? 1 : 0) )
-#	endif // BGFX_CONFIG_RENDERER_NOOP
 #else
 #	ifndef BGFX_CONFIG_RENDERER_DIRECT3D9
 #		define BGFX_CONFIG_RENDERER_DIRECT3D9 0
@@ -130,10 +117,6 @@
 #	ifndef BGFX_CONFIG_RENDERER_GNM
 #		define BGFX_CONFIG_RENDERER_GNM 0
 #	endif // BGFX_CONFIG_RENDERER_GNM
-
-#	ifndef BGFX_CONFIG_RENDERER_NOOP
-#		define BGFX_CONFIG_RENDERER_NOOP 0
-#	endif // BGFX_CONFIG_RENDERER_NOOP
 #endif // !defined...
 
 #if BGFX_CONFIG_RENDERER_OPENGL && BGFX_CONFIG_RENDERER_OPENGL < 21
diff --git a/3rdparty/bgfx/src/glcontext_eagl.mm b/3rdparty/bgfx/src/glcontext_eagl.mm
index ff54674..d4e1156 100644
--- a/3rdparty/bgfx/src/glcontext_eagl.mm
+++ b/3rdparty/bgfx/src/glcontext_eagl.mm
@@ -20,21 +20,25 @@ namespace bgfx { namespace gl
 	struct SwapChainGL
 	{
 		SwapChainGL(EAGLContext *_context, CAEAGLLayer *_layer)
-		: m_context(_context)
-		, m_fbo(0)
-		, m_colorRbo(0)
-		, m_depthStencilRbo(0)
+			: m_context(_context)
+			, m_fbo(0)
+			, m_colorRbo(0)
+			, m_depthStencilRbo(0)
 		{
 			_layer.contentsScale = [UIScreen mainScreen].scale;
 
-			_layer.opaque = [_layer.style valueForKey:@"opaque"] == nil ? true : [[_layer.style valueForKey:@"opaque"] boolValue];
+			_layer.opaque = [_layer.style valueForKey:@"opaque"] == nil
+				? true
+				: [[_layer.style valueForKey:@"opaque"] boolValue]
+				;
 
 			_layer.drawableProperties = [NSDictionary dictionaryWithObjectsAndKeys
-											: [NSNumber numberWithBool:false]
-											, kEAGLDrawablePropertyRetainedBacking
-											, kEAGLColorFormatRGBA8
-											, kEAGLDrawablePropertyColorFormat
-											, nil];
+				: [NSNumber numberWithBool:false]
+				, kEAGLDrawablePropertyRetainedBacking
+				, kEAGLColorFormatRGBA8
+				, kEAGLDrawablePropertyColorFormat
+				, nil
+				];
 
 			[EAGLContext setCurrentContext:_context];
 
@@ -72,20 +76,20 @@ namespace bgfx { namespace gl
 			GL_CHECK(glBindRenderbuffer(GL_RENDERBUFFER, 0) );
 			if (0 != m_fbo)
 			{
-			    GL_CHECK(glDeleteFramebuffers(1, &m_fbo) );
-			    m_fbo = 0;
+				GL_CHECK(glDeleteFramebuffers(1, &m_fbo) );
+				m_fbo = 0;
 			}
 
 			if (0 != m_colorRbo)
 			{
-			    GL_CHECK(glDeleteRenderbuffers(1, &m_colorRbo) );
-			    m_colorRbo = 0;
+				GL_CHECK(glDeleteRenderbuffers(1, &m_colorRbo) );
+				m_colorRbo = 0;
 			}
 
 			if (0 != m_depthStencilRbo)
 			{
-			    GL_CHECK(glDeleteRenderbuffers(1, &m_depthStencilRbo) );
-			    m_depthStencilRbo = 0;
+				GL_CHECK(glDeleteRenderbuffers(1, &m_depthStencilRbo) );
+				m_depthStencilRbo = 0;
 			}
 		}
 
@@ -93,20 +97,23 @@ namespace bgfx { namespace gl
 		{
 			GL_CHECK(glGenRenderbuffers(1, &m_depthStencilRbo) );
 			GL_CHECK(glBindRenderbuffer(GL_RENDERBUFFER, m_depthStencilRbo) );
-			GL_CHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8_OES, _width, _height) ); // from OES_packed_depth_stencil
+			GL_CHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8, _width, _height) );
 			GL_CHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, m_depthStencilRbo) );
 			GL_CHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_RENDERBUFFER, m_depthStencilRbo) );
 
-			BX_CHECK(GL_FRAMEBUFFER_COMPLETE ==  glCheckFramebufferStatus(GL_FRAMEBUFFER)
-						, "glCheckFramebufferStatus failed 0x%08x"
-						, glCheckFramebufferStatus(GL_FRAMEBUFFER)
-						);
+			GLenum err = glCheckFramebufferStatus(GL_FRAMEBUFFER);
+			BX_CHECK(GL_FRAMEBUFFER_COMPLETE == err, "glCheckFramebufferStatus failed 0x%08x", err);
+			BX_UNUSED(err);
 
 			makeCurrent();
+
 			GL_CHECK(glClearColor(0.0f, 0.0f, 0.0f, 0.0f) );
 			GL_CHECK(glClear(GL_COLOR_BUFFER_BIT) );
+
 			swapBuffers();
+
 			GL_CHECK(glClear(GL_COLOR_BUFFER_BIT) );
+
 			swapBuffers();
 		}
 
@@ -162,12 +169,12 @@ namespace bgfx { namespace gl
 		layer.opaque = [layer.style valueForKey:@"opaque"] == nil ? true : [[layer.style valueForKey:@"opaque"] boolValue];
 
 		layer.drawableProperties = [NSDictionary dictionaryWithObjectsAndKeys
-										: [NSNumber numberWithBool:false]
-										, kEAGLDrawablePropertyRetainedBacking
-										, kEAGLColorFormatRGBA8
-										, kEAGLDrawablePropertyColorFormat
-										, nil
-										];
+			: [NSNumber numberWithBool:false]
+			, kEAGLDrawablePropertyRetainedBacking
+			, kEAGLColorFormatRGBA8
+			, kEAGLDrawablePropertyColorFormat
+			, nil
+			];
 
 		EAGLContext* context = [ [EAGLContext alloc] initWithAPI:kEAGLRenderingAPIOpenGLES2];
 		BX_CHECK(NULL != context, "Failed to create kEAGLRenderingAPIOpenGLES2 context.");
@@ -191,7 +198,7 @@ namespace bgfx { namespace gl
 
 		GL_CHECK(glGenRenderbuffers(1, &m_depthStencilRbo) );
 		GL_CHECK(glBindRenderbuffer(GL_RENDERBUFFER, m_depthStencilRbo) );
-		GL_CHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8_OES, width, height) ); // from OES_packed_depth_stencil
+		GL_CHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8, width, height) );
 		GL_CHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, m_depthStencilRbo) );
 		GL_CHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_RENDERBUFFER, m_depthStencilRbo) );
 
@@ -278,7 +285,7 @@ namespace bgfx { namespace gl
 
 		GL_CHECK(glGenRenderbuffers(1, &m_depthStencilRbo) );
 		GL_CHECK(glBindRenderbuffer(GL_RENDERBUFFER, m_depthStencilRbo) );
-		GL_CHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8_OES, width, height) ); // from OES_packed_depth_stencil
+		GL_CHECK(glRenderbufferStorage(GL_RENDERBUFFER, GL_DEPTH24_STENCIL8, width, height) );
 		GL_CHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_RENDERBUFFER, m_depthStencilRbo) );
 		GL_CHECK(glFramebufferRenderbuffer(GL_FRAMEBUFFER, GL_STENCIL_ATTACHMENT, GL_RENDERBUFFER, m_depthStencilRbo) );
 
@@ -340,18 +347,20 @@ namespace bgfx { namespace gl
 	void GlContext::import()
 	{
 		BX_TRACE("Import:");
-#	define GL_EXTENSION(_optional, _proto, _func, _import) \
-		{ \
-			if (_func == NULL) \
-			{ \
-				_func = (_proto)bx::dlsym(s_opengles, #_import); \
-				BX_TRACE("%p " #_func " (" #_import ")", _func); \
-			} \
-			BGFX_FATAL(_optional || NULL != _func, Fatal::UnableToInitialize, "Failed to create OpenGLES context. EAGLGetProcAddress(\"%s\")", #_import); \
+#	define GL_EXTENSION(_optional, _proto, _func, _import)                        \
+		{                                                                         \
+			if (_func == NULL)                                                    \
+			{                                                                     \
+				_func = (_proto)bx::dlsym(s_opengles, #_import);                  \
+				BX_TRACE("%p " #_func " (" #_import ")", _func);                  \
+			}                                                                     \
+			BGFX_FATAL(_optional || NULL != _func, Fatal::UnableToInitialize      \
+				, "Failed to create OpenGLES context. EAGLGetProcAddress(\"%s\")" \
+				, #_import);                                                      \
 		}
 #	include "glimports.h"
 	}
 
 } /* namespace gl */ } // namespace bgfx
 
-#endif // BX_PLATFORM_IOS && (BGFX_CONFIG_RENDERER_OPENGLES2|BGFX_CONFIG_RENDERER_OPENGLES3|BGFX_CONFIG_RENDERER_OPENGL)
+#endif // BX_PLATFORM_IOS && (BGFX_CONFIG_RENDERER_OPENGLES|BGFX_CONFIG_RENDERER_OPENGL)
diff --git a/3rdparty/bgfx/src/image.h b/3rdparty/bgfx/src/image.h
deleted file mode 100644
index ef05a1f..0000000
--- a/3rdparty/bgfx/src/image.h
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
- */
-
-#ifndef BGFX_IMAGE_H_HEADER_GUARD
-#define BGFX_IMAGE_H_HEADER_GUARD
-
-#include <bx/pixelformat.h>
-
-namespace bgfx
-{
-	struct ImageContainer
-	{
-		bx::AllocatorI* m_allocator;
-		void*           m_data;
-
-		TextureFormat::Enum m_format;
-
-		uint32_t m_size;
-		uint32_t m_offset;
-		uint32_t m_width;
-		uint32_t m_height;
-		uint32_t m_depth;
-		uint16_t m_numLayers;
-		uint8_t  m_numMips;
-		bool     m_hasAlpha;
-		bool     m_cubeMap;
-		bool     m_ktx;
-		bool     m_ktxLE;
-		bool     m_srgb;
-	};
-
-	struct ImageMip
-	{
-		TextureFormat::Enum m_format;
-		uint32_t m_width;
-		uint32_t m_height;
-		uint32_t m_blockSize;
-		uint32_t m_size;
-		uint8_t  m_bpp;
-		bool     m_hasAlpha;
-		const uint8_t* m_data;
-	};
-
-	struct ImageBlockInfo
-	{
-		uint8_t bitsPerPixel;
-		uint8_t blockWidth;
-		uint8_t blockHeight;
-		uint8_t blockSize;
-		uint8_t minBlockX;
-		uint8_t minBlockY;
-		uint8_t depthBits;
-		uint8_t stencilBits;
-		uint8_t rBits;
-		uint8_t gBits;
-		uint8_t bBits;
-		uint8_t aBits;
-		uint8_t encoding;
-	};
-
-	/// Returns true if texture format is compressed.
-	bool isCompressed(TextureFormat::Enum _format);
-
-	/// Returns true if texture format is uncompressed.
-	bool isColor(TextureFormat::Enum _format);
-
-	/// Returns true if texture format is depth.
-	bool isDepth(TextureFormat::Enum _format);
-
-	/// Returns true if texture format is valid.
-	bool isValid(TextureFormat::Enum _format);
-
-	/// Returns bits per pixel.
-	uint8_t getBitsPerPixel(TextureFormat::Enum _format);
-
-	/// Returns texture block info.
-	const ImageBlockInfo& getBlockInfo(TextureFormat::Enum _format);
-
-	/// Converts format to string.
-	const char* getName(TextureFormat::Enum _format);
-
-	/// Converts string to format.
-	TextureFormat::Enum getFormat(const char* _name);
-
-	/// Returns number of mip-maps required for complete mip-map chain.
-	uint8_t imageGetNumMips(
-		  TextureFormat::Enum _format
-		, uint16_t _width
-		, uint16_t _height
-		, uint16_t _depth = 0
-		);
-
-	/// Returns image size.
-	uint32_t imageGetSize(
-		  TextureInfo* _info
-		, uint16_t _width
-		, uint16_t _height
-		, uint16_t _depth
-		, bool _cubeMap
-		, bool _hasMips
-		, uint16_t _numLayers
-		, TextureFormat::Enum _format
-		);
-
-	///
-	void imageSolid(void* _dst, uint32_t _width, uint32_t _height, uint32_t _solid);
-
-	///
-	void imageCheckerboard(void* _dst, uint32_t _width, uint32_t _height, uint32_t _step, uint32_t _0, uint32_t _1);
-
-	///
-	void imageRgba8Downsample2x2(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-	///
-	void imageRgba32fToLinear(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-	///
-	void imageRgba32fToGamma(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-	///
-	void imageRgba32fLinearDownsample2x2(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-	///
-	void imageRgba32fDownsample2x2NormalMap(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-	///
-	void imageSwizzleBgra8(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
-
-	///
-	void imageCopy(void* _dst, uint32_t _height, uint32_t _srcPitch, const void* _src, uint32_t _dstPitch);
-
-	///
-	void imageCopy(void* _dst, uint32_t _width, uint32_t _height, uint32_t _bpp, uint32_t _pitch, const void* _src);
-
-	///
-	bool imageConvert(TextureFormat::Enum _dstFormat, TextureFormat::Enum _srcFormat);
-
-	///
-	void imageConvert(
-		  void* _dst
-		, uint32_t _bpp
-		, bx::PackFn _pack
-		, const void* _src
-		, bx::UnpackFn _unpack
-		, uint32_t _size
-		);
-
-	///
-	void imageConvert(
-		  void* _dst
-		, uint32_t _dstBpp
-		, bx::PackFn _pack
-		, const void* _src
-		, uint32_t _srcBpp
-		, bx::UnpackFn _unpack
-		, uint32_t _width
-		, uint32_t _height
-		, uint32_t _srcPitch
-		);
-
-	///
-	bool imageConvert(
-		  void* _dst
-		, TextureFormat::Enum _dstFormat
-		, const void* _src
-		, TextureFormat::Enum _srcFormat
-		, uint32_t _width
-		, uint32_t _height
-		);
-
-	///
-	ImageContainer* imageConvert(
-		  bx::AllocatorI* _allocator
-		, TextureFormat::Enum _dstFormat
-		, const void* _src
-		, uint32_t _size
-		);
-
-	///
-	ImageContainer* imageAlloc(
-		  bx::AllocatorI* _allocator
-		, TextureFormat::Enum _format
-		, uint16_t _width
-		, uint16_t _height
-		, uint16_t _depth
-		, uint16_t _numLayers
-		, bool _cubeMap
-		, bool _hasMips
-		, const void* _data = NULL
-		);
-
-	///
-	void imageFree(ImageContainer* _imageContainer);
-
-	///
-	void imageWriteTga(
-		  bx::WriterI* _writer
-		, uint32_t _width
-		, uint32_t _height
-		, uint32_t _pitch
-		, const void* _src
-		, bool _grayscale
-		, bool _yflip
-		, bx::Error* _err = NULL
-		);
-
-	///
-	void imageWriteKtx(
-		  bx::WriterI* _writer
-		, TextureFormat::Enum _format
-		, bool _cubeMap
-		, uint32_t _width
-		, uint32_t _height
-		, uint32_t _depth
-		, uint8_t _numMips
-		, const void* _src
-		, bx::Error* _err = NULL
-		);
-
-	///
-	void imageWriteKtx(
-		  bx::WriterI* _writer
-		, ImageContainer& _imageContainer
-		, const void* _data
-		, uint32_t _size
-		, bx::Error* _err = NULL
-		);
-
-	///
-	bool imageParse(ImageContainer& _imageContainer, bx::ReaderSeekerI* _reader);
-
-	///
-	bool imageParse(ImageContainer& _imageContainer, const void* _data, uint32_t _size);
-
-	///
-	void imageDecodeToBgra8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _pitch, TextureFormat::Enum _format);
-
-	///
-	void imageDecodeToRgba8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _pitch, TextureFormat::Enum _format);
-
-	///
-	void imageDecodeToRgba32f(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint32_t _pitch, TextureFormat::Enum _format);
-
-	///
-	bool imageGetRawData(const ImageContainer& _imageContainer, uint16_t _side, uint8_t _lod, const void* _data, uint32_t _size, ImageMip& _mip);
-
-} // namespace bgfx
-
-#endif // BGFX_IMAGE_H_HEADER_GUARD
diff --git a/3rdparty/bgfx/src/renderer_d3d11.cpp b/3rdparty/bgfx/src/renderer_d3d11.cpp
index 2a65c2a..b4b203b 100644
--- a/3rdparty/bgfx/src/renderer_d3d11.cpp
+++ b/3rdparty/bgfx/src/renderer_d3d11.cpp
@@ -455,6 +455,35 @@ namespace bgfx { namespace d3d11
 			;
 	}
 
+	static const char* getLostReason(HRESULT _hr)
+	{
+		switch (_hr)
+		{
+		// The GPU device instance has been suspended. Use GetDeviceRemovedReason to determine the appropriate action.
+		case DXGI_ERROR_DEVICE_REMOVED: return "DXGI_ERROR_DEVICE_REMOVED";
+
+		// The GPU will not respond to more commands, most likely because of an invalid command passed by the calling application.
+		case DXGI_ERROR_DEVICE_HUNG: return "DXGI_ERROR_DEVICE_HUNG";
+
+		// The GPU will not respond to more commands, most likely because some other application submitted invalid commands.
+		// The calling application should re-create the device and continue.
+		case DXGI_ERROR_DEVICE_RESET: return "DXGI_ERROR_DEVICE_RESET";
+
+		// An internal issue prevented the driver from carrying out the specified operation. The driver's state is probably
+		// suspect, and the application should not continue.
+		case DXGI_ERROR_DRIVER_INTERNAL_ERROR: return "DXGI_ERROR_DRIVER_INTERNAL_ERROR";
+
+		// A resource is not available at the time of the call, but may become available later.
+		case DXGI_ERROR_NOT_CURRENTLY_AVAILABLE: return "DXGI_ERROR_NOT_CURRENTLY_AVAILABLE";
+
+		case S_OK: return "S_OK";
+
+		default: break;
+		}
+
+		return "Unknown HRESULT?";
+	}
+
 	template <typename Ty>
 	static BX_NO_INLINE void setDebugObjectName(Ty* _interface, const char* _format, ...)
 	{
@@ -654,7 +683,7 @@ namespace bgfx { namespace d3d11
 			, m_adapter(NULL)
 			, m_factory(NULL)
 			, m_swapChain(NULL)
-			, m_lost(0)
+			, m_lost(false)
 			, m_numWindows(0)
 			, m_device(NULL)
 			, m_deviceCtx(NULL)
@@ -945,7 +974,7 @@ namespace bgfx { namespace d3d11
 								}
 
 								if (BX_ENABLED(BGFX_CONFIG_DEBUG_PERFHUD)
-								&&  0 != strstr(description, "PerfHUD") )
+								&&  0 != bx::strnstr(description, "PerfHUD") )
 								{
 									m_adapter = adapter;
 									m_driverType = D3D_DRIVER_TYPE_REFERENCE;
@@ -1386,7 +1415,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 				{
 					uint16_t support = BGFX_CAPS_FORMAT_TEXTURE_NONE;
 
-					const DXGI_FORMAT fmt = isDepth(TextureFormat::Enum(ii) )
+					const DXGI_FORMAT fmt = bimg::isDepth(bimg::TextureFormat::Enum(ii) )
 						? s_textureFormat[ii].m_fmtDsv
 						: s_textureFormat[ii].m_fmt
 						;
@@ -1861,7 +1890,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 			uint8_t* src       = (uint8_t*)mapped.pData;
 			uint32_t srcPitch  = mapped.RowPitch;
 
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(texture.m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(texture.m_textureFormat) );
 			uint8_t* dst      = (uint8_t*)_data;
 			uint32_t dstPitch = srcWidth*bpp/8;
 
@@ -2026,7 +2055,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 
 				D3D11_MAPPED_SUBRESOURCE mapped;
 				DX_CHECK(m_deviceCtx->Map(texture, 0, D3D11_MAP_READ, 0, &mapped) );
-				imageSwizzleBgra8(
+				bimg::imageSwizzleBgra8(
 					  mapped.pData
 					, backBufferDesc.Width
 					, backBufferDesc.Height
@@ -2263,9 +2292,15 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 			capturePostReset();
 		}
 
+		bool isDeviceRemoved() BX_OVERRIDE
+		{
+			return m_lost;
+		}
+
 		void flip(HMD& _hmd) BX_OVERRIDE
 		{
-			if (NULL != m_swapChain)
+			if (NULL != m_swapChain
+			&&  !m_lost)
 			{
 				HRESULT hr = S_OK;
 				uint32_t syncInterval = BX_ENABLED(!BX_PLATFORM_WINDOWS)
@@ -2298,15 +2333,14 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					}
 				}
 
-				if (isLost(hr) )
-				{
-					++m_lost;
-					BGFX_FATAL(10 > m_lost, bgfx::Fatal::DeviceLost, "Device is lost. FAILED 0x%08x", hr);
-				}
-				else
-				{
-					m_lost = 0;
-				}
+				m_lost = isLost(hr);
+				BGFX_FATAL(!m_lost
+					, bgfx::Fatal::DeviceLost
+					, "Device is lost. FAILED 0x%08x %s (%s)"
+					, hr
+					, getLostReason(hr)
+					, DXGI_ERROR_DEVICE_REMOVED == hr ? getLostReason(m_device->GetDeviceRemovedReason() ) : "no info"
+					);
 			}
 		}
 
@@ -3294,7 +3328,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 				D3D11_MAPPED_SUBRESOURCE mapped;
 				DX_CHECK(m_deviceCtx->Map(m_captureTexture, 0, D3D11_MAP_READ, 0, &mapped) );
 
-				imageSwizzleBgra8(
+				bimg::imageSwizzleBgra8(
 					  mapped.pData
 					, getBufferWidth()
 					, getBufferHeight()
@@ -3551,7 +3585,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 #endif // BX_PLATFORM_WINDOWS
 
 		bool m_needPresent;
-		uint16_t m_lost;
+		bool m_lost;
 		uint16_t m_numWindows;
 		FrameBufferHandle m_windows[BGFX_CONFIG_MAX_FRAME_BUFFERS];
 
@@ -4319,14 +4353,14 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 
 	void TextureD3D11::create(const Memory* _mem, uint32_t _flags, uint8_t _skip)
 	{
-		ImageContainer imageContainer;
+		bimg::ImageContainer imageContainer;
 
-		if (imageParse(imageContainer, _mem->data, _mem->size) )
+		if (bimg::imageParse(imageContainer, _mem->data, _mem->size) )
 		{
 			uint8_t numMips = imageContainer.m_numMips;
 			const uint8_t startLod = uint8_t(bx::uint32_min(_skip, numMips-1) );
 			numMips -= startLod;
-			const ImageBlockInfo& blockInfo = getBlockInfo(TextureFormat::Enum(imageContainer.m_format) );
+			const bimg::ImageBlockInfo& blockInfo = bimg::getBlockInfo(bimg::TextureFormat::Enum(imageContainer.m_format) );
 			const uint32_t textureWidth  = bx::uint32_max(blockInfo.blockWidth,  imageContainer.m_width >>startLod);
 			const uint32_t textureHeight = bx::uint32_max(blockInfo.blockHeight, imageContainer.m_height>>startLod);
 			const uint16_t numLayers     = imageContainer.m_numLayers;
@@ -4338,7 +4372,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 			m_requestedFormat  = uint8_t(imageContainer.m_format);
 			m_textureFormat    = uint8_t(getViableTextureFormat(imageContainer) );
 			const bool convert = m_textureFormat != m_requestedFormat;
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 
 			if (imageContainer.m_cubeMap)
 			{
@@ -4361,7 +4395,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 
 			uint32_t kk = 0;
 
-			const bool compressed = isCompressed(TextureFormat::Enum(m_textureFormat) );
+			const bool compressed = bimg::isCompressed(bimg::TextureFormat::Enum(m_textureFormat) );
 			const bool swizzle    = TextureFormat::BGRA8 == m_textureFormat && 0 != (m_flags&BGFX_TEXTURE_COMPUTE_WRITE);
 
 			BX_TRACE("Texture %3d: %s (requested: %s), layers %d, %dx%d%s%s%s."
@@ -4388,8 +4422,8 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					height = bx::uint32_max(1, height);
 					depth  = bx::uint32_max(1, depth);
 
-					ImageMip mip;
-					if (imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
+					bimg::ImageMip mip;
+					if (bimg::imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
 					{
 						srd[kk].pSysMem = mip.m_data;
 
@@ -4397,7 +4431,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 						{
 							uint32_t srcpitch = mip.m_width*bpp/8;
 							uint8_t* temp = (uint8_t*)BX_ALLOC(g_allocator, mip.m_width*mip.m_height*bpp/8);
-							imageDecodeToBgra8(temp, mip.m_data, mip.m_width, mip.m_height, srcpitch, mip.m_format);
+							bimg::imageDecodeToBgra8(temp, mip.m_data, mip.m_width, mip.m_height, srcpitch, mip.m_format);
 
 							srd[kk].pSysMem = temp;
 							srd[kk].SysMemPitch = srcpitch;
@@ -4486,7 +4520,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					desc.CPUAccessFlags = 0;
 					desc.MiscFlags      = 0;
 
-					if (isDepth( (TextureFormat::Enum)m_textureFormat) )
+					if (bimg::isDepth(bimg::TextureFormat::Enum(m_textureFormat) ) )
 					{
 						desc.BindFlags |= D3D11_BIND_DEPTH_STENCIL;
 						desc.Usage = D3D11_USAGE_DEFAULT;
@@ -4684,7 +4718,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 		}
 
 		const uint32_t subres = _mip + ( (layer + _side) * m_numMips);
-		const uint32_t bpp    = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+		const uint32_t bpp    = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 		const uint32_t rectpitch  = _rect.m_width*bpp/8;
 		const uint32_t srcpitch   = UINT16_MAX == _pitch ? rectpitch : _pitch;
 		const uint32_t slicepitch = rectpitch*_rect.m_height;
@@ -4697,7 +4731,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 		if (convert)
 		{
 			temp = (uint8_t*)BX_ALLOC(g_allocator, slicepitch);
-			imageDecodeToBgra8(temp, data, _rect.m_width, _rect.m_height, srcpitch, TextureFormat::Enum(m_requestedFormat) );
+			bimg::imageDecodeToBgra8(temp, data, _rect.m_width, _rect.m_height, srcpitch, bimg::TextureFormat::Enum(m_requestedFormat) );
 			data = temp;
 		}
 
@@ -4793,7 +4827,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 		DX_CHECK(device->CreateRenderTargetView(ptr, NULL, &m_rtv[0]) );
 		DX_RELEASE(ptr, 0);
 
-		DXGI_FORMAT fmtDsv = isDepth(_depthFormat)
+		DXGI_FORMAT fmtDsv = bimg::isDepth(bimg::TextureFormat::Enum(_depthFormat) )
 			? s_textureFormat[_depthFormat].m_fmtDsv
 			: DXGI_FORMAT_D24_UNORM_S8_UINT
 			;
@@ -4893,7 +4927,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					const uint32_t msaaQuality = bx::uint32_satsub( (texture.m_flags&BGFX_TEXTURE_RT_MSAA_MASK)>>BGFX_TEXTURE_RT_MSAA_SHIFT, 1);
 					const DXGI_SAMPLE_DESC& msaa = s_msaa[msaaQuality];
 
-					if (isDepth( (TextureFormat::Enum)texture.m_textureFormat) )
+					if (bimg::isDepth(bimg::TextureFormat::Enum(texture.m_textureFormat) ) )
 					{
 						BX_CHECK(NULL == m_dsv, "Frame buffer already has depth-stencil attached.");
 
@@ -5273,7 +5307,8 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 
 	void RendererContextD3D11::submit(Frame* _render, ClearQuad& _clearQuad, TextVideoMemBlitter& _textVideoMemBlitter)
 	{
-		if (updateResolution(_render->m_resolution) )
+		if (m_lost
+		||  updateResolution(_render->m_resolution) )
 		{
 			return;
 		}
@@ -5315,6 +5350,9 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 		currentState.m_stateFlags = BGFX_STATE_NONE;
 		currentState.m_stencil = packStencil(BGFX_STENCIL_NONE, BGFX_STENCIL_NONE);
 
+		RenderBind currentBind;
+		currentBind.clear();
+
 		_render->m_hmdInitialized = m_ovr.isInitialized();
 
 		const bool hmdEnabled = m_ovr.isEnabled();
@@ -5378,7 +5416,9 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					|| item == numItems
 					;
 
-				const RenderItem& renderItem = _render->m_renderItem[_render->m_sortValues[item] ];
+				const uint32_t itemIdx       = _render->m_sortValues[item];
+				const RenderItem& renderItem = _render->m_renderItem[itemIdx];
+				const RenderBind& renderBind = _render->m_renderItemBind[itemIdx];
 				++item;
 
 				if (viewChanged)
@@ -5525,7 +5565,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 						}
 						else
 						{
-							bool depthStencil = isDepth(TextureFormat::Enum(src.m_textureFormat) );
+							bool depthStencil = bimg::isDepth(bimg::TextureFormat::Enum(src.m_textureFormat) );
 							BX_CHECK(!depthStencil
 								||  (width == src.m_width && height == src.m_height)
 								, "When blitting depthstencil surface, source resolution must match destination."
@@ -5639,7 +5679,7 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 
 					for (uint32_t ii = 0; ii < BGFX_MAX_COMPUTE_BINDINGS; ++ii)
 					{
-						const Binding& bind = compute.m_bind[ii];
+						const Binding& bind = renderBind.m_bind[ii];
 						if (invalidHandle != bind.m_idx)
 						{
 							switch (bind.m_type)
@@ -5764,6 +5804,8 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					currentState.m_stateFlags = newFlags;
 					currentState.m_stencil    = newStencil;
 
+					currentBind.clear();
+
 					setBlendState(newFlags);
 					setDepthStencilState(newFlags, packStencil(BGFX_STENCIL_DEFAULT, BGFX_STENCIL_DEFAULT) );
 
@@ -5935,16 +5977,36 @@ BX_PRAGMA_DIAGNOSTIC_POP();
 					uint32_t changes = 0;
 					for (uint8_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 					{
-						const Binding& bind = draw.m_bind[stage];
-						Binding& current = currentState.m_bind[stage];
+						const Binding& bind = renderBind.m_bind[stage];
+						Binding& current = currentBind.m_bind[stage];
 						if (current.m_idx != bind.m_idx
+						||  current.m_type != bind.m_type
 						||  current.m_un.m_draw.m_textureFlags != bind.m_un.m_draw.m_textureFlags
 						||  programChanged)
 						{
 							if (invalidHandle != bind.m_idx)
 							{
-								TextureD3D11& texture = m_textures[bind.m_idx];
-								texture.commit(stage, bind.m_un.m_draw.m_textureFlags, _render->m_colorPalette);
+								switch (bind.m_type)
+								{
+								case Binding::Texture:
+									{
+										TextureD3D11& texture = m_textures[bind.m_idx];
+										texture.commit(stage, bind.m_un.m_draw.m_textureFlags, _render->m_colorPalette);
+									}
+									break;
+
+								case Binding::IndexBuffer:
+								case Binding::VertexBuffer:
+									{
+										const BufferD3D11& buffer = Binding::IndexBuffer == bind.m_type
+											? m_indexBuffers[bind.m_idx]
+											: m_vertexBuffers[bind.m_idx]
+											;
+										m_textureStage.m_srv[stage] = buffer.m_srv;
+										m_textureStage.m_sampler[stage] = NULL;
+									}
+									break;
+								}
 							}
 							else
 							{
diff --git a/3rdparty/bgfx/src/renderer_d3d12.cpp b/3rdparty/bgfx/src/renderer_d3d12.cpp
index b5c30cb..c7f7cca 100644
--- a/3rdparty/bgfx/src/renderer_d3d12.cpp
+++ b/3rdparty/bgfx/src/renderer_d3d12.cpp
@@ -432,6 +432,46 @@ namespace bgfx { namespace d3d12
 		return createCommittedResource(_device, _heapProperty, &resourceDesc, NULL);
 	}
 
+	inline bool isLost(HRESULT _hr)
+	{
+		return false
+			|| _hr == DXGI_ERROR_DEVICE_REMOVED
+			|| _hr == DXGI_ERROR_DEVICE_HUNG
+			|| _hr == DXGI_ERROR_DEVICE_RESET
+			|| _hr == DXGI_ERROR_DRIVER_INTERNAL_ERROR
+			|| _hr == DXGI_ERROR_NOT_CURRENTLY_AVAILABLE
+			;
+	}
+
+	static const char* getLostReason(HRESULT _hr)
+	{
+		switch (_hr)
+		{
+		// The GPU device instance has been suspended. Use GetDeviceRemovedReason to determine the appropriate action.
+		case DXGI_ERROR_DEVICE_REMOVED: return "DXGI_ERROR_DEVICE_REMOVED";
+
+		// The GPU will not respond to more commands, most likely because of an invalid command passed by the calling application.
+		case DXGI_ERROR_DEVICE_HUNG: return "DXGI_ERROR_DEVICE_HUNG";
+
+		// The GPU will not respond to more commands, most likely because some other application submitted invalid commands.
+		// The calling application should re-create the device and continue.
+		case DXGI_ERROR_DEVICE_RESET: return "DXGI_ERROR_DEVICE_RESET";
+
+		// An internal issue prevented the driver from carrying out the specified operation. The driver's state is probably
+		// suspect, and the application should not continue.
+		case DXGI_ERROR_DRIVER_INTERNAL_ERROR: return "DXGI_ERROR_DRIVER_INTERNAL_ERROR";
+
+		// A resource is not available at the time of the call, but may become available later.
+		case DXGI_ERROR_NOT_CURRENTLY_AVAILABLE: return "DXGI_ERROR_NOT_CURRENTLY_AVAILABLE";
+
+		case S_OK: return "S_OK";
+
+		default: break;
+		}
+
+		return "Unknown HRESULT?";
+	}
+
 	BX_NO_INLINE void setDebugObjectName(ID3D12Object* _object, const char* _format, ...)
 	{
 		if (BX_ENABLED(BGFX_CONFIG_DEBUG_OBJECT_NAME) )
@@ -467,6 +507,7 @@ namespace bgfx { namespace d3d12
 			, m_renderdocdll(NULL)
 			, m_featureLevel(D3D_FEATURE_LEVEL(0) )
 			, m_wireframe(false)
+			, m_lost(false)
 			, m_maxAnisotropy(1)
 			, m_depthClamp(false)
 			, m_fsChanges(0)
@@ -629,7 +670,7 @@ namespace bgfx { namespace d3d12
 						}
 
 						if (BX_ENABLED(BGFX_CONFIG_DEBUG_PERFHUD)
-						&&  0 != strstr(description, "PerfHUD") )
+						&&  0 != bx::strnstr(description, "PerfHUD") )
 						{
 							m_adapter = adapter;
 							m_driverType = D3D_DRIVER_TYPE_REFERENCE;
@@ -1014,7 +1055,7 @@ namespace bgfx { namespace d3d12
 				{
 					uint16_t support = BGFX_CAPS_FORMAT_TEXTURE_NONE;
 
-					const DXGI_FORMAT fmt = isDepth(TextureFormat::Enum(ii) )
+					const DXGI_FORMAT fmt = bimg::isDepth(bimg::TextureFormat::Enum(ii) )
 						? s_textureFormat[ii].m_fmtDsv
 						: s_textureFormat[ii].m_fmt
 						;
@@ -1260,19 +1301,15 @@ namespace bgfx { namespace d3d12
 			return BGFX_RENDERER_DIRECT3D12_NAME;
 		}
 
-		static bool isLost(HRESULT _hr)
+		bool isDeviceRemoved() BX_OVERRIDE
 		{
-			return DXGI_ERROR_DEVICE_REMOVED == _hr
-				|| DXGI_ERROR_DEVICE_HUNG == _hr
-				|| DXGI_ERROR_DEVICE_RESET == _hr
-				|| DXGI_ERROR_DRIVER_INTERNAL_ERROR == _hr
-				|| DXGI_ERROR_NOT_CURRENTLY_AVAILABLE == _hr
-				;
+			return m_lost;
 		}
 
 		void flip(HMD& /*_hmd*/) BX_OVERRIDE
 		{
-			if (NULL != m_swapChain)
+			if (NULL != m_swapChain
+			&&  !m_lost)
 			{
 				int64_t start = bx::getHPCounter();
 
@@ -1293,16 +1330,14 @@ namespace bgfx { namespace d3d12
 				int64_t now = bx::getHPCounter();
 				m_presentElapsed = now - start;
 
-				if (FAILED(hr)
-				&&  isLost(hr) )
-				{
-					++m_lost;
-					BGFX_FATAL(10 > m_lost, bgfx::Fatal::DeviceLost, "Device is lost. FAILED 0x%08x", hr);
-				}
-				else
-				{
-					m_lost = 0;
-				}
+				m_lost = isLost(hr);
+				BGFX_FATAL(!m_lost
+					, bgfx::Fatal::DeviceLost
+					, "Device is lost. FAILED 0x%08x %s (%s)"
+					, hr
+					, getLostReason(hr)
+					, DXGI_ERROR_DEVICE_REMOVED == hr ? getLostReason(m_device->GetDeviceRemovedReason() ) : "no info"
+					);
 			}
 		}
 
@@ -1448,7 +1483,7 @@ namespace bgfx { namespace d3d12
 			uint8_t* src;
 			readback->Map(0, NULL, (void**)&src);
 
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(texture.m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(texture.m_textureFormat) );
 			uint8_t* dst      = (uint8_t*)_data;
 			uint32_t dstPitch = srcWidth*bpp/8;
 
@@ -1606,7 +1641,7 @@ namespace bgfx { namespace d3d12
 
 			void* data;
 			readback->Map(0, NULL, (void**)&data);
-			imageSwizzleBgra8(
+			bimg::imageSwizzleBgra8(
 				  data
 				, width
 				, height
@@ -1864,7 +1899,7 @@ data.NumQualityLevels = 0;
 			}
 		}
 
-		void updateResolution(const Resolution& _resolution)
+		bool updateResolution(const Resolution& _resolution)
 		{
 			if (!!(_resolution.m_flags & BGFX_RESET_MAXANISOTROPY) )
 			{
@@ -1946,6 +1981,8 @@ data.NumQualityLevels = 0;
 
 				postReset();
 			}
+
+			return false;
 		}
 
 		void setShaderUniform(uint8_t _flags, uint32_t _regIndex, const void* _val, uint32_t _numRegs)
@@ -2831,7 +2868,6 @@ data.NumQualityLevels = 0;
 #endif // BX_PLATFORM_WINDOWS
 
 		int64_t m_presentElapsed;
-		uint16_t m_lost;
 		uint16_t m_numWindows;
 		FrameBufferHandle m_windows[BGFX_CONFIG_MAX_FRAME_BUFFERS];
 
@@ -2860,6 +2896,7 @@ data.NumQualityLevels = 0;
 
 		Resolution m_resolution;
 		bool m_wireframe;
+		bool m_lost;
 
 #if BX_PLATFORM_WINDOWS
 		DXGI_SWAP_CHAIN_DESC m_scd;
@@ -3968,14 +4005,14 @@ data.NumQualityLevels = 0;
 
 	void TextureD3D12::create(const Memory* _mem, uint32_t _flags, uint8_t _skip)
 	{
-		ImageContainer imageContainer;
+		bimg::ImageContainer imageContainer;
 
-		if (imageParse(imageContainer, _mem->data, _mem->size) )
+		if (bimg::imageParse(imageContainer, _mem->data, _mem->size) )
 		{
 			uint8_t numMips = imageContainer.m_numMips;
 			const uint8_t startLod = uint8_t(bx::uint32_min(_skip, numMips-1) );
 			numMips -= startLod;
-			const ImageBlockInfo& blockInfo = getBlockInfo(TextureFormat::Enum(imageContainer.m_format) );
+			const bimg::ImageBlockInfo& blockInfo = bimg::getBlockInfo(imageContainer.m_format);
 			const uint32_t textureWidth  = bx::uint32_max(blockInfo.blockWidth,  imageContainer.m_width >>startLod);
 			const uint32_t textureHeight = bx::uint32_max(blockInfo.blockHeight, imageContainer.m_height>>startLod);
 			const uint16_t numLayers     = imageContainer.m_numLayers;
@@ -3987,7 +4024,7 @@ data.NumQualityLevels = 0;
 			m_requestedFormat  = uint8_t(imageContainer.m_format);
 			m_textureFormat    = uint8_t(getViableTextureFormat(imageContainer) );
 			const bool convert = m_textureFormat != m_requestedFormat;
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 
 			if (imageContainer.m_cubeMap)
 			{
@@ -4009,7 +4046,7 @@ data.NumQualityLevels = 0;
 
 			uint32_t kk = 0;
 
-			const bool compressed = isCompressed(TextureFormat::Enum(m_textureFormat) );
+			const bool compressed = bimg::isCompressed(bimg::TextureFormat::Enum(m_textureFormat) );
 			const bool swizzle    = TextureFormat::BGRA8 == m_textureFormat && 0 != (m_flags&BGFX_TEXTURE_COMPUTE_WRITE);
 			uint32_t blockWidth   = 1;
 			uint32_t blockHeight  = 1;
@@ -4051,8 +4088,8 @@ data.NumQualityLevels = 0;
 					height = bx::uint32_max(blockHeight, height);
 					depth  = bx::uint32_max(1, depth);
 
-					ImageMip mip;
-					if (imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
+					bimg::ImageMip mip;
+					if (bimg::imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
 					{
 						if (convert)
 						{
@@ -4060,7 +4097,7 @@ data.NumQualityLevels = 0;
 							const uint32_t slice = bx::strideAlign(pitch * height, D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT);
 
 							uint8_t* temp = (uint8_t*)BX_ALLOC(g_allocator, slice);
-							imageDecodeToBgra8(temp
+							bimg::imageDecodeToBgra8(temp
 									, mip.m_data
 									, mip.m_width
 									, mip.m_height
@@ -4078,7 +4115,7 @@ data.NumQualityLevels = 0;
 							uint32_t slice = bx::strideAlign( (mip.m_height/blockInfo.blockHeight)*pitch,           D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT);
 
 							uint8_t* temp = (uint8_t*)BX_ALLOC(g_allocator, slice);
-							imageCopy(temp
+							bimg::imageCopy(temp
 									, mip.m_height/blockInfo.blockHeight
 									, (mip.m_width /blockInfo.blockWidth )*mip.m_blockSize
 									, mip.m_data
@@ -4096,7 +4133,7 @@ data.NumQualityLevels = 0;
 							const uint32_t slice = bx::strideAlign(pitch * mip.m_height,      D3D12_TEXTURE_DATA_PLACEMENT_ALIGNMENT);
 
 							uint8_t* temp = (uint8_t*)BX_ALLOC(g_allocator, slice);
-							imageCopy(temp
+							bimg::imageCopy(temp
 									, mip.m_height
 									, mip.m_width*mip.m_bpp / 8
 									, mip.m_data
@@ -4164,7 +4201,7 @@ data.NumQualityLevels = 0;
 			D3D12_RESOURCE_STATES state = D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE;
 
 			D3D12_CLEAR_VALUE* clearValue = NULL;
-			if (isDepth(TextureFormat::Enum(m_textureFormat) ) )
+			if (bimg::isDepth(bimg::TextureFormat::Enum(m_textureFormat) ) )
 			{
 				resourceDesc.Format = s_textureFormat[m_textureFormat].m_fmt;
 				resourceDesc.Flags |= D3D12_RESOURCE_FLAG_ALLOW_DEPTH_STENCIL;
@@ -4271,7 +4308,7 @@ data.NumQualityLevels = 0;
 
 			case Texture3D:
 				resourceDesc.Dimension        = D3D12_RESOURCE_DIMENSION_TEXTURE3D;
-				resourceDesc.DepthOrArraySize = m_depth;
+				resourceDesc.DepthOrArraySize = uint16_t(m_depth);
 				m_srvd.ViewDimension                 = D3D12_SRV_DIMENSION_TEXTURE3D;
 				m_srvd.Texture3D.MostDetailedMip     = 0;
 				m_srvd.Texture3D.MipLevels           = numMips;
@@ -4364,7 +4401,7 @@ data.NumQualityLevels = 0;
 		setState(_commandList, D3D12_RESOURCE_STATE_COPY_DEST);
 
 		const uint32_t subres = _mip + (_side * m_numMips);
-		const uint32_t bpp    = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+		const uint32_t bpp    = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 		const uint32_t rectpitch = _rect.m_width*bpp/8;
 		const uint32_t srcpitch  = UINT16_MAX == _pitch ? rectpitch : _pitch;
 
@@ -4481,7 +4518,7 @@ data.NumQualityLevels = 0;
 						m_height = uint32_t(desc.Height);
 					}
 
-					if (isDepth( (TextureFormat::Enum)texture.m_textureFormat) )
+					if (bimg::isDepth(bimg::TextureFormat::Enum(texture.m_textureFormat) ) )
 					{
 						BX_CHECK(!isValid(m_depth), "");
 						m_depth = handle;
@@ -4489,7 +4526,7 @@ data.NumQualityLevels = 0;
 						uint32_t dsvDescriptorSize = device->GetDescriptorHandleIncrementSize(D3D12_DESCRIPTOR_HEAP_TYPE_DSV);
 						dsvDescriptor.ptr += (1 + fbhIdx) * dsvDescriptorSize;
 
-						const ImageBlockInfo& blockInfo = getBlockInfo(TextureFormat::Enum(texture.m_textureFormat) );
+						const bimg::ImageBlockInfo& blockInfo = bimg::getBlockInfo(bimg::TextureFormat::Enum(texture.m_textureFormat) );
 						BX_UNUSED(blockInfo);
 
 						D3D12_DEPTH_STENCIL_VIEW_DESC dsvDesc;
@@ -4779,7 +4816,11 @@ data.NumQualityLevels = 0;
 	{
 //		PIX_BEGINEVENT(D3DCOLOR_FRAME, L"rendererSubmit");
 
-		updateResolution(_render->m_resolution);
+		if (m_lost
+		||  updateResolution(_render->m_resolution) )
+		{
+			return;
+		}
 
 		int64_t elapsed = -bx::getHPCounter();
 		int64_t captureElapsed = 0;
@@ -4805,6 +4846,9 @@ data.NumQualityLevels = 0;
 		currentState.m_stateFlags = BGFX_STATE_NONE;
 		currentState.m_stencil    = packStencil(BGFX_STENCIL_NONE, BGFX_STENCIL_NONE);
 
+		RenderBind currentBind;
+		currentBind.clear();
+
 		_render->m_hmdInitialized = false;
 
 		const bool hmdEnabled = false;
@@ -4890,7 +4934,9 @@ data.NumQualityLevels = 0;
 					|| item == numItems
 					;
 
-				const RenderItem& renderItem = _render->m_renderItem[_render->m_sortValues[item] ];
+				const uint32_t itemIdx       = _render->m_sortValues[item];
+				const RenderItem& renderItem = _render->m_renderItem[itemIdx];
+				const RenderBind& renderBind = _render->m_renderItemBind[itemIdx];
 				++item;
 
 				if (viewChanged)
@@ -5007,7 +5053,7 @@ data.NumQualityLevels = 0;
 							srcLocation.Type      = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
 							srcLocation.SubresourceIndex = srcZ*src.m_numMips+blit.m_srcMip;
 
-							bool depthStencil = isDepth(TextureFormat::Enum(src.m_textureFormat) );
+							bool depthStencil = bimg::isDepth(bimg::TextureFormat::Enum(src.m_textureFormat) );
 							m_commandList->CopyTextureRegion(&dstLocation
 								, blit.m_dstX
 								, blit.m_dstY
@@ -5043,7 +5089,7 @@ data.NumQualityLevels = 0;
 						currentBindHash = 0;
 					}
 
-					uint32_t bindHash = bx::hashMurmur2A(compute.m_bind, sizeof(compute.m_bind) );
+					uint32_t bindHash = bx::hashMurmur2A(renderBind.m_bind, sizeof(renderBind.m_bind) );
 					if (currentBindHash != bindHash)
 					{
 						currentBindHash  = bindHash;
@@ -5056,7 +5102,7 @@ data.NumQualityLevels = 0;
 
 							for (uint32_t ii = 0; ii < BGFX_MAX_COMPUTE_BINDINGS; ++ii)
 							{
-								const Binding& bind = compute.m_bind[ii];
+								const Binding& bind = renderBind.m_bind[ii];
 								if (invalidHandle != bind.m_idx)
 								{
 									switch (bind.m_type)
@@ -5243,6 +5289,8 @@ data.NumQualityLevels = 0;
 					currentState.m_stateFlags = newFlags;
 					currentState.m_stencil    = newStencil;
 
+					currentBind.clear();
+
 					const uint64_t pt = newFlags&BGFX_STATE_PT_MASK;
 					primIndex = uint8_t(pt>>BGFX_STATE_PT_SHIFT);
 				}
@@ -5271,7 +5319,7 @@ data.NumQualityLevels = 0;
 							);
 
 					uint16_t scissor = draw.m_scissor;
-					uint32_t bindHash = bx::hashMurmur2A(draw.m_bind, sizeof(draw.m_bind) );
+					uint32_t bindHash = bx::hashMurmur2A(renderBind.m_bind, sizeof(renderBind.m_bind) );
 					if (currentBindHash != bindHash
 					||  0 != changedStencil
 					|| (hasFactor && blendFactor != draw.m_rgba)
@@ -5297,17 +5345,47 @@ data.NumQualityLevels = 0;
 								srvHandle[0].ptr = 0;
 								for (uint32_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 								{
-									const Binding& bind = draw.m_bind[stage];
+									const Binding& bind = renderBind.m_bind[stage];
 									if (invalidHandle != bind.m_idx)
 									{
-										TextureD3D12& texture = m_textures[bind.m_idx];
-										texture.setState(m_commandList, D3D12_RESOURCE_STATE_GENERIC_READ);
-										scratchBuffer.allocSrv(srvHandle[stage], texture);
-										samplerFlags[stage] = (0 == (BGFX_TEXTURE_INTERNAL_DEFAULT_SAMPLER & bind.m_un.m_draw.m_textureFlags)
-											? bind.m_un.m_draw.m_textureFlags
-											: texture.m_flags
-											) & (BGFX_TEXTURE_SAMPLER_BITS_MASK|BGFX_TEXTURE_BORDER_COLOR_MASK)
-											;
+										switch (bind.m_type)
+										{
+										case Binding::Texture:
+											{
+												TextureD3D12& texture = m_textures[bind.m_idx];
+												texture.setState(m_commandList, D3D12_RESOURCE_STATE_GENERIC_READ);
+												scratchBuffer.allocSrv(srvHandle[stage], texture);
+												samplerFlags[stage] = (0 == (BGFX_TEXTURE_INTERNAL_DEFAULT_SAMPLER & bind.m_un.m_draw.m_textureFlags)
+													? bind.m_un.m_draw.m_textureFlags
+													: texture.m_flags
+													) & (BGFX_TEXTURE_SAMPLER_BITS_MASK | BGFX_TEXTURE_BORDER_COLOR_MASK)
+													;
+											}
+											break;
+
+										case Binding::IndexBuffer:
+										case Binding::VertexBuffer:
+											{
+												BufferD3D12& buffer = Binding::IndexBuffer == bind.m_type
+													? m_indexBuffers[bind.m_idx]
+													: m_vertexBuffers[bind.m_idx]
+													;
+
+												if (Access::Read != bind.m_un.m_compute.m_access)
+												{
+													// The api functions prevent binding with Access::Write,
+													// but might as well allow it in here for future-proofing
+													buffer.setState(m_commandList, D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+													scratchBuffer.allocUav(srvHandle[stage], buffer);
+												}
+												else
+												{
+													buffer.setState(m_commandList, D3D12_RESOURCE_STATE_GENERIC_READ);
+													scratchBuffer.allocSrv(srvHandle[stage], buffer);
+												}
+											}
+											break;
+										}
 									}
 									else
 									{
diff --git a/3rdparty/bgfx/src/renderer_d3d9.cpp b/3rdparty/bgfx/src/renderer_d3d9.cpp
index 4f4267b..7a1c8c7 100644
--- a/3rdparty/bgfx/src/renderer_d3d9.cpp
+++ b/3rdparty/bgfx/src/renderer_d3d9.cpp
@@ -7,6 +7,7 @@
 
 #if BGFX_CONFIG_RENDERER_DIRECT3D9
 #	include "renderer_d3d9.h"
+#	include <bx/pixelformat.h>
 
 namespace bgfx { namespace d3d9
 {
@@ -463,7 +464,7 @@ namespace bgfx { namespace d3d9
 							}
 
 							if (BX_ENABLED(BGFX_CONFIG_DEBUG_PERFHUD)
-							&&  0 != strstr(desc.Description, "PerfHUD") )
+							&&  0 != bx::strnstr(desc.Description, "PerfHUD") )
 							{
 								m_adapter = ii;
 								m_deviceType = D3DDEVTYPE_REF;
@@ -700,7 +701,7 @@ namespace bgfx { namespace d3d9
 				support |= SUCCEEDED(m_d3d9->CheckDeviceFormat(m_adapter
 					, m_deviceType
 					, adapterFormat
-					, isDepth(TextureFormat::Enum(ii) ) ? D3DUSAGE_DEPTHSTENCIL : D3DUSAGE_RENDERTARGET
+					, bimg::isDepth(bimg::TextureFormat::Enum(ii) ) ? D3DUSAGE_DEPTHSTENCIL : D3DUSAGE_RENDERTARGET
 					, D3DRTYPE_TEXTURE
 					, s_textureFormat[ii].m_fmt
 					) ) ? BGFX_CAPS_FORMAT_TEXTURE_FRAMEBUFFER : BGFX_CAPS_FORMAT_TEXTURE_NONE;
@@ -716,7 +717,7 @@ namespace bgfx { namespace d3d9
 				support |= SUCCEEDED(m_d3d9->CheckDeviceFormat(m_adapter
 					, m_deviceType
 					, adapterFormat
-					, isDepth(TextureFormat::Enum(ii) ) ? D3DUSAGE_DEPTHSTENCIL : D3DUSAGE_RENDERTARGET
+					, bimg::isDepth(bimg::TextureFormat::Enum(ii) ) ? D3DUSAGE_DEPTHSTENCIL : D3DUSAGE_RENDERTARGET
 					, D3DRTYPE_TEXTURE
 					, s_textureFormat[ii].m_fmt
 					) ) ? BGFX_CAPS_FORMAT_TEXTURE_MIP_AUTOGEN : BGFX_CAPS_FORMAT_TEXTURE_NONE;
@@ -1015,7 +1016,7 @@ namespace bgfx { namespace d3d9
 			uint32_t srcPitch  = lockedRect.Pitch;
 			uint8_t* src       = (uint8_t*)lockedRect.pBits;
 
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(texture.m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(texture.m_textureFormat) );
 			uint8_t* dst      = (uint8_t*)_data;
 			uint32_t dstPitch = srcWidth*bpp/8;
 
@@ -1458,6 +1459,11 @@ namespace bgfx { namespace d3d9
 			m_flushQuery->GetData(NULL, 0, D3DGETDATA_FLUSH);
 		}
 
+		bool isDeviceRemoved() BX_OVERRIDE
+		{
+			return false;
+		}
+
 		void flip(HMD& /*_hmd*/) BX_OVERRIDE
 		{
 			if (NULL != m_swapChain)
@@ -2504,7 +2510,7 @@ namespace bgfx { namespace d3d9
 	void TextureD3D9::createTexture(uint32_t _width, uint32_t _height, uint8_t _numMips)
 	{
 		m_type = Texture2D;
-		const TextureFormat::Enum fmt = (TextureFormat::Enum)m_textureFormat;
+		const bimg::TextureFormat::Enum fmt = (bimg::TextureFormat::Enum)m_textureFormat;
 
 		DWORD usage = 0;
 		D3DPOOL pool = D3DPOOL_DEFAULT;
@@ -2512,7 +2518,7 @@ namespace bgfx { namespace d3d9
 		const bool renderTarget = 0 != (m_flags&BGFX_TEXTURE_RT_MASK);
 		const bool blit         = 0 != (m_flags&BGFX_TEXTURE_BLIT_DST);
 		const bool readBack     = 0 != (m_flags&BGFX_TEXTURE_READ_BACK);
-		if (isDepth(fmt) )
+		if (bimg::isDepth(fmt) )
 		{
 			usage = D3DUSAGE_DEPTHSTENCIL;
 		}
@@ -2543,7 +2549,7 @@ namespace bgfx { namespace d3d9
 			{
 				const Msaa& msaa = s_msaa[msaaQuality];
 
-				if (isDepth(fmt) )
+				if (bimg::isDepth(fmt) )
 				{
 					DX_CHECK(device->CreateDepthStencilSurface(
 						  m_width
@@ -2615,7 +2621,7 @@ namespace bgfx { namespace d3d9
 			, _width
 			, _height
 			, _numMips
-			, getName(fmt)
+			, bimg::getName(fmt)
 			);
 	}
 
@@ -2667,13 +2673,13 @@ namespace bgfx { namespace d3d9
 	void TextureD3D9::createCubeTexture(uint32_t _width, uint8_t _numMips)
 	{
 		m_type = TextureCube;
-		const TextureFormat::Enum fmt = (TextureFormat::Enum)m_textureFormat;
+		const bimg::TextureFormat::Enum fmt = (bimg::TextureFormat::Enum)m_textureFormat;
 
 		DWORD usage = 0;
 
 		const bool renderTarget = 0 != (m_flags&BGFX_TEXTURE_RT_MASK);
 		const bool blit         = 0 != (m_flags&BGFX_TEXTURE_BLIT_DST);
-		if (isDepth(fmt) )
+		if (bimg::isDepth(fmt) )
 		{
 			usage = D3DUSAGE_DEPTHSTENCIL;
 		}
@@ -2891,14 +2897,14 @@ namespace bgfx { namespace d3d9
 
 	void TextureD3D9::create(const Memory* _mem, uint32_t _flags, uint8_t _skip)
 	{
-		ImageContainer imageContainer;
+		bimg::ImageContainer imageContainer;
 
-		if (imageParse(imageContainer, _mem->data, _mem->size) )
+		if (bimg::imageParse(imageContainer, _mem->data, _mem->size) )
 		{
 			uint8_t numMips = imageContainer.m_numMips;
 			const uint8_t startLod = uint8_t(bx::uint32_min(_skip, numMips-1) );
 			numMips -= startLod;
-			const ImageBlockInfo& blockInfo = getBlockInfo(TextureFormat::Enum(imageContainer.m_format) );
+			const bimg::ImageBlockInfo& blockInfo = bimg::getBlockInfo(bimg::TextureFormat::Enum(imageContainer.m_format) );
 			const uint32_t textureWidth  = bx::uint32_max(blockInfo.blockWidth,  imageContainer.m_width >>startLod);
 			const uint32_t textureHeight = bx::uint32_max(blockInfo.blockHeight, imageContainer.m_height>>startLod);
 
@@ -2911,7 +2917,7 @@ namespace bgfx { namespace d3d9
 			m_textureFormat   = uint8_t(getViableTextureFormat(imageContainer) );
 			const bool convert = m_textureFormat != m_requestedFormat;
 
-			uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+			uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 
 			if (imageContainer.m_cubeMap)
 			{
@@ -2950,8 +2956,8 @@ namespace bgfx { namespace d3d9
 			// bytes. If actual mip size is used it causes memory corruption.
 			// http://www.aras-p.info/texts/D3D9GPUHacks.html#3dc
 			const bool useMipSize = true
-							&& imageContainer.m_format != TextureFormat::BC4
-							&& imageContainer.m_format != TextureFormat::BC5
+							&& imageContainer.m_format != bimg::TextureFormat::BC4
+							&& imageContainer.m_format != bimg::TextureFormat::BC5
 							;
 
 			for (uint8_t side = 0, numSides = imageContainer.m_cubeMap ? 6 : 1; side < numSides; ++side)
@@ -2971,8 +2977,8 @@ namespace bgfx { namespace d3d9
 					mipHeight = bx::uint32_max(blockInfo.blockHeight, mipHeight);
 					uint32_t mipSize = width*height*depth*bpp/8;
 
-					ImageMip mip;
-					if (imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
+					bimg::ImageMip mip;
+					if (bimg::imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
 					{
 						uint32_t pitch;
 						uint32_t slicePitch;
@@ -2986,7 +2992,7 @@ namespace bgfx { namespace d3d9
 								uint32_t srcpitch = mipWidth*bpp/8;
 
 								uint8_t* temp = (uint8_t*)BX_ALLOC(g_allocator, srcpitch*mipHeight);
-								imageDecodeToBgra8(temp
+								bimg::imageDecodeToBgra8(temp
 										, mip.m_data
 										, mip.m_width
 										, mip.m_height
@@ -3000,7 +3006,7 @@ namespace bgfx { namespace d3d9
 							}
 							else
 							{
-								imageDecodeToBgra8(bits, mip.m_data, mip.m_width, mip.m_height, pitch, mip.m_format);
+								bimg::imageDecodeToBgra8(bits, mip.m_data, mip.m_width, mip.m_height, pitch, mip.m_format);
 							}
 						}
 						else
@@ -3009,11 +3015,11 @@ namespace bgfx { namespace d3d9
 							switch (m_textureFormat)
 							{
 							case TextureFormat::RGB5A1:
-								imageConvert(bits, 16, bx::packBgr5a1, mip.m_data, bx::unpackRgb5a1, size);
+								bimg::imageConvert(bits, 16, bx::packBgr5a1, mip.m_data, bx::unpackRgb5a1, size);
 								break;
 
 							case TextureFormat::RGBA4:
-								imageConvert(bits, 16, bx::packBgra4, mip.m_data, bx::unpackRgba4, size);
+								bimg::imageConvert(bits, 16, bx::packBgra4, mip.m_data, bx::unpackRgba4, size);
 								break;
 
 							default:
@@ -3045,7 +3051,7 @@ namespace bgfx { namespace d3d9
 
 	void TextureD3D9::update(uint8_t _side, uint8_t _mip, const Rect& _rect, uint16_t _z, uint16_t _depth, uint16_t _pitch, const Memory* _mem)
 	{
-		const uint32_t bpp = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+		const uint32_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 		const uint32_t rectpitch = _rect.m_width*bpp/8;
 		const uint32_t srcpitch  = UINT16_MAX == _pitch ? rectpitch : _pitch;
 		const uint32_t dstpitch  = s_renderD3D9->m_updateTexturePitch;
@@ -3059,7 +3065,7 @@ namespace bgfx { namespace d3d9
 		if (convert)
 		{
 			temp = (uint8_t*)BX_ALLOC(g_allocator, rectpitch*_rect.m_height);
-			imageDecodeToBgra8(temp, data, _rect.m_width, _rect.m_height, srcpitch, TextureFormat::Enum(m_requestedFormat) );
+			bimg::imageDecodeToBgra8(temp, data, _rect.m_width, _rect.m_height, srcpitch, bimg::TextureFormat::Enum(m_requestedFormat) );
 			data = temp;
 		}
 
@@ -3071,11 +3077,11 @@ namespace bgfx { namespace d3d9
 				switch (m_textureFormat)
 				{
 				case TextureFormat::RGB5A1:
-					imageConvert(dst, 16, bx::packBgr5a1, src, bx::unpackRgb5a1, rectpitch);
+					bimg::imageConvert(dst, 16, bx::packBgr5a1, src, bx::unpackRgb5a1, rectpitch);
 					break;
 
 				case TextureFormat::RGBA4:
-					imageConvert(dst, 16, bx::packBgra4, src, bx::unpackRgba4, rectpitch);
+					bimg::imageConvert(dst, 16, bx::packBgra4, src, bx::unpackRgba4, rectpitch);
 					break;
 
 				default:
@@ -3211,7 +3217,7 @@ namespace bgfx { namespace d3d9
 					m_height = texture.m_height;
 				}
 
-				if (isDepth( (TextureFormat::Enum)texture.m_textureFormat) )
+				if (bimg::isDepth(bimg::TextureFormat::Enum(texture.m_textureFormat) ) )
 				{
 					m_dsIdx = uint8_t(ii);
 				}
@@ -3640,6 +3646,9 @@ namespace bgfx { namespace d3d9
 		currentState.m_stateFlags = BGFX_STATE_NONE;
 		currentState.m_stencil    = packStencil(BGFX_STENCIL_NONE, BGFX_STENCIL_NONE);
 
+		RenderBind currentBind;
+		currentBind.clear();
+
 		ViewState viewState(_render, false);
 
 		DX_CHECK(device->SetRenderState(D3DRS_FILLMODE, _render->m_debug&BGFX_DEBUG_WIREFRAME ? D3DFILL_WIREFRAME : D3DFILL_SOLID) );
@@ -3692,7 +3701,9 @@ namespace bgfx { namespace d3d9
 					continue;
 				}
 
-				const RenderDraw& draw = _render->m_renderItem[_render->m_sortValues[item] ].draw;
+				const uint32_t itemIdx = _render->m_sortValues[item];
+				const RenderDraw& draw = _render->m_renderItem[itemIdx].draw;
+				const RenderBind& renderBind = _render->m_renderItemBind[itemIdx];
 
 				const bool hasOcclusionQuery = 0 != (draw.m_stateFlags & BGFX_STATE_INTERNAL_OCCLUSION_QUERY);
 				if (isValid(draw.m_occlusionQuery)
@@ -3801,7 +3812,7 @@ namespace bgfx { namespace d3d9
 						//
 						// GetRenderTargetData (dst must be SYSTEMMEM)
 
-						bool depth = isDepth(TextureFormat::Enum(src.m_textureFormat) );
+						bool depth = bimg::isDepth(bimg::TextureFormat::Enum(src.m_textureFormat) );
 						HRESULT hr = m_device->StretchRect(srcSurface
 							, depth ? NULL : &srcRect
 							, dstSurface
@@ -4092,8 +4103,9 @@ namespace bgfx { namespace d3d9
 				{
 					for (uint8_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 					{
-						const Binding& bind = draw.m_bind[stage];
-						Binding& current = currentState.m_bind[stage];
+						const Binding& bind = renderBind.m_bind[stage];
+						Binding& current = currentBind.m_bind[stage];
+
 						if (current.m_idx != bind.m_idx
 						||  current.m_un.m_draw.m_textureFlags != bind.m_un.m_draw.m_textureFlags
 						||  programChanged)
diff --git a/3rdparty/bgfx/src/renderer_gl.cpp b/3rdparty/bgfx/src/renderer_gl.cpp
index 8f4903a..da123ff 100644
--- a/3rdparty/bgfx/src/renderer_gl.cpp
+++ b/3rdparty/bgfx/src/renderer_gl.cpp
@@ -1153,10 +1153,10 @@ namespace bgfx { namespace gl
 			: tfi.m_internalFmt
 			;
 
-		GLsizei size = (16*16*getBitsPerPixel(_format) )/8;
+		GLsizei size = (16*16*bimg::getBitsPerPixel(bimg::TextureFormat::Enum(_format) ) )/8;
 		void* data = NULL;
 
-		if (isDepth(_format) )
+		if (bimg::isDepth(bimg::TextureFormat::Enum(_format) ) )
 		{
 			_srgb    = false;
 			_mipmaps = false;
@@ -1169,7 +1169,7 @@ namespace bgfx { namespace gl
 		flushGlError();
 		GLenum err = 0;
 
-		if (isCompressed(_format) )
+		if (bimg::isCompressed(bimg::TextureFormat::Enum(_format) ) )
 		{
 			glCompressedTexImage2D(GL_TEXTURE_2D, 0, internalFmt, 16, 16, 0, size, data);
 			err |= glGetError();
@@ -1294,9 +1294,9 @@ namespace bgfx { namespace gl
 		GLenum err = initTestTexture(_format, _srgb, false);
 
 		GLenum attachment;
-		if (isDepth(_format) )
+		if (bimg::isDepth(bimg::TextureFormat::Enum(_format) ) )
 		{
-			const ImageBlockInfo& info = getBlockInfo(_format);
+			const bimg::ImageBlockInfo& info = bimg::getBlockInfo(bimg::TextureFormat::Enum(_format) );
 			if (0 == info.depthBits)
 			{
 				attachment = GL_STENCIL_ATTACHMENT;
@@ -1412,6 +1412,16 @@ namespace bgfx { namespace gl
 		{ "Intel",                        BGFX_PCI_ID_INTEL  },
 	};
 
+	struct Workaround
+	{
+		void reset()
+		{
+			m_detachShader = true;
+		}
+
+		bool m_detachShader;
+	};
+
 	struct RendererContextGL : public RendererContextI
 	{
 		RendererContextGL()
@@ -1482,6 +1492,8 @@ namespace bgfx { namespace gl
 				}
 			}
 
+			m_workaround.reset();
+
 			GLint numCmpFormats = 0;
 			GL_CHECK(glGetIntegerv(GL_NUM_COMPRESSED_TEXTURE_FORMATS, &numCmpFormats) );
 			BX_TRACE("GL_NUM_COMPRESSED_TEXTURE_FORMATS %d", numCmpFormats);
@@ -2277,6 +2289,11 @@ namespace bgfx { namespace gl
 			return BGFX_RENDERER_OPENGL_NAME;
 		}
 
+		bool isDeviceRemoved() BX_OVERRIDE
+		{
+			return false;
+		}
+
 		void flip(HMD& _hmd)
 		{
 			if (m_flip)
@@ -2413,7 +2430,7 @@ namespace bgfx { namespace gl
 			if (m_readBackSupported)
 			{
 				const TextureGL& texture = m_textures[_handle.idx];
-				const bool compressed    = isCompressed(TextureFormat::Enum(texture.m_textureFormat) );
+				const bool compressed    = bimg::isCompressed(bimg::TextureFormat::Enum(texture.m_textureFormat) );
 
 				GL_CHECK(glBindTexture(texture.m_target, texture.m_id) );
 
@@ -2558,7 +2575,7 @@ namespace bgfx { namespace gl
 
 			if (GL_RGBA == m_readPixelsFmt)
 			{
-				imageSwizzleBgra8(data, width, height, width*4, data);
+				bimg::imageSwizzleBgra8(data, width, height, width*4, data);
 			}
 
 			g_callback->screenShot(_filePath
@@ -3127,7 +3144,7 @@ namespace bgfx { namespace gl
 
 				if (GL_RGBA == m_readPixelsFmt)
 				{
-					imageSwizzleBgra8(m_capture, m_resolution.m_width, m_resolution.m_height, m_resolution.m_width*4, m_capture);
+					bimg::imageSwizzleBgra8(m_capture, m_resolution.m_width, m_resolution.m_height, m_resolution.m_width*4, m_capture);
 				}
 
 				g_callback->captureFrame(m_capture, m_captureSize);
@@ -3520,6 +3537,8 @@ namespace bgfx { namespace gl
 		const char* m_version;
 		const char* m_glslVersion;
 
+		Workaround m_workaround;
+
 		GLuint m_currentFbo;
 
 		VR m_ovr;
@@ -4018,7 +4037,8 @@ namespace bgfx { namespace gl
 
 		init();
 
-		if (!cached)
+		if (!cached
+		&&  s_renderGL->m_workaround.m_detachShader)
 		{
 			// Must be after init, otherwise init might fail to lookup shader
 			// info (NVIDIA Tegra 3 OpenGL ES 2.0 14.01003).
@@ -4810,9 +4830,9 @@ namespace bgfx { namespace gl
 
 	void TextureGL::create(const Memory* _mem, uint32_t _flags, uint8_t _skip)
 	{
-		ImageContainer imageContainer;
+		bimg::ImageContainer imageContainer;
 
-		if (imageParse(imageContainer, _mem->data, _mem->size) )
+		if (bimg::imageParse(imageContainer, _mem->data, _mem->size) )
 		{
 			uint8_t numMips = imageContainer.m_numMips;
 			const uint8_t startLod = uint8_t(bx::uint32_min(_skip, numMips-1) );
@@ -4822,7 +4842,7 @@ namespace bgfx { namespace gl
 			uint32_t textureHeight;
 			uint32_t textureDepth;
 			{
-				const ImageBlockInfo& ibi = getBlockInfo(TextureFormat::Enum(imageContainer.m_format) );
+				const bimg::ImageBlockInfo& ibi = bimg::getBlockInfo(bimg::TextureFormat::Enum(imageContainer.m_format) );
 				textureWidth  = bx::uint32_max(ibi.blockWidth,  imageContainer.m_width >>startLod);
 				textureHeight = bx::uint32_max(ibi.blockHeight, imageContainer.m_height>>startLod);
 				textureDepth  = 1 < imageContainer.m_depth
@@ -4888,7 +4908,7 @@ namespace bgfx { namespace gl
 				&& !s_textureFormat[m_requestedFormat].m_supported
 				&& !s_renderGL->m_textureSwizzleSupport
 				;
-			const bool compressed = isCompressed(TextureFormat::Enum(m_requestedFormat) );
+			const bool compressed = bimg::isCompressed(bimg::TextureFormat::Enum(m_requestedFormat) );
 			const bool convert    = false
 				|| m_textureFormat != m_requestedFormat
 				|| swizzle
@@ -4941,8 +4961,8 @@ namespace bgfx { namespace gl
 						: side
 						;
 
-					ImageMip mip;
-					if (imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
+					bimg::ImageMip mip;
+					if (bimg::imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
 					{
 						if (compressed
 						&& !convert)
@@ -4994,7 +5014,7 @@ namespace bgfx { namespace gl
 						{
 							uint32_t size = bx::uint32_max(1, (width  + 3)>>2)
 										  * bx::uint32_max(1, (height + 3)>>2)
-										  * 4*4*getBitsPerPixel(TextureFormat::Enum(m_textureFormat) )/8
+										  * 4*4* bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) )/8
 										  ;
 
 							compressedTexImage(imageTarget
@@ -5066,7 +5086,7 @@ namespace bgfx { namespace gl
 
 	void TextureGL::update(uint8_t _side, uint8_t _mip, const Rect& _rect, uint16_t _z, uint16_t _depth, uint16_t _pitch, const Memory* _mem)
 	{
-		const uint32_t bpp = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+		const uint32_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 		const uint32_t rectpitch = _rect.m_width*bpp/8;
 		uint32_t srcpitch  = UINT16_MAX == _pitch ? rectpitch : _pitch;
 
@@ -5084,7 +5104,7 @@ namespace bgfx { namespace gl
 			&& !s_renderGL->m_textureSwizzleSupport
 			;
 		const bool unpackRowLength = BX_IGNORE_C4127(!!BGFX_CONFIG_RENDERER_OPENGL || s_extension[Extension::EXT_unpack_subimage].m_supported);
-		const bool compressed      = isCompressed(TextureFormat::Enum(m_requestedFormat) );
+		const bool compressed      = bimg::isCompressed(bimg::TextureFormat::Enum(m_requestedFormat) );
 		const bool convert         = false
 			|| (compressed && m_textureFormat != m_requestedFormat)
 			|| swizzle
@@ -5110,7 +5130,7 @@ namespace bgfx { namespace gl
 
 			if (!unpackRowLength)
 			{
-				imageCopy(temp, width, height, bpp, srcpitch, data);
+				bimg::imageCopy(temp, width, height, bpp, srcpitch, data);
 				data = temp;
 			}
 
@@ -5133,7 +5153,7 @@ namespace bgfx { namespace gl
 
 			if (convert)
 			{
-				imageDecodeToRgba8(temp, data, width, height, srcpitch, TextureFormat::Enum(m_requestedFormat) );
+				bimg::imageDecodeToRgba8(temp, data, width, height, srcpitch, bimg::TextureFormat::Enum(m_requestedFormat) );
 				data = temp;
 				srcpitch = rectpitch;
 			}
@@ -5141,7 +5161,7 @@ namespace bgfx { namespace gl
 			if (!unpackRowLength
 			&&  !convert)
 			{
-				imageCopy(temp, width, height, bpp, srcpitch, data);
+				bimg::imageCopy(temp, width, height, bpp, srcpitch, data);
 				data = temp;
 			}
 
@@ -5883,10 +5903,10 @@ namespace bgfx { namespace gl
 					}
 
 					GLenum attachment = GL_COLOR_ATTACHMENT0 + colorIdx;
-					TextureFormat::Enum format = (TextureFormat::Enum)texture.m_textureFormat;
-					if (isDepth(format) )
+					bimg::TextureFormat::Enum format = bimg::TextureFormat::Enum(texture.m_textureFormat);
+					if (bimg::isDepth(format) )
 					{
-						const ImageBlockInfo& info = getBlockInfo(format);
+						const bimg::ImageBlockInfo& info = bimg::getBlockInfo(format);
 						if (0 < info.stencilBits)
 						{
 							attachment = GL_DEPTH_STENCIL_ATTACHMENT;
@@ -5973,7 +5993,7 @@ namespace bgfx { namespace gl
 						if (0 != texture.m_id)
 						{
 							GLenum attachment = GL_COLOR_ATTACHMENT0 + colorIdx;
-							if (!isDepth( (TextureFormat::Enum)texture.m_textureFormat) )
+							if (!bimg::isDepth(bimg::TextureFormat::Enum(texture.m_textureFormat) ) )
 							{
 								++colorIdx;
 
@@ -6244,6 +6264,9 @@ namespace bgfx { namespace gl
 		currentState.m_stateFlags = BGFX_STATE_NONE;
 		currentState.m_stencil    = packStencil(BGFX_STENCIL_NONE, BGFX_STENCIL_NONE);
 
+		RenderBind currentBind;
+		currentBind.clear();
+
 		_render->m_hmdInitialized = m_ovr.isInitialized();
 
 		const bool hmdEnabled = m_ovr.isEnabled();
@@ -6324,7 +6347,9 @@ namespace bgfx { namespace gl
 					|| item == numItems
 					;
 
-				const RenderItem& renderItem = _render->m_renderItem[_render->m_sortValues[item] ];
+				const uint32_t itemIdx       = _render->m_sortValues[item];
+				const RenderItem& renderItem = _render->m_renderItem[itemIdx];
+				const RenderBind& renderBind = _render->m_renderItemBind[itemIdx];
 				++item;
 
 				if (viewChanged)
@@ -6499,7 +6524,7 @@ namespace bgfx { namespace gl
 						GLbitfield barrier = 0;
 						for (uint32_t ii = 0; ii < BGFX_MAX_COMPUTE_BINDINGS; ++ii)
 						{
-							const Binding& bind = compute.m_bind[ii];
+							const Binding& bind = renderBind.m_bind[ii];
 							if (invalidHandle != bind.m_idx)
 							{
 								switch (bind.m_type)
@@ -6630,6 +6655,8 @@ namespace bgfx { namespace gl
 					changedStencil = packStencil(BGFX_STENCIL_MASK, BGFX_STENCIL_MASK);
 					currentState.m_stateFlags = newFlags;
 					currentState.m_stencil    = newStencil;
+
+					currentBind.clear();
 				}
 
 				uint16_t scissor = draw.m_scissor;
@@ -6985,16 +7012,40 @@ namespace bgfx { namespace gl
 					{
 						for (uint32_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 						{
-							const Binding& bind = draw.m_bind[stage];
-							Binding& current = currentState.m_bind[stage];
+							const Binding& bind = renderBind.m_bind[stage];
+							Binding& current = currentBind.m_bind[stage];
 							if (current.m_idx != bind.m_idx
+							||  current.m_type != bind.m_type
 							||  current.m_un.m_draw.m_textureFlags != bind.m_un.m_draw.m_textureFlags
 							||  programChanged)
 							{
 								if (invalidHandle != bind.m_idx)
 								{
-									TextureGL& texture = m_textures[bind.m_idx];
-									texture.commit(stage, bind.m_un.m_draw.m_textureFlags, _render->m_colorPalette);
+									switch (bind.m_type)
+									{
+									case Binding::Texture:
+										{
+											TextureGL& texture = m_textures[bind.m_idx];
+											texture.commit(stage, bind.m_un.m_draw.m_textureFlags, _render->m_colorPalette);
+										}
+										break;
+
+									case Binding::IndexBuffer:
+										{
+											const IndexBufferGL& buffer = m_indexBuffers[bind.m_idx];
+											GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, stage, buffer.m_id) );
+											// TODO: barriers?
+										}
+										break;
+
+									case Binding::VertexBuffer:
+										{
+											const VertexBufferGL& buffer = m_vertexBuffers[bind.m_idx];
+											GL_CHECK(glBindBufferBase(GL_SHADER_STORAGE_BUFFER, stage, buffer.m_id) );
+											// TODO: barriers?
+										}
+										break;
+									}
 								}
 							}
 
diff --git a/3rdparty/bgfx/src/renderer_mtl.mm b/3rdparty/bgfx/src/renderer_mtl.mm
index fbfcdf2..b822dd9 100644
--- a/3rdparty/bgfx/src/renderer_mtl.mm
+++ b/3rdparty/bgfx/src/renderer_mtl.mm
@@ -557,7 +557,7 @@ namespace bgfx { namespace mtl
 					: BGFX_CAPS_FORMAT_TEXTURE_NONE
 					;
 
-				if (!isCompressed((TextureFormat::Enum)(ii)))
+				if (!bimg::isCompressed(bimg::TextureFormat::Enum(ii) ) )
 				{
 					support |= BGFX_CAPS_FORMAT_TEXTURE_FRAMEBUFFER
 						| BGFX_CAPS_FORMAT_TEXTURE_FRAMEBUFFER_MSAA;
@@ -803,7 +803,7 @@ namespace bgfx { namespace mtl
 
 			uint32_t srcWidth  = bx::uint32_max(1, texture.m_ptr.width()  >> _mip);
 			uint32_t srcHeight = bx::uint32_max(1, texture.m_ptr.height() >> _mip);
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(texture.m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(texture.m_textureFormat) );
 
 			MTLRegion region = { { 0, 0, 0 }, { srcWidth, srcHeight, 1 } };
 
@@ -1069,6 +1069,11 @@ namespace bgfx { namespace mtl
 			}
 		}
 
+		bool isDeviceRemoved() BX_OVERRIDE
+		{
+			return false;
+		}
+
 		void flip(HMD& /*_hmd*/) BX_OVERRIDE
 		{
 			if (NULL == m_commandBuffer)
@@ -1249,7 +1254,7 @@ namespace bgfx { namespace mtl
 
 				if (m_screenshotTarget.pixelFormat() == MTLPixelFormatRGBA8Uint)
 				{
-					imageSwizzleBgra8(
+					bimg::imageSwizzleBgra8(
 						  m_capture
 						, m_resolution.m_width
 						, m_resolution.m_height
@@ -2380,14 +2385,14 @@ namespace bgfx { namespace mtl
 	{
 		m_sampler = s_renderMtl->getSamplerState(_flags);
 
-		ImageContainer imageContainer;
+		bimg::ImageContainer imageContainer;
 
-		if (imageParse(imageContainer, _mem->data, _mem->size) )
+		if (bimg::imageParse(imageContainer, _mem->data, _mem->size) )
 		{
 			uint8_t numMips = imageContainer.m_numMips;
 			const uint8_t startLod = uint8_t(bx::uint32_min(_skip, numMips-1) );
 			numMips -= startLod;
-			const ImageBlockInfo& blockInfo = getBlockInfo(TextureFormat::Enum(imageContainer.m_format) );
+			const bimg::ImageBlockInfo& blockInfo = getBlockInfo(bimg::TextureFormat::Enum(imageContainer.m_format) );
 			const uint32_t textureWidth  = bx::uint32_max(blockInfo.blockWidth,  imageContainer.m_width >>startLod);
 			const uint32_t textureHeight = bx::uint32_max(blockInfo.blockHeight, imageContainer.m_height>>startLod);
 			const uint16_t numLayers     = imageContainer.m_numLayers;
@@ -2399,7 +2404,7 @@ namespace bgfx { namespace mtl
 			m_requestedFormat  = uint8_t(imageContainer.m_format);
 			m_textureFormat    = uint8_t(getViableTextureFormat(imageContainer) );
 			const bool convert = m_textureFormat != m_requestedFormat;
-			const uint8_t bpp = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+			const uint8_t bpp = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 
 			TextureDescriptor desc = s_renderMtl->m_textureDescriptor;
 
@@ -2437,7 +2442,7 @@ namespace bgfx { namespace mtl
 			m_numMips = numMips;
 			const uint16_t numSides = numLayers * (imageContainer.m_cubeMap ? 6 : 1);
 
-			const bool compressed   = isCompressed(TextureFormat::Enum(m_textureFormat) );
+			const bool compressed   = bimg::isCompressed(bimg::TextureFormat::Enum(m_textureFormat) );
 			const bool writeOnly    = 0 != (_flags&BGFX_TEXTURE_RT_WRITE_ONLY);
 			const bool computeWrite = 0 != (_flags&BGFX_TEXTURE_COMPUTE_WRITE);
 			const bool renderTarget = 0 != (_flags&BGFX_TEXTURE_RT_MASK);
@@ -2487,16 +2492,25 @@ namespace bgfx { namespace mtl
 			{
 				desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
 
-				desc.storageMode = (MTLStorageMode)(writeOnly||isDepth(TextureFormat::Enum(m_textureFormat))
-													? 2 /*MTLStorageModePrivate*/
-													: ((BX_ENABLED(BX_PLATFORM_IOS)) ? 0 /* MTLStorageModeShared */ :  1 /*MTLStorageModeManaged*/)
-													);
+				desc.storageMode = (MTLStorageMode)(false
+					|| writeOnly
+					|| bimg::isDepth(bimg::TextureFormat::Enum(m_textureFormat) )
+					? 2 /*MTLStorageModePrivate*/
+					: (BX_ENABLED(BX_PLATFORM_IOS)
+						? 0 /* MTLStorageModeShared */
+						:  1 /*MTLStorageModeManaged*/
+					) );
 
 				desc.usage = MTLTextureUsageShaderRead;
 				if (computeWrite)
+				{
 					desc.usage |= MTLTextureUsageShaderWrite;
+				}
+
 				if (renderTarget)
+				{
 					desc.usage |= MTLTextureUsageRenderTarget;
+				}
 			}
 
 			m_ptr = s_renderMtl->m_device.newTextureWithDescriptor(desc);
@@ -2535,14 +2549,14 @@ namespace bgfx { namespace mtl
 					height = bx::uint32_max(1, height);
 					depth  = bx::uint32_max(1, depth);
 
-					ImageMip mip;
-					if (imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
+					bimg::ImageMip mip;
+					if (bimg::imageGetRawData(imageContainer, side, lod+startLod, _mem->data, _mem->size, mip) )
 					{
 						const uint8_t* data = mip.m_data;
 
 						if (convert)
 						{
-							imageDecodeToBgra8(temp
+							bimg::imageDecodeToBgra8(temp
 								, mip.m_data
 								, mip.m_width
 								, mip.m_height
@@ -2601,7 +2615,7 @@ namespace bgfx { namespace mtl
 
 	void TextureMtl::update(uint8_t _side, uint8_t _mip, const Rect& _rect, uint16_t _z, uint16_t _depth, uint16_t _pitch, const Memory* _mem)
 	{
-		const uint32_t bpp       = getBitsPerPixel(TextureFormat::Enum(m_textureFormat) );
+		const uint32_t bpp       = bimg::getBitsPerPixel(bimg::TextureFormat::Enum(m_textureFormat) );
 		const uint32_t rectpitch = _rect.m_width*bpp/8;
 		const uint32_t srcpitch  = UINT16_MAX == _pitch ? rectpitch : _pitch;
 		const uint32_t slice = ((m_type == Texture3D) ? 0 : _side + _z * (m_type == TextureCube ? 6 : 1));
@@ -2615,12 +2629,12 @@ namespace bgfx { namespace mtl
 		if (convert)
 		{
 			temp = (uint8_t*)BX_ALLOC(g_allocator, rectpitch*_rect.m_height);
-			imageDecodeToBgra8(temp
+			bimg::imageDecodeToBgra8(temp
 				, data
 				, _rect.m_width
 				, _rect.m_height
 				, srcpitch
-				, TextureFormat::Enum(m_requestedFormat)
+				, bimg::TextureFormat::Enum(m_requestedFormat)
 				);
 			data = temp;
 		}
@@ -2713,7 +2727,7 @@ namespace bgfx { namespace mtl
 					m_height = texture.m_height;
 				}
 
-				if (isDepth( (TextureFormat::Enum)texture.m_textureFormat) )
+				if (bimg::isDepth(bimg::TextureFormat::Enum(texture.m_textureFormat) ) )
 				{
 					m_depthHandle = handle;
 				}
@@ -3048,6 +3062,9 @@ namespace bgfx { namespace mtl
 		currentState.m_stateFlags = BGFX_STATE_NONE;
 		currentState.m_stencil    = packStencil(BGFX_STENCIL_NONE, BGFX_STENCIL_NONE);
 
+		RenderBind currentBind;
+		currentBind.clear();
+
 		_render->m_hmdInitialized = false;
 
 		const bool hmdEnabled = false;
@@ -3106,7 +3123,10 @@ namespace bgfx { namespace mtl
 					|| key.m_view != view
 					|| item == numItems
 					;
-				const RenderItem& renderItem = _render->m_renderItem[_render->m_sortValues[item] ];
+
+				const uint32_t itemIdx       = _render->m_sortValues[item];
+				const RenderItem& renderItem = _render->m_renderItem[itemIdx];
+				const RenderBind& renderBind = _render->m_renderItemBind[itemIdx];
 				++item;
 
 				if (viewChanged)
@@ -3406,6 +3426,8 @@ namespace bgfx { namespace mtl
 					currentState.m_stateFlags = newFlags;
 					currentState.m_stencil    = newStencil;
 
+					currentBind.clear();
+
 					programIdx = invalidHandle;
 					setDepthStencilState(newFlags, packStencil(BGFX_STENCIL_DEFAULT, BGFX_STENCIL_DEFAULT));
 
@@ -3612,22 +3634,25 @@ namespace bgfx { namespace mtl
 
 					for (uint8_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 					{
-						const Binding& sampler = draw.m_bind[stage];
-						Binding& current = currentState.m_bind[stage];
-						if (current.m_idx != sampler.m_idx
-						||  current.m_un.m_draw.m_textureFlags != sampler.m_un.m_draw.m_textureFlags
+						const Binding& bind = renderBind.m_bind[stage];
+						Binding& current = currentBind.m_bind[stage];
+
+						if (current.m_idx                      != bind.m_idx
+						||  current.m_un.m_draw.m_textureFlags != bind.m_un.m_draw.m_textureFlags
 						||  programChanged)
 						{
-							if (invalidHandle != sampler.m_idx)
+							if (invalidHandle != bind.m_idx)
 							{
-								TextureMtl& texture = m_textures[sampler.m_idx];
-								texture.commit(stage, (usedVertexSamplerStages&(1<<stage))!=0,
-											   (usedFragmentSamplerStages&(1<<stage))!=0,
-												sampler.m_un.m_draw.m_textureFlags);
+								TextureMtl& texture = m_textures[bind.m_idx];
+								texture.commit(stage
+									, 0 != (usedVertexSamplerStages   & (1<<stage) )
+									, 0 != (usedFragmentSamplerStages & (1<<stage) )
+									, bind.m_un.m_draw.m_textureFlags
+									);
 							}
 						}
 
-						current = sampler;
+						current = bind;
 					}
 				}
 
diff --git a/3rdparty/bgfx/src/renderer_noop.cpp b/3rdparty/bgfx/src/renderer_noop.cpp
index 24fbfb4..3f24756 100644
--- a/3rdparty/bgfx/src/renderer_noop.cpp
+++ b/3rdparty/bgfx/src/renderer_noop.cpp
@@ -5,36 +5,38 @@
 
 #include "bgfx_p.h"
 
-#if BGFX_CONFIG_RENDERER_NOOP
-
 namespace bgfx { namespace noop
 {
 	struct RendererContextNOOP : public RendererContextI
 	{
 		RendererContextNOOP()
 		{
-			// Pretend all features that are not returning results to CPU
-			// are available.
+			// Pretend all features are available.
 			g_caps.supported = 0
-				| BGFX_CAPS_TEXTURE_COMPARE_LEQUAL
-				| BGFX_CAPS_TEXTURE_COMPARE_ALL
-				| BGFX_CAPS_TEXTURE_3D
-				| BGFX_CAPS_VERTEX_ATTRIB_HALF
-				| BGFX_CAPS_VERTEX_ATTRIB_UINT10
-				| BGFX_CAPS_INSTANCING
-				| BGFX_CAPS_FRAGMENT_DEPTH
+				| BGFX_CAPS_ALPHA_TO_COVERAGE
 				| BGFX_CAPS_BLEND_INDEPENDENT
 				| BGFX_CAPS_COMPUTE
-				| BGFX_CAPS_FRAGMENT_ORDERING
-				| BGFX_CAPS_SWAP_CHAIN
-				| BGFX_CAPS_INDEX32
-				| BGFX_CAPS_DRAW_INDIRECT
-				| BGFX_CAPS_HIDPI
-				| BGFX_CAPS_TEXTURE_BLIT
-				| BGFX_CAPS_ALPHA_TO_COVERAGE
 				| BGFX_CAPS_CONSERVATIVE_RASTER
+				| BGFX_CAPS_DRAW_INDIRECT
+				| BGFX_CAPS_FRAGMENT_DEPTH
+				| BGFX_CAPS_FRAGMENT_ORDERING
+				| BGFX_CAPS_GRAPHICS_DEBUGGER
+				| BGFX_CAPS_HIDPI
+				| BGFX_CAPS_HMD
+				| BGFX_CAPS_INDEX32
+				| BGFX_CAPS_INSTANCING
+				| BGFX_CAPS_OCCLUSION_QUERY
+				| BGFX_CAPS_RENDERER_MULTITHREADED
+				| BGFX_CAPS_SWAP_CHAIN
 				| BGFX_CAPS_TEXTURE_2D_ARRAY
+				| BGFX_CAPS_TEXTURE_3D
+				| BGFX_CAPS_TEXTURE_BLIT
+				| BGFX_CAPS_TEXTURE_COMPARE_ALL
+				| BGFX_CAPS_TEXTURE_COMPARE_LEQUAL
 				| BGFX_CAPS_TEXTURE_CUBE_ARRAY
+				| BGFX_CAPS_TEXTURE_READ_BACK
+				| BGFX_CAPS_VERTEX_ATTRIB_HALF
+				| BGFX_CAPS_VERTEX_ATTRIB_UINT10
 				;
 		}
 
@@ -52,6 +54,11 @@ namespace bgfx { namespace noop
 			return BGFX_RENDERER_NOOP_NAME;
 		}
 
+		bool isDeviceRemoved() BX_OVERRIDE
+		{
+			return false;
+		}
+
 		void flip(HMD& /*_hmd*/) BX_OVERRIDE
 		{
 		}
@@ -224,19 +231,3 @@ namespace bgfx { namespace noop
 		s_renderNOOP = NULL;
 	}
 } /* namespace noop */ } // namespace bgfx
-
-#else
-
-namespace bgfx { namespace noop
-{
-	RendererContextI* rendererCreate()
-	{
-		return NULL;
-	}
-
-	void rendererDestroy()
-	{
-	}
-} /* namespace noop */ } // namespace bgfx
-
-#endif // BGFX_CONFIG_RENDERER_NOOP
diff --git a/3rdparty/bgfx/src/renderer_null.cpp b/3rdparty/bgfx/src/renderer_null.cpp
deleted file mode 100644
index 1dcd621..0000000
--- a/3rdparty/bgfx/src/renderer_null.cpp
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
- * Copyright 2011-2016 Branimir Karadzic. All rights reserved.
- * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
- */
-
-#include "bgfx_p.h"
-
-#if BGFX_CONFIG_RENDERER_NULL
-
-namespace bgfx { namespace noop
-{
-	struct RendererContextNULL : public RendererContextI
-	{
-		RendererContextNULL()
-		{
-		}
-
-		~RendererContextNULL()
-		{
-		}
-
-		RendererType::Enum getRendererType() const BX_OVERRIDE
-		{
-			return RendererType::Null;
-		}
-
-		const char* getRendererName() const BX_OVERRIDE
-		{
-			return BGFX_RENDERER_NULL_NAME;
-		}
-
-		void flip(HMD& /*_hmd*/) BX_OVERRIDE
-		{
-		}
-
-		void createIndexBuffer(IndexBufferHandle /*_handle*/, Memory* /*_mem*/, uint16_t /*_flags*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyIndexBuffer(IndexBufferHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createVertexDecl(VertexDeclHandle /*_handle*/, const VertexDecl& /*_decl*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyVertexDecl(VertexDeclHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createVertexBuffer(VertexBufferHandle /*_handle*/, Memory* /*_mem*/, VertexDeclHandle /*_declHandle*/, uint16_t /*_flags*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyVertexBuffer(VertexBufferHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createDynamicIndexBuffer(IndexBufferHandle /*_handle*/, uint32_t /*_size*/, uint16_t /*_flags*/) BX_OVERRIDE
-		{
-		}
-
-		void updateDynamicIndexBuffer(IndexBufferHandle /*_handle*/, uint32_t /*_offset*/, uint32_t /*_size*/, Memory* /*_mem*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyDynamicIndexBuffer(IndexBufferHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createDynamicVertexBuffer(VertexBufferHandle /*_handle*/, uint32_t /*_size*/, uint16_t /*_flags*/) BX_OVERRIDE
-		{
-		}
-
-		void updateDynamicVertexBuffer(VertexBufferHandle /*_handle*/, uint32_t /*_offset*/, uint32_t /*_size*/, Memory* /*_mem*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyDynamicVertexBuffer(VertexBufferHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createShader(ShaderHandle /*_handle*/, Memory* /*_mem*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyShader(ShaderHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createProgram(ProgramHandle /*_handle*/, ShaderHandle /*_vsh*/, ShaderHandle /*_fsh*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyProgram(ProgramHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createTexture(TextureHandle /*_handle*/, Memory* /*_mem*/, uint32_t /*_flags*/, uint8_t /*_skip*/) BX_OVERRIDE
-		{
-		}
-
-		void updateTextureBegin(TextureHandle /*_handle*/, uint8_t /*_side*/, uint8_t /*_mip*/) BX_OVERRIDE
-		{
-		}
-
-		void updateTexture(TextureHandle /*_handle*/, uint8_t /*_side*/, uint8_t /*_mip*/, const Rect& /*_rect*/, uint16_t /*_z*/, uint16_t /*_depth*/, uint16_t /*_pitch*/, const Memory* /*_mem*/) BX_OVERRIDE
-		{
-		}
-
-		void updateTextureEnd() BX_OVERRIDE
-		{
-		}
-
-		void readTexture(TextureHandle /*_handle*/, void* /*_data*/) BX_OVERRIDE
-		{
-		}
-
-		void resizeTexture(TextureHandle /*_handle*/, uint16_t /*_width*/, uint16_t /*_height*/, uint8_t /*_numMips*/) BX_OVERRIDE
-		{
-		}
-
-		void overrideInternal(TextureHandle /*_handle*/, uintptr_t /*_ptr*/) BX_OVERRIDE
-		{
-		}
-
-		uintptr_t getInternal(TextureHandle /*_handle*/) BX_OVERRIDE
-		{
-			return 0;
-		}
-
-		void destroyTexture(TextureHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createFrameBuffer(FrameBufferHandle /*_handle*/, uint8_t /*_num*/, const Attachment* /*_attachment*/) BX_OVERRIDE
-		{
-		}
-
-		void createFrameBuffer(FrameBufferHandle /*_handle*/, void* /*_nwh*/, uint32_t /*_width*/, uint32_t /*_height*/, TextureFormat::Enum /*_depthFormat*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyFrameBuffer(FrameBufferHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void createUniform(UniformHandle /*_handle*/, UniformType::Enum /*_type*/, uint16_t /*_num*/, const char* /*_name*/) BX_OVERRIDE
-		{
-		}
-
-		void destroyUniform(UniformHandle /*_handle*/) BX_OVERRIDE
-		{
-		}
-
-		void saveScreenShot(const char* /*_filePath*/) BX_OVERRIDE
-		{
-		}
-
-		void updateViewName(uint8_t /*_id*/, const char* /*_name*/) BX_OVERRIDE
-		{
-		}
-
-		void updateUniform(uint16_t /*_loc*/, const void* /*_data*/, uint32_t /*_size*/) BX_OVERRIDE
-		{
-		}
-
-		void setMarker(const char* /*_marker*/, uint32_t /*_size*/) BX_OVERRIDE
-		{
-		}
-
-		void submit(Frame* /*_render*/, ClearQuad& /*_clearQuad*/, TextVideoMemBlitter& /*_textVideoMemBlitter*/) BX_OVERRIDE
-		{
-		}
-
-		void blitSetup(TextVideoMemBlitter& /*_blitter*/) BX_OVERRIDE
-		{
-		}
-
-		void blitRender(TextVideoMemBlitter& /*_blitter*/, uint32_t /*_numIndices*/) BX_OVERRIDE
-		{
-		}
-	};
-
-	static RendererContextNULL* s_renderNULL;
-
-	RendererContextI* rendererCreate()
-	{
-		s_renderNULL = BX_NEW(g_allocator, RendererContextNULL);
-		return s_renderNULL;
-	}
-
-	void rendererDestroy()
-	{
-		BX_DELETE(g_allocator, s_renderNULL);
-		s_renderNULL = NULL;
-	}
-} /* namespace noop */ } // namespace bgfx
-
-#else
-
-namespace bgfx { namespace noop
-{
-	RendererContextI* rendererCreate()
-	{
-		return NULL;
-	}
-
-	void rendererDestroy()
-	{
-	}
-} /* namespace noop */ } // namespace bgfx
-
-#endif // BGFX_CONFIG_RENDERER_NULL
diff --git a/3rdparty/bgfx/src/renderer_vk.cpp b/3rdparty/bgfx/src/renderer_vk.cpp
index 8e2f66c..28e2146 100644
--- a/3rdparty/bgfx/src/renderer_vk.cpp
+++ b/3rdparty/bgfx/src/renderer_vk.cpp
@@ -986,7 +986,7 @@ VK_IMPORT_INSTANCE
 					{
 						uint8_t support = BGFX_CAPS_FORMAT_TEXTURE_NONE;
 
-						const bool depth = isDepth(TextureFormat::Enum(ii) );
+						const bool depth = bimg::isDepth(bimg::TextureFormat::Enum(ii) );
 						VkFormat fmt = depth
 							? s_textureFormat[ii].m_fmtDsv
 							: s_textureFormat[ii].m_fmt
@@ -1919,6 +1919,11 @@ VK_IMPORT_DEVICE
 			return BGFX_RENDERER_VULKAN_NAME;
 		}
 
+		bool isDeviceRemoved() BX_OVERRIDE
+		{
+			return false;
+		}
+
 		void flip(HMD& /*_hmd*/) BX_OVERRIDE
 		{
 			if (VK_NULL_HANDLE != m_swapchain)
@@ -3745,7 +3750,9 @@ VK_DESTROY
 					|| item == numItems
 					;
 
-				const RenderItem& renderItem = _render->m_renderItem[_render->m_sortValues[item] ];
+				const uint32_t itemIdx       = _render->m_sortValues[item];
+				const RenderItem& renderItem = _render->m_renderItem[itemIdx];
+				const RenderBind& renderBind = _render->m_renderItemBind[itemIdx];
 				++item;
 
 				if (viewChanged)
@@ -3912,7 +3919,7 @@ BX_UNUSED(currentSamplerStateIdx);
 						currentBindHash = 0;
 					}
 
-//					uint32_t bindHash = bx::hashMurmur2A(compute.m_bind, sizeof(compute.m_bind) );
+//					uint32_t bindHash = bx::hashMurmur2A(renderBind.m_bind, sizeof(renderBind.m_bind) );
 //					if (currentBindHash != bindHash)
 //					{
 //						currentBindHash  = bindHash;
@@ -3925,7 +3932,7 @@ BX_UNUSED(currentSamplerStateIdx);
 //
 //							for (uint32_t ii = 0; ii < BGFX_MAX_COMPUTE_BINDINGS; ++ii)
 //							{
-//								const Binding& bind = compute.m_bind[ii];
+//								const Binding& bind = renderBind.m_bind[ii];
 //								if (invalidHandle != bind.m_idx)
 //								{
 //									switch (bind.m_type)
@@ -4138,7 +4145,7 @@ BX_UNUSED(currentSamplerStateIdx);
 							);
 
 					uint16_t scissor = draw.m_scissor;
-					uint32_t bindHash = bx::hashMurmur2A(draw.m_bind, sizeof(draw.m_bind) );
+					uint32_t bindHash = bx::hashMurmur2A(renderBind.m_bind, sizeof(renderBind.m_bind) );
 					if (currentBindHash != bindHash
 					||  0 != changedStencil
 					|| (hasFactor && blendFactor != draw.m_rgba)
@@ -4164,7 +4171,7 @@ BX_UNUSED(currentSamplerStateIdx);
 //								srvHandle[0].ptr = 0;
 //								for (uint32_t stage = 0; stage < BGFX_CONFIG_MAX_TEXTURE_SAMPLERS; ++stage)
 //								{
-//									const Binding& bind = draw.m_bind[stage];
+//									const Binding& bind = renderBind.m_bind[stage];
 //									if (invalidHandle != bind.m_idx)
 //									{
 //										TextureD3D12& texture = m_textures[bind.m_idx];
diff --git a/3rdparty/bgfx/src/shader_dxbc.cpp b/3rdparty/bgfx/src/shader_dxbc.cpp
index b73b215..f0a76bd 100644
--- a/3rdparty/bgfx/src/shader_dxbc.cpp
+++ b/3rdparty/bgfx/src/shader_dxbc.cpp
@@ -94,7 +94,7 @@ namespace bgfx
 		{ 2, 0 }, // SQRT
 		{ 1, 0 }, // SWITCH
 		{ 3, 0 }, // SINCOS
-		{ 3, 0 }, // UDIV
+		{ 4, 0 }, // UDIV
 		{ 3, 0 }, // ULT
 		{ 3, 0 }, // UGE
 		{ 4, 0 }, // UMUL
diff --git a/3rdparty/bgfx/tools/geometryc/geometryc.cpp b/3rdparty/bgfx/tools/geometryc/geometryc.cpp
index c18a397..14275ba 100644
--- a/3rdparty/bgfx/tools/geometryc/geometryc.cpp
+++ b/3rdparty/bgfx/tools/geometryc/geometryc.cpp
@@ -524,13 +524,13 @@ int main(int _argc, const char* _argv[])
 						index.m_vbc = 0;
 					}
 
-					char* vertex   = argv[edge+1];
-					char* texcoord = strchr(vertex, '/');
+					const char* vertex   = argv[edge+1];
+					char* texcoord = const_cast<char*>(bx::strnchr(vertex, '/') );
 					if (NULL != texcoord)
 					{
 						*texcoord++ = '\0';
 
-						char* normal = strchr(texcoord, '/');
+						char* normal = const_cast<char*>(bx::strnchr(texcoord, '/') );
 						if (NULL != normal)
 						{
 							*normal++ = '\0';
@@ -860,7 +860,7 @@ int main(int _argc, const char* _argv[])
 
 				if (hasTangent)
 				{
-					calcTangents(vertexData, numVertices, decl, indexData, numIndices);
+					calcTangents(vertexData, uint16_t(numVertices), decl, indexData, numIndices);
 				}
 
 				bx::MemoryWriter memWriter(&memBlock);
@@ -877,7 +877,7 @@ int main(int _argc, const char* _argv[])
 							, prim1.m_numIndices
 							, vertexData + prim1.m_startVertex
 							, numVertices
-							, stride
+							, uint16_t(stride)
 							);
 					}
 				}
@@ -991,7 +991,7 @@ int main(int _argc, const char* _argv[])
 	{
 		if (hasTangent)
 		{
-			calcTangents(vertexData, numVertices, decl, indexData, numIndices);
+			calcTangents(vertexData, uint16_t(numVertices), decl, indexData, numIndices);
 		}
 
 		bx::MemoryWriter memWriter(&memBlock);
@@ -1008,7 +1008,7 @@ int main(int _argc, const char* _argv[])
 					, prim1.m_numIndices
 					, vertexData + prim1.m_startVertex
 					, numVertices
-					, stride
+					, uint16_t(stride)
 					);
 			}
 		}
diff --git a/3rdparty/bgfx/tools/shaderc/shaderc.cpp b/3rdparty/bgfx/tools/shaderc/shaderc.cpp
index 57934b5..b444bb3 100644
--- a/3rdparty/bgfx/tools/shaderc/shaderc.cpp
+++ b/3rdparty/bgfx/tools/shaderc/shaderc.cpp
@@ -141,11 +141,11 @@ namespace bgfx
 
 	const char* interpolationDx11(const char* _glsl)
 	{
-		if (0 == strcmp(_glsl, "smooth") )
+		if (0 == bx::strncmp(_glsl, "smooth") )
 		{
 			return "linear";
 		}
-		else if (0 == strcmp(_glsl, "flat") )
+		else if (0 == bx::strncmp(_glsl, "flat") )
 		{
 			return "nointerpolation";
 		}
@@ -169,7 +169,7 @@ namespace bgfx
 		for (uint32_t ii = 0; ii < UniformType::Count*2; ++ii)
 		{
 			if (NULL != s_uniformTypeName[ii]
-			&&  0 == strcmp(_name, s_uniformTypeName[ii]) )
+			&&  0 == bx::strncmp(_name, s_uniformTypeName[ii]) )
 			{
 				return UniformType::Enum(ii/2);
 			}
@@ -517,7 +517,10 @@ namespace bgfx
 		{
 			char* start = scratch(_includeDir);
 
-			for (char* split = strchr(start, ';'); NULL != split; split = strchr(start, ';') )
+			for (char* split = const_cast<char*>(bx::strnchr(start, ';') )
+				; NULL != split
+				; split = const_cast<char*>(bx::strnchr(start, ';') )
+				)
 			{
 				*split = '\0';
 				m_tagptr->tag = FPPTAG_INCLUDE_DIR;
@@ -811,32 +814,32 @@ namespace bgfx
 		const char* profile = cmdLine.findOption('p', "profile");
 		if (NULL != profile)
 		{
-			if (0 == strncmp(&profile[1], "s_4_0_level", 11) )
+			if (0 == bx::strncmp(&profile[1], "s_4_0_level", 11) )
 			{
 				hlsl = 2;
 			}
-			else if (0 == strncmp(&profile[1], "s_3", 3) )
+			else if (0 == bx::strncmp(&profile[1], "s_3", 3) )
 			{
 				hlsl = 3;
 				d3d  = 9;
 			}
-			else if (0 == strncmp(&profile[1], "s_4", 3) )
+			else if (0 == bx::strncmp(&profile[1], "s_4", 3) )
 			{
 				hlsl = 4;
 			}
-			else if (0 == strncmp(&profile[1], "s_5", 3) )
+			else if (0 == bx::strncmp(&profile[1], "s_5", 3) )
 			{
 				hlsl = 5;
 			}
-			else if (0 == strcmp(profile, "metal") )
+			else if (0 == bx::strncmp(profile, "metal") )
 			{
 				metal = 1;
 			}
-			else if (0 == strcmp(profile, "pssl") )
+			else if (0 == bx::strncmp(profile, "pssl") )
 			{
 				pssl = 1;
 			}
-			else if (0 == strcmp(profile, "spirv") )
+			else if (0 == bx::strncmp(profile, "spirv") )
 			{
 				spirv = 1;
 			}
@@ -909,7 +912,7 @@ namespace bgfx
 		&&    '\0'  != *defines)
 		{
 			defines = bx::strws(defines);
-			const char* eol = strchr(defines, ';');
+			const char* eol = bx::strnchr(defines, ';');
 			if (NULL == eol)
 			{
 				eol = defines + strlen(defines);
@@ -1057,7 +1060,7 @@ namespace bgfx
 			   &&  *parse != '\0')
 			{
 				parse = bx::strws(parse);
-				const char* eol = strchr(parse, ';');
+				const char* eol = bx::strnchr(parse, ';');
 				if (NULL == eol)
 				{
 					eol = bx::streol(parse);
@@ -1069,18 +1072,18 @@ namespace bgfx
 					const char* interpolation = NULL;
 					const char* typen = parse;
 
-					if (0 == strncmp(typen, "lowp", 4)
-					||  0 == strncmp(typen, "mediump", 7)
-					||  0 == strncmp(typen, "highp", 5) )
+					if (0 == bx::strncmp(typen, "lowp", 4)
+					||  0 == bx::strncmp(typen, "mediump", 7)
+					||  0 == bx::strncmp(typen, "highp", 5) )
 					{
 						precision = typen;
 						typen = parse = bx::strws(bx::strword(parse) );
 					}
 
-					if (0 == strncmp(typen, "flat", 4)
-					||  0 == strncmp(typen, "smooth", 6)
-					||  0 == strncmp(typen, "noperspective", 13)
-					||  0 == strncmp(typen, "centroid", 8) )
+					if (0 == bx::strncmp(typen, "flat", 4)
+					||  0 == bx::strncmp(typen, "smooth", 6)
+					||  0 == bx::strncmp(typen, "noperspective", 13)
+					||  0 == bx::strncmp(typen, "centroid", 8) )
 					{
 						interpolation = typen;
 						typen = parse = bx::strws(bx::strword(parse) );
@@ -1184,21 +1187,21 @@ namespace bgfx
 					const char* nl  = bx::strnl(eol);
 					input = const_cast<char*>(nl);
 
-					if (0 == strncmp(str, "input", 5) )
+					if (0 == bx::strncmp(str, "input", 5) )
 					{
 						str += 5;
-						const char* comment = strstr(str, "//");
+						const char* comment = bx::strnstr(str, "//");
 						eol = NULL != comment && comment < eol ? comment : eol;
 						inputHash = parseInOut(shaderInputs, str, eol);
 					}
-					else if (0 == strncmp(str, "output", 6) )
+					else if (0 == bx::strncmp(str, "output", 6) )
 					{
 						str += 6;
-						const char* comment = strstr(str, "//");
+						const char* comment = bx::strnstr(str, "//");
 						eol = NULL != comment && comment < eol ? comment : eol;
 						outputHash = parseInOut(shaderOutputs, str, eol);
 					}
-					else if (0 == strncmp(str, "raw", 3) )
+					else if (0 == bx::strncmp(str, "raw", 3) )
 					{
 						raw = true;
 						str += 3;
@@ -1268,7 +1271,7 @@ namespace bgfx
 			}
 			else if ('c' == shaderType) // Compute
 			{
-				char* entry = strstr(input, "void main()");
+				char* entry = const_cast<char*>(bx::strnstr(input, "void main()") );
 				if (NULL == entry)
 				{
 					fprintf(stderr, "Shader entry point 'void main()' is not found.\n");
@@ -1307,10 +1310,10 @@ namespace bgfx
 
 						uint32_t arg = 0;
 
-						const bool hasLocalInvocationID    = NULL != strstr(input, "gl_LocalInvocationID");
-						const bool hasLocalInvocationIndex = NULL != strstr(input, "gl_LocalInvocationIndex");
-						const bool hasGlobalInvocationID   = NULL != strstr(input, "gl_GlobalInvocationID");
-						const bool hasWorkGroupID          = NULL != strstr(input, "gl_WorkGroupID");
+						const bool hasLocalInvocationID    = NULL != bx::strnstr(input, "gl_LocalInvocationID");
+						const bool hasLocalInvocationIndex = NULL != bx::strnstr(input, "gl_LocalInvocationIndex");
+						const bool hasGlobalInvocationID   = NULL != bx::strnstr(input, "gl_GlobalInvocationID");
+						const bool hasWorkGroupID          = NULL != bx::strnstr(input, "gl_WorkGroupID");
 
 						if (hasLocalInvocationID)
 						{
@@ -1455,7 +1458,7 @@ namespace bgfx
 			}
 			else // Vertex/Fragment
 			{
-				char* entry = strstr(input, "void main()");
+				char* entry = const_cast<char*>(bx::strnstr(input, "void main()") );
 				if (NULL == entry)
 				{
 					fprintf(stderr, "Shader entry point 'void main()' is not found.\n");
@@ -1484,8 +1487,8 @@ namespace bgfx
 								const Varying& var = varyingIt->second;
 								const char* name = var.m_name.c_str();
 
-								if (0 == strncmp(name, "a_", 2)
-								||  0 == strncmp(name, "i_", 2) )
+								if (0 == bx::strncmp(name, "a_", 2)
+								||  0 == bx::strncmp(name, "i_", 2) )
 								{
 									preprocessor.writef("attribute %s %s %s %s;\n"
 											, var.m_precision.c_str()
@@ -1555,17 +1558,17 @@ namespace bgfx
 
 						if ('f' == shaderType)
 						{
-							const char* insert = strstr(entry, "{");
+							const char* insert = bx::strnstr(entry, "{");
 							if (NULL != insert)
 							{
 								insert = strInsert(const_cast<char*>(insert+1), "\nvec4 bgfx_VoidFrag = vec4_splat(0.0);\n");
 							}
 
-							const bool hasFragColor   = NULL != strstr(input, "gl_FragColor");
-							const bool hasFragCoord   = NULL != strstr(input, "gl_FragCoord") || hlsl > 3 || hlsl == 2;
-							const bool hasFragDepth   = NULL != strstr(input, "gl_FragDepth");
-							const bool hasFrontFacing = NULL != strstr(input, "gl_FrontFacing");
-							const bool hasPrimitiveId = NULL != strstr(input, "gl_PrimitiveID");
+							const bool hasFragColor   = NULL != bx::strnstr(input, "gl_FragColor");
+							const bool hasFragCoord   = NULL != bx::strnstr(input, "gl_FragCoord") || hlsl > 3 || hlsl == 2;
+							const bool hasFragDepth   = NULL != bx::strnstr(input, "gl_FragDepth");
+							const bool hasFrontFacing = NULL != bx::strnstr(input, "gl_FrontFacing");
+							const bool hasPrimitiveId = NULL != bx::strnstr(input, "gl_PrimitiveID");
 
 							bool hasFragData[8] = {};
 							uint32_t numFragData = 0;
@@ -1573,7 +1576,7 @@ namespace bgfx
 							{
 								char temp[32];
 								bx::snprintf(temp, BX_COUNTOF(temp), "gl_FragData[%d]", ii);
-								hasFragData[ii] = NULL != strstr(input, temp);
+								hasFragData[ii] = NULL != bx::strnstr(input, temp);
 								numFragData += hasFragData[ii];
 							}
 
@@ -1695,7 +1698,7 @@ namespace bgfx
 						}
 						else if ('v' == shaderType)
 						{
-							const char* brace = strstr(entry, "{");
+							const char* brace = bx::strnstr(entry, "{");
 							if (NULL != brace)
 							{
 								const char* end = bx::strmb(brace, '{', '}');
@@ -1874,6 +1877,7 @@ namespace bgfx
 									else
 									{
 										bx::stringPrintf(code, "#version %s\n", need130 ? "130" : profile);
+										glsl = 130;
 									}
 
 									if (usesGpuShader5)
@@ -1895,8 +1899,7 @@ namespace bgfx
 
 									if (usesTextureLod)
 									{
-										if ( (0 != metal || 130 > glsl)
-										&&  'f' == shaderType)
+										if ('f' == shaderType)
 										{
 											ARB_shader_texture_lod = true;
 											bx::stringPrintf(code
diff --git a/3rdparty/bgfx/tools/shaderc/shaderc_glsl.cpp b/3rdparty/bgfx/tools/shaderc/shaderc_glsl.cpp
index 10ab449..0b42a2f 100644
--- a/3rdparty/bgfx/tools/shaderc/shaderc_glsl.cpp
+++ b/3rdparty/bgfx/tools/shaderc/shaderc_glsl.cpp
@@ -50,6 +50,7 @@ namespace bgfx { namespace glsl
 
 			bool found = false
 				|| 3 == sscanf(log, "%u:%u(%u):", &source, &line, &column)
+				|| 2 == sscanf(log, "(%u,%u):", &line, &column)
 				;
 
 			if (found
@@ -108,21 +109,24 @@ namespace bgfx { namespace glsl
 				&&  *parse != '\0')
 			{
 				parse = bx::strws(parse);
-				const char* eol = strchr(parse, ';');
+				const char* eol = bx::strnchr(parse, ';');
 				if (NULL != eol)
 				{
 					const char* qualifier = parse;
 					parse = bx::strws(bx::strword(parse) );
 
-					if (0 == strncmp(qualifier, "attribute", 9)
-					||  0 == strncmp(qualifier, "varying", 7) )
+					if (0 == bx::strncmp(qualifier, "attribute", 9)
+					||  0 == bx::strncmp(qualifier, "varying",   7)
+					||  0 == bx::strncmp(qualifier, "in",        2)
+					||  0 == bx::strncmp(qualifier, "out",       3)
+					   )
 					{
 						// skip attributes and varyings.
 						parse = eol + 1;
 						continue;
 					}
 
-					if (0 != strncmp(qualifier, "uniform", 7) )
+					if (0 != bx::strncmp(qualifier, "uniform", 7) )
 					{
 						// end if there is no uniform keyword.
 						parse = NULL;
@@ -132,9 +136,9 @@ namespace bgfx { namespace glsl
 					const char* precision = NULL;
 					const char* typen = parse;
 
-					if (0 == strncmp(typen, "lowp", 4)
-					||  0 == strncmp(typen, "mediump", 7)
-					||  0 == strncmp(typen, "highp", 5) )
+					if (0 == bx::strncmp(typen, "lowp", 4)
+					||  0 == bx::strncmp(typen, "mediump", 7)
+					||  0 == bx::strncmp(typen, "highp", 5) )
 					{
 						precision = typen;
 						typen = parse = bx::strws(bx::strword(parse) );
@@ -145,32 +149,32 @@ namespace bgfx { namespace glsl
 					char uniformType[256];
 					parse = bx::strword(parse);
 
-					if (0 == strncmp(typen, "sampler", 7) )
+					if (0 == bx::strncmp(typen, "sampler", 7) )
 					{
-						strcpy(uniformType, "int");
+						bx::strlncpy(uniformType, BX_COUNTOF(uniformType), "int");
 					}
 					else
 					{
-						bx::strlcpy(uniformType, typen, parse-typen+1);
+						bx::strlcpy(uniformType, typen, int32_t(parse-typen+1) );
 					}
 
 					const char* name = parse = bx::strws(parse);
 
 					char uniformName[256];
 					uint8_t num = 1;
-					const char* array = bx::strnstr(name, "[", eol-parse);
+					const char* array = bx::strnstr(name, "[", int32_t(eol-parse) );
 					if (NULL != array)
 					{
-						bx::strlcpy(uniformName, name, array-name+1);
+						bx::strlcpy(uniformName, name, int32_t(array-name+1) );
 
 						char arraySize[32];
-						const char* end = bx::strnstr(array, "]", eol-array);
-						bx::strlcpy(arraySize, array+1, end-array);
+						const char* end = bx::strnstr(array, "]", int32_t(eol-array) );
+						bx::strlcpy(arraySize, array+1, int32_t(end-array) );
 						num = uint8_t(atoi(arraySize) );
 					}
 					else
 					{
-						bx::strlcpy(uniformName, name, eol-name+1);
+						bx::strlcpy(uniformName, name, int32_t(eol-name+1) );
 					}
 
 					Uniform un;
@@ -193,43 +197,43 @@ namespace bgfx { namespace glsl
 		}
 		else
 		{
-			const char* parse = strstr(optimizedShader, "struct xlatMtlShaderUniform {");
+			const char* parse = bx::strnstr(optimizedShader, "struct xlatMtlShaderUniform {");
 			const char* end   = parse;
 			if (NULL != parse)
 			{
-				parse += strlen("struct xlatMtlShaderUniform {");
-				end   = strstr(parse, "};");
+				parse += bx::strnlen("struct xlatMtlShaderUniform {");
+				end   =  bx::strnstr(parse, "};");
 			}
 
 			while ( parse < end
 			&&     *parse != '\0')
 			{
 				parse = bx::strws(parse);
-				const char* eol = strchr(parse, ';');
+				const char* eol = bx::strnchr(parse, ';');
 				if (NULL != eol)
 				{
 					const char* typen = parse;
 
 					char uniformType[256];
 					parse = bx::strword(parse);
-					bx::strlcpy(uniformType, typen, parse-typen+1);
+					bx::strlcpy(uniformType, typen, int32_t(parse-typen+1) );
 					const char* name = parse = bx::strws(parse);
 
 					char uniformName[256];
 					uint8_t num = 1;
-					const char* array = bx::strnstr(name, "[", eol-parse);
+					const char* array = bx::strnstr(name, "[", int32_t(eol-parse) );
 					if (NULL != array)
 					{
-						bx::strlcpy(uniformName, name, array-name+1);
+						bx::strlcpy(uniformName, name, int32_t(array-name+1) );
 
 						char arraySize[32];
-						const char* arrayEnd = bx::strnstr(array, "]", eol-array);
-						bx::strlcpy(arraySize, array+1, arrayEnd-array);
+						const char* arrayEnd = bx::strnstr(array, "]", int32_t(eol-array) );
+						bx::strlcpy(arraySize, array+1, int32_t(arrayEnd-array) );
 						num = uint8_t(atoi(arraySize) );
 					}
 					else
 					{
-						bx::strlcpy(uniformName, name, eol-name+1);
+						bx::strlcpy(uniformName, name, int32_t(eol-name+1) );
 					}
 
 					Uniform un;
diff --git a/3rdparty/bgfx/tools/shaderc/shaderc_hlsl.cpp b/3rdparty/bgfx/tools/shaderc/shaderc_hlsl.cpp
index 24bf9dc..fe74d58 100644
--- a/3rdparty/bgfx/tools/shaderc/shaderc_hlsl.cpp
+++ b/3rdparty/bgfx/tools/shaderc/shaderc_hlsl.cpp
@@ -520,7 +520,7 @@ namespace bgfx { namespace hlsl
 						, bindDesc.BindCount
 						);
 
-					const char * end = strstr(bindDesc.Name, "Sampler");
+					const char * end = bx::strnstr(bindDesc.Name, "Sampler");
 					if (NULL != end)
 					{
 						Uniform un;
diff --git a/3rdparty/bgfx/tools/shaderc/shaderc_spirv.cpp b/3rdparty/bgfx/tools/shaderc/shaderc_spirv.cpp
index 517192e..97a9029 100644
--- a/3rdparty/bgfx/tools/shaderc/shaderc_spirv.cpp
+++ b/3rdparty/bgfx/tools/shaderc/shaderc_spirv.cpp
@@ -495,7 +495,7 @@ namespace bgfx { namespace spirv
 				const SpvReflection::Id& id = it->second;
 				uint32_t num = uint32_t(id.members.size() );
 				if (0 < num
-				&&  0 != strcmp(id.var.name.c_str(), "gl_PerVertex") )
+				&&  0 != bx::strncmp(id.var.name.c_str(), "gl_PerVertex") )
 				{
 					printf("%3d: %s %d %s\n"
 						, it->first
@@ -606,7 +606,7 @@ namespace bgfx { namespace spirv
 				int32_t start   = 0;
 				int32_t end     = INT32_MAX;
 
-				const char* err = strstr(log, "ERROR:");
+				const char* err = bx::strnstr(log, "ERROR:");
 
 				bool found = false;
 
diff --git a/3rdparty/bgfx/tools/texturec/texturec.cpp b/3rdparty/bgfx/tools/texturec/texturec.cpp
index 3dd106a..b7e629f 100644
--- a/3rdparty/bgfx/tools/texturec/texturec.cpp
+++ b/3rdparty/bgfx/tools/texturec/texturec.cpp
@@ -3,69 +3,13 @@
  * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
  */
 
+#include <stdio.h>
 #include <bx/allocator.h>
 #include <bx/readerwriter.h>
 #include <bx/endian.h>
 
-#include <bgfx/bgfx.h>
-
-#include "image.h"
-
-#include <libsquish/squish.h>
-#include <etc1/etc1.h>
-#include <etc2/ProcessRGB.hpp>
-#include <nvtt/nvtt.h>
-#include <pvrtc/PvrTcEncoder.h>
-
-#include <edtaa3/edtaa3func.h>
-
-extern "C" {
-#include <iqa.h>
-}
-
-#define LODEPNG_NO_COMPILE_ENCODER
-#define LODEPNG_NO_COMPILE_DISK
-#define LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
-#define LODEPNG_NO_COMPILE_ERROR_TEXT
-#define LODEPNG_NO_COMPILE_ALLOCATORS
-#define LODEPNG_NO_COMPILE_CPP
-#include <lodepng/lodepng.cpp>
-
-void* lodepng_malloc(size_t _size)
-{
-	return ::malloc(_size);
-}
-
-void* lodepng_realloc(void* _ptr, size_t _size)
-{
-	return ::realloc(_ptr, _size);
-}
-
-void lodepng_free(void* _ptr)
-{
-	::free(_ptr);
-}
-
-BX_PRAGMA_DIAGNOSTIC_PUSH();
-BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wmissing-field-initializers");
-BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wshadow");
-BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wint-to-pointer-cast")
-#define STBI_MALLOC(_size)        lodepng_malloc(_size)
-#define STBI_REALLOC(_ptr, _size) lodepng_realloc(_ptr, _size)
-#define STBI_FREE(_ptr)           lodepng_free(_ptr)
-#define STB_IMAGE_IMPLEMENTATION
-#include <stb/stb_image.c>
-BX_PRAGMA_DIAGNOSTIC_POP();
-
-BX_PRAGMA_DIAGNOSTIC_PUSH()
-BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wtype-limits")
-BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-parameter")
-BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-value")
-BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4100) // error C4100: '' : unreferenced formal parameter
-#define MINIZ_NO_STDIO
-#define TINYEXR_IMPLEMENTATION
-#include <tinyexr/tinyexr.h>
-BX_PRAGMA_DIAGNOSTIC_POP()
+#include <bimg/decode.h>
+#include <bimg/encode.h>
 
 #if 0
 #	define BX_TRACE(_format, ...) fprintf(stderr, "" _format "\n", ##__VA_ARGS__)
@@ -76,403 +20,9 @@ BX_PRAGMA_DIAGNOSTIC_POP()
 #include <bx/crtimpl.h>
 #include <bx/uint32_t.h>
 
-namespace bgfx
-{
-	bool imageParse(ImageContainer& _imageContainer, const void* _data, uint32_t _size, void** _out)
-	{
-		*_out = NULL;
-		bool loaded = imageParse(_imageContainer, _data, _size);
-		if (!loaded)
-		{
-			bgfx::TextureFormat::Enum format = bgfx::TextureFormat::RGBA8;
-			uint32_t bpp = 32;
-
-			uint32_t width  = 0;
-			uint32_t height = 0;
-
-			uint8_t* out = NULL;
-			static uint8_t pngMagic[] = { 0x89, 0x50, 0x4E, 0x47, 0x0d, 0x0a };
-			if (0 == memcmp(_data, pngMagic, sizeof(pngMagic) ) )
-			{
-				unsigned error;
-				LodePNGState state;
-				lodepng_state_init(&state);
-				state.decoder.color_convert = 0;
-				error = lodepng_decode(&out, &width, &height, &state, (uint8_t*)_data, _size);
-
-				if (0 == error)
-				{
-					*_out = out;
-
-					switch (state.info_raw.bitdepth)
-					{
-					case 8:
-						switch (state.info_raw.colortype)
-						{
-						case LCT_GREY:
-							format = bgfx::TextureFormat::R8;
-							bpp    = 8;
-							break;
-
-						case LCT_GREY_ALPHA:
-							format = bgfx::TextureFormat::RG8;
-							bpp    = 16;
-							break;
-
-						case LCT_RGB:
-							format = bgfx::TextureFormat::RGB8;
-							bpp    = 24;
-							break;
-
-						case LCT_RGBA:
-							format = bgfx::TextureFormat::RGBA8;
-							bpp    = 32;
-							break;
-
-						case LCT_PALETTE:
-							break;
-						}
-						break;
-
-					case 16:
-						switch (state.info_raw.colortype)
-						{
-						case LCT_GREY:
-							for (uint32_t ii = 0, num = width*height; ii < num; ++ii)
-							{
-								uint16_t* rgba = (uint16_t*)out + ii;
-								rgba[0] = bx::toHostEndian(rgba[0], false);
-							}
-							format = bgfx::TextureFormat::R16;
-							bpp    = 16;
-							break;
-
-						case LCT_GREY_ALPHA:
-							for (uint32_t ii = 0, num = width*height; ii < num; ++ii)
-							{
-								uint16_t* rgba = (uint16_t*)out + ii*2;
-								rgba[0] = bx::toHostEndian(rgba[0], false);
-								rgba[1] = bx::toHostEndian(rgba[1], false);
-							}
-							format = bgfx::TextureFormat::RG16;
-							bpp    = 32;
-							break;
-
-						case LCT_RGBA:
-							for (uint32_t ii = 0, num = width*height; ii < num; ++ii)
-							{
-								uint16_t* rgba = (uint16_t*)out + ii*4;
-								rgba[0] = bx::toHostEndian(rgba[0], false);
-								rgba[1] = bx::toHostEndian(rgba[1], false);
-								rgba[2] = bx::toHostEndian(rgba[2], false);
-								rgba[3] = bx::toHostEndian(rgba[3], false);
-							}
-							format = bgfx::TextureFormat::RGBA16;
-							bpp    = 64;
-							break;
-
-						case LCT_RGB:
-						case LCT_PALETTE:
-							break;
-						}
-						break;
-
-					default:
-						break;
-					}
-				}
-
-				lodepng_state_cleanup(&state);
-			}
-			else
-			{
-				int comp = 0;
-				*_out = stbi_load_from_memory( (uint8_t*)_data, _size, (int*)&width, (int*)&height, &comp, 4);
-			}
-
-			loaded = NULL != *_out;
-
-			if (loaded)
-			{
-				_imageContainer.m_data      = *_out;
-				_imageContainer.m_size      = width*height*bpp/8;
-				_imageContainer.m_offset    = 0;
-				_imageContainer.m_width     = width;
-				_imageContainer.m_height    = height;
-				_imageContainer.m_depth     = 1;
-				_imageContainer.m_numLayers = 1;
-				_imageContainer.m_format    = format;
-				_imageContainer.m_numMips   = 1;
-				_imageContainer.m_hasAlpha  = true;
-				_imageContainer.m_cubeMap   = false;
-				_imageContainer.m_ktx       = false;
-				_imageContainer.m_ktxLE     = false;
-				_imageContainer.m_srgb      = false;
-			}
-		}
-
-		return loaded;
-	}
-
-	bool imageEncodeFromRgba8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint8_t _format)
-	{
-		TextureFormat::Enum format = TextureFormat::Enum(_format);
-
-		switch (format)
-		{
-		case TextureFormat::BC1:
-		case TextureFormat::BC2:
-		case TextureFormat::BC3:
-		case TextureFormat::BC4:
-		case TextureFormat::BC5:
-			squish::CompressImage( (const uint8_t*)_src, _width, _height, _dst
-				, format == TextureFormat::BC2 ? squish::kDxt3
-				: format == TextureFormat::BC3 ? squish::kDxt5
-				: format == TextureFormat::BC4 ? squish::kBc4
-				: format == TextureFormat::BC5 ? squish::kBc5
-				:                                squish::kDxt1
-				);
-			return true;
-
-		case TextureFormat::BC6H:
-			nvtt::compressBC6H( (const uint8_t*)_src, _width, _height, 4, _dst);
-			return true;
-
-		case TextureFormat::BC7:
-			nvtt::compressBC7( (const uint8_t*)_src, _width, _height, 4, _dst);
-			return true;
-
-		case TextureFormat::ETC1:
-			etc1_encode_image( (const uint8_t*)_src, _width, _height, 4, _width*4, (uint8_t*)_dst);
-			return true;
-
-		case TextureFormat::ETC2:
-			{
-				const uint32_t blockWidth  = (_width +3)/4;
-				const uint32_t blockHeight = (_height+3)/4;
-				const uint32_t pitch = _width*4;
-				const uint8_t* src = (const uint8_t*)_src;
-				uint64_t* dst = (uint64_t*)_dst;
-				for (uint32_t yy = 0; yy < blockHeight; ++yy)
-				{
-					for (uint32_t xx = 0; xx < blockWidth; ++xx)
-					{
-						uint8_t block[4*4*4];
-						const uint8_t* ptr = &src[(yy*pitch+xx*4)*4];
-
-						for (uint32_t ii = 0; ii < 16; ++ii)
-						{ // BGRx
-							memcpy(&block[ii*4], &ptr[(ii%4)*pitch + (ii&~3)], 4);
-							bx::xchg(block[ii*4+0], block[ii*4+2]);
-						}
-
-						*dst++ = ProcessRGB_ETC2(block);
-					}
-				}
-			}
-			return true;
-
-		case TextureFormat::PTC14:
-			{
-				using namespace Javelin;
-				RgbaBitmap bmp;
-				bmp.width  = _width;
-				bmp.height = _height;
-				bmp.data   = (uint8_t*)const_cast<void*>(_src);
-				PvrTcEncoder::EncodeRgb4Bpp(_dst, bmp);
-				bmp.data = NULL;
-			}
-			return true;
-
-		case TextureFormat::PTC14A:
-			{
-				using namespace Javelin;
-				RgbaBitmap bmp;
-				bmp.width  = _width;
-				bmp.height = _height;
-				bmp.data   = (uint8_t*)const_cast<void*>(_src);
-				PvrTcEncoder::EncodeRgba4Bpp(_dst, bmp);
-				bmp.data = NULL;
-			}
-			return true;
-
-		case TextureFormat::BGRA8:
-			imageSwizzleBgra8(_dst, _width, _height, _width*4, _src);
-			return true;
-
-		case TextureFormat::RGBA8:
-			memcpy(_dst, _src, _width*_height*4);
-			return true;
-
-		default:
-			return imageConvert(_dst, format, _src, TextureFormat::RGBA8, _width, _height);
-		}
-
-		return false;
-	}
-
-	bool imageEncodeFromRgba32f(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, uint8_t _format)
-	{
-		TextureFormat::Enum format = TextureFormat::Enum(_format);
-
-		const uint8_t* src = (const uint8_t*)_src;
-
-		switch (format)
-		{
-		case TextureFormat::RGBA8:
-			{
-				uint8_t* dst = (uint8_t*)_dst;
-				for (uint32_t yy = 0; yy < _height; ++yy)
-				{
-					for (uint32_t xx = 0; xx < _width; ++xx)
-					{
-						const uint32_t offset = yy*_width + xx;
-						const float* input = (const float*)&src[offset * 16];
-						uint8_t* output    = &dst[offset * 4];
-						output[0] = uint8_t(input[0]*255.0f + 0.5f);
-						output[1] = uint8_t(input[1]*255.0f + 0.5f);
-						output[2] = uint8_t(input[2]*255.0f + 0.5f);
-						output[3] = uint8_t(input[3]*255.0f + 0.5f);
-					}
-				}
-			}
-			return true;
-
-		case TextureFormat::BC5:
-			{
-				uint8_t* temp = (uint8_t*)BX_ALLOC(_allocator, _width*_height*4);
-				for (uint32_t yy = 0; yy < _height; ++yy)
-				{
-					for (uint32_t xx = 0; xx < _width; ++xx)
-					{
-						const uint32_t offset = yy*_width + xx;
-						const float* input = (const float*)&src[offset * 16];
-						uint8_t* output    = &temp[offset * 4];
-						output[0] = uint8_t(input[0]*255.0f + 0.5f);
-						output[1] = uint8_t(input[1]*255.0f + 0.5f);
-						output[2] = uint8_t(input[2]*255.0f + 0.5f);
-						output[3] = uint8_t(input[3]*255.0f + 0.5f);
-					}
-				}
-
-				imageEncodeFromRgba8(_dst, temp, _width, _height, _format);
-				BX_FREE(_allocator, temp);
-			}
-			return true;
-
-		default:
-			return imageConvert(_dst, format, _src, TextureFormat::RGBA32F, _width, _height);
-		}
-
-		return false;
-	}
-
-	void imageRgba32f11to01(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src)
-	{
-		const uint8_t* src = (const uint8_t*)_src;
-		uint8_t* dst = (uint8_t*)_dst;
-
-		for (uint32_t yy = 0; yy < _height; ++yy)
-		{
-			for (uint32_t xx = 0; xx < _width; ++xx)
-			{
-				const uint32_t offset = yy*_pitch + xx * 16;
-				const float* input = (const float*)&src[offset];
-				float* output = (float*)&dst[offset];
-				output[0] = input[0]*0.5f + 0.5f;
-				output[1] = input[1]*0.5f + 0.5f;
-				output[2] = input[2]*0.5f + 0.5f;
-				output[3] = input[3]*0.5f + 0.5f;
-			}
-		}
-	}
-
-	static void edtaa3(bx::AllocatorI* _allocator, double* _dst, uint32_t _width, uint32_t _height, double* _src)
-	{
-		const uint32_t numPixels = _width*_height;
-
-		short* xdist = (short *)BX_ALLOC(_allocator, numPixels*sizeof(short) );
-		short* ydist = (short *)BX_ALLOC(_allocator, numPixels*sizeof(short) );
-		double* gx   = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
-		double* gy   = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
-
-		::computegradient(_src, _width, _height, gx, gy);
-		::edtaa3(_src, gx, gy, _width, _height, xdist, ydist, _dst);
-
-		for (uint32_t ii = 0; ii < numPixels; ++ii)
-		{
-			if (_dst[ii] < 0.0)
-			{
-				_dst[ii] = 0.0;
-			}
-		}
-
-		BX_FREE(_allocator, xdist);
-		BX_FREE(_allocator, ydist);
-		BX_FREE(_allocator, gx);
-		BX_FREE(_allocator, gy);
-	}
-
-	inline double min(double _a, double _b)
-	{
-		return _a > _b ? _b : _a;
-	}
-
-	inline double max(double _a, double _b)
-	{
-		return _a > _b ? _a : _b;
-	}
-
-	inline double clamp(double _val, double _min, double _max)
-	{
-		return max(min(_val, _max), _min);
-	}
-
-	void imageMakeDist(bx::AllocatorI* _allocator, void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, float _edge, const void* _src)
-	{
-		const uint32_t numPixels = _width*_height;
-
-		double* imgIn   = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
-		double* outside = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
-		double* inside  = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
-
-		for (uint32_t yy = 0; yy < _height; ++yy)
-		{
-			const uint8_t* src = (const uint8_t*)_src + yy*_pitch;
-			double* dst = &imgIn[yy*_width];
-			for (uint32_t xx = 0; xx < _width; ++xx)
-			{
-				dst[xx] = double(src[xx])/255.0;
-			}
-		}
-
-		edtaa3(_allocator, outside, _width, _height, imgIn);
-
-		for (uint32_t ii = 0; ii < numPixels; ++ii)
-		{
-			imgIn[ii] = 1.0 - imgIn[ii];
-		}
-
-		edtaa3(_allocator, inside, _width, _height, imgIn);
-
-		BX_FREE(_allocator, imgIn);
-
-		uint8_t* dst = (uint8_t*)_dst;
-
-		double edgeOffset = _edge*0.5;
-		double invEdge = 1.0/_edge;
-
-		for (uint32_t ii = 0; ii < numPixels; ++ii)
-		{
-			double dist = clamp( ( (outside[ii] - inside[ii])+edgeOffset) * invEdge, 0.0, 1.0);
-			dst[ii] = 255-uint8_t(dist * 255.0);
-		}
-
-		BX_FREE(_allocator, inside);
-		BX_FREE(_allocator, outside);
-	}
-
-} // namespace bgfx
+extern "C" {
+#include <iqa.h>
+}
 
 void help(const char* _error = NULL)
 {
@@ -567,28 +117,20 @@ int main(int _argc, const char* _argv[])
 	bx::close(&reader);
 
 	{
-		using namespace bgfx;
+		using namespace bimg;
 
-		uint8_t* decodedImage = NULL;
-		ImageContainer input;
+		ImageContainer* input = imageParse(&allocator, inputData, inputSize);
 
-		bool loaded = imageParse(input, inputData, inputSize, (void**)&decodedImage);
-		if (NULL != decodedImage)
+		if (NULL != input)
 		{
 			BX_FREE(&allocator, inputData);
 
-			inputData = (uint8_t*)input.m_data;
-			inputSize = input.m_size;
-		}
-
-		if (loaded)
-		{
 			const char* type = cmdLine.findOption('t');
-			bgfx::TextureFormat::Enum format = input.m_format;
+			bimg::TextureFormat::Enum format = input->m_format;
 
 			if (NULL != type)
 			{
-				format = bgfx::getFormat(type);
+				format = bimg::getFormat(type);
 
 				if (!isValid(format) )
 				{
@@ -600,10 +142,10 @@ int main(int _argc, const char* _argv[])
 			ImageContainer* output = NULL;
 
 			ImageMip mip;
-			if (imageGetRawData(input, 0, 0, inputData, inputSize, mip) )
+			if (imageGetRawData(*input, 0, 0, input->m_data, input->m_size, mip) )
 			{
 				uint8_t numMips = mips
-					? imageGetNumMips(format, mip.m_width, mip.m_height)
+					? imageGetNumMips(format, uint16_t(mip.m_width), uint16_t(mip.m_height) )
 					: 1
 					;
 
@@ -611,7 +153,7 @@ int main(int _argc, const char* _argv[])
 
 				if (normalMap)
 				{
-					output = imageAlloc(&allocator, format, mip.m_width, mip.m_height, 0, 1, false, mips);
+					output = imageAlloc(&allocator, format, uint16_t(mip.m_width), uint16_t(mip.m_height), 0, 1, false, mips);
 
 					ImageMip dstMip;
 					imageGetRawData(*output, 0, 0, NULL, 0, dstMip);
@@ -631,8 +173,8 @@ int main(int _argc, const char* _argv[])
 
 					uint32_t size = imageGetSize(
 						  NULL
-						, dstMip.m_width
-						, dstMip.m_height
+						, uint16_t(dstMip.m_width)
+						, uint16_t(dstMip.m_height)
 						, 0
 						, false
 						, false
@@ -682,9 +224,9 @@ int main(int _argc, const char* _argv[])
 
 					BX_FREE(&allocator, rgbaDst);
 				}
-				else if (8 != getBlockInfo(input.m_format).rBits)
+				else if (8 != getBlockInfo(input->m_format).rBits)
 				{
-					output = imageAlloc(&allocator, format, mip.m_width, mip.m_height, 0, 1, false, mips);
+					output = imageAlloc(&allocator, format, uint16_t(mip.m_width), uint16_t(mip.m_height), 0, 1, false, mips);
 
 					ImageMip dstMip;
 					imageGetRawData(*output, 0, 0, NULL, 0, dstMip);
@@ -704,8 +246,8 @@ int main(int _argc, const char* _argv[])
 
 					uint32_t size = imageGetSize(
 						  NULL
-						, dstMip.m_width
-						, dstMip.m_height
+						, uint16_t(dstMip.m_width)
+						, uint16_t(dstMip.m_height)
 						, 0
 						, false
 						, false
@@ -753,7 +295,7 @@ int main(int _argc, const char* _argv[])
 				}
 				else
 				{
-					output = imageAlloc(&allocator, format, mip.m_width, mip.m_height, 0, 1, false, mips);
+					output = imageAlloc(&allocator, format, uint16_t(mip.m_width), uint16_t(mip.m_height), 0, 1, false, mips);
 
 					ImageMip dstMip;
 					imageGetRawData(*output, 0, 0, NULL, 0, dstMip);
@@ -773,8 +315,8 @@ int main(int _argc, const char* _argv[])
 
 					uint32_t size = imageGetSize(
 						  NULL
-						, dstMip.m_width
-						, dstMip.m_height
+						, uint16_t(dstMip.m_width)
+						, uint16_t(dstMip.m_height)
 						, 0
 						, false
 						, false
@@ -782,7 +324,7 @@ int main(int _argc, const char* _argv[])
 						, TextureFormat::RGBA8
 						);
 					temp = BX_ALLOC(&allocator, size);
-					memset(temp, 0, size);
+					bx::memSet(temp, 0, size);
 					uint8_t* rgba = (uint8_t*)temp;
 
 					imageDecodeToRgba8(rgba
@@ -797,7 +339,7 @@ int main(int _argc, const char* _argv[])
 					if (iqa)
 					{
 						ref = BX_ALLOC(&allocator, size);
-						memcpy(ref, rgba, size);
+						bx::memCopy(ref, rgba, size);
 					}
 
 					imageEncodeFromRgba8(output->m_data, rgba, dstMip.m_width, dstMip.m_height, format);
@@ -879,8 +421,6 @@ int main(int _argc, const char* _argv[])
 			help("Failed to load input file.");
 			return EXIT_FAILURE;
 		}
-
-		BX_FREE(&allocator, inputData);
 	}
 
 	return EXIT_SUCCESS;
diff --git a/3rdparty/bgfx/tools/texturev/texturev.cpp b/3rdparty/bgfx/tools/texturev/texturev.cpp
index bd582c8..182f56f 100644
--- a/3rdparty/bgfx/tools/texturev/texturev.cpp
+++ b/3rdparty/bgfx/tools/texturev/texturev.cpp
@@ -24,7 +24,7 @@
 #include <string>
 namespace stl = tinystl;
 
-#include "image.h"
+#include <bimg/decode.h>
 
 #include <bgfx/embedded_shader.h>
 
@@ -312,7 +312,7 @@ struct View
 			{
 				if (0 == (item->d_type & DT_DIR) )
 				{
-					const char* ext = strrchr(item->d_name, '.');
+					const char* ext = bx::strnrchr(item->d_name, '.');
 					if (NULL != ext)
 					{
 						ext += 1;
@@ -856,13 +856,20 @@ int _main_(int _argc, char** _argv)
 						);
 
 				std::string title;
-				bx::stringPrintf(title, "%s (%d x %d%s, %s)"
-					, filePath
-					, view.m_info.width
-					, view.m_info.height
-					, view.m_info.cubeMap ? " CubeMap" : ""
-					, bgfx::getName(view.m_info.format)
-					);
+				if (isValid(texture) )
+				{
+					bx::stringPrintf(title, "%s (%d x %d%s, %s)"
+						, filePath
+						, view.m_info.width
+						, view.m_info.height
+						, view.m_info.cubeMap ? " CubeMap" : ""
+						, bimg::getName(bimg::TextureFormat::Enum(view.m_info.format) )
+						);
+				}
+				else
+				{
+					bx::stringPrintf(title, "Failed to load %s!", filePath);
+				}
 				entry::WindowHandle handle = { 0 };
 				entry::setWindowTitle(handle, title.c_str() );
 			}
diff --git a/3rdparty/bimg/3rdparty/edtaa3/LICENSE.md b/3rdparty/bimg/3rdparty/edtaa3/LICENSE.md
new file mode 100644
index 0000000..93e6a94
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/edtaa3/LICENSE.md
@@ -0,0 +1,34 @@
+https://github.com/OpenGLInsights/OpenGLInsightsCode/blob/master/Chapter%2012%202D%20Shape%20Rendering%20by%20Distance%20Fields/LICENSE.txt
+
+The C code and the GLSL code for the OpenGL demo is public
+domain code. The distance transform code in the console
+application to create distance field textures, located in
+the file "edtaa3func.c", is MIT licensed, and free to use
+under the following conditions.
+
+https://github.com/OpenGLInsights/OpenGLInsightsCode/issues/6#issuecomment-67829157
+
+----
+
+Copyright (C) 2011 by Stefan Gustavson
+(stefan.gustavson@liu.se)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+----
diff --git a/3rdparty/bimg/3rdparty/edtaa3/edtaa3func.cpp b/3rdparty/bimg/3rdparty/edtaa3/edtaa3func.cpp
new file mode 100644
index 0000000..e209529
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/edtaa3/edtaa3func.cpp
@@ -0,0 +1,580 @@
+/*
+ * edtaa3()
+ *
+ * Sweep-and-update Euclidean distance transform of an
+ * image. Positive pixels are treated as object pixels,
+ * zero or negative pixels are treated as background.
+ * An attempt is made to treat antialiased edges correctly.
+ * The input image must have pixels in the range [0,1],
+ * and the antialiased image should be a box-filter
+ * sampling of the ideal, crisp edge.
+ * If the antialias region is more than 1 pixel wide,
+ * the result from this transform will be inaccurate.
+ *
+ * By Stefan Gustavson (stefan.gustavson@gmail.com).
+ *
+ * Originally written in 1994, based on a verbal
+ * description of Per-Erik Danielsson's SSED8 algorithm
+ * as presented in the PhD dissertation of Ingemar
+ * Ragnemalm. This is Per-Erik Danielsson's scanline
+ * scheme from 1979 - I only implemented it in C.
+ *
+ * Updated in 2004 to treat border pixels correctly,
+ * and cleaned up the code to improve readability.
+ *
+ * Updated in 2009 to handle anti-aliased edges,
+ * as published in the article "Anti-aliased Euclidean
+ * distance transform" by Stefan Gustavson and Robin Strand,
+ * Pattern Recognition Letters 32 (2011) 252�257.
+ *
+ * Updated in 2011 to avoid a corner case causing an
+ * infinite loop for some input data.
+ *
+*/
+
+/*
+
+Copyright (C) 2011 by Stefan Gustavson
+
+(stefan.gustavson@liu.se)
+
+This code is distributed under the permissive "MIT license":
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
+*/
+
+#include <math.h>
+
+/*
+ * Compute the local gradient at edge pixels using convolution filters.
+ * The gradient is computed only at edge pixels. At other places in the
+ * image, it is never used, and it's mostly zero anyway.
+ */
+void computegradient(double *img, int w, int h, double *gx, double *gy)
+{
+    int i,j,k;
+    double glength;
+#define SQRT2 1.4142136
+    for(i = 1; i < h-1; i++) { // Avoid edges where the kernels would spill over
+        for(j = 1; j < w-1; j++) {
+            k = i*w + j;
+            if((img[k]>0.0) && (img[k]<1.0)) { // Compute gradient for edge pixels only
+                gx[k] = -img[k-w-1] - SQRT2*img[k-1] - img[k+w-1] + img[k-w+1] + SQRT2*img[k+1] + img[k+w+1];
+                gy[k] = -img[k-w-1] - SQRT2*img[k-w] - img[k-w+1] + img[k+w-1] + SQRT2*img[k+w] + img[k+w+1];
+                glength = gx[k]*gx[k] + gy[k]*gy[k];
+                if(glength > 0.0) { // Avoid division by zero
+                    glength = sqrt(glength);
+                    gx[k]=gx[k]/glength;
+                    gy[k]=gy[k]/glength;
+                }
+            }
+        }
+    }
+    // TODO: Compute reasonable values for gx, gy also around the image edges.
+    // (These are zero now, which reduces the accuracy for a 1-pixel wide region
+	// around the image edge.) 2x2 kernels would be suitable for this.
+}
+
+/*
+ * A somewhat tricky function to approximate the distance to an edge in a
+ * certain pixel, with consideration to either the local gradient (gx,gy)
+ * or the direction to the pixel (dx,dy) and the pixel greyscale value a.
+ * The latter alternative, using (dx,dy), is the metric used by edtaa2().
+ * Using a local estimate of the edge gradient (gx,gy) yields much better
+ * accuracy at and near edges, and reduces the error even at distant pixels
+ * provided that the gradient direction is accurately estimated.
+ */
+double edgedf(double gx, double gy, double a)
+{
+    double df, glength, temp, a1;
+
+    if ((gx == 0) || (gy == 0)) { // Either A) gu or gv are zero, or B) both
+        df = 0.5-a;  // Linear approximation is A) correct or B) a fair guess
+    } else {
+        glength = sqrt(gx*gx + gy*gy);
+        if(glength>0) {
+            gx = gx/glength;
+            gy = gy/glength;
+        }
+        /* Everything is symmetric wrt sign and transposition,
+         * so move to first octant (gx>=0, gy>=0, gx>=gy) to
+         * avoid handling all possible edge directions.
+         */
+        gx = fabs(gx);
+        gy = fabs(gy);
+        if(gx<gy) {
+            temp = gx;
+            gx = gy;
+            gy = temp;
+        }
+        a1 = 0.5*gy/gx;
+        if (a < a1) { // 0 <= a < a1
+            df = 0.5*(gx + gy) - sqrt(2.0*gx*gy*a);
+        } else if (a < (1.0-a1)) { // a1 <= a <= 1-a1
+            df = (0.5-a)*gx;
+        } else { // 1-a1 < a <= 1
+            df = -0.5*(gx + gy) + sqrt(2.0*gx*gy*(1.0-a));
+        }
+    }    
+    return df;
+}
+
+double distaa3(double *img, double *gximg, double *gyimg, int w, int c, int xc, int yc, int xi, int yi)
+{
+  double di, df, dx, dy, gx, gy, a;
+  int closest;
+  
+  closest = c-xc-yc*w; // Index to the edge pixel pointed to from c
+  a = img[closest];    // Grayscale value at the edge pixel
+  gx = gximg[closest]; // X gradient component at the edge pixel
+  gy = gyimg[closest]; // Y gradient component at the edge pixel
+  
+  if(a > 1.0) a = 1.0;
+  if(a < 0.0) a = 0.0; // Clip grayscale values outside the range [0,1]
+  if(a == 0.0) return 1000000.0; // Not an object pixel, return "very far" ("don't know yet")
+
+  dx = (double)xi;
+  dy = (double)yi;
+  di = sqrt(dx*dx + dy*dy); // Length of integer vector, like a traditional EDT
+  if(di==0) { // Use local gradient only at edges
+      // Estimate based on local gradient only
+      df = edgedf(gx, gy, a);
+  } else {
+      // Estimate gradient based on direction to edge (accurate for large di)
+      df = edgedf(dx, dy, a);
+  }
+  return di + df; // Same metric as edtaa2, except at edges (where di=0)
+}
+
+// Shorthand macro: add ubiquitous parameters img, gx, gy and w and call distaa3()
+#define DISTAA(c,xc,yc,xi,yi) (distaa3(img, gx, gy, w, c, xc, yc, xi, yi))
+
+void edtaa3(double *img, double *gx, double *gy, int w, int h, short *distx, short *disty, double *dist)
+{
+  int x, y, i, c;
+  int offset_u, offset_ur, offset_r, offset_rd,
+  offset_d, offset_dl, offset_l, offset_lu;
+  double olddist, newdist;
+  int cdistx, cdisty, newdistx, newdisty;
+  int changed;
+  double epsilon = 1e-3; // Safeguard against errors due to limited precision
+
+  /* Initialize index offsets for the current image width */
+  offset_u = -w;
+  offset_ur = -w+1;
+  offset_r = 1;
+  offset_rd = w+1;
+  offset_d = w;
+  offset_dl = w-1;
+  offset_l = -1;
+  offset_lu = -w-1;
+
+  /* Initialize the distance images */
+  for(i=0; i<w*h; i++) {
+    distx[i] = 0; // At first, all pixels point to
+    disty[i] = 0; // themselves as the closest known.
+    if(img[i] <= 0.0)
+      {
+	dist[i]= 1000000.0; // Big value, means "not set yet"
+      }
+    else if (img[i]<1.0) {
+      dist[i] = edgedf(gx[i], gy[i], img[i]); // Gradient-assisted estimate
+    }
+    else {
+      dist[i]= 0.0; // Inside the object
+    }
+  }
+
+  /* Perform the transformation */
+  do
+    {
+      changed = 0;
+
+      /* Scan rows, except first row */
+      for(y=1; y<h; y++)
+        {
+
+          /* move index to leftmost pixel of current row */
+          i = y*w;
+
+          /* scan right, propagate distances from above & left */
+
+          /* Leftmost pixel is special, has no left neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If non-zero distance or not set yet
+            {
+	      c = i + offset_u; // Index of candidate for testing
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_ur;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+          i++;
+
+          /* Middle pixels have all neighbors */
+          for(x=1; x<w-1; x++, i++)
+            {
+              olddist = dist[i];
+              if(olddist <= 0) continue; // No need to update further
+
+	      c = i+offset_l;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_lu;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_u;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_ur;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+
+          /* Rightmost pixel of row is special, has no right neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If not already zero distance
+            {
+	      c = i+offset_l;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_lu;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_u;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty+1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+
+          /* Move index to second rightmost pixel of current row. */
+          /* Rightmost pixel is skipped, it has no right neighbor. */
+          i = y*w + w-2;
+
+          /* scan left, propagate distance from right */
+          for(x=w-2; x>=0; x--, i--)
+            {
+              olddist = dist[i];
+              if(olddist <= 0) continue; // Already zero distance
+
+	      c = i+offset_r;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+        }
+      
+      /* Scan rows in reverse order, except last row */
+      for(y=h-2; y>=0; y--)
+        {
+          /* move index to rightmost pixel of current row */
+          i = y*w + w-1;
+
+          /* Scan left, propagate distances from below & right */
+
+          /* Rightmost pixel is special, has no right neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If not already zero distance
+            {
+	      c = i+offset_d;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_dl;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+          i--;
+
+          /* Middle pixels have all neighbors */
+          for(x=w-2; x>0; x--, i--)
+            {
+              olddist = dist[i];
+              if(olddist <= 0) continue; // Already zero distance
+
+	      c = i+offset_r;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_rd;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_d;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_dl;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+          /* Leftmost pixel is special, has no left neighbors */
+          olddist = dist[i];
+          if(olddist > 0) // If not already zero distance
+            {
+	      c = i+offset_r;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_rd;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx-1;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  olddist=newdist;
+                  changed = 1;
+                }
+
+	      c = i+offset_d;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx;
+              newdisty = cdisty-1;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+
+          /* Move index to second leftmost pixel of current row. */
+          /* Leftmost pixel is skipped, it has no left neighbor. */
+          i = y*w + 1;
+          for(x=1; x<w; x++, i++)
+            {
+              /* scan right, propagate distance from left */
+              olddist = dist[i];
+              if(olddist <= 0) continue; // Already zero distance
+
+	      c = i+offset_l;
+	      cdistx = distx[c];
+	      cdisty = disty[c];
+              newdistx = cdistx+1;
+              newdisty = cdisty;
+              newdist = DISTAA(c, cdistx, cdisty, newdistx, newdisty);
+              if(newdist < olddist-epsilon)
+                {
+                  distx[i]=newdistx;
+                  disty[i]=newdisty;
+                  dist[i]=newdist;
+                  changed = 1;
+                }
+            }
+        }
+    }
+  while(changed); // Sweep until no more updates are made
+
+  /* The transformation is completed. */
+
+}
diff --git a/3rdparty/bimg/3rdparty/edtaa3/edtaa3func.h b/3rdparty/bimg/3rdparty/edtaa3/edtaa3func.h
new file mode 100644
index 0000000..6052aa4
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/edtaa3/edtaa3func.h
@@ -0,0 +1,7 @@
+#ifndef EDTAA3_H_HEADER_GUARD
+#define EDTAA3_H_HEADER_GUARD
+
+extern void computegradient(double *img, int w, int h, double *gx, double *gy);
+extern void edtaa3(double *img, double *gx, double *gy, int w, int h, short *distx, short *disty, double *dist);
+
+#endif // EDTAA3_H_HEADER_GUARD
diff --git a/3rdparty/bimg/3rdparty/etc1/LICENSE b/3rdparty/bimg/3rdparty/etc1/LICENSE
new file mode 100644
index 0000000..64635a4
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc1/LICENSE
@@ -0,0 +1,161 @@
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and
+distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the
+copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other
+entities that control, are controlled by, or are under common control with
+that entity. For the purposes of this definition, "control" means (i) the
+power, direct or indirect, to cause the direction or management of such 
+entity, whether by contract or otherwise, or (ii) ownership of fifty 
+percent (50%) or more of the outstanding shares, or (iii) beneficial 
+ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising 
+permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, 
+including but not limited to software source code, documentation 
+source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation 
+or translation of a Source form, including but not limited to compiled 
+object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object 
+form, made available under the License, as indicated by a copyright 
+notice that is included in or attached to the work (an example is 
+provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object 
+form, that is based on (or derived from) the Work and for which the 
+editorial revisions, annotations, elaborations, or other modifications 
+represent, as a whole, an original work of authorship. For the purposes 
+of this License, Derivative Works shall not include works that remain 
+separable from, or merely link (or bind by name) to the interfaces of, 
+the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original 
+version of the Work and any modifications or additions to that Work or 
+Derivative Works thereof, that is intentionally submitted to Licensor 
+for inclusion in the Work by the copyright owner or by an individual or 
+Legal Entity authorized to submit on behalf of the copyright owner. For 
+the purposes of this definition, "submitted" means any form of electronic, 
+verbal, or written communication sent to the Licensor or its 
+representatives, including but not limited to communication on electronic 
+mailing lists, source code control systems, and issue tracking systems that 
+are managed by, or on behalf of, the Licensor for the purpose of discussing 
+and improving the Work, but excluding communication that is conspicuously 
+marked or otherwise designated in writing by the copyright owner as "Not 
+a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on 
+behalf of whom a Contribution has been received by Licensor and subsequently 
+incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this 
+License, each Contributor hereby grants to You a perpetual, worldwide, 
+non-exclusive, no-charge, royalty-free, irrevocable copyright license to 
+reproduce, prepare Derivative Works of, publicly display, publicly perform, 
+sublicense, and distribute the Work and such Derivative Works in Source or 
+Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this 
+License, each Contributor hereby grants to You a perpetual, worldwide, 
+non-exclusive, no-charge, royalty-free, irrevocable (except as stated in 
+this section) patent license to make, have made, use, offer to sell, sell, 
+import, and otherwise transfer the Work, where such license applies only to 
+those patent claims licensable by such Contributor that are necessarily 
+infringed by their Contribution(s) alone or by combination of their 
+Contribution(s) with the Work to which such Contribution(s) was submitted. 
+If You institute patent litigation against any entity (including a cross-claim
+or counterclaim in a lawsuit) alleging that the Work or a Contribution 
+incorporated within the Work constitutes direct or contributory patent 
+infringement, then any patent licenses granted to You under this License 
+for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or 
+Derivative Works thereof in any medium, with or without modifications, and 
+in Source or Object form, provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of 
+this License; and
+You must cause any modified files to carry prominent notices stating that 
+You changed the files; and
+You must retain, in the Source form of any Derivative Works that You 
+distribute, all copyright, patent, trademark, and attribution notices 
+from the Source form of the Work, excluding those notices that do not 
+pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, 
+then any Derivative Works that You distribute must include a readable 
+copy of the attribution notices contained within such NOTICE file, excluding
+those notices that do not pertain to any part of the Derivative Works, in
+at least one of the following places: within a NOTICE text file distributed 
+as part of the Derivative Works; within the Source form or documentation, if 
+provided along with the Derivative Works; or, within a display generated by 
+the Derivative Works, if and wherever such third-party notices normally 
+appear. The contents of the NOTICE file are for informational purposes 
+only and do not modify the License. You may add Your own attribution 
+notices within Derivative Works that You distribute, alongside or as 
+an addendum to the NOTICE text from the Work, provided that such additional 
+attribution notices cannot be construed as modifying the License. 
+
+You may add Your own copyright statement to Your modifications and may provide
+additional or different license terms and conditions for use, reproduction, or
+distribution of Your modifications, or for any such Derivative Works as a 
+whole, provided Your use, reproduction, and distribution of the Work otherwise 
+complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any 
+Contribution intentionally submitted for inclusion in the Work by You to the 
+Licensor shall be under the terms and conditions of this License, without any 
+additional terms or conditions. Notwithstanding the above, nothing herein 
+shall supersede or modify the terms of any separate license agreement you 
+may have executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names, 
+trademarks, service marks, or product names of the Licensor, except as 
+required for reasonable and customary use in describing the origin of the 
+Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to 
+in writing, Licensor provides the Work (and each Contributor provides its 
+Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 
+ANY KIND, either express or implied, including, without limitation, any 
+warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or 
+FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining 
+the appropriateness of using or redistributing the Work and assume any risks 
+associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in
+tort (including negligence), contract, or otherwise, unless required by 
+applicable law (such as deliberate and grossly negligent acts) or agreed to 
+in writing, shall any Contributor be liable to You for damages, including 
+any direct, indirect, special, incidental, or consequential damages of any 
+character arising as a result of this License or out of the use or inability 
+to use the Work (including but not limited to damages for loss of goodwill, 
+work stoppage, computer failure or malfunction, or any and all other 
+commercial damages or losses), even if such Contributor has been advised 
+of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the 
+Work or Derivative Works thereof, You may choose to offer, and charge a 
+fee for, acceptance of support, warranty, indemnity, or other liability 
+obligations and/or rights consistent with this License. However, in accepting
+such obligations, You may act only on Your own behalf and on Your sole 
+responsibility, not on behalf of any other Contributor, and only if You
+agree to indemnify, defend, and hold each Contributor harmless for any 
+liability incurred by, or claims asserted against, such Contributor by 
+reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
\ No newline at end of file
diff --git a/3rdparty/bimg/3rdparty/etc1/etc1.cpp b/3rdparty/bimg/3rdparty/etc1/etc1.cpp
new file mode 100644
index 0000000..0953f98
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc1/etc1.cpp
@@ -0,0 +1,686 @@
+// Copyright 2009 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// This is a fork of the AOSP project ETC1 codec. The original code can be found
+// at the following web site:
+// https://android.googlesource.com/platform/frameworks/native/+/master/opengl/include/ETC1/
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#include "etc1.h"
+
+#include <cstring>
+
+/* From http://www.khronos.org/registry/gles/extensions/OES/OES_compressed_ETC1_RGB8_texture.txt
+
+ The number of bits that represent a 4x4 texel block is 64 bits if
+ <internalformat> is given by ETC1_RGB8_OES.
+
+ The data for a block is a number of bytes,
+
+ {q0, q1, q2, q3, q4, q5, q6, q7}
+
+ where byte q0 is located at the lowest memory address and q7 at
+ the highest. The 64 bits specifying the block is then represented
+ by the following 64 bit integer:
+
+ int64bit = 256*(256*(256*(256*(256*(256*(256*q0+q1)+q2)+q3)+q4)+q5)+q6)+q7;
+
+ ETC1_RGB8_OES:
+
+ a) bit layout in bits 63 through 32 if diffbit = 0
+
+ 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48
+ -----------------------------------------------
+ | base col1 | base col2 | base col1 | base col2 |
+ | R1 (4bits)| R2 (4bits)| G1 (4bits)| G2 (4bits)|
+ -----------------------------------------------
+
+ 47 46 45 44 43 42 41 40 39 38 37 36 35 34  33  32
+ ---------------------------------------------------
+ | base col1 | base col2 | table  | table  |diff|flip|
+ | B1 (4bits)| B2 (4bits)| cw 1   | cw 2   |bit |bit |
+ ---------------------------------------------------
+
+
+ b) bit layout in bits 63 through 32 if diffbit = 1
+
+ 63 62 61 60 59 58 57 56 55 54 53 52 51 50 49 48
+ -----------------------------------------------
+ | base col1    | dcol 2 | base col1    | dcol 2 |
+ | R1' (5 bits) | dR2    | G1' (5 bits) | dG2    |
+ -----------------------------------------------
+
+ 47 46 45 44 43 42 41 40 39 38 37 36 35 34  33  32
+ ---------------------------------------------------
+ | base col 1   | dcol 2 | table  | table  |diff|flip|
+ | B1' (5 bits) | dB2    | cw 1   | cw 2   |bit |bit |
+ ---------------------------------------------------
+
+
+ c) bit layout in bits 31 through 0 (in both cases)
+
+ 31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16
+ -----------------------------------------------
+ |       most significant pixel index bits       |
+ | p| o| n| m| l| k| j| i| h| g| f| e| d| c| b| a|
+ -----------------------------------------------
+
+ 15 14 13 12 11 10  9  8  7  6  5  4  3   2   1  0
+ --------------------------------------------------
+ |         least significant pixel index bits       |
+ | p| o| n| m| l| k| j| i| h| g| f| e| d| c | b | a |
+ --------------------------------------------------
+
+
+ Add table 3.17.2: Intensity modifier sets for ETC1 compressed textures:
+
+ table codeword                modifier table
+ ------------------        ----------------------
+ 0                     -8  -2  2   8
+ 1                    -17  -5  5  17
+ 2                    -29  -9  9  29
+ 3                    -42 -13 13  42
+ 4                    -60 -18 18  60
+ 5                    -80 -24 24  80
+ 6                   -106 -33 33 106
+ 7                   -183 -47 47 183
+
+
+ Add table 3.17.3 Mapping from pixel index values to modifier values for
+ ETC1 compressed textures:
+
+ pixel index value
+ ---------------
+ msb     lsb           resulting modifier value
+ -----   -----          -------------------------
+ 1       1            -b (large negative value)
+ 1       0            -a (small negative value)
+ 0       0             a (small positive value)
+ 0       1             b (large positive value)
+
+
+ */
+
+static const int kModifierTable[] = {
+/* 0 */2, 8, -2, -8,
+/* 1 */5, 17, -5, -17,
+/* 2 */9, 29, -9, -29,
+/* 3 */13, 42, -13, -42,
+/* 4 */18, 60, -18, -60,
+/* 5 */24, 80, -24, -80,
+/* 6 */33, 106, -33, -106,
+/* 7 */47, 183, -47, -183 };
+
+static const int kLookup[8] = { 0, 1, 2, 3, -4, -3, -2, -1 };
+
+static inline etc1_byte clamp(int x) {
+    return (etc1_byte) (x >= 0 ? (x < 255 ? x : 255) : 0);
+}
+
+static
+inline int convert4To8(int b) {
+    int c = b & 0xf;
+    return (c << 4) | c;
+}
+
+static
+inline int convert5To8(int b) {
+    int c = b & 0x1f;
+    return (c << 3) | (c >> 2);
+}
+
+static
+inline int convert6To8(int b) {
+    int c = b & 0x3f;
+    return (c << 2) | (c >> 4);
+}
+
+static
+inline int divideBy255(int d) {
+    return (d + 128 + (d >> 8)) >> 8;
+}
+
+static
+inline int convert8To4(int b) {
+    int c = b & 0xff;
+    return divideBy255(c * 15);
+}
+
+static
+inline int convert8To5(int b) {
+    int c = b & 0xff;
+    return divideBy255(c * 31);
+}
+
+static
+inline int convertDiff(int base, int diff) {
+    return convert5To8((0x1f & base) + kLookup[0x7 & diff]);
+}
+
+static
+void decode_subblock(etc1_byte* pOut, int r, int g, int b, const int* table,
+        etc1_uint32 low, bool second, bool flipped) {
+    int baseX = 0;
+    int baseY = 0;
+    if (second) {
+        if (flipped) {
+            baseY = 2;
+        } else {
+            baseX = 2;
+        }
+    }
+    for (int i = 0; i < 8; i++) {
+        int x, y;
+        if (flipped) {
+            x = baseX + (i >> 1);
+            y = baseY + (i & 1);
+        } else {
+            x = baseX + (i >> 2);
+            y = baseY + (i & 3);
+        }
+        int k = y + (x * 4);
+        int offset = ((low >> k) & 1) | ((low >> (k + 15)) & 2);
+        int delta = table[offset];
+        etc1_byte* q = pOut + 3 * (x + 4 * y);
+        *q++ = clamp(r + delta);
+        *q++ = clamp(g + delta);
+        *q++ = clamp(b + delta);
+    }
+}
+
+// Input is an ETC1 compressed version of the data.
+// Output is a 4 x 4 square of 3-byte pixels in form R, G, B
+
+void etc1_decode_block(const etc1_byte* pIn, etc1_byte* pOut) {
+    etc1_uint32 high = (pIn[0] << 24) | (pIn[1] << 16) | (pIn[2] << 8) | pIn[3];
+    etc1_uint32 low = (pIn[4] << 24) | (pIn[5] << 16) | (pIn[6] << 8) | pIn[7];
+    int r1, r2, g1, g2, b1, b2;
+    if (high & 2) {
+        // differential
+        int rBase = high >> 27;
+        int gBase = high >> 19;
+        int bBase = high >> 11;
+        r1 = convert5To8(rBase);
+        r2 = convertDiff(rBase, high >> 24);
+        g1 = convert5To8(gBase);
+        g2 = convertDiff(gBase, high >> 16);
+        b1 = convert5To8(bBase);
+        b2 = convertDiff(bBase, high >> 8);
+    } else {
+        // not differential
+        r1 = convert4To8(high >> 28);
+        r2 = convert4To8(high >> 24);
+        g1 = convert4To8(high >> 20);
+        g2 = convert4To8(high >> 16);
+        b1 = convert4To8(high >> 12);
+        b2 = convert4To8(high >> 8);
+    }
+    int tableIndexA = 7 & (high >> 5);
+    int tableIndexB = 7 & (high >> 2);
+    const int* tableA = kModifierTable + tableIndexA * 4;
+    const int* tableB = kModifierTable + tableIndexB * 4;
+    bool flipped = (high & 1) != 0;
+    decode_subblock(pOut, r1, g1, b1, tableA, low, false, flipped);
+    decode_subblock(pOut, r2, g2, b2, tableB, low, true, flipped);
+}
+
+typedef struct {
+    etc1_uint32 high;
+    etc1_uint32 low;
+    etc1_uint32 score; // Lower is more accurate
+} etc_compressed;
+
+static
+inline void take_best(etc_compressed* a, const etc_compressed* b) {
+    if (a->score > b->score) {
+        *a = *b;
+    }
+}
+
+static
+void etc_average_colors_subblock(const etc1_byte* pIn, etc1_uint32 inMask,
+        etc1_byte* pColors, bool flipped, bool second) {
+    int r = 0;
+    int g = 0;
+    int b = 0;
+
+    if (flipped) {
+        int by = 0;
+        if (second) {
+            by = 2;
+        }
+        for (int y = 0; y < 2; y++) {
+            int yy = by + y;
+            for (int x = 0; x < 4; x++) {
+                int i = x + 4 * yy;
+                if (inMask & (1 << i)) {
+                    const etc1_byte* p = pIn + i * 3;
+                    r += *(p++);
+                    g += *(p++);
+                    b += *(p++);
+                }
+            }
+        }
+    } else {
+        int bx = 0;
+        if (second) {
+            bx = 2;
+        }
+        for (int y = 0; y < 4; y++) {
+            for (int x = 0; x < 2; x++) {
+                int xx = bx + x;
+                int i = xx + 4 * y;
+                if (inMask & (1 << i)) {
+                    const etc1_byte* p = pIn + i * 3;
+                    r += *(p++);
+                    g += *(p++);
+                    b += *(p++);
+                }
+            }
+        }
+    }
+    pColors[0] = (etc1_byte)((r + 4) >> 3);
+    pColors[1] = (etc1_byte)((g + 4) >> 3);
+    pColors[2] = (etc1_byte)((b + 4) >> 3);
+}
+
+static
+inline int square(int x) {
+    return x * x;
+}
+
+static etc1_uint32 chooseModifier(const etc1_byte* pBaseColors,
+        const etc1_byte* pIn, etc1_uint32 *pLow, int bitIndex,
+        const int* pModifierTable) {
+    etc1_uint32 bestScore = ~0;
+    int bestIndex = 0;
+    int pixelR = pIn[0];
+    int pixelG = pIn[1];
+    int pixelB = pIn[2];
+    int r = pBaseColors[0];
+    int g = pBaseColors[1];
+    int b = pBaseColors[2];
+    for (int i = 0; i < 4; i++) {
+        int modifier = pModifierTable[i];
+        int decodedG = clamp(g + modifier);
+        etc1_uint32 score = (etc1_uint32) (6 * square(decodedG - pixelG));
+        if (score >= bestScore) {
+            continue;
+        }
+        int decodedR = clamp(r + modifier);
+        score += (etc1_uint32) (3 * square(decodedR - pixelR));
+        if (score >= bestScore) {
+            continue;
+        }
+        int decodedB = clamp(b + modifier);
+        score += (etc1_uint32) square(decodedB - pixelB);
+        if (score < bestScore) {
+            bestScore = score;
+            bestIndex = i;
+        }
+    }
+    etc1_uint32 lowMask = (((bestIndex >> 1) << 16) | (bestIndex & 1))
+            << bitIndex;
+    *pLow |= lowMask;
+    return bestScore;
+}
+
+static
+void etc_encode_subblock_helper(const etc1_byte* pIn, etc1_uint32 inMask,
+        etc_compressed* pCompressed, bool flipped, bool second,
+        const etc1_byte* pBaseColors, const int* pModifierTable) {
+    int score = pCompressed->score;
+    if (flipped) {
+        int by = 0;
+        if (second) {
+            by = 2;
+        }
+        for (int y = 0; y < 2; y++) {
+            int yy = by + y;
+            for (int x = 0; x < 4; x++) {
+                int i = x + 4 * yy;
+                if (inMask & (1 << i)) {
+                    score += chooseModifier(pBaseColors, pIn + i * 3,
+                            &pCompressed->low, yy + x * 4, pModifierTable);
+                }
+            }
+        }
+    } else {
+        int bx = 0;
+        if (second) {
+            bx = 2;
+        }
+        for (int y = 0; y < 4; y++) {
+            for (int x = 0; x < 2; x++) {
+                int xx = bx + x;
+                int i = xx + 4 * y;
+                if (inMask & (1 << i)) {
+                    score += chooseModifier(pBaseColors, pIn + i * 3,
+                            &pCompressed->low, y + xx * 4, pModifierTable);
+                }
+            }
+        }
+    }
+    pCompressed->score = score;
+}
+
+static bool inRange4bitSigned(int color) {
+    return color >= -4 && color <= 3;
+}
+
+static void etc_encodeBaseColors(etc1_byte* pBaseColors,
+        const etc1_byte* pColors, etc_compressed* pCompressed) {
+    int r1, g1, b1, r2, g2, b2; // 8 bit base colors for sub-blocks
+    bool differential;
+    {
+        int r51 = convert8To5(pColors[0]);
+        int g51 = convert8To5(pColors[1]);
+        int b51 = convert8To5(pColors[2]);
+        int r52 = convert8To5(pColors[3]);
+        int g52 = convert8To5(pColors[4]);
+        int b52 = convert8To5(pColors[5]);
+
+        r1 = convert5To8(r51);
+        g1 = convert5To8(g51);
+        b1 = convert5To8(b51);
+
+        int dr = r52 - r51;
+        int dg = g52 - g51;
+        int db = b52 - b51;
+
+        differential = inRange4bitSigned(dr) && inRange4bitSigned(dg)
+                && inRange4bitSigned(db);
+        if (differential) {
+            r2 = convert5To8(r51 + dr);
+            g2 = convert5To8(g51 + dg);
+            b2 = convert5To8(b51 + db);
+            pCompressed->high |= (r51 << 27) | ((7 & dr) << 24) | (g51 << 19)
+                    | ((7 & dg) << 16) | (b51 << 11) | ((7 & db) << 8) | 2;
+        }
+    }
+
+    if (!differential) {
+        int r41 = convert8To4(pColors[0]);
+        int g41 = convert8To4(pColors[1]);
+        int b41 = convert8To4(pColors[2]);
+        int r42 = convert8To4(pColors[3]);
+        int g42 = convert8To4(pColors[4]);
+        int b42 = convert8To4(pColors[5]);
+        r1 = convert4To8(r41);
+        g1 = convert4To8(g41);
+        b1 = convert4To8(b41);
+        r2 = convert4To8(r42);
+        g2 = convert4To8(g42);
+        b2 = convert4To8(b42);
+        pCompressed->high |= (r41 << 28) | (r42 << 24) | (g41 << 20) | (g42
+                << 16) | (b41 << 12) | (b42 << 8);
+    }
+    pBaseColors[0] = r1;
+    pBaseColors[1] = g1;
+    pBaseColors[2] = b1;
+    pBaseColors[3] = r2;
+    pBaseColors[4] = g2;
+    pBaseColors[5] = b2;
+}
+
+static
+void etc_encode_block_helper(const etc1_byte* pIn, etc1_uint32 inMask,
+        const etc1_byte* pColors, etc_compressed* pCompressed, bool flipped) {
+    pCompressed->score = ~0;
+    pCompressed->high = (flipped ? 1 : 0);
+    pCompressed->low = 0;
+
+    etc1_byte pBaseColors[6];
+
+    etc_encodeBaseColors(pBaseColors, pColors, pCompressed);
+
+    int originalHigh = pCompressed->high;
+
+    const int* pModifierTable = kModifierTable;
+    for (int i = 0; i < 8; i++, pModifierTable += 4) {
+        etc_compressed temp;
+        temp.score = 0;
+        temp.high = originalHigh | (i << 5);
+        temp.low = 0;
+        etc_encode_subblock_helper(pIn, inMask, &temp, flipped, false,
+                pBaseColors, pModifierTable);
+        take_best(pCompressed, &temp);
+    }
+    pModifierTable = kModifierTable;
+    etc_compressed firstHalf = *pCompressed;
+    for (int i = 0; i < 8; i++, pModifierTable += 4) {
+        etc_compressed temp;
+        temp.score = firstHalf.score;
+        temp.high = firstHalf.high | (i << 2);
+        temp.low = firstHalf.low;
+        etc_encode_subblock_helper(pIn, inMask, &temp, flipped, true,
+                pBaseColors + 3, pModifierTable);
+        if (i == 0) {
+            *pCompressed = temp;
+        } else {
+            take_best(pCompressed, &temp);
+        }
+    }
+}
+
+static void writeBigEndian(etc1_byte* pOut, etc1_uint32 d) {
+    pOut[0] = (etc1_byte)(d >> 24);
+    pOut[1] = (etc1_byte)(d >> 16);
+    pOut[2] = (etc1_byte)(d >> 8);
+    pOut[3] = (etc1_byte) d;
+}
+
+// Input is a 4 x 4 square of 3-byte pixels in form R, G, B
+// inmask is a 16-bit mask where bit (1 << (x + y * 4)) tells whether the corresponding (x,y)
+// pixel is valid or not. Invalid pixel color values are ignored when compressing.
+// Output is an ETC1 compressed version of the data.
+
+void etc1_encode_block(const etc1_byte* pIn, etc1_uint32 inMask,
+        etc1_byte* pOut) {
+    etc1_byte colors[6];
+    etc1_byte flippedColors[6];
+    etc_average_colors_subblock(pIn, inMask, colors, false, false);
+    etc_average_colors_subblock(pIn, inMask, colors + 3, false, true);
+    etc_average_colors_subblock(pIn, inMask, flippedColors, true, false);
+    etc_average_colors_subblock(pIn, inMask, flippedColors + 3, true, true);
+
+    etc_compressed a, b;
+    etc_encode_block_helper(pIn, inMask, colors, &a, false);
+    etc_encode_block_helper(pIn, inMask, flippedColors, &b, true);
+    take_best(&a, &b);
+    writeBigEndian(pOut, a.high);
+    writeBigEndian(pOut + 4, a.low);
+}
+
+// Return the size of the encoded image data (does not include size of PKM header).
+
+etc1_uint32 etc1_get_encoded_data_size(etc1_uint32 width, etc1_uint32 height) {
+    return (((width + 3) & ~3) * ((height + 3) & ~3)) >> 1;
+}
+
+// Encode an entire image.
+// pIn - pointer to the image data. Formatted such that the Red component of
+//       pixel (x,y) is at pIn + pixelSize * x + stride * y + redOffset;
+// pOut - pointer to encoded data. Must be large enough to store entire encoded image.
+
+int etc1_encode_image(const etc1_byte* pIn, etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride, etc1_byte* pOut) {
+    if (pixelSize < 2 || pixelSize > 4) {
+        return -1;
+    }
+    static const unsigned short kYMask[] = { 0x0, 0xf, 0xff, 0xfff, 0xffff };
+    static const unsigned short kXMask[] = { 0x0, 0x1111, 0x3333, 0x7777,
+            0xffff };
+    etc1_byte block[ETC1_DECODED_BLOCK_SIZE];
+    etc1_byte encoded[ETC1_ENCODED_BLOCK_SIZE];
+
+    etc1_uint32 encodedWidth = (width + 3) & ~3;
+    etc1_uint32 encodedHeight = (height + 3) & ~3;
+
+    for (etc1_uint32 y = 0; y < encodedHeight; y += 4) {
+        etc1_uint32 yEnd = height - y;
+        if (yEnd > 4) {
+            yEnd = 4;
+        }
+        int ymask = kYMask[yEnd];
+        for (etc1_uint32 x = 0; x < encodedWidth; x += 4) {
+            etc1_uint32 xEnd = width - x;
+            if (xEnd > 4) {
+                xEnd = 4;
+            }
+            int mask = ymask & kXMask[xEnd];
+            for (etc1_uint32 cy = 0; cy < yEnd; cy++) {
+                etc1_byte* q = block + (cy * 4) * 3;
+                const etc1_byte* p = pIn + pixelSize * x + stride * (y + cy);
+                if (pixelSize >= 3) {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        memcpy(q, p, 3);
+                        q += 3;
+                        p += pixelSize;
+                    }
+                } else {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        int pixel = (p[1] << 8) | p[0];
+                        *q++ = convert5To8(pixel >> 11);
+                        *q++ = convert6To8(pixel >> 5);
+                        *q++ = convert5To8(pixel);
+                        p += pixelSize;
+                    }
+                }
+            }
+            etc1_encode_block(block, mask, encoded);
+            memcpy(pOut, encoded, sizeof(encoded));
+            pOut += sizeof(encoded);
+        }
+    }
+    return 0;
+}
+
+// Decode an entire image.
+// pIn - pointer to encoded data.
+// pOut - pointer to the image data. Will be written such that the Red component of
+//       pixel (x,y) is at pIn + pixelSize * x + stride * y + redOffset. Must be
+//        large enough to store entire image.
+
+
+int etc1_decode_image(const etc1_byte* pIn, etc1_byte* pOut,
+        etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride) {
+    if (pixelSize < 2 || pixelSize > 4) {
+        return -1;
+    }
+    etc1_byte block[ETC1_DECODED_BLOCK_SIZE];
+
+    etc1_uint32 encodedWidth = (width + 3) & ~3;
+    etc1_uint32 encodedHeight = (height + 3) & ~3;
+
+    for (etc1_uint32 y = 0; y < encodedHeight; y += 4) {
+        etc1_uint32 yEnd = height - y;
+        if (yEnd > 4) {
+            yEnd = 4;
+        }
+        for (etc1_uint32 x = 0; x < encodedWidth; x += 4) {
+            etc1_uint32 xEnd = width - x;
+            if (xEnd > 4) {
+                xEnd = 4;
+            }
+            etc1_decode_block(pIn, block);
+            pIn += ETC1_ENCODED_BLOCK_SIZE;
+            for (etc1_uint32 cy = 0; cy < yEnd; cy++) {
+                const etc1_byte* q = block + (cy * 4) * 3;
+                etc1_byte* p = pOut + pixelSize * x + stride * (y + cy);
+                if (pixelSize >= 3) {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        memcpy(p, q, 3);
+                        q += 3;
+                        p += pixelSize;
+                    }
+                } else {
+                    for (etc1_uint32 cx = 0; cx < xEnd; cx++) {
+                        etc1_byte r = *q++;
+                        etc1_byte g = *q++;
+                        etc1_byte b = *q++;
+                        etc1_uint32 pixel = ((r >> 3) << 11) | ((g >> 2) << 5) | (b >> 3);
+                        *p++ = (etc1_byte) pixel;
+                        *p++ = (etc1_byte) (pixel >> 8);
+                    }
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+static const char kMagic[] = { 'P', 'K', 'M', ' ', '1', '0' };
+
+static const etc1_uint32 ETC1_PKM_FORMAT_OFFSET = 6;
+static const etc1_uint32 ETC1_PKM_ENCODED_WIDTH_OFFSET = 8;
+static const etc1_uint32 ETC1_PKM_ENCODED_HEIGHT_OFFSET = 10;
+static const etc1_uint32 ETC1_PKM_WIDTH_OFFSET = 12;
+static const etc1_uint32 ETC1_PKM_HEIGHT_OFFSET = 14;
+
+static const etc1_uint32 ETC1_RGB_NO_MIPMAPS = 0;
+
+static void writeBEUint16(etc1_byte* pOut, etc1_uint32 data) {
+    pOut[0] = (etc1_byte) (data >> 8);
+    pOut[1] = (etc1_byte) data;
+}
+
+static etc1_uint32 readBEUint16(const etc1_byte* pIn) {
+    return (pIn[0] << 8) | pIn[1];
+}
+
+// Format a PKM header
+
+void etc1_pkm_format_header(etc1_byte* pHeader, etc1_uint32 width, etc1_uint32 height) {
+    memcpy(pHeader, kMagic, sizeof(kMagic));
+    etc1_uint32 encodedWidth = (width + 3) & ~3;
+    etc1_uint32 encodedHeight = (height + 3) & ~3;
+    writeBEUint16(pHeader + ETC1_PKM_FORMAT_OFFSET, ETC1_RGB_NO_MIPMAPS);
+    writeBEUint16(pHeader + ETC1_PKM_ENCODED_WIDTH_OFFSET, encodedWidth);
+    writeBEUint16(pHeader + ETC1_PKM_ENCODED_HEIGHT_OFFSET, encodedHeight);
+    writeBEUint16(pHeader + ETC1_PKM_WIDTH_OFFSET, width);
+    writeBEUint16(pHeader + ETC1_PKM_HEIGHT_OFFSET, height);
+}
+
+// Check if a PKM header is correctly formatted.
+
+etc1_bool etc1_pkm_is_valid(const etc1_byte* pHeader) {
+    if (memcmp(pHeader, kMagic, sizeof(kMagic))) {
+        return false;
+    }
+    etc1_uint32 format = readBEUint16(pHeader + ETC1_PKM_FORMAT_OFFSET);
+    etc1_uint32 encodedWidth = readBEUint16(pHeader + ETC1_PKM_ENCODED_WIDTH_OFFSET);
+    etc1_uint32 encodedHeight = readBEUint16(pHeader + ETC1_PKM_ENCODED_HEIGHT_OFFSET);
+    etc1_uint32 width = readBEUint16(pHeader + ETC1_PKM_WIDTH_OFFSET);
+    etc1_uint32 height = readBEUint16(pHeader + ETC1_PKM_HEIGHT_OFFSET);
+    return format == ETC1_RGB_NO_MIPMAPS &&
+            encodedWidth >= width && encodedWidth - width < 4 &&
+            encodedHeight >= height && encodedHeight - height < 4;
+}
+
+// Read the image width from a PKM header
+
+etc1_uint32 etc1_pkm_get_width(const etc1_byte* pHeader) {
+    return readBEUint16(pHeader + ETC1_PKM_WIDTH_OFFSET);
+}
+
+// Read the image height from a PKM header
+
+etc1_uint32 etc1_pkm_get_height(const etc1_byte* pHeader){
+    return readBEUint16(pHeader + ETC1_PKM_HEIGHT_OFFSET);
+}
diff --git a/3rdparty/bimg/3rdparty/etc1/etc1.h b/3rdparty/bimg/3rdparty/etc1/etc1.h
new file mode 100644
index 0000000..d66ca9d
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc1/etc1.h
@@ -0,0 +1,114 @@
+// Copyright 2009 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+// This is a fork of the AOSP project ETC1 codec. The original code can be found
+// at the following web site:
+// https://android.googlesource.com/platform/frameworks/native/+/master/opengl/libs/ETC1/
+
+//////////////////////////////////////////////////////////////////////////////////////////
+
+#ifndef __etc1_h__
+#define __etc1_h__
+
+#define ETC1_ENCODED_BLOCK_SIZE 8
+#define ETC1_DECODED_BLOCK_SIZE 48
+
+#ifndef ETC1_RGB8_OES
+#define ETC1_RGB8_OES 0x8D64
+#endif
+
+typedef unsigned char etc1_byte;
+typedef int etc1_bool;
+typedef unsigned int etc1_uint32;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Encode a block of pixels.
+//
+// pIn is a pointer to a ETC_DECODED_BLOCK_SIZE array of bytes that represent a
+// 4 x 4 square of 3-byte pixels in form R, G, B. Byte (3 * (x + 4 * y) is the R
+// value of pixel (x, y).
+//
+// validPixelMask is a 16-bit mask where bit (1 << (x + y * 4)) indicates whether
+// the corresponding (x,y) pixel is valid. Invalid pixel color values are ignored when compressing.
+//
+// pOut is an ETC1 compressed version of the data.
+
+void etc1_encode_block(const etc1_byte* pIn, etc1_uint32 validPixelMask, etc1_byte* pOut);
+
+// Decode a block of pixels.
+//
+// pIn is an ETC1 compressed version of the data.
+//
+// pOut is a pointer to a ETC_DECODED_BLOCK_SIZE array of bytes that represent a
+// 4 x 4 square of 3-byte pixels in form R, G, B. Byte (3 * (x + 4 * y) is the R
+// value of pixel (x, y).
+
+void etc1_decode_block(const etc1_byte* pIn, etc1_byte* pOut);
+
+// Return the size of the encoded image data (does not include size of PKM header).
+
+etc1_uint32 etc1_get_encoded_data_size(etc1_uint32 width, etc1_uint32 height);
+
+// Encode an entire image.
+// pIn - pointer to the image data. Formatted such that
+//       pixel (x,y) is at pIn + pixelSize * x + stride * y;
+// pOut - pointer to encoded data. Must be large enough to store entire encoded image.
+// pixelSize can be 2 or 3. 2 is an GL_UNSIGNED_SHORT_5_6_5 image, 3 is a GL_BYTE RGB image.
+// returns non-zero if there is an error.
+
+int etc1_encode_image(const etc1_byte* pIn, etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride, etc1_byte* pOut);
+
+// Decode an entire image.
+// pIn - pointer to encoded data.
+// pOut - pointer to the image data. Will be written such that
+//        pixel (x,y) is at pIn + pixelSize * x + stride * y. Must be
+//        large enough to store entire image.
+// pixelSize can be 2 or 3. 2 is an GL_UNSIGNED_SHORT_5_6_5 image, 3 is a GL_BYTE RGB image.
+// returns non-zero if there is an error.
+
+int etc1_decode_image(const etc1_byte* pIn, etc1_byte* pOut,
+        etc1_uint32 width, etc1_uint32 height,
+        etc1_uint32 pixelSize, etc1_uint32 stride);
+
+// Size of a PKM header, in bytes.
+
+#define ETC_PKM_HEADER_SIZE 16
+
+// Format a PKM header
+
+void etc1_pkm_format_header(etc1_byte* pHeader, etc1_uint32 width, etc1_uint32 height);
+
+// Check if a PKM header is correctly formatted.
+
+etc1_bool etc1_pkm_is_valid(const etc1_byte* pHeader);
+
+// Read the image width from a PKM header
+
+etc1_uint32 etc1_pkm_get_width(const etc1_byte* pHeader);
+
+// Read the image height from a PKM header
+
+etc1_uint32 etc1_pkm_get_height(const etc1_byte* pHeader);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/etc2/LICENSE.txt b/3rdparty/bimg/3rdparty/etc2/LICENSE.txt
new file mode 100644
index 0000000..2254f9e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/LICENSE.txt
@@ -0,0 +1,24 @@
+Copyright (c) 2013, Bartosz Taudul <wolf.pld@gmail.com>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/bimg/3rdparty/etc2/Math.hpp b/3rdparty/bimg/3rdparty/etc2/Math.hpp
new file mode 100644
index 0000000..3a92a2e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/Math.hpp
@@ -0,0 +1,90 @@
+#ifndef __DARKRL__MATH_HPP__
+#define __DARKRL__MATH_HPP__
+
+#include <algorithm>
+#include <math.h>
+
+#include "Types.hpp"
+
+template<typename T>
+inline T AlignPOT( T val )
+{
+    if( val == 0 ) return 1;
+    val--;
+    for( unsigned int i=1; i<sizeof( T ) * 8; i <<= 1 )
+    {
+        val |= val >> i;
+    }
+    return val + 1;
+}
+
+inline int CountSetBits( uint32 val )
+{
+    val -= ( val >> 1 ) & 0x55555555;
+    val = ( ( val >> 2 ) & 0x33333333 ) + ( val & 0x33333333 );
+    val = ( ( val >> 4 ) + val ) & 0x0f0f0f0f;
+    val += val >> 8;
+    val += val >> 16;
+    return val & 0x0000003f;
+}
+
+inline int CountLeadingZeros( uint32 val )
+{
+    val |= val >> 1;
+    val |= val >> 2;
+    val |= val >> 4;
+    val |= val >> 8;
+    val |= val >> 16;
+    return 32 - CountSetBits( val );
+}
+
+inline float sRGB2linear( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.04045f )
+    {
+        return v / 12.92f;
+    }
+    else
+    {
+        return powf( ( v + a ) / ( 1 + a ), 2.4f );
+    }
+}
+
+inline float linear2sRGB( float v )
+{
+    const float a = 0.055f;
+    if( v <= 0.0031308f )
+    {
+        return 12.92f * v;
+    }
+    else
+    {
+        return ( 1 + a ) * pow( v, 1/2.4f ) - a;
+    }
+}
+
+template<class T>
+inline T SmoothStep( T x )
+{
+    return x*x*(3-2*x);
+}
+
+inline uint8 clampu8( int32 val )
+{
+    return std::min( std::max( 0, val ), 255 );
+}
+
+template<class T>
+inline T sq( T val )
+{
+    return val * val;
+}
+
+static inline int mul8bit( int a, int b )
+{
+    int t = a*b + 128;
+    return ( t + ( t >> 8 ) ) >> 8;
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/etc2/ProcessCommon.hpp b/3rdparty/bimg/3rdparty/etc2/ProcessCommon.hpp
new file mode 100644
index 0000000..7e6addb
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/ProcessCommon.hpp
@@ -0,0 +1,51 @@
+#ifndef __PROCESSCOMMON_HPP__
+#define __PROCESSCOMMON_HPP__
+
+#include <assert.h>
+#include <stddef.h>
+
+#include "Types.hpp"
+
+template<class T>
+static size_t GetLeastError( const T* err, size_t num )
+{
+    size_t idx = 0;
+    for( size_t i=1; i<num; i++ )
+    {
+        if( err[i] < err[idx] )
+        {
+            idx = i;
+        }
+    }
+    return idx;
+}
+
+static uint64 FixByteOrder( uint64 d )
+{
+    return ( ( d & 0x00000000FFFFFFFF ) ) |
+           ( ( d & 0xFF00000000000000 ) >> 24 ) |
+           ( ( d & 0x000000FF00000000 ) << 24 ) |
+           ( ( d & 0x00FF000000000000 ) >> 8 ) |
+           ( ( d & 0x0000FF0000000000 ) << 8 );
+}
+
+template<class T, class S>
+static uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id )
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64 t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return d;
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/etc2/ProcessRGB.cpp b/3rdparty/bimg/3rdparty/etc2/ProcessRGB.cpp
new file mode 100644
index 0000000..29f0f7e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/ProcessRGB.cpp
@@ -0,0 +1,719 @@
+#include <string.h>
+
+#include "Math.hpp"
+#include "ProcessCommon.hpp"
+#include "ProcessRGB.hpp"
+#include "Tables.hpp"
+#include "Types.hpp"
+#include "Vector.hpp"
+
+#include <bx/endian.h>
+
+#ifdef __SSE4_1__
+#  ifdef _MSC_VER
+#    include <intrin.h>
+#    include <Windows.h>
+#  else
+#    include <x86intrin.h>
+#  endif
+#endif
+
+namespace
+{
+
+typedef uint16 v4i[4];
+
+void Average( const uint8* data, v4i* a )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i d0l = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(d1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(d1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(d3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(d3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b2, b3), _mm_set1_epi32(4)), 3);
+    __m128i a1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b1), _mm_set1_epi32(4)), 3);
+    __m128i a2 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b1, b3), _mm_set1_epi32(4)), 3);
+    __m128i a3 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(b0, b2), _mm_set1_epi32(4)), 3);
+
+    _mm_storeu_si128((__m128i*)&a[0], _mm_packus_epi32(_mm_shuffle_epi32(a0, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a1, _MM_SHUFFLE(3, 0, 1, 2))));
+    _mm_storeu_si128((__m128i*)&a[2], _mm_packus_epi32(_mm_shuffle_epi32(a2, _MM_SHUFFLE(3, 0, 1, 2)), _mm_shuffle_epi32(a3, _MM_SHUFFLE(3, 0, 1, 2))));
+#else
+    uint32 r[4];
+    uint32 g[4];
+    uint32 b[4];
+
+    memset(r, 0, sizeof(r));
+    memset(g, 0, sizeof(g));
+    memset(b, 0, sizeof(b));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            b[index] += *data++;
+            g[index] += *data++;
+            r[index] += *data++;
+            data++;
+        }
+    }
+
+    a[0][0] = uint16( (r[2] + r[3] + 4) / 8 );
+    a[0][1] = uint16( (g[2] + g[3] + 4) / 8 );
+    a[0][2] = uint16( (b[2] + b[3] + 4) / 8 );
+    a[0][3] = 0;
+    a[1][0] = uint16( (r[0] + r[1] + 4) / 8 );
+    a[1][1] = uint16( (g[0] + g[1] + 4) / 8 );
+    a[1][2] = uint16( (b[0] + b[1] + 4) / 8 );
+    a[1][3] = 0;
+    a[2][0] = uint16( (r[1] + r[3] + 4) / 8 );
+    a[2][1] = uint16( (g[1] + g[3] + 4) / 8 );
+    a[2][2] = uint16( (b[1] + b[3] + 4) / 8 );
+    a[2][3] = 0;
+    a[3][0] = uint16( (r[0] + r[2] + 4) / 8 );
+    a[3][1] = uint16( (g[0] + g[2] + 4) / 8 );
+    a[3][2] = uint16( (b[0] + b[2] + 4) / 8 );
+    a[3][3] = 0;
+#endif
+}
+
+void CalcErrorBlock( const uint8* data, uint err[4][4] )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)data) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)data) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)data) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)data) + 3);
+
+    __m128i dm0 = _mm_and_si128(d0, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm1 = _mm_and_si128(d1, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm2 = _mm_and_si128(d2, _mm_set1_epi32(0x00FFFFFF));
+    __m128i dm3 = _mm_and_si128(d3, _mm_set1_epi32(0x00FFFFFF));
+
+    __m128i d0l = _mm_unpacklo_epi8(dm0, _mm_setzero_si128());
+    __m128i d0h = _mm_unpackhi_epi8(dm0, _mm_setzero_si128());
+    __m128i d1l = _mm_unpacklo_epi8(dm1, _mm_setzero_si128());
+    __m128i d1h = _mm_unpackhi_epi8(dm1, _mm_setzero_si128());
+    __m128i d2l = _mm_unpacklo_epi8(dm2, _mm_setzero_si128());
+    __m128i d2h = _mm_unpackhi_epi8(dm2, _mm_setzero_si128());
+    __m128i d3l = _mm_unpacklo_epi8(dm3, _mm_setzero_si128());
+    __m128i d3h = _mm_unpackhi_epi8(dm3, _mm_setzero_si128());
+
+    __m128i sum0 = _mm_add_epi16(d0l, d1l);
+    __m128i sum1 = _mm_add_epi16(d0h, d1h);
+    __m128i sum2 = _mm_add_epi16(d2l, d3l);
+    __m128i sum3 = _mm_add_epi16(d2h, d3h);
+
+    __m128i sum0l = _mm_unpacklo_epi16(sum0, _mm_setzero_si128());
+    __m128i sum0h = _mm_unpackhi_epi16(sum0, _mm_setzero_si128());
+    __m128i sum1l = _mm_unpacklo_epi16(sum1, _mm_setzero_si128());
+    __m128i sum1h = _mm_unpackhi_epi16(sum1, _mm_setzero_si128());
+    __m128i sum2l = _mm_unpacklo_epi16(sum2, _mm_setzero_si128());
+    __m128i sum2h = _mm_unpackhi_epi16(sum2, _mm_setzero_si128());
+    __m128i sum3l = _mm_unpacklo_epi16(sum3, _mm_setzero_si128());
+    __m128i sum3h = _mm_unpackhi_epi16(sum3, _mm_setzero_si128());
+
+    __m128i b0 = _mm_add_epi32(sum0l, sum0h);
+    __m128i b1 = _mm_add_epi32(sum1l, sum1h);
+    __m128i b2 = _mm_add_epi32(sum2l, sum2h);
+    __m128i b3 = _mm_add_epi32(sum3l, sum3h);
+
+    __m128i a0 = _mm_add_epi32(b2, b3);
+    __m128i a1 = _mm_add_epi32(b0, b1);
+    __m128i a2 = _mm_add_epi32(b1, b3);
+    __m128i a3 = _mm_add_epi32(b0, b2);
+
+    _mm_storeu_si128((__m128i*)&err[0], a0);
+    _mm_storeu_si128((__m128i*)&err[1], a1);
+    _mm_storeu_si128((__m128i*)&err[2], a2);
+    _mm_storeu_si128((__m128i*)&err[3], a3);
+#else
+    uint terr[4][4];
+
+    memset(terr, 0, 16 * sizeof(uint));
+
+    for( int j=0; j<4; j++ )
+    {
+        for( int i=0; i<4; i++ )
+        {
+            int index = (j & 2) + (i >> 1);
+            uint d = *data++;
+            terr[index][0] += d;
+            d = *data++;
+            terr[index][1] += d;
+            d = *data++;
+            terr[index][2] += d;
+            data++;
+        }
+    }
+
+    for( int i=0; i<3; i++ )
+    {
+        err[0][i] = terr[2][i] + terr[3][i];
+        err[1][i] = terr[0][i] + terr[1][i];
+        err[2][i] = terr[1][i] + terr[3][i];
+        err[3][i] = terr[0][i] + terr[2][i];
+    }
+    for( int i=0; i<4; i++ )
+    {
+        err[i][3] = 0;
+    }
+#endif
+}
+
+uint CalcError( const uint block[4], const v4i& average )
+{
+    uint err = 0x3FFFFFFF; // Big value to prevent negative values, but small enough to prevent overflow
+    err -= block[0] * 2 * average[2];
+    err -= block[1] * 2 * average[1];
+    err -= block[2] * 2 * average[0];
+    err += 8 * ( sq( average[0] ) + sq( average[1] ) + sq( average[2] ) );
+    return err;
+}
+
+void ProcessAverages( v4i* a )
+{
+#ifdef __SSE4_1__
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2]);
+
+        __m128i t = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(31)), _mm_set1_epi16(128));
+
+        __m128i c = _mm_srli_epi16(_mm_add_epi16(t, _mm_srli_epi16(t, 8)), 8);
+
+        __m128i c1 = _mm_shuffle_epi32(c, _MM_SHUFFLE(3, 2, 3, 2));
+        __m128i diff = _mm_sub_epi16(c, c1);
+        diff = _mm_max_epi16(diff, _mm_set1_epi16(-4));
+        diff = _mm_min_epi16(diff, _mm_set1_epi16(3));
+
+        __m128i co = _mm_add_epi16(c1, diff);
+
+        c = _mm_blend_epi16(co, c, 0xF0);
+
+        __m128i a0 = _mm_or_si128(_mm_slli_epi16(c, 3), _mm_srli_epi16(c, 2));
+
+        _mm_storeu_si128((__m128i*)a[4+i*2], a0);
+    }
+
+    for( int i=0; i<2; i++ )
+    {
+        __m128i d = _mm_loadu_si128((__m128i*)a[i*2]);
+
+        __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(d, _mm_set1_epi16(15)), _mm_set1_epi16(128));
+        __m128i t1 = _mm_srli_epi16(_mm_add_epi16(t0, _mm_srli_epi16(t0, 8)), 8);
+
+        __m128i t2 = _mm_or_si128(t1, _mm_slli_epi16(t1, 4));
+
+        _mm_storeu_si128((__m128i*)a[i*2], t2);
+    }
+#else
+    for( int i=0; i<2; i++ )
+    {
+        for( int j=0; j<3; j++ )
+        {
+            int32 c1 = mul8bit( a[i*2+1][j], 31 );
+            int32 c2 = mul8bit( a[i*2][j], 31 );
+
+            int32 diff = c2 - c1;
+            if( diff > 3 ) diff = 3;
+            else if( diff < -4 ) diff = -4;
+
+            int32 co = c1 + diff;
+
+            a[5+i*2][j] = ( c1 << 3 ) | ( c1 >> 2 );
+            a[4+i*2][j] = ( co << 3 ) | ( co >> 2 );
+        }
+    }
+
+    for( int i=0; i<4; i++ )
+    {
+        a[i][0] = g_avg2[mul8bit( a[i][0], 15 )];
+        a[i][1] = g_avg2[mul8bit( a[i][1], 15 )];
+        a[i][2] = g_avg2[mul8bit( a[i][2], 15 )];
+    }
+#endif
+}
+
+void EncodeAverages( uint64& _d, const v4i* a, size_t idx )
+{
+    uint64 d = _d;
+    d |= ( idx << 24 );
+    size_t base = idx << 1;
+
+    if( ( idx & 0x2 ) == 0 )
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64( a[base+0][i] >> 4 ) << ( i*8 );
+            d |= uint64( a[base+1][i] >> 4 ) << ( i*8 + 4 );
+        }
+    }
+    else
+    {
+        for( int i=0; i<3; i++ )
+        {
+            d |= uint64( a[base+1][i] & 0xF8 ) << ( i*8 );
+            int32 c = ( ( a[base+0][i] & 0xF8 ) - ( a[base+1][i] & 0xF8 ) ) >> 3;
+            c &= ~0xFFFFFFF8;
+            d |= ((uint64)c) << ( i*8 );
+        }
+    }
+    _d = d;
+}
+
+uint64 CheckSolid( const uint8* src )
+{
+#ifdef __SSE4_1__
+    __m128i d0 = _mm_loadu_si128(((__m128i*)src) + 0);
+    __m128i d1 = _mm_loadu_si128(((__m128i*)src) + 1);
+    __m128i d2 = _mm_loadu_si128(((__m128i*)src) + 2);
+    __m128i d3 = _mm_loadu_si128(((__m128i*)src) + 3);
+
+    __m128i c = _mm_shuffle_epi32(d0, _MM_SHUFFLE(0, 0, 0, 0));
+
+    __m128i c0 = _mm_cmpeq_epi8(d0, c);
+    __m128i c1 = _mm_cmpeq_epi8(d1, c);
+    __m128i c2 = _mm_cmpeq_epi8(d2, c);
+    __m128i c3 = _mm_cmpeq_epi8(d3, c);
+
+    __m128i m0 = _mm_and_si128(c0, c1);
+    __m128i m1 = _mm_and_si128(c2, c3);
+    __m128i m = _mm_and_si128(m0, m1);
+
+    if (!_mm_testc_si128(m, _mm_set1_epi32(-1)))
+    {
+        return 0;
+    }
+#else
+    const uint8* ptr = src + 4;
+    for( int i=1; i<16; i++ )
+    {
+        if( memcmp( src, ptr, 4 ) != 0 )
+        {
+            return 0;
+        }
+        ptr += 4;
+    }
+#endif
+    return 0x02000000 |
+        ( uint( src[0] & 0xF8 ) << 16 ) |
+        ( uint( src[1] & 0xF8 ) << 8 ) |
+        ( uint( src[2] & 0xF8 ) );
+}
+
+void PrepareAverages( v4i a[8], const uint8* src, uint err[4] )
+{
+    Average( src, a );
+    ProcessAverages( a );
+
+    uint errblock[4][4];
+    CalcErrorBlock( src, errblock );
+
+    for( int i=0; i<4; i++ )
+    {
+        err[i/2] += CalcError( errblock[i], a[i] );
+        err[2+i/2] += CalcError( errblock[i], a[i+4] );
+    }
+}
+
+void FindBestFit( uint64 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16* sel = tsel[i];
+        uint bid = id[i];
+        uint64* ter = terr[bid%2];
+
+        uint8 b = *data++;
+        uint8 g = *data++;
+        uint8 r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+#ifdef __SSE4_1__
+        // Reference implementation
+
+        __m128i pix = _mm_set1_epi32(dr * 77 + dg * 151 + db * 28);
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        __m128i error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[0]));
+        __m128i error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[1]));
+        __m128i error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[0]));
+        __m128i error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[1]));
+
+        __m128i index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        __m128i minError0 = _mm_min_epi32(error0, error1);
+
+        __m128i index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        __m128i minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex0 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        __m128i minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        __m128i squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        __m128i minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        __m128i squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        error0 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[2]));
+        error1 = _mm_abs_epi32(_mm_add_epi32(pix, g_table256_SIMD[3]));
+        error2 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[2]));
+        error3 = _mm_abs_epi32(_mm_sub_epi32(pix, g_table256_SIMD[3]));
+
+        index0 = _mm_and_si128(_mm_cmplt_epi32(error1, error0), _mm_set1_epi32(1));
+        minError0 = _mm_min_epi32(error0, error1);
+
+        index1 = _mm_sub_epi32(_mm_set1_epi32(2), _mm_cmplt_epi32(error3, error2));
+        minError1 = _mm_min_epi32(error2, error3);
+
+        __m128i minIndex1 = _mm_blendv_epi8(index0, index1, _mm_cmplt_epi32(minError1, minError0));
+        minError = _mm_min_epi32(minError0, minError1);
+
+        // Squaring the minimum error to produce correct values when adding
+        minErrorLow = _mm_shuffle_epi32(minError, _MM_SHUFFLE(1, 1, 0, 0));
+        squareErrorLow = _mm_mul_epi32(minErrorLow, minErrorLow);
+        squareErrorLow = _mm_add_epi64(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 2));
+        _mm_storeu_si128(((__m128i*)ter) + 2, squareErrorLow);
+        minErrorHigh = _mm_shuffle_epi32(minError, _MM_SHUFFLE(3, 3, 2, 2));
+        squareErrorHigh = _mm_mul_epi32(minErrorHigh, minErrorHigh);
+        squareErrorHigh = _mm_add_epi64(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 3));
+        _mm_storeu_si128(((__m128i*)ter) + 3, squareErrorHigh);
+        __m128i minIndex = _mm_packs_epi32(minIndex0, minIndex1);
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+#else
+        int pix = dr * 77 + dg * 151 + db * 28;
+
+        for( int t=0; t<8; t++ )
+        {
+            const int64* tab = g_table256[t];
+            uint idx = 0;
+            uint64 err = sq( tab[0] + pix );
+            for( int j=1; j<4; j++ )
+            {
+                uint64 local = sq( tab[j] + pix );
+                if( local < err )
+                {
+                    err = local;
+                    idx = j;
+                }
+            }
+            *sel++ = idx;
+            *ter++ += err;
+        }
+#endif
+    }
+}
+
+#ifdef __SSE4_1__
+// Non-reference implementation, but faster. Produces same results as the AVX2 version
+void FindBestFit( uint32 terr[2][8], uint16 tsel[16][8], v4i a[8], const uint32* id, const uint8* data )
+{
+    for( size_t i=0; i<16; i++ )
+    {
+        uint16* sel = tsel[i];
+        uint bid = id[i];
+        uint32* ter = terr[bid%2];
+
+        uint8 b = *data++;
+        uint8 g = *data++;
+        uint8 r = *data++;
+        data++;
+
+        int dr = a[bid][0] - r;
+        int dg = a[bid][1] - g;
+        int db = a[bid][2] - b;
+
+        // The scaling values are divided by two and rounded, to allow the differences to be in the range of signed int16
+        // This produces slightly different results, but is significant faster
+        __m128i pixel = _mm_set1_epi16(dr * 38 + dg * 76 + db * 14);
+        __m128i pix = _mm_abs_epi16(pixel);
+
+        // Taking the absolute value is way faster. The values are only used to sort, so the result will be the same.
+        // Since the selector table is symmetrical, we need to calculate the difference only for half of the entries.
+        __m128i error0 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[0]));
+        __m128i error1 = _mm_abs_epi16(_mm_sub_epi16(pix, g_table128_SIMD[1]));
+
+        __m128i index = _mm_and_si128(_mm_cmplt_epi16(error1, error0), _mm_set1_epi16(1));
+        __m128i minError = _mm_min_epi16(error0, error1);
+
+        // Exploiting symmetry of the selector table and use the sign bit
+        // This produces slightly different results, but is needed to produce same results as AVX2 implementation
+        __m128i indexBit = _mm_andnot_si128(_mm_srli_epi16(pixel, 15), _mm_set1_epi8(-1));
+        __m128i minIndex = _mm_or_si128(index, _mm_add_epi16(indexBit, indexBit));
+
+        // Squaring the minimum error to produce correct values when adding
+        __m128i squareErrorLo = _mm_mullo_epi16(minError, minError);
+        __m128i squareErrorHi = _mm_mulhi_epi16(minError, minError);
+
+        __m128i squareErrorLow = _mm_unpacklo_epi16(squareErrorLo, squareErrorHi);
+        __m128i squareErrorHigh = _mm_unpackhi_epi16(squareErrorLo, squareErrorHi);
+
+        squareErrorLow = _mm_add_epi32(squareErrorLow, _mm_loadu_si128(((__m128i*)ter) + 0));
+        _mm_storeu_si128(((__m128i*)ter) + 0, squareErrorLow);
+        squareErrorHigh = _mm_add_epi32(squareErrorHigh, _mm_loadu_si128(((__m128i*)ter) + 1));
+        _mm_storeu_si128(((__m128i*)ter) + 1, squareErrorHigh);
+
+        _mm_storeu_si128((__m128i*)sel, minIndex);
+    }
+}
+#endif
+
+uint8_t convert6(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 11 - ((i + 11) >> 7) - ((i + 4) >> 7)) >> 3;
+}
+
+uint8_t convert7(float f)
+{
+    int i = (std::min(std::max(static_cast<int>(f), 0), 1023) - 15) >> 1;
+    return (i + 9 - ((i + 9) >> 8) - ((i + 6) >> 8)) >> 2;
+}
+
+std::pair<uint64, uint64> Planar(const uint8* src)
+{
+    int32 r = 0;
+    int32 g = 0;
+    int32 b = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        b += src[i * 4 + 0];
+        g += src[i * 4 + 1];
+        r += src[i * 4 + 2];
+    }
+
+    int32 difRyz = 0;
+    int32 difGyz = 0;
+    int32 difByz = 0;
+    int32 difRxz = 0;
+    int32 difGxz = 0;
+    int32 difBxz = 0;
+
+    const int32 scaling[] = { -255, -85, 85, 255 };
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32 difB = (static_cast<int>(src[i * 4 + 0]) << 4) - b;
+        int32 difG = (static_cast<int>(src[i * 4 + 1]) << 4) - g;
+        int32 difR = (static_cast<int>(src[i * 4 + 2]) << 4) - r;
+
+        difRyz += difR * scaling[i % 4];
+        difGyz += difG * scaling[i % 4];
+        difByz += difB * scaling[i % 4];
+
+        difRxz += difR * scaling[i / 4];
+        difGxz += difG * scaling[i / 4];
+        difBxz += difB * scaling[i / 4];
+    }
+
+    const float scale = -4.0f / ((255 * 255 * 8.0f + 85 * 85 * 8.0f) * 16.0f);
+
+    float aR = difRxz * scale;
+    float aG = difGxz * scale;
+    float aB = difBxz * scale;
+
+    float bR = difRyz * scale;
+    float bG = difGyz * scale;
+    float bB = difByz * scale;
+
+    float dR = r * (4.0f / 16.0f);
+    float dG = g * (4.0f / 16.0f);
+    float dB = b * (4.0f / 16.0f);
+
+    // calculating the three colors RGBO, RGBH, and RGBV.  RGB = df - af * x - bf * y;
+    float cofR = (aR *  255.0f + (bR *  255.0f + dR));
+    float cofG = (aG *  255.0f + (bG *  255.0f + dG));
+    float cofB = (aB *  255.0f + (bB *  255.0f + dB));
+    float chfR = (aR * -425.0f + (bR *  255.0f + dR));
+    float chfG = (aG * -425.0f + (bG *  255.0f + dG));
+    float chfB = (aB * -425.0f + (bB *  255.0f + dB));
+    float cvfR = (aR *  255.0f + (bR * -425.0f + dR));
+    float cvfG = (aG *  255.0f + (bG * -425.0f + dG));
+    float cvfB = (aB *  255.0f + (bB * -425.0f + dB));
+
+    // convert to r6g7b6
+    int32 coR = convert6(cofR);
+    int32 coG = convert7(cofG);
+    int32 coB = convert6(cofB);
+    int32 chR = convert6(chfR);
+    int32 chG = convert7(chfG);
+    int32 chB = convert6(chfB);
+    int32 cvR = convert6(cvfR);
+    int32 cvG = convert7(cvfG);
+    int32 cvB = convert6(cvfB);
+
+    // Error calculation
+    int32 ro0 = coR;
+    int32 go0 = coG;
+    int32 bo0 = coB;
+    int32 ro1 = (ro0 >> 4) | (ro0 << 2);
+    int32 go1 = (go0 >> 6) | (go0 << 1);
+    int32 bo1 = (bo0 >> 4) | (bo0 << 2);
+    int32 ro2 = (ro1 << 2) + 2;
+    int32 go2 = (go1 << 2) + 2;
+    int32 bo2 = (bo1 << 2) + 2;
+
+    int32 rh0 = chR;
+    int32 gh0 = chG;
+    int32 bh0 = chB;
+    int32 rh1 = (rh0 >> 4) | (rh0 << 2);
+    int32 gh1 = (gh0 >> 6) | (gh0 << 1);
+    int32 bh1 = (bh0 >> 4) | (bh0 << 2);
+
+    int32 rh2 = rh1 - ro1;
+    int32 gh2 = gh1 - go1;
+    int32 bh2 = bh1 - bo1;
+
+    int32 rv0 = cvR;
+    int32 gv0 = cvG;
+    int32 bv0 = cvB;
+    int32 rv1 = (rv0 >> 4) | (rv0 << 2);
+    int32 gv1 = (gv0 >> 6) | (gv0 << 1);
+    int32 bv1 = (bv0 >> 4) | (bv0 << 2);
+
+    int32 rv2 = rv1 - ro1;
+    int32 gv2 = gv1 - go1;
+    int32 bv2 = bv1 - bo1;
+
+    uint64 error = 0;
+
+    for (int i = 0; i < 16; ++i)
+    {
+        int32 cR = clampu8((rh2 * (i / 4) + rv2 * (i % 4) + ro2) >> 2);
+        int32 cG = clampu8((gh2 * (i / 4) + gv2 * (i % 4) + go2) >> 2);
+        int32 cB = clampu8((bh2 * (i / 4) + bv2 * (i % 4) + bo2) >> 2);
+
+        int32 difB = static_cast<int>(src[i * 4 + 0]) - cB;
+        int32 difG = static_cast<int>(src[i * 4 + 1]) - cG;
+        int32 difR = static_cast<int>(src[i * 4 + 2]) - cR;
+
+        int32 dif = difR * 38 + difG * 76 + difB * 14;
+
+        error += dif * dif;
+    }
+
+    /**/
+    uint32 rgbv = cvB | (cvG << 6) | (cvR << 13);
+    uint32 rgbh = chB | (chG << 6) | (chR << 13);
+    uint32 hi = rgbv | ((rgbh & 0x1FFF) << 19);
+    uint32 lo = (chR & 0x1) | 0x2 | ((chR << 1) & 0x7C);
+    lo |= ((coB & 0x07) <<  7) | ((coB & 0x18) <<  8) | ((coB & 0x20) << 11);
+    lo |= ((coG & 0x3F) << 17) | ((coG & 0x40) << 18);
+    lo |= coR << 25;
+
+    const int32 idx = (coR & 0x20) | ((coG & 0x20) >> 1) | ((coB & 0x1E) >> 1);
+
+    lo |= g_flags[idx];
+
+    uint64 result = static_cast<uint32>(bx::endianSwap(lo));
+    result |= static_cast<uint64>(static_cast<uint32>(bx::endianSwap(hi))) << 32;
+
+    return std::make_pair(result, error);
+}
+
+template<class T, class S>
+uint64 EncodeSelectors( uint64 d, const T terr[2][8], const S tsel[16][8], const uint32* id, const uint64 value, const uint64 error)
+{
+    size_t tidx[2];
+    tidx[0] = GetLeastError( terr[0], 8 );
+    tidx[1] = GetLeastError( terr[1], 8 );
+
+    if ((terr[0][tidx[0]] + terr[1][tidx[1]]) >= error)
+    {
+        return value;
+    }
+
+    d |= tidx[0] << 26;
+    d |= tidx[1] << 29;
+    for( int i=0; i<16; i++ )
+    {
+        uint64 t = tsel[i][tidx[id[i]%2]];
+        d |= ( t & 0x1 ) << ( i + 32 );
+        d |= ( t & 0x2 ) << ( i + 47 );
+    }
+
+    return FixByteOrder(d);
+}
+}
+
+uint64 ProcessRGB( const uint8* src )
+{
+    uint64 d = CheckSolid( src );
+    if( d != 0 ) return d;
+
+    v4i a[8];
+    uint err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+#if defined __SSE4_1__ && !defined REFERENCE_IMPLEMENTATION
+    uint32 terr[2][8] = {};
+#else
+    uint64 terr[2][8] = {};
+#endif
+    uint16 tsel[16][8];
+    const uint32* id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return FixByteOrder( EncodeSelectors( d, terr, tsel, id ) );
+}
+
+uint64 ProcessRGB_ETC2( const uint8* src )
+{
+    std::pair<uint64, uint64> result = Planar( src );
+
+    uint64 d = 0;
+
+    v4i a[8];
+    uint err[4] = {};
+    PrepareAverages( a, src, err );
+    size_t idx = GetLeastError( err, 4 );
+    EncodeAverages( d, a, idx );
+
+    uint64 terr[2][8] = {};
+    uint16 tsel[16][8];
+    const uint32* id = g_id[idx];
+    FindBestFit( terr, tsel, a, id, src );
+
+    return EncodeSelectors( d, terr, tsel, id, result.first, result.second );
+}
diff --git a/3rdparty/bimg/3rdparty/etc2/ProcessRGB.hpp b/3rdparty/bimg/3rdparty/etc2/ProcessRGB.hpp
new file mode 100644
index 0000000..21434a3
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/ProcessRGB.hpp
@@ -0,0 +1,9 @@
+#ifndef __PROCESSRGB_HPP__
+#define __PROCESSRGB_HPP__
+
+#include "Types.hpp"
+
+uint64 ProcessRGB( const uint8* src );
+uint64 ProcessRGB_ETC2( const uint8* src );
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/etc2/Tables.cpp b/3rdparty/bimg/3rdparty/etc2/Tables.cpp
new file mode 100644
index 0000000..968fbf5
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/Tables.cpp
@@ -0,0 +1,109 @@
+#include "Tables.hpp"
+
+const int32 g_table[8][4] = {
+    {  2,  8,   -2,   -8 },
+    {  5, 17,   -5,  -17 },
+    {  9, 29,   -9,  -29 },
+    { 13, 42,  -13,  -42 },
+    { 18, 60,  -18,  -60 },
+    { 24, 80,  -24,  -80 },
+    { 33, 106, -33, -106 },
+    { 47, 183, -47, -183 }
+};
+
+const int64 g_table256[8][4] = {
+    {  2*256,  8*256,   -2*256,   -8*256 },
+    {  5*256, 17*256,   -5*256,  -17*256 },
+    {  9*256, 29*256,   -9*256,  -29*256 },
+    { 13*256, 42*256,  -13*256,  -42*256 },
+    { 18*256, 60*256,  -18*256,  -60*256 },
+    { 24*256, 80*256,  -24*256,  -80*256 },
+    { 33*256, 106*256, -33*256, -106*256 },
+    { 47*256, 183*256, -47*256, -183*256 }
+};
+
+const uint32 g_id[4][16] = {
+    { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2, 3, 3, 2, 2 },
+    { 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 },
+    { 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6, 7, 7, 6, 6 }
+};
+
+const uint32 g_avg2[16] = {
+    0x00,
+    0x11,
+    0x22,
+    0x33,
+    0x44,
+    0x55,
+    0x66,
+    0x77,
+    0x88,
+    0x99,
+    0xAA,
+    0xBB,
+    0xCC,
+    0xDD,
+    0xEE,
+    0xFF
+};
+
+const uint32 g_flags[64] = {
+    0x80800402, 0x80800402, 0x80800402, 0x80800402,
+    0x80800402, 0x80800402, 0x80800402, 0x8080E002,
+    0x80800402, 0x80800402, 0x8080E002, 0x8080E002,
+    0x80800402, 0x8080E002, 0x8080E002, 0x8080E002,
+    0x80000402, 0x80000402, 0x80000402, 0x80000402,
+    0x80000402, 0x80000402, 0x80000402, 0x8000E002,
+    0x80000402, 0x80000402, 0x8000E002, 0x8000E002,
+    0x80000402, 0x8000E002, 0x8000E002, 0x8000E002,
+    0x00800402, 0x00800402, 0x00800402, 0x00800402,
+    0x00800402, 0x00800402, 0x00800402, 0x0080E002,
+    0x00800402, 0x00800402, 0x0080E002, 0x0080E002,
+    0x00800402, 0x0080E002, 0x0080E002, 0x0080E002,
+    0x00000402, 0x00000402, 0x00000402, 0x00000402,
+    0x00000402, 0x00000402, 0x00000402, 0x0000E002,
+    0x00000402, 0x00000402, 0x0000E002, 0x0000E002,
+    0x00000402, 0x0000E002, 0x0000E002, 0x0000E002
+};
+
+#ifdef __SSE4_1__
+const uint8 g_flags_AVX2[64] =
+{
+    0x63, 0x63, 0x63, 0x63,
+    0x63, 0x63, 0x63, 0x7D,
+    0x63, 0x63, 0x7D, 0x7D,
+    0x63, 0x7D, 0x7D, 0x7D,
+    0x43, 0x43, 0x43, 0x43,
+    0x43, 0x43, 0x43, 0x5D,
+    0x43, 0x43, 0x5D, 0x5D,
+    0x43, 0x5D, 0x5D, 0x5D,
+    0x23, 0x23, 0x23, 0x23,
+    0x23, 0x23, 0x23, 0x3D,
+    0x23, 0x23, 0x3D, 0x3D,
+    0x23, 0x3D, 0x3D, 0x3D,
+    0x03, 0x03, 0x03, 0x03,
+    0x03, 0x03, 0x03, 0x1D,
+    0x03, 0x03, 0x1D, 0x1D,
+    0x03, 0x1D, 0x1D, 0x1D,
+};
+
+const __m128i g_table_SIMD[2] =
+{
+    _mm_setr_epi16(   2,   5,   9,  13,  18,  24,  33,  47),
+    _mm_setr_epi16(   8,  17,  29,  42,  60,  80, 106, 183)
+};
+const __m128i g_table128_SIMD[2] =
+{
+    _mm_setr_epi16(   2*128,   5*128,   9*128,  13*128,  18*128,  24*128,  33*128,  47*128),
+    _mm_setr_epi16(   8*128,  17*128,  29*128,  42*128,  60*128,  80*128, 106*128, 183*128)
+};
+const __m128i g_table256_SIMD[4] =
+{
+    _mm_setr_epi32(  2*256,   5*256,   9*256,  13*256),
+    _mm_setr_epi32(  8*256,  17*256,  29*256,  42*256),
+    _mm_setr_epi32( 18*256,  24*256,  33*256,  47*256),
+    _mm_setr_epi32( 60*256,  80*256, 106*256, 183*256)
+};
+#endif
+
diff --git a/3rdparty/bimg/3rdparty/etc2/Tables.hpp b/3rdparty/bimg/3rdparty/etc2/Tables.hpp
new file mode 100644
index 0000000..b570526
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/Tables.hpp
@@ -0,0 +1,25 @@
+#ifndef __TABLES_HPP__
+#define __TABLES_HPP__
+
+#include "Types.hpp"
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
+extern const int32 g_table[8][4];
+extern const int64 g_table256[8][4];
+
+extern const uint32 g_id[4][16];
+
+extern const uint32 g_avg2[16];
+
+extern const uint32 g_flags[64];
+
+#ifdef __SSE4_1__
+extern const uint8 g_flags_AVX2[64];
+extern const __m128i g_table_SIMD[2];
+extern const __m128i g_table128_SIMD[2];
+extern const __m128i g_table256_SIMD[4];
+#endif
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/etc2/Types.hpp b/3rdparty/bimg/3rdparty/etc2/Types.hpp
new file mode 100644
index 0000000..b31da22
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/Types.hpp
@@ -0,0 +1,17 @@
+#ifndef __DARKRL__TYPES_HPP__
+#define __DARKRL__TYPES_HPP__
+
+#include <stdint.h>
+
+typedef int8_t      int8;
+typedef uint8_t     uint8;
+typedef int16_t     int16;
+typedef uint16_t    uint16;
+typedef int32_t     int32;
+typedef uint32_t    uint32;
+typedef int64_t     int64;
+typedef uint64_t    uint64;
+
+typedef unsigned int uint;
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/etc2/Vector.hpp b/3rdparty/bimg/3rdparty/etc2/Vector.hpp
new file mode 100644
index 0000000..3005fdc
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/etc2/Vector.hpp
@@ -0,0 +1,222 @@
+#ifndef __DARKRL__VECTOR_HPP__
+#define __DARKRL__VECTOR_HPP__
+
+#include <assert.h>
+#include <algorithm>
+#include <math.h>
+
+#include "Math.hpp"
+#include "Types.hpp"
+
+template<class T>
+struct Vector2
+{
+    Vector2() : x( 0 ), y( 0 ) {}
+    Vector2( T v ) : x( v ), y( v ) {}
+    Vector2( T _x, T _y ) : x( _x ), y( _y ) {}
+
+    bool operator==( const Vector2<T>& rhs ) const { return x == rhs.x && y == rhs.y; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    Vector2<T>& operator+=( const Vector2<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator-=( const Vector2<T>& rhs )
+    {
+        x -= rhs.x;
+        y -= rhs.y;
+        return *this;
+    }
+    Vector2<T>& operator*=( const Vector2<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        return *this;
+    }
+
+    T x, y;
+};
+
+template<class T>
+Vector2<T> operator+( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x + rhs.x, lhs.y + rhs.y );
+}
+
+template<class T>
+Vector2<T> operator-( const Vector2<T>& lhs, const Vector2<T>& rhs )
+{
+    return Vector2<T>( lhs.x - rhs.x, lhs.y - rhs.y );
+}
+
+template<class T>
+Vector2<T> operator*( const Vector2<T>& lhs, const float& rhs )
+{
+    return Vector2<T>( lhs.x * rhs, lhs.y * rhs );
+}
+
+template<class T>
+Vector2<T> operator/( const Vector2<T>& lhs, const T& rhs )
+{
+    return Vector2<T>( lhs.x / rhs, lhs.y / rhs );
+}
+
+
+typedef Vector2<int32> v2i;
+typedef Vector2<float> v2f;
+
+
+template<class T>
+struct Vector3
+{
+    Vector3() : x( 0 ), y( 0 ), z( 0 ) {}
+    Vector3( T v ) : x( v ), y( v ), z( v ) {}
+    Vector3( T _x, T _y, T _z ) : x( _x ), y( _y ), z( _z ) {}
+    template<class Y>
+    Vector3( const Vector3<Y>& v ) : x( T( v.x ) ), y( T( v.y ) ), z( T( v.z ) ) {}
+
+    T Luminance() const { return T( x * 0.3f + y * 0.59f + z * 0.11f ); }
+    void Clamp()
+    {
+        x = std::min( T(1), std::max( T(0), x ) );
+        y = std::min( T(1), std::max( T(0), y ) );
+        z = std::min( T(1), std::max( T(0), z ) );
+    }
+
+    bool operator==( const Vector3<T>& rhs ) const { return x == rhs.x && y == rhs.y && z == rhs.z; }
+    bool operator!=( const Vector2<T>& rhs ) const { return !( *this == rhs ); }
+
+    T& operator[]( uint idx ) { assert( idx < 3 ); return ((T*)this)[idx]; }
+    const T& operator[]( uint idx ) const { assert( idx < 3 ); return ((T*)this)[idx]; }
+
+    Vector3<T> operator+=( const Vector3<T>& rhs )
+    {
+        x += rhs.x;
+        y += rhs.y;
+        z += rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const Vector3<T>& rhs )
+    {
+        x *= rhs.x;
+        y *= rhs.y;
+        z *= rhs.z;
+        return *this;
+    }
+
+    Vector3<T> operator*=( const float& rhs )
+    {
+        x *= rhs;
+        y *= rhs;
+        z *= rhs;
+        return *this;
+    }
+
+    T x, y, z;
+    T padding;
+};
+
+template<class T>
+Vector3<T> operator+( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x + rhs.x, lhs.y + rhs.y, lhs.z + rhs.z );
+}
+
+template<class T>
+Vector3<T> operator-( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x - rhs.x, lhs.y - rhs.y, lhs.z - rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return Vector3<T>( lhs.x * rhs.x, lhs.y * rhs.y, lhs.z * rhs.z );
+}
+
+template<class T>
+Vector3<T> operator*( const Vector3<T>& lhs, const float& rhs )
+{
+    return Vector3<T>( T( lhs.x * rhs ), T( lhs.y * rhs ), T( lhs.z * rhs ) );
+}
+
+template<class T>
+Vector3<T> operator/( const Vector3<T>& lhs, const T& rhs )
+{
+    return Vector3<T>( lhs.x / rhs, lhs.y / rhs, lhs.z / rhs );
+}
+
+template<class T>
+bool operator<( const Vector3<T>& lhs, const Vector3<T>& rhs )
+{
+    return lhs.Luminance() < rhs.Luminance();
+}
+
+typedef Vector3<int32> v3i;
+typedef Vector3<float> v3f;
+typedef Vector3<uint8> v3b;
+
+
+static inline v3b v3f_to_v3b( const v3f& v )
+{
+    return v3b( uint8( std::min( 1.f, v.x ) * 255 ), uint8( std::min( 1.f, v.y ) * 255 ), uint8( std::min( 1.f, v.z ) * 255 ) );
+}
+
+template<class T>
+Vector3<T> Mix( const Vector3<T>& v1, const Vector3<T>& v2, float amount )
+{
+    return v1 + ( v2 - v1 ) * amount;
+}
+
+template<>
+inline v3b Mix( const v3b& v1, const v3b& v2, float amount )
+{
+    return v3b( v3f( v1 ) + ( v3f( v2 ) - v3f( v1 ) ) * amount );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v )
+{
+    T l = v.Luminance();
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> Desaturate( const Vector3<T>& v, float mul )
+{
+    T l = T( v.Luminance() * mul );
+    return Vector3<T>( l, l, l );
+}
+
+template<class T>
+Vector3<T> pow( const Vector3<T>& base, float exponent )
+{
+    return Vector3<T>(
+        pow( base.x, exponent ),
+        pow( base.y, exponent ),
+        pow( base.z, exponent ) );
+}
+
+template<class T>
+Vector3<T> sRGB2linear( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        sRGB2linear( v.x ),
+        sRGB2linear( v.y ),
+        sRGB2linear( v.z ) );
+}
+
+template<class T>
+Vector3<T> linear2sRGB( const Vector3<T>& v )
+{
+    return Vector3<T>(
+        linear2sRGB( v.x ),
+        linear2sRGB( v.y ),
+        linear2sRGB( v.z ) );
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/iqa/LICENSE b/3rdparty/bimg/3rdparty/iqa/LICENSE
new file mode 100644
index 0000000..ff67944
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/LICENSE
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
diff --git a/3rdparty/bimg/3rdparty/iqa/README.txt b/3rdparty/bimg/3rdparty/iqa/README.txt
new file mode 100644
index 0000000..2028d46
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/README.txt
@@ -0,0 +1,36 @@
+Doxygen documentation can be found at: http://tdistler.com/iqa
+
+BUILD:
+
+  All build artifacts end up in build/<configuration>, where <configuration> is
+  'debug' or 'release'.
+
+  Windows:
+    - Open iqa.sln, select 'Debug' or 'Release', and build. The output is a 
+      static library 'iqa.lib'.
+    - To run the tests under the debugger, first right-click the 'test' project,
+      select Properties -> Configuration Properties -> Debugging and set
+      'Working Directory' to '$(OutDir)'. Then start the application.
+
+  Linux:
+    - Change directories into the root of the IQA branch you want to build.
+    - Type `make` for a debug build, or `make RELEASE=1` for a release build.
+      The output is a static library 'libiqa.a'.
+    - Type `make test` (or `make test RELEASE=1`) to build the unit tests.
+    - Type `make clean` (or `make clean RELEASE=1`) to delete all build
+      artifacts.
+    - To run the tests, `cd` to the build/<configuration> directory and type
+      `./test`.
+
+
+USE:
+
+  - Include 'iqa.h' in your source file.
+  - Call iqa_* methods.
+  - Link against the IQA library.
+
+
+HELP & SUPPORT:
+
+  Further help can be found at: https://sourceforge.net/projects/iqa/support
+
diff --git a/3rdparty/bimg/3rdparty/iqa/include/convolve.h b/3rdparty/bimg/3rdparty/iqa/include/convolve.h
new file mode 100644
index 0000000..a5e2e71
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/include/convolve.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _CONVOLVE_H_
+#define _CONVOLVE_H_
+
+typedef float (*_iqa_get_pixel)(const float *img, int w, int h, int x, int y, float bnd_const);
+
+/** Out-of-bounds array values are a mirrored reflection of the border values*/
+float KBND_SYMMETRIC(const float *img, int w, int h, int x, int y, float bnd_const);
+/** Out-of-bounds array values are set to the nearest border value */
+float KBND_REPLICATE(const float *img, int w, int h, int x, int y, float bnd_const);
+/** Out-of-bounds array values are set to 'bnd_const' */
+float KBND_CONSTANT(const float *img, int w, int h, int x, int y, float bnd_const);
+
+
+/** Defines a convolution kernel */
+struct _kernel {
+    float *kernel;          /**< Pointer to the kernel values */
+    int w;                  /**< The kernel width */
+    int h;                  /**< The kernel height */
+    int normalized;         /**< 1 if the kernel values add up to 1. 0 otherwise */
+    _iqa_get_pixel bnd_opt; /**< Defines how out-of-bounds image values are handled */
+    float bnd_const;        /**< If 'bnd_opt' is KBND_CONSTANT, this specifies the out-of-bounds value */
+};
+
+/**
+ * @brief Applies the specified kernel to the image.
+ * The kernel will be applied to all areas where it fits completely within
+ * the image. The resulting image will be smaller by half the kernel width 
+ * and height (w - kw/2 and h - kh/2).
+ *
+ * @param img Image to modify
+ * @param w Image width
+ * @param h Image height
+ * @param k The kernel to apply
+ * @param result Buffer to hold the resulting image ((w-kw)*(h-kh), where kw
+ *               and kh are the kernel width and height). If 0, the result
+ *               will be written to the original image buffer.
+ * @param rw Optional. The width of the resulting image will be stored here.
+ * @param rh Optional. The height of the resulting image will be stored here.
+ */
+void _iqa_convolve(float *img, int w, int h, const struct _kernel *k, float *result, int *rw, int *rh);
+
+/**
+ * The same as _iqa_convolve() except the kernel is applied to the entire image.
+ * In other words, the kernel is applied to all areas where the top-left corner
+ * of the kernel is in the image. Out-of-bound pixel value (off the right and
+ * bottom edges) are chosen based on the 'bnd_opt' and 'bnd_const' members of
+ * the kernel structure. The resulting array is the same size as the input
+ * image.
+ *
+ * @param img Image to modify
+ * @param w Image width
+ * @param h Image height
+ * @param k The kernel to apply
+ * @param result Buffer to hold the resulting image ((w-kw)*(h-kh), where kw
+ *               and kh are the kernel width and height). If 0, the result
+ *               will be written to the original image buffer.
+ * @return 0 if successful. Non-zero otherwise.
+ */
+int _iqa_img_filter(float *img, int w, int h, const struct _kernel *k, float *result);
+
+/**
+ * Returns the filtered version of the specified pixel. If no kernel is given,
+ * the raw pixel value is returned.
+ * 
+ * @param img Source image
+ * @param w Image width
+ * @param h Image height
+ * @param x The x location of the pixel to filter
+ * @param y The y location of the pixel to filter
+ * @param k Optional. The convolution kernel to apply to the pixel.
+ * @param kscale The scale of the kernel (for normalization). 1 for normalized
+ *               kernels. Required if 'k' is not null.
+ * @return The filtered pixel value.
+ */
+float _iqa_filter_pixel(const float *img, int w, int h, int x, int y, const struct _kernel *k, const float kscale);
+
+
+#endif /*_CONVOLVE_H_*/
diff --git a/3rdparty/bimg/3rdparty/iqa/include/decimate.h b/3rdparty/bimg/3rdparty/iqa/include/decimate.h
new file mode 100644
index 0000000..40f1a8c
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/include/decimate.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _DECIMATE_H_
+#define _DECIMATE_H_
+
+#include "convolve.h"
+
+/**
+ * @brief Downsamples (decimates) an image.
+ *
+ * @param img Image to modify
+ * @param w Image width
+ * @param h Image height
+ * @param factor Decimation factor
+ * @param k The kernel to apply (e.g. low-pass filter). Can be 0.
+ * @param result Buffer to hold the resulting image (w/factor*h/factor). If 0,
+ *               the result will be written to the original image buffer.
+ * @param rw Optional. The width of the resulting image will be stored here.
+ * @param rh Optional. The height of the resulting image will be stored here.
+ * @return 0 on success.
+ */
+int _iqa_decimate(float *img, int w, int h, int factor, const struct _kernel *k, float *result, int *rw, int *rh);
+
+#endif /*_DECIMATE_H_*/
diff --git a/3rdparty/bimg/3rdparty/iqa/include/iqa.h b/3rdparty/bimg/3rdparty/iqa/include/iqa.h
new file mode 100644
index 0000000..408675e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/include/iqa.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _IQA_H_
+#define _IQA_H_
+
+#include "iqa_os.h"
+
+/**
+ * Allows fine-grain control of the SSIM algorithm.
+ */
+struct iqa_ssim_args {
+    float alpha;    /**< luminance exponent */
+    float beta;     /**< contrast exponent */
+    float gamma;    /**< structure exponent */
+    int L;          /**< dynamic range (2^8 - 1)*/
+    float K1;       /**< stabilization constant 1 */
+    float K2;       /**< stabilization constant 2 */
+    int f;          /**< scale factor. 0=default scaling, 1=no scaling */
+};
+
+/**
+ * Allows fine-grain control of the MS-SSIM algorithm.
+ */
+struct iqa_ms_ssim_args {
+    int wang;             /**< 1=original algorithm by Wang, et al. 0=MS-SSIM* by Rouse/Hemami (default). */
+    int gaussian;         /**< 1=11x11 Gaussian window (default). 0=8x8 linear window. */
+    int scales;           /**< Number of scaled images to use. Default is 5. */
+    const float *alphas;  /**< Pointer to array of alpha values for each scale. Required if 'scales' isn't 5. */
+    const float *betas;   /**< Pointer to array of beta values for each scale. Required if 'scales' isn't 5. */
+    const float *gammas;  /**< Pointer to array of gamma values for each scale. Required if 'scales' isn't 5. */
+};
+
+/**
+ * Calculates the Mean Squared Error between 2 equal-sized 8-bit images.
+ * @note The images must have the same width, height, and stride.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @return The MSE.
+ */
+float iqa_mse(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride);
+
+/**
+ * Calculates the Peak Signal-to-Noise-Ratio between 2 equal-sized 8-bit
+ * images.
+ * @note The images must have the same width, height, and stride.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @return The PSNR.
+ */
+float iqa_psnr(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride);
+
+/**
+ * Calculates the Structural SIMilarity between 2 equal-sized 8-bit images.
+ *
+ * See https://ece.uwaterloo.ca/~z70wang/publications/ssim.html
+ * @note The images must have the same width, height, and stride.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @param gaussian 0 = 8x8 square window, 1 = 11x11 circular-symmetric Gaussian
+ * weighting.
+ * @param args Optional SSIM arguments for fine control of the algorithm. 0 for
+ * defaults. Defaults are a=b=g=1.0, L=255, K1=0.01, K2=0.03
+ * @return The mean SSIM over the entire image (MSSIM), or INFINITY if error.
+ */
+float iqa_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride, 
+    int gaussian, const struct iqa_ssim_args *args);
+
+/**
+ * Calculates the Multi-Scale Structural SIMilarity between 2 equal-sized 8-bit
+ * images. The default algorithm is MS-SSIM* proposed by Rouse/Hemami 2008.
+ *
+ * See https://ece.uwaterloo.ca/~z70wang/publications/msssim.pdf and
+ * http://foulard.ece.cornell.edu/publications/dmr_hvei2008_paper.pdf
+ *
+ * @note 1. The images must have the same width, height, and stride.
+ * @note 2. The minimum image width or height is 2^(scales-1) * filter, where 'filter' is 11
+ * if a Gaussian window is being used, or 9 otherwise.
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images.
+ * @param h Height of the images.
+ * @param stride The length (in bytes) of each horizontal line in the image.
+ *               This may be different from the image width.
+ * @param args Optional MS-SSIM arguments for fine control of the algorithm. 0
+ * for defaults. Defaults are wang=0, scales=5, gaussian=1.
+ * @return The mean MS-SSIM over the entire image, or INFINITY if error.
+ */
+float iqa_ms_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride, 
+    const struct iqa_ms_ssim_args *args);
+
+#endif /*_IQA_H_*/
diff --git a/3rdparty/bimg/3rdparty/iqa/include/iqa_os.h b/3rdparty/bimg/3rdparty/iqa/include/iqa_os.h
new file mode 100644
index 0000000..52e0be0
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/include/iqa_os.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _OS_H_
+#define _OS_H_
+
+/* Microsoft tends to implement features early, but they have a high legacy
+ * cost because they won't break existing implementations. As such, certain
+ * features we take for granted on other platforms (like C99) aren't fully
+ * implemented. This file is meant to rectify that.
+ */
+
+#ifdef WIN32
+
+#include <windows.h>
+#include <math.h>
+
+#define IQA_INLINE __inline
+
+#ifndef INFINITY
+    #define INFINITY (float)HUGE_VAL /**< Defined in C99 (Windows is C89) */
+#endif /*INFINITY*/
+
+#ifndef NAN
+    static const unsigned long __nan[2] = {0xffffffff, 0x7fffffff};
+    #define NAN (*(const float *) __nan) /**< Defined in C99 (Windows is C99) */
+#endif
+
+#define IQA_EXPORT __declspec(dllexport)
+
+#else /* !Windows */
+
+#define IQA_INLINE inline
+#define IQA_EXPORT
+
+#endif
+
+#endif /* _OS_H_ */
diff --git a/3rdparty/bimg/3rdparty/iqa/include/math_utils.h b/3rdparty/bimg/3rdparty/iqa/include/math_utils.h
new file mode 100644
index 0000000..674b354
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/include/math_utils.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _MATH_UTILS_H_
+#define _MATH_UTILS_H_
+
+#include "iqa_os.h"
+#include <math.h>
+
+/**
+ * Rounds a float to the nearest integer.
+ */
+IQA_EXPORT int _round(float a);
+
+IQA_EXPORT int _max(int x, int y);
+
+IQA_EXPORT int _min(int x, int y);
+
+
+/** 
+ * Compares 2 floats to the specified digit of precision.
+ * @return 0 if equal, 1 otherwise.
+ */
+IQA_EXPORT int _cmp_float(float a, float b, int digits);
+
+
+/** 
+ * Compares 2 matrices with the specified precision. 'b' is assumed to be the
+ * same size as 'a' or smaller.
+ * @return 0 if equal, 1 otherwise
+ */
+IQA_EXPORT int _matrix_cmp(const float *a, const float *b, int w, int h, int digits);
+
+#endif /*_MATH_UTILS_H_*/
diff --git a/3rdparty/bimg/3rdparty/iqa/include/ssim.h b/3rdparty/bimg/3rdparty/iqa/include/ssim.h
new file mode 100644
index 0000000..5653afe
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/include/ssim.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SSIM_H_
+#define _SSIM_H_
+
+#include "convolve.h"
+
+/*
+ * Circular-symmetric Gaussian weighting.
+ * h(x,y) = hg(x,y)/SUM(SUM(hg)) , for normalization to 1.0
+ * hg(x,y) = e^( -0.5*( (x^2+y^2)/sigma^2 ) ) , where sigma was 1.5
+ */
+#define GAUSSIAN_LEN 11
+static const float g_gaussian_window[GAUSSIAN_LEN][GAUSSIAN_LEN] = {
+    {0.000001f, 0.000008f, 0.000037f, 0.000112f, 0.000219f, 0.000274f, 0.000219f, 0.000112f, 0.000037f, 0.000008f, 0.000001f},
+    {0.000008f, 0.000058f, 0.000274f, 0.000831f, 0.001619f, 0.002021f, 0.001619f, 0.000831f, 0.000274f, 0.000058f, 0.000008f},
+    {0.000037f, 0.000274f, 0.001296f, 0.003937f, 0.007668f, 0.009577f, 0.007668f, 0.003937f, 0.001296f, 0.000274f, 0.000037f},
+    {0.000112f, 0.000831f, 0.003937f, 0.011960f, 0.023294f, 0.029091f, 0.023294f, 0.011960f, 0.003937f, 0.000831f, 0.000112f},
+    {0.000219f, 0.001619f, 0.007668f, 0.023294f, 0.045371f, 0.056662f, 0.045371f, 0.023294f, 0.007668f, 0.001619f, 0.000219f},
+    {0.000274f, 0.002021f, 0.009577f, 0.029091f, 0.056662f, 0.070762f, 0.056662f, 0.029091f, 0.009577f, 0.002021f, 0.000274f},
+    {0.000219f, 0.001619f, 0.007668f, 0.023294f, 0.045371f, 0.056662f, 0.045371f, 0.023294f, 0.007668f, 0.001619f, 0.000219f},
+    {0.000112f, 0.000831f, 0.003937f, 0.011960f, 0.023294f, 0.029091f, 0.023294f, 0.011960f, 0.003937f, 0.000831f, 0.000112f},
+    {0.000037f, 0.000274f, 0.001296f, 0.003937f, 0.007668f, 0.009577f, 0.007668f, 0.003937f, 0.001296f, 0.000274f, 0.000037f},
+    {0.000008f, 0.000058f, 0.000274f, 0.000831f, 0.001619f, 0.002021f, 0.001619f, 0.000831f, 0.000274f, 0.000058f, 0.000008f},
+    {0.000001f, 0.000008f, 0.000037f, 0.000112f, 0.000219f, 0.000274f, 0.000219f, 0.000112f, 0.000037f, 0.000008f, 0.000001f},
+};
+
+/*
+ * Equal weight square window.
+ * Each pixel is equally weighted (1/64) so that SUM(x) = 1.0
+ */
+#define SQUARE_LEN 8
+static const float g_square_window[SQUARE_LEN][SQUARE_LEN] = {
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+    {0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f, 0.015625f},
+};
+
+/* Holds intermediate SSIM values for map-reduce operation. */
+struct _ssim_int {
+    double l;
+    double c;
+    double s;
+};
+
+/* Defines the pointers to the map-reduce functions. */
+typedef int (*_map)(const struct _ssim_int *, void *);
+typedef float (*_reduce)(int, int, void *);
+
+/* Arguments for map-reduce. The 'context' is user-defined. */
+struct _map_reduce {
+    _map map;
+    _reduce reduce;
+    void *context;
+};
+
+/**
+ * Private method that calculates the SSIM value on a pre-processed image.
+ *
+ * The input images must have stride==width. This method does not scale.
+ *
+ * @note Image buffers are modified.
+ *
+ * Map-reduce is used for doing the final SSIM calculation. The map function is
+ * called for every pixel, and the reduce is called at the end. The context is
+ * caller-defined and *not* modified by this method.
+ *
+ * @param ref Original reference image
+ * @param cmp Distorted image
+ * @param w Width of the images
+ * @param h Height of the images
+ * @param k The kernel used as the window function
+ * @param mr Optional map-reduce functions to use to calculate SSIM. Required
+ *           if 'args' is not null. Ignored if 'args' is null.
+ * @param args Optional SSIM arguments for fine control of the algorithm. 0 for defaults.
+ *             Defaults are a=b=g=1.0, L=255, K1=0.01, K2=0.03
+ * @return The mean SSIM over the entire image (MSSIM), or INFINITY if error.
+ */
+float _iqa_ssim(float *ref, float *cmp, int w, int h, const struct _kernel *k, const struct _map_reduce *mr, const struct iqa_ssim_args *args);
+
+#endif /* _SSIM_H_ */
diff --git a/3rdparty/bimg/3rdparty/iqa/source/convolve.c b/3rdparty/bimg/3rdparty/iqa/source/convolve.c
new file mode 100644
index 0000000..c915907
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/convolve.c
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "convolve.h"
+#include <stdlib.h>
+
+float KBND_SYMMETRIC(const float *img, int w, int h, int x, int y, float bnd_const)
+{
+    (void)bnd_const;
+    if (x<0) x=-1-x;
+    else if (x>=w) x=(w-(x-w))-1;
+    if (y<0) y=-1-y;
+    else if (y>=h) y=(h-(y-h))-1;
+    return img[y*w + x];
+}
+
+float KBND_REPLICATE(const float *img, int w, int h, int x, int y, float bnd_const)
+{
+    (void)bnd_const;
+    if (x<0) x=0;
+    if (x>=w) x=w-1;
+    if (y<0) y=0;
+    if (y>=h) y=h-1;
+    return img[y*w + x];
+}
+
+float KBND_CONSTANT(const float *img, int w, int h, int x, int y, float bnd_const)
+{
+    if (x<0) x=0;
+    if (y<0) y=0;
+    if (x>=w || y>=h)
+        return bnd_const;
+    return img[y*w + x];
+}
+
+static float _calc_scale(const struct _kernel *k)
+{
+    int ii,k_len;
+    double sum=0.0;
+
+    if (k->normalized)
+        return 1.0f;
+    else {
+        k_len = k->w * k->h;
+        for (ii=0; ii<k_len; ++ii)
+            sum += k->kernel[ii];
+        if (sum != 0.0)
+            return (float)(1.0 / sum);
+        return 1.0f;
+    }
+}
+
+void _iqa_convolve(float *img, int w, int h, const struct _kernel *k, float *result, int *rw, int *rh)
+{
+    int x,y,kx,ky,u,v;
+    int uc = k->w/2;
+    int vc = k->h/2;
+    int kw_even = (k->w&1)?0:1;
+    int kh_even = (k->h&1)?0:1;
+    int dst_w = w - k->w + 1;
+    int dst_h = h - k->h + 1;
+    int img_offset,k_offset;
+    double sum;
+    float scale, *dst=result;
+
+    if (!dst)
+        dst = img; /* Convolve in-place */
+
+    /* Kernel is applied to all positions where the kernel is fully contained
+     * in the image */
+    scale = _calc_scale(k);
+    for (y=0; y < dst_h; ++y) {
+        for (x=0; x < dst_w; ++x) {
+            sum = 0.0;
+            k_offset = 0;
+            ky = y+vc;
+            kx = x+uc;
+            for (v=-vc; v <= vc-kh_even; ++v) {
+                img_offset = (ky+v)*w + kx;
+                for (u=-uc; u <= uc-kw_even; ++u, ++k_offset) {
+                    sum += img[img_offset+u] * k->kernel[k_offset];
+                }
+            }
+            dst[y*dst_w + x] = (float)(sum * scale);
+        }
+    }
+
+    if (rw) *rw = dst_w;
+    if (rh) *rh = dst_h;
+}
+
+int _iqa_img_filter(float *img, int w, int h, const struct _kernel *k, float *result)
+{
+    int x,y;
+    int img_offset;
+    float scale, *dst=result;
+
+    if (!k || !k->bnd_opt)
+        return 1;
+
+    if (!dst) {
+        dst = (float*)malloc(w*h*sizeof(float));
+        if (!dst)
+            return 2;
+    }
+
+    scale = _calc_scale(k);
+
+    /* Kernel is applied to all positions where top-left corner is in the image */
+    for (y=0; y < h; ++y) {
+        for (x=0; x < w; ++x) {
+            dst[y*w + x] = _iqa_filter_pixel(img, w, h, x, y, k, scale);
+        }
+    }
+
+    /* If no result buffer given, copy results to image buffer */
+    if (!result) {
+        for (y=0; y<h; ++y) {
+            img_offset = y*w;
+            for (x=0; x<w; ++x, ++img_offset) {
+                img[img_offset] = dst[img_offset];
+            }
+        }
+        free(dst);
+    }
+    return 0;
+}
+
+float _iqa_filter_pixel(const float *img, int w, int h, int x, int y, const struct _kernel *k, const float kscale)
+{
+    int u,v,uc,vc;
+    int kw_even,kh_even;
+    int x_edge_left,x_edge_right,y_edge_top,y_edge_bottom;
+    int edge,img_offset,k_offset;
+    double sum;
+
+    if (!k)
+        return img[y*w + x];
+
+    uc = k->w/2;
+    vc = k->h/2;
+    kw_even = (k->w&1)?0:1;
+    kh_even = (k->h&1)?0:1;
+    x_edge_left  = uc;
+    x_edge_right = w-uc;
+    y_edge_top = vc;
+    y_edge_bottom = h-vc;
+
+    edge = 0;
+    if (x < x_edge_left || y < y_edge_top || x >= x_edge_right || y >= y_edge_bottom)
+        edge = 1;
+
+    sum = 0.0;
+    k_offset = 0;
+    for (v=-vc; v <= vc-kh_even; ++v) {
+        img_offset = (y+v)*w + x;
+        for (u=-uc; u <= uc-kw_even; ++u, ++k_offset) {
+            if (!edge)
+                sum += img[img_offset+u] * k->kernel[k_offset];
+            else
+                sum += k->bnd_opt(img, w, h, x+u, y+v, k->bnd_const) * k->kernel[k_offset];
+        }
+    }
+    return (float)(sum * kscale);
+}
diff --git a/3rdparty/bimg/3rdparty/iqa/source/decimate.c b/3rdparty/bimg/3rdparty/iqa/source/decimate.c
new file mode 100644
index 0000000..91c6a9b
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/decimate.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "decimate.h"
+#include <stdlib.h>
+
+int _iqa_decimate(float *img, int w, int h, int factor, const struct _kernel *k, float *result, int *rw, int *rh)
+{
+    int x,y;
+    int sw = w/factor + (w&1);
+    int sh = h/factor + (h&1);
+    int dst_offset;
+    float *dst=img;
+
+    if (result)
+        dst = result;
+
+    /* Downsample */
+    for (y=0; y<sh; ++y) {
+        dst_offset = y*sw;
+        for (x=0; x<sw; ++x,++dst_offset) {
+            dst[dst_offset] = _iqa_filter_pixel(img, w, h, x*factor, y*factor, k, 1.0f);
+        }
+    }
+    
+    if (rw) *rw = sw;
+    if (rh) *rh = sh;
+    return 0;
+}
diff --git a/3rdparty/bimg/3rdparty/iqa/source/math_utils.c b/3rdparty/bimg/3rdparty/iqa/source/math_utils.c
new file mode 100644
index 0000000..83f923d
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/math_utils.c
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "math_utils.h"
+#include <math.h>
+
+int _round(float a)
+{
+    int sign_a = a > 0.0f ? 1 : -1;
+    return a-(int)a >= 0.5 ? (int)a + sign_a : (int)a;
+}
+
+int _max(int x, int y)
+{
+    return x >= y ? x : y;
+}
+
+int _min(int x, int y)
+{
+    return x <= y ? x : y;
+}
+
+int _cmp_float(float a, float b, int digits)
+{
+    /* Round */
+    int sign_a = a > 0.0f ? 1 : -1;
+    int sign_b = b > 0.0f ? 1 : -1;
+    double scale = pow(10.0, (double)digits);
+    double ax = a * scale;
+    double bx = b * scale;
+    int ai = ax-(int)ax >= 0.5 ? (int)ax + sign_a : (int)ax;
+    int bi = bx-(int)bx >= 0.5 ? (int)bx + sign_b : (int)bx;
+
+    /* Compare */
+    return ai == bi ? 0 : 1;
+}
+
+int _matrix_cmp(const float *a, const float *b, int w, int h, int digits)
+{
+    int offset;
+    int result=0;
+    int len=w*h;
+    for (offset=0; offset<len; ++offset) {
+        if (_cmp_float(a[offset], b[offset], digits)) {
+            result = 1;
+            break;
+        }
+    }
+
+    return result;
+}
+
diff --git a/3rdparty/bimg/3rdparty/iqa/source/ms_ssim.c b/3rdparty/bimg/3rdparty/iqa/source/ms_ssim.c
new file mode 100644
index 0000000..91812a0
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/ms_ssim.c
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+#include "ssim.h"
+#include "decimate.h"
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* Default number of scales */
+#define SCALES  5
+
+/* Low-pass filter for down-sampling (9/7 biorthogonal wavelet filter) */
+#define LPF_LEN 9
+static const float g_lpf[LPF_LEN][LPF_LEN] = {
+   { 0.000714f,-0.000450f,-0.002090f, 0.007132f, 0.016114f, 0.007132f,-0.002090f,-0.000450f, 0.000714f},
+   {-0.000450f, 0.000283f, 0.001316f,-0.004490f,-0.010146f,-0.004490f, 0.001316f, 0.000283f,-0.000450f},
+   {-0.002090f, 0.001316f, 0.006115f,-0.020867f,-0.047149f,-0.020867f, 0.006115f, 0.001316f,-0.002090f},
+   { 0.007132f,-0.004490f,-0.020867f, 0.071207f, 0.160885f, 0.071207f,-0.020867f,-0.004490f, 0.007132f},
+   { 0.016114f,-0.010146f,-0.047149f, 0.160885f, 0.363505f, 0.160885f,-0.047149f,-0.010146f, 0.016114f},
+   { 0.007132f,-0.004490f,-0.020867f, 0.071207f, 0.160885f, 0.071207f,-0.020867f,-0.004490f, 0.007132f},
+   {-0.002090f, 0.001316f, 0.006115f,-0.020867f,-0.047149f,-0.020867f, 0.006115f, 0.001316f,-0.002090f},
+   {-0.000450f, 0.000283f, 0.001316f,-0.004490f,-0.010146f,-0.004490f, 0.001316f, 0.000283f,-0.000450f},
+   { 0.000714f,-0.000450f,-0.002090f, 0.007132f, 0.016114f, 0.007132f,-0.002090f,-0.000450f, 0.000714f},
+};
+
+/* Alpha, beta, and gamma values for each scale */
+static float g_alphas[] = { 0.0000f, 0.0000f, 0.0000f, 0.0000f, 0.1333f };
+static float g_betas[]  = { 0.0448f, 0.2856f, 0.3001f, 0.2363f, 0.1333f };
+static float g_gammas[] = { 0.0448f, 0.2856f, 0.3001f, 0.2363f, 0.1333f };
+
+
+struct _context {
+    double l;  /* Luminance */
+    double c;  /* Contrast */
+    double s;  /* Structure */
+    float alpha;
+    float beta;
+    float gamma;
+};
+
+/* Called for each pixel */
+int _ms_ssim_map(const struct _ssim_int *si, void *ctx)
+{
+    struct _context *ms_ctx = (struct _context*)ctx;
+    ms_ctx->l += si->l;
+    ms_ctx->c += si->c;
+    ms_ctx->s += si->s;
+    return 0;
+}
+
+/* Called to calculate the final result */
+float _ms_ssim_reduce(int w, int h, void *ctx)
+{
+    double size = (double)(w*h);
+    struct _context *ms_ctx = (struct _context*)ctx;
+    ms_ctx->l = pow(ms_ctx->l / size, (double)ms_ctx->alpha);
+    ms_ctx->c = pow(ms_ctx->c / size, (double)ms_ctx->beta);
+    ms_ctx->s = pow(fabs(ms_ctx->s / size), (double)ms_ctx->gamma);
+    return (float)(ms_ctx->l * ms_ctx->c * ms_ctx->s);
+}
+
+/* Releases the scaled buffers */
+void _free_buffers(float **buf, int scales)
+{
+    int idx;
+    for (idx=0; idx<scales; ++idx)
+        free(buf[idx]);
+}
+
+/* Allocates the scaled buffers. If error, all buffers are free'd */
+int _alloc_buffers(float **buf, int w, int h, int scales)
+{
+    int idx;
+    int cur_w = w;
+    int cur_h = h;
+    for (idx=0; idx<scales; ++idx) {
+        buf[idx] = (float*)malloc(cur_w*cur_h*sizeof(float));
+        if (!buf[idx]) {
+            _free_buffers(buf, idx);
+            return 1;
+        }
+        cur_w = cur_w/2 + (cur_w&1);
+        cur_h = cur_h/2 + (cur_h&1);
+    }
+    return 0;
+}
+
+/*
+ * MS_SSIM(X,Y) = Lm(x,y)^aM * MULT[j=1->M]( Cj(x,y)^bj  *  Sj(x,y)^gj )
+ * where,
+ *  L = mean
+ *  C = variance
+ *  S = cross-correlation
+ *
+ *  b1=g1=0.0448, b2=g2=0.2856, b3=g3=0.3001, b4=g4=0.2363, a5=b5=g5=0.1333
+ */
+float iqa_ms_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, 
+    int stride, const struct iqa_ms_ssim_args *args)
+{
+    int wang=0;
+    int scales=SCALES;
+    int gauss=1;
+    const float *alphas=g_alphas, *betas=g_betas, *gammas=g_gammas;
+    int idx,x,y,cur_w,cur_h;
+    int offset,src_offset;
+    float **ref_imgs, **cmp_imgs; /* Array of pointers to scaled images */
+    float msssim;
+    struct _kernel lpf, window;
+    struct iqa_ssim_args s_args;
+    struct _map_reduce mr;
+    struct _context ms_ctx;
+
+    if (args) {
+        wang   = args->wang;
+        gauss  = args->gaussian;
+        scales = args->scales;
+        if (args->alphas)
+            alphas = args->alphas;
+        if (args->betas)
+            betas  = args->betas;
+        if (args->gammas)
+            gammas = args->gammas;
+    }
+
+    /* Make sure we won't scale below 1x1 */
+    cur_w = w;
+    cur_h = h;
+    for (idx=0; idx<scales; ++idx) {
+        if ( gauss ? cur_w<GAUSSIAN_LEN || cur_h<GAUSSIAN_LEN : cur_w<LPF_LEN || cur_h<LPF_LEN )
+            return INFINITY;
+        cur_w /= 2;
+        cur_h /= 2;
+    }
+
+    window.kernel = (float*)g_square_window;
+    window.w = window.h = SQUARE_LEN;
+    window.normalized = 1;
+    window.bnd_opt = KBND_SYMMETRIC;
+    if (gauss) {
+        window.kernel = (float*)g_gaussian_window;
+        window.w = window.h = GAUSSIAN_LEN;
+    }
+
+    mr.map     = _ms_ssim_map;
+    mr.reduce  = _ms_ssim_reduce;
+
+    /* Allocate the scaled image buffers */
+    ref_imgs = (float**)malloc(scales*sizeof(float*));
+    cmp_imgs = (float**)malloc(scales*sizeof(float*));
+    if (!ref_imgs || !cmp_imgs) {
+        if (ref_imgs) free(ref_imgs);
+        if (cmp_imgs) free(cmp_imgs);
+        return INFINITY;
+    }
+    if (_alloc_buffers(ref_imgs, w, h, scales)) {
+        free(ref_imgs);
+        free(cmp_imgs);
+        return INFINITY;
+    }
+    if (_alloc_buffers(cmp_imgs, w, h, scales)) {
+        _free_buffers(ref_imgs, scales);
+        free(ref_imgs);
+        free(cmp_imgs);
+        return INFINITY;
+    }
+
+    /* Copy original images into first scale buffer, forcing stride = width. */
+    for (y=0; y<h; ++y) {
+        src_offset = y*stride;
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset, ++src_offset) {
+            ref_imgs[0][offset] = (float)ref[src_offset];
+            cmp_imgs[0][offset] = (float)cmp[src_offset];
+        }
+    }
+
+    /* Create scaled versions of the images */
+    cur_w=w;
+    cur_h=h;
+    lpf.kernel = (float*)g_lpf;
+    lpf.w = lpf.h = LPF_LEN;
+    lpf.normalized = 1;
+    lpf.bnd_opt = KBND_SYMMETRIC;
+    for (idx=1; idx<scales; ++idx) {
+        if (_iqa_decimate(ref_imgs[idx-1], cur_w, cur_h, 2, &lpf, ref_imgs[idx], 0, 0) ||
+            _iqa_decimate(cmp_imgs[idx-1], cur_w, cur_h, 2, &lpf, cmp_imgs[idx], &cur_w, &cur_h))
+        {
+            _free_buffers(ref_imgs, scales);
+            _free_buffers(cmp_imgs, scales);
+            free(ref_imgs);
+            free(cmp_imgs);
+            return INFINITY;
+        }
+    }
+
+    cur_w=w;
+    cur_h=h;
+    msssim = 1.0;
+    for (idx=0; idx<scales; ++idx) {
+
+        ms_ctx.l = 0;
+        ms_ctx.c = 0;
+        ms_ctx.s = 0;
+        ms_ctx.alpha = alphas[idx];
+        ms_ctx.beta  = betas[idx];
+        ms_ctx.gamma = gammas[idx];
+
+        if (!wang) {
+            /* MS-SSIM* (Rouse/Hemami) */
+            s_args.alpha = 1.0f;
+            s_args.beta  = 1.0f;
+            s_args.gamma = 1.0f;
+            s_args.K1 = 0.0f; /* Force stabilization constants to 0 */
+            s_args.K2 = 0.0f;
+            s_args.L  = 255;
+            s_args.f  = 1; /* Don't resize */
+            mr.context = &ms_ctx;
+            msssim *= _iqa_ssim(ref_imgs[idx], cmp_imgs[idx], cur_w, cur_h, &window, &mr, &s_args);
+        }
+        else {
+            /* MS-SSIM (Wang) */
+            s_args.alpha = 1.0f;
+            s_args.beta  = 1.0f;
+            s_args.gamma = 1.0f;
+            s_args.K1 = 0.01f;
+            s_args.K2 = 0.03f;
+            s_args.L  = 255;
+            s_args.f  = 1; /* Don't resize */
+            mr.context = &ms_ctx;
+            msssim *= _iqa_ssim(ref_imgs[idx], cmp_imgs[idx], cur_w, cur_h, &window, &mr, &s_args);
+        }
+
+        if (msssim == INFINITY)
+            break;
+        cur_w = cur_w/2 + (cur_w&1);
+        cur_h = cur_h/2 + (cur_h&1);
+    }
+
+    _free_buffers(ref_imgs, scales);
+    _free_buffers(cmp_imgs, scales);
+    free(ref_imgs);
+    free(cmp_imgs);
+
+    return msssim;
+}
diff --git a/3rdparty/bimg/3rdparty/iqa/source/mse.c b/3rdparty/bimg/3rdparty/iqa/source/mse.c
new file mode 100644
index 0000000..da0ce77
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/mse.c
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+
+/* MSE(a,b) = 1/N * SUM((a-b)^2) */
+float iqa_mse(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride)
+{
+    int error, offset;
+    unsigned long long sum=0;
+    int ww,hh;
+    for (hh=0; hh<h; ++hh) {
+        offset = hh*stride;
+        for (ww=0; ww<w; ++ww, ++offset) {
+            error = ref[offset] - cmp[offset];
+            sum += error * error;
+        }
+    }
+    return (float)( (double)sum / (double)(w*h) );
+}
diff --git a/3rdparty/bimg/3rdparty/iqa/source/psnr.c b/3rdparty/bimg/3rdparty/iqa/source/psnr.c
new file mode 100644
index 0000000..4567071
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/psnr.c
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+#include <math.h>
+
+/* PSNR(a,b) = 10*log10(L^2 / MSE(a,b)), where L=2^b - 1 (8bit = 255) */
+float iqa_psnr(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride)
+{
+    const int L_sqd = 255 * 255;
+    return (float)( 10.0 * log10( L_sqd / iqa_mse(ref,cmp,w,h,stride) ) );
+}
diff --git a/3rdparty/bimg/3rdparty/iqa/source/ssim.c b/3rdparty/bimg/3rdparty/iqa/source/ssim.c
new file mode 100644
index 0000000..d1acccb
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/iqa/source/ssim.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2011, Tom Distler (http://tdistler.com)
+ * All rights reserved.
+ *
+ * The BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * - Redistributions of source code must retain the above copyright notice, 
+ *   this list of conditions and the following disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above copyright notice,
+ *   this list of conditions and the following disclaimer in the documentation
+ *   and/or other materials provided with the distribution.
+ *
+ * - Neither the name of the tdistler.com nor the names of its contributors may
+ *   be used to endorse or promote products derived from this software without
+ *   specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE 
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "iqa.h"
+#include "convolve.h"
+#include "decimate.h"
+#include "math_utils.h"
+#include "ssim.h"
+#include <stdlib.h>
+#include <math.h>
+
+
+/* Forward declarations. */
+IQA_INLINE static double _calc_luminance(float, float, float, float);
+IQA_INLINE static double _calc_contrast(double, float, float, float, float);
+IQA_INLINE static double _calc_structure(float, double, float, float, float, float);
+static int _ssim_map(const struct _ssim_int *, void *);
+static float _ssim_reduce(int, int, void *);
+
+/* 
+ * SSIM(x,y)=(2*ux*uy + C1)*(2sxy + C2) / (ux^2 + uy^2 + C1)*(sx^2 + sy^2 + C2)
+ * where,
+ *  ux = SUM(w*x)
+ *  sx = (SUM(w*(x-ux)^2)^0.5
+ *  sxy = SUM(w*(x-ux)*(y-uy))
+ *
+ * Returns mean SSIM. MSSIM(X,Y) = 1/M * SUM(SSIM(x,y))
+ */
+float iqa_ssim(const unsigned char *ref, const unsigned char *cmp, int w, int h, int stride,
+    int gaussian, const struct iqa_ssim_args *args)
+{
+    int scale;
+    int x,y,src_offset,offset;
+    float *ref_f,*cmp_f;
+    struct _kernel low_pass;
+    struct _kernel window;
+    float result;
+    double ssim_sum=0.0;
+    struct _map_reduce mr;
+
+    /* Initialize algorithm parameters */
+    scale = _max( 1, _round( (float)_min(w,h) / 256.0f ) );
+    if (args) {
+        if(args->f)
+            scale = args->f;
+        mr.map     = _ssim_map;
+        mr.reduce  = _ssim_reduce;
+        mr.context = (void*)&ssim_sum;
+    }
+    window.kernel = (float*)g_square_window;
+    window.w = window.h = SQUARE_LEN;
+    window.normalized = 1;
+    window.bnd_opt = KBND_SYMMETRIC;
+    if (gaussian) {
+        window.kernel = (float*)g_gaussian_window;
+        window.w = window.h = GAUSSIAN_LEN;
+    }
+
+    /* Convert image values to floats. Forcing stride = width. */
+    ref_f = (float*)malloc(w*h*sizeof(float));
+    cmp_f = (float*)malloc(w*h*sizeof(float));
+    if (!ref_f || !cmp_f) {
+        if (ref_f) free(ref_f);
+        if (cmp_f) free(cmp_f);
+        return INFINITY;
+    }
+    for (y=0; y<h; ++y) {
+        src_offset = y*stride;
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset, ++src_offset) {
+            ref_f[offset] = (float)ref[src_offset];
+            cmp_f[offset] = (float)cmp[src_offset];
+        }
+    }
+
+    /* Scale the images down if required */
+    if (scale > 1) {
+        /* Generate simple low-pass filter */
+        low_pass.kernel = (float*)malloc(scale*scale*sizeof(float));
+        if (!low_pass.kernel) {
+            free(ref_f);
+            free(cmp_f);
+            return INFINITY;
+        }
+        low_pass.w = low_pass.h = scale;
+        low_pass.normalized = 0;
+        low_pass.bnd_opt = KBND_SYMMETRIC;
+        for (offset=0; offset<scale*scale; ++offset)
+            low_pass.kernel[offset] = 1.0f/(scale*scale);
+
+        /* Resample */
+        if (_iqa_decimate(ref_f, w, h, scale, &low_pass, 0, 0, 0) ||
+            _iqa_decimate(cmp_f, w, h, scale, &low_pass, 0, &w, &h)) { /* Update w/h */
+            free(ref_f);
+            free(cmp_f);
+            free(low_pass.kernel);
+            return INFINITY;
+        }
+        free(low_pass.kernel);
+    }
+
+    result = _iqa_ssim(ref_f, cmp_f, w, h, &window, &mr, args);
+    
+    free(ref_f);
+    free(cmp_f);
+
+    return result;
+}
+
+
+/* _iqa_ssim */
+float _iqa_ssim(float *ref, float *cmp, int w, int h, const struct _kernel *k, const struct _map_reduce *mr, const struct iqa_ssim_args *args)
+{
+    float alpha=1.0f, beta=1.0f, gamma=1.0f;
+    int L=255;
+    float K1=0.01f, K2=0.03f;
+    float C1,C2,C3;
+    int x,y,offset;
+    float *ref_mu,*cmp_mu,*ref_sigma_sqd,*cmp_sigma_sqd,*sigma_both;
+    double ssim_sum, numerator, denominator;
+    double luminance_comp, contrast_comp, structure_comp, sigma_root;
+    struct _ssim_int sint;
+
+    /* Initialize algorithm parameters */
+    if (args) {
+        if (!mr)
+            return INFINITY;
+        alpha = args->alpha;
+        beta  = args->beta;
+        gamma = args->gamma;
+        L     = args->L;
+        K1    = args->K1;
+        K2    = args->K2;
+    }
+    C1 = (K1*L)*(K1*L);
+    C2 = (K2*L)*(K2*L);
+    C3 = C2 / 2.0f;
+
+    ref_mu = (float*)malloc(w*h*sizeof(float));
+    cmp_mu = (float*)malloc(w*h*sizeof(float));
+    ref_sigma_sqd = (float*)malloc(w*h*sizeof(float));
+    cmp_sigma_sqd = (float*)malloc(w*h*sizeof(float));
+    sigma_both = (float*)malloc(w*h*sizeof(float));
+    if (!ref_mu || !cmp_mu || !ref_sigma_sqd || !cmp_sigma_sqd || !sigma_both) {
+        if (ref_mu) free(ref_mu);
+        if (cmp_mu) free(cmp_mu);
+        if (ref_sigma_sqd) free(ref_sigma_sqd);
+        if (cmp_sigma_sqd) free(cmp_sigma_sqd);
+        if (sigma_both) free(sigma_both);
+        return INFINITY;
+    }
+
+    /* Calculate mean */
+    _iqa_convolve(ref, w, h, k, ref_mu, 0, 0);
+    _iqa_convolve(cmp, w, h, k, cmp_mu, 0, 0);
+
+    for (y=0; y<h; ++y) {
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset) {
+            ref_sigma_sqd[offset] = ref[offset] * ref[offset];
+            cmp_sigma_sqd[offset] = cmp[offset] * cmp[offset];
+            sigma_both[offset] = ref[offset] * cmp[offset];
+        }
+    }
+
+    /* Calculate sigma */
+    _iqa_convolve(ref_sigma_sqd, w, h, k, 0, 0, 0);
+    _iqa_convolve(cmp_sigma_sqd, w, h, k, 0, 0, 0);
+    _iqa_convolve(sigma_both, w, h, k, 0, &w, &h); /* Update the width and height */
+
+    /* The convolution results are smaller by the kernel width and height */
+    for (y=0; y<h; ++y) {
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset) {
+            ref_sigma_sqd[offset] -= ref_mu[offset] * ref_mu[offset];
+            cmp_sigma_sqd[offset] -= cmp_mu[offset] * cmp_mu[offset];
+            sigma_both[offset] -= ref_mu[offset] * cmp_mu[offset];
+        }
+    }
+
+    ssim_sum = 0.0;
+    for (y=0; y<h; ++y) {
+        offset = y*w;
+        for (x=0; x<w; ++x, ++offset) {
+
+            if (!args) {
+                /* The default case */
+                numerator   = (2.0 * ref_mu[offset] * cmp_mu[offset] + C1) * (2.0 * sigma_both[offset] + C2);
+                denominator = (ref_mu[offset]*ref_mu[offset] + cmp_mu[offset]*cmp_mu[offset] + C1) * 
+                    (ref_sigma_sqd[offset] + cmp_sigma_sqd[offset] + C2);
+                ssim_sum += numerator / denominator;
+            }
+            else {
+                /* User tweaked alpha, beta, or gamma */
+
+                /* passing a negative number to sqrt() cause a domain error */
+                if (ref_sigma_sqd[offset] < 0.0f)
+                    ref_sigma_sqd[offset] = 0.0f;
+                if (cmp_sigma_sqd[offset] < 0.0f)
+                    cmp_sigma_sqd[offset] = 0.0f;
+                sigma_root = sqrt(ref_sigma_sqd[offset] * cmp_sigma_sqd[offset]);
+
+                luminance_comp = _calc_luminance(ref_mu[offset], cmp_mu[offset], C1, alpha);
+                contrast_comp  = _calc_contrast(sigma_root, ref_sigma_sqd[offset], cmp_sigma_sqd[offset], C2, beta);
+                structure_comp = _calc_structure(sigma_both[offset], sigma_root, ref_sigma_sqd[offset], cmp_sigma_sqd[offset], C3, gamma);
+
+                sint.l = luminance_comp;
+                sint.c = contrast_comp;
+                sint.s = structure_comp;
+
+                if (mr->map(&sint, mr->context))
+                    return INFINITY;
+            }
+        }
+    }
+
+    free(ref_mu);
+    free(cmp_mu);
+    free(ref_sigma_sqd);
+    free(cmp_sigma_sqd);
+    free(sigma_both);
+
+    if (!args)
+        return (float)(ssim_sum / (double)(w*h));
+    return mr->reduce(w, h, mr->context);
+}
+
+
+/* _ssim_map */
+int _ssim_map(const struct _ssim_int *si, void *ctx)
+{
+    double *ssim_sum = (double*)ctx;
+    *ssim_sum += si->l * si->c * si->s;
+    return 0;
+}
+
+/* _ssim_reduce */
+float _ssim_reduce(int w, int h, void *ctx)
+{
+    double *ssim_sum = (double*)ctx;
+    return (float)(*ssim_sum / (double)(w*h));
+}
+
+
+/* _calc_luminance */
+IQA_INLINE static double _calc_luminance(float mu1, float mu2, float C1, float alpha)
+{
+    double result;
+    float sign;
+    /* For MS-SSIM* */
+    if (C1 == 0 && mu1*mu1 == 0 && mu2*mu2 == 0)
+        return 1.0;
+    result = (2.0 * mu1 * mu2 + C1) / (mu1*mu1 + mu2*mu2 + C1);
+    if (alpha == 1.0f)
+        return result;
+    sign = result < 0.0 ? -1.0f : 1.0f;
+    return sign * pow(fabs(result),(double)alpha);
+}
+
+/* _calc_contrast */
+IQA_INLINE static double _calc_contrast(double sigma_comb_12, float sigma1_sqd, float sigma2_sqd, float C2, float beta)
+{
+    double result;
+    float sign;
+    /* For MS-SSIM* */
+    if (C2 == 0 && sigma1_sqd + sigma2_sqd == 0)
+        return 1.0;
+    result = (2.0 * sigma_comb_12 + C2) / (sigma1_sqd + sigma2_sqd + C2);
+    if (beta == 1.0f)
+        return result;
+    sign = result < 0.0 ? -1.0f : 1.0f;
+    return sign * pow(fabs(result),(double)beta);
+}
+
+/* _calc_structure */
+IQA_INLINE static double _calc_structure(float sigma_12, double sigma_comb_12, float sigma1, float sigma2, float C3, float gamma)
+{
+    double result;
+    float sign;
+    /* For MS-SSIM* */
+    if (C3 == 0 && sigma_comb_12 == 0) {
+        if (sigma1 == 0 && sigma2 == 0)
+            return 1.0;
+        else if (sigma1 == 0 || sigma2 == 0)
+            return 0.0;
+    }
+    result = (sigma_12 + C3) / (sigma_comb_12 + C3);
+    if (gamma == 1.0f)
+        return result;
+    sign = result < 0.0 ? -1.0f : 1.0f;
+    return sign * pow(fabs(result),(double)gamma);
+}
\ No newline at end of file
diff --git a/3rdparty/bimg/3rdparty/libsquish/LICENSE b/3rdparty/bimg/3rdparty/libsquish/LICENSE
new file mode 100644
index 0000000..ed1c78d
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/LICENSE
@@ -0,0 +1,20 @@
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/3rdparty/bimg/3rdparty/libsquish/README b/3rdparty/bimg/3rdparty/libsquish/README
new file mode 100644
index 0000000..d26b72e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/README
@@ -0,0 +1,35 @@
+LICENSE
+-------
+
+The squish library is distributed under the terms and conditions of the MIT
+license. This license is specified at the top of each source file and must be
+preserved in its entirety.
+
+BUILDING AND INSTALLING THE LIBRARY
+-----------------------------------
+
+If you are using Visual Studio 2003 or above under Windows then load the Visual
+Studio 2003 project in the vs7 folder. By default, the library is built using
+SSE2 optimisations. To change this either change or remove the SQUISH_USE_SSE=2
+from the preprocessor symbols.
+
+If you are using a Mac then load the Xcode 2.2 project in the distribution. By
+default, the library is built using Altivec optimisations. To change this
+either change or remove SQUISH_USE_ALTIVEC=1 from the preprocessor symbols. I
+guess I'll have to think about changing this for the new Intel Macs that are
+rolling out...
+
+If you are using unix then first edit the config file in the base directory of
+the distribution, enabling Altivec or SSE with the USE_ALTIVEC or USE_SSE
+variables, and editing the optimisation flags passed to the C++ compiler if
+necessary. Then make can be used to build the library, and make install (from
+the superuser account) can be used to install (into /usr/local by default).
+
+REPORTING BUGS OR FEATURE REQUESTS
+----------------------------------
+
+Feedback can be sent to Simon Brown (the developer) at si@sjbrown.co.uk
+
+New releases are announced on the squish library homepage at
+http://sjbrown.co.uk/?code=squish
+
diff --git a/3rdparty/bimg/3rdparty/libsquish/alpha.cpp b/3rdparty/bimg/3rdparty/libsquish/alpha.cpp
new file mode 100644
index 0000000..0f94e21
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/alpha.cpp
@@ -0,0 +1,350 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "alpha.h"
+
+#include <climits>
+#include <algorithm>
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// quantise and pack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		float alpha1 = ( float )rgba[8*i + 3] * ( 15.0f/255.0f );
+		float alpha2 = ( float )rgba[8*i + 7] * ( 15.0f/255.0f );
+		int quant1 = FloatToInt( alpha1, 15 );
+		int quant2 = FloatToInt( alpha2, 15 );
+		
+		// set alpha to zero where masked
+		int bit1 = 1 << ( 2*i );
+		int bit2 = 1 << ( 2*i + 1 );
+		if( ( mask & bit1 ) == 0 )
+			quant1 = 0;
+		if( ( mask & bit2 ) == 0 )
+			quant2 = 0;
+
+		// pack into the byte
+		bytes[i] = ( u8 )( quant1 | ( quant2 << 4 ) );
+	}
+}
+
+void DecompressAlphaDxt3( u8* rgba, void const* block )
+{
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the alpha values pairwise
+	for( int i = 0; i < 8; ++i )
+	{
+		// quantise down to 4 bits
+		u8 quant = bytes[i];
+		
+		// unpack the values
+		u8 lo = quant & 0x0f;
+		u8 hi = quant & 0xf0;
+
+		// convert back up to bytes
+		rgba[8*i + 3] = lo | ( lo << 4 );
+		rgba[8*i + 7] = hi | ( hi >> 4 );
+	}
+}
+
+static void FixRange( int& min, int& max, int steps )
+{
+	if( max - min < steps )
+		max = std::min( min + steps, 255 );
+	if( max - min < steps )
+		min = std::max( 0, max - steps );
+}
+
+static int FitCodes( u8 const* rgba, int mask, u8 const* codes, u8* indices )
+{
+	// fit each alpha value to the codebook
+	int err = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			// use the first code
+			indices[i] = 0;
+			continue;
+		}
+		
+		// find the least error and corresponding index
+		int value = rgba[4*i + 3];
+		int least = INT_MAX;
+		int index = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			// get the squared error from this code
+			int dist = ( int )value - ( int )codes[j];
+			dist *= dist;
+			
+			// compare with the best so far
+			if( dist < least )
+			{
+				least = dist;
+				index = j;
+			}
+		}
+		
+		// save this index and accumulate the error
+		indices[i] = ( u8 )index;
+		err += least;
+	}
+	
+	// return the total error
+	return err;
+}
+
+static void WriteAlphaBlock( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	u8* bytes = reinterpret_cast< u8* >( block );
+	
+	// write the first two bytes
+	bytes[0] = ( u8 )alpha0;
+	bytes[1] = ( u8 )alpha1;
+	
+	// pack the indices with 3 bits each
+	u8* dest = bytes + 2;
+	u8 const* src = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// pack 8 3-bit values
+		int value = 0;
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = *src++;
+			value |= ( index << 3*j );
+		}
+			
+		// store in 3 bytes
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = ( value >> 8*j ) & 0xff;
+			*dest++ = ( u8 )byte;
+		}
+	}
+}
+
+static void WriteAlphaBlock5( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 > alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else if( index <= 5 )
+				swapped[i] = 7 - index;
+			else 
+				swapped[i] = index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+static void WriteAlphaBlock7( int alpha0, int alpha1, u8 const* indices, void* block )
+{
+	// check the relative values of the endpoints
+	if( alpha0 < alpha1 )
+	{
+		// swap the indices
+		u8 swapped[16];
+		for( int i = 0; i < 16; ++i )
+		{
+			u8 index = indices[i];
+			if( index == 0 )
+				swapped[i] = 1;
+			else if( index == 1 )
+				swapped[i] = 0;
+			else
+				swapped[i] = 9 - index;
+		}
+		
+		// write the block
+		WriteAlphaBlock( alpha1, alpha0, swapped, block );
+	}
+	else
+	{
+		// write the block
+		WriteAlphaBlock( alpha0, alpha1, indices, block );
+	}	
+}
+
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block )
+{
+	// get the range for 5-alpha and 7-alpha interpolation
+	int min5 = 255;
+	int max5 = 0;
+	int min7 = 255;
+	int max7 = 0;
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is valid
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+			continue;
+
+		// incorporate into the min/max
+		int value = rgba[4*i + 3];
+		if( value < min7 )
+			min7 = value;
+		if( value > max7 )
+			max7 = value;
+		if( value != 0 && value < min5 )
+			min5 = value;
+		if( value != 255 && value > max5 )
+			max5 = value;
+	}
+	
+	// handle the case that no valid range was found
+	if( min5 > max5 )
+		min5 = max5;
+	if( min7 > max7 )
+		min7 = max7;
+		
+	// fix the range to be the minimum in each case
+	FixRange( min5, max5, 5 );
+	FixRange( min7, max7, 7 );
+	
+	// set up the 5-alpha code book
+	u8 codes5[8];
+	codes5[0] = ( u8 )min5;
+	codes5[1] = ( u8 )max5;
+	for( int i = 1; i < 5; ++i )
+		codes5[1 + i] = ( u8 )( ( ( 5 - i )*min5 + i*max5 )/5 );
+	codes5[6] = 0;
+	codes5[7] = 255;
+	
+	// set up the 7-alpha code book
+	u8 codes7[8];
+	codes7[0] = ( u8 )min7;
+	codes7[1] = ( u8 )max7;
+	for( int i = 1; i < 7; ++i )
+		codes7[1 + i] = ( u8 )( ( ( 7 - i )*min7 + i*max7 )/7 );
+		
+	// fit the data to both code books
+	u8 indices5[16];
+	u8 indices7[16];
+	int err5 = FitCodes( rgba, mask, codes5, indices5 );
+	int err7 = FitCodes( rgba, mask, codes7, indices7 );
+	
+	// save the block with least error
+	if( err5 <= err7 )
+		WriteAlphaBlock5( min5, max5, indices5, block );
+	else
+		WriteAlphaBlock7( min7, max7, indices7, block );
+}
+
+void DecompressAlphaDxt5( u8* rgba, void const* block )
+{
+	// get the two alpha values
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	int alpha0 = bytes[0];
+	int alpha1 = bytes[1];
+	
+	// compare the values to build the codebook
+	u8 codes[8];
+	codes[0] = ( u8 )alpha0;
+	codes[1] = ( u8 )alpha1;
+	if( alpha0 <= alpha1 )
+	{
+		// use 5-alpha codebook
+		for( int i = 1; i < 5; ++i )
+			codes[1 + i] = ( u8 )( ( ( 5 - i )*alpha0 + i*alpha1 )/5 );
+		codes[6] = 0;
+		codes[7] = 255;
+	}
+	else
+	{
+		// use 7-alpha codebook
+		for( int i = 1; i < 7; ++i )
+			codes[1 + i] = ( u8 )( ( ( 7 - i )*alpha0 + i*alpha1 )/7 );
+	}
+	
+	// decode the indices
+	u8 indices[16];
+	u8 const* src = bytes + 2;
+	u8* dest = indices;
+	for( int i = 0; i < 2; ++i )
+	{
+		// grab 3 bytes
+		int value = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			int byte = *src++;
+			value |= ( byte << 8*j );
+		}
+		
+		// unpack 8 3-bit values from it
+		for( int j = 0; j < 8; ++j )
+		{
+			int index = ( value >> 3*j ) & 0x7;
+			*dest++ = ( u8 )index;
+		}
+	}
+	
+	// write out the indexed codebook values
+	for( int i = 0; i < 16; ++i )
+		rgba[4*i + 3] = codes[indices[i]];
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/alpha.h b/3rdparty/bimg/3rdparty/libsquish/alpha.h
new file mode 100644
index 0000000..e5e7f32
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/alpha.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_ALPHA_H
+#define SQUISH_ALPHA_H
+
+#include "squish.h"
+
+namespace squish {
+
+void CompressAlphaDxt3( u8 const* rgba, int mask, void* block );
+void CompressAlphaDxt5( u8 const* rgba, int mask, void* block );
+
+void DecompressAlphaDxt3( u8* rgba, void const* block );
+void DecompressAlphaDxt5( u8* rgba, void const* block );
+
+} // namespace squish
+
+#endif // ndef SQUISH_ALPHA_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/clusterfit.cpp b/3rdparty/bimg/3rdparty/libsquish/clusterfit.cpp
new file mode 100644
index 0000000..9670446
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/clusterfit.cpp
@@ -0,0 +1,392 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "clusterfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+ClusterFit::ClusterFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// set the iteration count
+	m_iterationCount = ( m_flags & kColourIterativeClusterFit ) ? kMaxIterations : 1;
+
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec4( metric[0], metric[1], metric[2], 1.0f );
+	else
+		m_metric = VEC4_CONST( 1.0f );	
+
+	// initialise the best error
+	m_besterror = VEC4_CONST( FLT_MAX );
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, m_colours->GetWeights() );
+	
+	// compute the principle component
+	m_principle = ComputePrincipleComponent( covariance );
+}
+
+bool ClusterFit::ConstructOrdering( Vec3 const& axis, int iteration )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+
+	// build the list of dot products
+	float dps[16];
+	u8* order = ( u8* )m_order + 16*iteration;
+	for( int i = 0; i < count; ++i )
+	{
+		dps[i] = Dot( values[i], axis );
+		order[i] = ( u8 )i;
+	}
+		
+	// stable sort using them
+	for( int i = 0; i < count; ++i )
+	{
+		for( int j = i; j > 0 && dps[j] < dps[j - 1]; --j )
+		{
+			std::swap( dps[j], dps[j - 1] );
+			std::swap( order[j], order[j - 1] );
+		}
+	}
+	
+	// check this ordering is unique
+	for( int it = 0; it < iteration; ++it )
+	{
+		u8 const* prev = ( u8* )m_order + 16*it;
+		bool same = true;
+		for( int i = 0; i < count; ++i )
+		{
+			if( order[i] != prev[i] )
+			{
+				same = false;
+				break;
+			}
+		}
+		if( same )
+			return false;
+	}
+	
+	// copy the ordering and weight all the points
+	Vec3 const* unweighted = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	m_xsum_wsum = VEC4_CONST( 0.0f );
+	for( int i = 0; i < count; ++i )
+	{
+		int j = order[i];
+		Vec4 p( unweighted[j].X(), unweighted[j].Y(), unweighted[j].Z(), 1.0f );
+		Vec4 w( weights[j] );
+		Vec4 x = p*w;
+		m_points_weights[i] = x;
+		m_xsum_wsum += x;
+	}
+	return true;
+}
+
+void ClusterFit::Compress3( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0 );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const half_half2( 0.5f, 0.5f, 0.5f, 0.25f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is half along
+			Vec4 part1 = ( i == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+			int jmin = ( i == 0 ) ? 1 : i;
+			for( int j = jmin;; )
+			{
+				// last cluster [j,count) is at the end
+				Vec4 part2 = m_xsum_wsum - part1 - part0;
+				
+				// compute least squares terms directly
+				Vec4 alphax_sum = MultiplyAdd( part1, half_half2, part0 );
+				Vec4 alpha2_sum = alphax_sum.SplatW();
+
+				Vec4 betax_sum = MultiplyAdd( part1, half_half2, part2 );
+				Vec4 beta2_sum = betax_sum.SplatW();
+
+				Vec4 alphabeta_sum = ( part1*half_half2 ).SplatW();
+
+				// compute the least-squares optimal points
+				Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+				Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+				Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+				// clamp to the grid
+				a = Min( one, Max( zero, a ) );
+				b = Min( one, Max( zero, b ) );
+				a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+				b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+				
+				// compute the error (we skip the constant xxsum)
+				Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+				Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+				Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+				Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+				// apply the metric to the error term
+				Vec4 e5 = e4*m_metric;
+				Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+				
+				// keep the solution if it wins
+				if( CompareAnyLessThan( error, besterror ) )
+				{
+					beststart = a;
+					bestend = b;
+					besti = i;
+					bestj = j;
+					besterror = error;
+					bestiteration = iterationIndex;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+		
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock3( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+void ClusterFit::Compress4( void* block )
+{
+	// declare variables
+	int const count = m_colours->GetCount();
+	Vec4 const two = VEC4_CONST( 2.0f );
+	Vec4 const one = VEC4_CONST( 1.0f );
+	Vec4 const onethird_onethird2( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
+	Vec4 const twothirds_twothirds2( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
+	Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
+	Vec4 const zero = VEC4_CONST( 0.0f );
+	Vec4 const half = VEC4_CONST( 0.5f );
+	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
+	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
+
+	// prepare an ordering using the principle axis
+	ConstructOrdering( m_principle, 0 );
+	
+	// check all possible clusters and iterate on the total order
+	Vec4 beststart = VEC4_CONST( 0.0f );
+	Vec4 bestend = VEC4_CONST( 0.0f );
+	Vec4 besterror = m_besterror;
+	u8 bestindices[16];
+	int bestiteration = 0;
+	int besti = 0, bestj = 0, bestk = 0;
+	
+	// loop over iterations (we avoid the case that all points in first or last cluster)
+	for( int iterationIndex = 0;; )
+	{
+		// first cluster [0,i) is at the start
+		Vec4 part0 = VEC4_CONST( 0.0f );
+		for( int i = 0; i < count; ++i )
+		{
+			// second cluster [i,j) is one third along
+			Vec4 part1 = VEC4_CONST( 0.0f );
+			for( int j = i;; )
+			{
+				// third cluster [j,k) is two thirds along
+				Vec4 part2 = ( j == 0 ) ? m_points_weights[0] : VEC4_CONST( 0.0f );
+				int kmin = ( j == 0 ) ? 1 : j;
+				for( int k = kmin;; )
+				{
+					// last cluster [k,count) is at the end
+					Vec4 part3 = m_xsum_wsum - part2 - part1 - part0;
+
+					// compute least squares terms directly
+					Vec4 const alphax_sum = MultiplyAdd( part2, onethird_onethird2, MultiplyAdd( part1, twothirds_twothirds2, part0 ) );
+					Vec4 const alpha2_sum = alphax_sum.SplatW();
+					
+					Vec4 const betax_sum = MultiplyAdd( part1, onethird_onethird2, MultiplyAdd( part2, twothirds_twothirds2, part3 ) );
+					Vec4 const beta2_sum = betax_sum.SplatW();
+					
+					Vec4 const alphabeta_sum = twonineths*( part1 + part2 ).SplatW();
+
+					// compute the least-squares optimal points
+					Vec4 factor = Reciprocal( NegativeMultiplySubtract( alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum ) );
+					Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum*beta2_sum )*factor;
+					Vec4 b = NegativeMultiplySubtract( alphax_sum, alphabeta_sum, betax_sum*alpha2_sum )*factor;
+
+					// clamp to the grid
+					a = Min( one, Max( zero, a ) );
+					b = Min( one, Max( zero, b ) );
+					a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
+					b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;
+					
+					// compute the error (we skip the constant xxsum)
+					Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
+					Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
+					Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
+					Vec4 e4 = MultiplyAdd( two, e3, e1 );
+
+					// apply the metric to the error term
+					Vec4 e5 = e4*m_metric;
+					Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
+
+					// keep the solution if it wins
+					if( CompareAnyLessThan( error, besterror ) )
+					{
+						beststart = a;
+						bestend = b;
+						besterror = error;
+						besti = i;
+						bestj = j;
+						bestk = k;
+						bestiteration = iterationIndex;
+					}
+
+					// advance
+					if( k == count )
+						break;
+					part2 += m_points_weights[k];
+					++k;
+				}
+
+				// advance
+				if( j == count )
+					break;
+				part1 += m_points_weights[j];
+				++j;
+			}
+
+			// advance
+			part0 += m_points_weights[i];
+		}
+		
+		// stop if we didn't improve in this iteration
+		if( bestiteration != iterationIndex )
+			break;
+			
+		// advance if possible
+		++iterationIndex;
+		if( iterationIndex == m_iterationCount )
+			break;
+			
+		// stop if a new iteration is an ordering that has already been tried
+		Vec3 axis = ( bestend - beststart ).GetVec3();
+		if( !ConstructOrdering( axis, iterationIndex ) )
+			break;
+	}
+
+	// save the block if necessary
+	if( CompareAnyLessThan( besterror, m_besterror ) )
+	{
+		// remap the indices
+		u8 const* order = ( u8* )m_order + 16*bestiteration;
+
+		u8 unordered[16];
+		for( int m = 0; m < besti; ++m )
+			unordered[order[m]] = 0;
+		for( int m = besti; m < bestj; ++m )
+			unordered[order[m]] = 2;
+		for( int m = bestj; m < bestk; ++m )
+			unordered[order[m]] = 3;
+		for( int m = bestk; m < count; ++m )
+			unordered[order[m]] = 1;
+
+		m_colours->RemapIndices( unordered, bestindices );
+		
+		// save the block
+		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );
+
+		// save the error
+		m_besterror = besterror;
+	}
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/clusterfit.h b/3rdparty/bimg/3rdparty/libsquish/clusterfit.h
new file mode 100644
index 0000000..c882469
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/clusterfit.h
@@ -0,0 +1,61 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+	Copyright (c) 2007 Ignacio Castano                   icastano@nvidia.com
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CLUSTERFIT_H
+#define SQUISH_CLUSTERFIT_H
+
+#include "squish.h"
+#include "maths.h"
+#include "simd.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ClusterFit : public ColourFit
+{
+public:
+	ClusterFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	bool ConstructOrdering( Vec3 const& axis, int iteration );
+
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+
+	enum { kMaxIterations = 8 };
+
+	int m_iterationCount;
+	Vec3 m_principle;
+	u8 m_order[16*kMaxIterations];
+	Vec4 m_points_weights[16];
+	Vec4 m_xsum_wsum;
+	Vec4 m_metric;
+	Vec4 m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_CLUSTERFIT_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/colourblock.cpp b/3rdparty/bimg/3rdparty/libsquish/colourblock.cpp
new file mode 100644
index 0000000..e6a5788
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/colourblock.cpp
@@ -0,0 +1,214 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourblock.h"
+
+namespace squish {
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+static int FloatTo565( Vec3::Arg colour )
+{
+	// get the components in the correct range
+	int r = FloatToInt( 31.0f*colour.X(), 31 );
+	int g = FloatToInt( 63.0f*colour.Y(), 63 );
+	int b = FloatToInt( 31.0f*colour.Z(), 31 );
+	
+	// pack into a single value
+	return ( r << 11 ) | ( g << 5 ) | b;
+}
+
+static void WriteColourBlock( int a, int b, u8* indices, void* block )
+{
+	// get the block as bytes
+	u8* bytes = ( u8* )block;
+
+	// write the endpoints
+	bytes[0] = ( u8 )( a & 0xff );
+	bytes[1] = ( u8 )( a >> 8 );
+	bytes[2] = ( u8 )( b & 0xff );
+	bytes[3] = ( u8 )( b >> 8 );
+	
+	// write the indices
+	for( int i = 0; i < 4; ++i )
+	{
+		u8 const* ind = indices + 4*i;
+		bytes[4 + i] = ind[0] | ( ind[1] << 2 ) | ( ind[2] << 4 ) | ( ind[3] << 6 );
+	}
+}
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a <= b )
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	else
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+		{
+			if( indices[i] == 0 )
+				remapped[i] = 1;
+			else if( indices[i] == 1 )
+				remapped[i] = 0;
+			else
+				remapped[i] = indices[i];
+		}
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block )
+{
+	// get the packed values
+	int a = FloatTo565( start );
+	int b = FloatTo565( end );
+
+	// remap the indices
+	u8 remapped[16];
+	if( a < b )
+	{
+		// swap a and b
+		std::swap( a, b );
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = ( indices[i] ^ 0x1 ) & 0x3;
+	}
+	else if( a == b )
+	{
+		// use index 0
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = 0;
+	}
+	else
+	{
+		// use the indices directly
+		for( int i = 0; i < 16; ++i )
+			remapped[i] = indices[i];
+	}
+	
+	// write the block
+	WriteColourBlock( a, b, remapped, block );
+}
+
+static int Unpack565( u8 const* packed, u8* colour )
+{
+	// build the packed value
+	int value = ( int )packed[0] | ( ( int )packed[1] << 8 );
+	
+	// get the components in the stored range
+	u8 red = ( u8 )( ( value >> 11 ) & 0x1f );
+	u8 green = ( u8 )( ( value >> 5 ) & 0x3f );
+	u8 blue = ( u8 )( value & 0x1f );
+
+	// scale up to 8 bits
+	colour[0] = ( red << 3 ) | ( red >> 2 );
+	colour[1] = ( green << 2 ) | ( green >> 4 );
+	colour[2] = ( blue << 3 ) | ( blue >> 2 );
+	colour[3] = 255;
+	
+	// return the value
+	return value;
+}
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 )
+{
+	// get the block bytes
+	u8 const* bytes = reinterpret_cast< u8 const* >( block );
+	
+	// unpack the endpoints
+	u8 codes[16];
+	int a = Unpack565( bytes, codes );
+	int b = Unpack565( bytes + 2, codes + 4 );
+	
+	// generate the midpoints
+	for( int i = 0; i < 3; ++i )
+	{
+		int c = codes[i];
+		int d = codes[4 + i];
+
+		if( isDxt1 && a <= b )
+		{
+			codes[8 + i] = ( u8 )( ( c + d )/2 );
+			codes[12 + i] = 0;
+		}
+		else
+		{
+			codes[8 + i] = ( u8 )( ( 2*c + d )/3 );
+			codes[12 + i] = ( u8 )( ( c + 2*d )/3 );
+		}
+	}
+	
+	// fill in alpha for the intermediate values
+	codes[8 + 3] = 255;
+	codes[12 + 3] = ( isDxt1 && a <= b ) ? 0 : 255;
+	
+	// unpack the indices
+	u8 indices[16];
+	for( int i = 0; i < 4; ++i )
+	{
+		u8* ind = indices + 4*i;
+		u8 packed = bytes[4 + i];
+		
+		ind[0] = packed & 0x3;
+		ind[1] = ( packed >> 2 ) & 0x3;
+		ind[2] = ( packed >> 4 ) & 0x3;
+		ind[3] = ( packed >> 6 ) & 0x3;
+	}
+
+	// store out the colours
+	for( int i = 0; i < 16; ++i )
+	{
+		u8 offset = 4*indices[i];
+		for( int j = 0; j < 4; ++j )
+			rgba[4*i + j] = codes[offset + j];
+	}
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/colourblock.h b/3rdparty/bimg/3rdparty/libsquish/colourblock.h
new file mode 100644
index 0000000..2562561
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/colourblock.h
@@ -0,0 +1,41 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURBLOCK_H
+#define SQUISH_COLOURBLOCK_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+void WriteColourBlock3( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+void WriteColourBlock4( Vec3::Arg start, Vec3::Arg end, u8 const* indices, void* block );
+
+void DecompressColour( u8* rgba, void const* block, bool isDxt1 );
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURBLOCK_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/colourfit.cpp b/3rdparty/bimg/3rdparty/libsquish/colourfit.cpp
new file mode 100644
index 0000000..11efa46
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/colourfit.cpp
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourfit.h"
+#include "colourset.h"
+
+namespace squish {
+
+ColourFit::ColourFit( ColourSet const* colours, int flags ) 
+  : m_colours( colours ), 
+	m_flags( flags )
+{
+}
+
+ColourFit::~ColourFit()
+{
+}
+
+void ColourFit::Compress( void* block )
+{
+	bool isDxt1 = ( ( m_flags & kDxt1 ) != 0 );
+	if( isDxt1 )
+	{
+		Compress3( block );
+		if( !m_colours->IsTransparent() )
+			Compress4( block );
+	}
+	else
+		Compress4( block );
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/colourfit.h b/3rdparty/bimg/3rdparty/libsquish/colourfit.h
new file mode 100644
index 0000000..7593223
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/colourfit.h
@@ -0,0 +1,56 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURFIT_H
+#define SQUISH_COLOURFIT_H
+
+#include "squish.h"
+#include "maths.h"
+
+#include <climits>
+
+namespace squish {
+
+class ColourSet;
+
+class ColourFit
+{
+public:
+	ColourFit( ColourSet const* colours, int flags );
+	virtual ~ColourFit();
+
+	void Compress( void* block );
+
+protected:
+	virtual void Compress3( void* block ) = 0;
+	virtual void Compress4( void* block ) = 0;
+
+	ColourSet const* m_colours;
+	int m_flags;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_COLOURFIT_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/colourset.cpp b/3rdparty/bimg/3rdparty/libsquish/colourset.cpp
new file mode 100644
index 0000000..97d29d9
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/colourset.cpp
@@ -0,0 +1,121 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "colourset.h"
+
+namespace squish {
+
+ColourSet::ColourSet( u8 const* rgba, int mask, int flags )
+  : m_count( 0 ), 
+	m_transparent( false )
+{
+	// check the compression mode for dxt1
+	bool isDxt1 = ( ( flags & kDxt1 ) != 0 );
+	bool weightByAlpha = ( ( flags & kWeightColourByAlpha ) != 0 );
+
+	// create the minimal set
+	for( int i = 0; i < 16; ++i )
+	{
+		// check this pixel is enabled
+		int bit = 1 << i;
+		if( ( mask & bit ) == 0 )
+		{
+			m_remap[i] = -1;
+			continue;
+		}
+	
+		// check for transparent pixels when using dxt1
+		if( isDxt1 && rgba[4*i + 3] < 128 )
+		{
+			m_remap[i] = -1;
+			m_transparent = true;
+			continue;
+		}
+
+		// loop over previous points for a match
+		for( int j = 0;; ++j )
+		{
+			// allocate a new point
+			if( j == i )
+			{
+				// normalise coordinates to [0,1]
+				float x = ( float )rgba[4*i] / 255.0f;
+				float y = ( float )rgba[4*i + 1] / 255.0f;
+				float z = ( float )rgba[4*i + 2] / 255.0f;
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// add the point
+				m_points[m_count] = Vec3( x, y, z );
+				m_weights[m_count] = ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = m_count;
+				
+				// advance
+				++m_count;
+				break;
+			}
+		
+			// check for a match
+			int oldbit = 1 << j;
+			bool match = ( ( mask & oldbit ) != 0 )
+				&& ( rgba[4*i] == rgba[4*j] )
+				&& ( rgba[4*i + 1] == rgba[4*j + 1] )
+				&& ( rgba[4*i + 2] == rgba[4*j + 2] )
+				&& ( rgba[4*j + 3] >= 128 || !isDxt1 );
+			if( match )
+			{
+				// get the index of the match
+				int index = m_remap[j];
+				
+				// ensure there is always non-zero weight even for zero alpha
+				float w = ( float )( rgba[4*i + 3] + 1 ) / 256.0f;
+
+				// map to this point and increase the weight
+				m_weights[index] += ( weightByAlpha ? w : 1.0f );
+				m_remap[i] = index;
+				break;
+			}
+		}
+	}
+
+	// square root the weights
+	for( int i = 0; i < m_count; ++i )
+		m_weights[i] = std::sqrt( m_weights[i] );
+}
+
+void ColourSet::RemapIndices( u8 const* source, u8* target ) const
+{
+	for( int i = 0; i < 16; ++i )
+	{
+		int j = m_remap[i];
+		if( j == -1 )
+			target[i] = 3;
+		else
+			target[i] = source[j];
+	}
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/colourset.h b/3rdparty/bimg/3rdparty/libsquish/colourset.h
new file mode 100644
index 0000000..0c66fe4
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/colourset.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_COLOURSET_H
+#define SQUISH_COLOURSET_H
+
+#include "squish.h"
+#include "maths.h"
+
+namespace squish {
+
+/*! @brief Represents a set of block colours
+*/
+class ColourSet
+{
+public:
+	ColourSet( u8 const* rgba, int mask, int flags );
+
+	int GetCount() const { return m_count; }
+	Vec3 const* GetPoints() const { return m_points; }
+	float const* GetWeights() const { return m_weights; }
+	bool IsTransparent() const { return m_transparent; }
+
+	void RemapIndices( u8 const* source, u8* target ) const;
+
+private:
+	int m_count;
+	Vec3 m_points[16];
+	float m_weights[16];
+	int m_remap[16];
+	bool m_transparent;
+};
+
+} // namespace sqish
+
+#endif // ndef SQUISH_COLOURSET_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/config.h b/3rdparty/bimg/3rdparty/libsquish/config.h
new file mode 100644
index 0000000..2fad557
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/config.h
@@ -0,0 +1,49 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_CONFIG_H
+#define SQUISH_CONFIG_H
+
+// Set to 1 when building squish to use Altivec instructions.
+#ifndef SQUISH_USE_ALTIVEC
+#define SQUISH_USE_ALTIVEC 0
+#endif
+
+// Set to 1 or 2 when building squish to use SSE or SSE2 instructions.
+#ifndef SQUISH_USE_SSE
+#define SQUISH_USE_SSE 0
+#endif
+
+// Internally set SQUISH_USE_SIMD when either Altivec or SSE is available.
+#if SQUISH_USE_ALTIVEC && SQUISH_USE_SSE
+#error "Cannot enable both Altivec and SSE!"
+#endif
+#if SQUISH_USE_ALTIVEC || SQUISH_USE_SSE
+#define SQUISH_USE_SIMD 1
+#else
+#define SQUISH_USE_SIMD 0
+#endif
+
+#endif // ndef SQUISH_CONFIG_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/maths.cpp b/3rdparty/bimg/3rdparty/libsquish/maths.cpp
new file mode 100644
index 0000000..9af4197
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/maths.cpp
@@ -0,0 +1,259 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+/*! @file
+
+	The symmetric eigensystem solver algorithm is from 
+	http://www.geometrictools.com/Documentation/EigenSymmetric3x3.pdf
+*/
+
+#include "maths.h"
+#include "simd.h"
+#include <cfloat>
+
+namespace squish {
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights )
+{
+	// compute the centroid
+	float total = 0.0f;
+	Vec3 centroid( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		total += weights[i];
+		centroid += weights[i]*points[i];
+	}
+	if( total > FLT_EPSILON )
+		centroid /= total;
+
+	// accumulate the covariance matrix
+	Sym3x3 covariance( 0.0f );
+	for( int i = 0; i < n; ++i )
+	{
+		Vec3 a = points[i] - centroid;
+		Vec3 b = weights[i]*a;
+		
+		covariance[0] += a.X()*b.X();
+		covariance[1] += a.X()*b.Y();
+		covariance[2] += a.X()*b.Z();
+		covariance[3] += a.Y()*b.Y();
+		covariance[4] += a.Y()*b.Z();
+		covariance[5] += a.Z()*b.Z();
+	}
+	
+	// return it
+	return covariance;
+}
+
+#if 0
+
+static Vec3 GetMultiplicity1Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// compute U
+	Sym3x3 u;
+	u[0] = m[3]*m[5] - m[4]*m[4];
+	u[1] = m[2]*m[4] - m[1]*m[5];
+	u[2] = m[1]*m[4] - m[2]*m[3];
+	u[3] = m[0]*m[5] - m[2]*m[2];
+	u[4] = m[1]*m[2] - m[4]*m[0];
+	u[5] = m[0]*m[3] - m[1]*m[1];
+
+	// find the largest component
+	float mc = std::fabs( u[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( u[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the column with this component
+	switch( mi )
+	{
+	case 0:
+		return Vec3( u[0], u[1], u[2] );
+
+	case 1:
+	case 3:
+		return Vec3( u[1], u[3], u[4] );
+
+	default:
+		return Vec3( u[2], u[4], u[5] );
+	}
+}
+
+static Vec3 GetMultiplicity2Evector( Sym3x3 const& matrix, float evalue )
+{
+	// compute M
+	Sym3x3 m;
+	m[0] = matrix[0] - evalue;
+	m[1] = matrix[1];
+	m[2] = matrix[2];
+	m[3] = matrix[3] - evalue;
+	m[4] = matrix[4];
+	m[5] = matrix[5] - evalue;
+
+	// find the largest component
+	float mc = std::fabs( m[0] );
+	int mi = 0;
+	for( int i = 1; i < 6; ++i )
+	{
+		float c = std::fabs( m[i] );
+		if( c > mc )
+		{
+			mc = c;
+			mi = i;
+		}
+	}
+
+	// pick the first eigenvector based on this index
+	switch( mi )
+	{
+	case 0:
+	case 1:
+		return Vec3( -m[1], m[0], 0.0f );
+
+	case 2:
+		return Vec3( m[2], 0.0f, -m[0] );
+
+	case 3:
+	case 4:
+		return Vec3( 0.0f, -m[4], m[3] );
+
+	default:
+		return Vec3( 0.0f, -m[5], m[4] );
+	}
+}
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	// compute the cubic coefficients
+	float c0 = matrix[0]*matrix[3]*matrix[5] 
+		+ 2.0f*matrix[1]*matrix[2]*matrix[4] 
+		- matrix[0]*matrix[4]*matrix[4] 
+		- matrix[3]*matrix[2]*matrix[2] 
+		- matrix[5]*matrix[1]*matrix[1];
+	float c1 = matrix[0]*matrix[3] + matrix[0]*matrix[5] + matrix[3]*matrix[5]
+		- matrix[1]*matrix[1] - matrix[2]*matrix[2] - matrix[4]*matrix[4];
+	float c2 = matrix[0] + matrix[3] + matrix[5];
+
+	// compute the quadratic coefficients
+	float a = c1 - ( 1.0f/3.0f )*c2*c2;
+	float b = ( -2.0f/27.0f )*c2*c2*c2 + ( 1.0f/3.0f )*c1*c2 - c0;
+
+	// compute the root count check
+	float Q = 0.25f*b*b + ( 1.0f/27.0f )*a*a*a;
+
+	// test the multiplicity
+	if( FLT_EPSILON < Q )
+	{
+		// only one root, which implies we have a multiple of the identity
+        return Vec3( 1.0f );
+	}
+	else if( Q < -FLT_EPSILON )
+	{
+		// three distinct roots
+		float theta = std::atan2( std::sqrt( -Q ), -0.5f*b );
+		float rho = std::sqrt( 0.25f*b*b - Q );
+
+		float rt = std::pow( rho, 1.0f/3.0f );
+		float ct = std::cos( theta/3.0f );
+		float st = std::sin( theta/3.0f );
+
+		float l1 = ( 1.0f/3.0f )*c2 + 2.0f*rt*ct;
+		float l2 = ( 1.0f/3.0f )*c2 - rt*( ct + ( float )sqrt( 3.0f )*st );
+		float l3 = ( 1.0f/3.0f )*c2 - rt*( ct - ( float )sqrt( 3.0f )*st );
+
+		// pick the larger
+		if( std::fabs( l2 ) > std::fabs( l1 ) )
+			l1 = l2;
+		if( std::fabs( l3 ) > std::fabs( l1 ) )
+			l1 = l3;
+
+		// get the eigenvector
+		return GetMultiplicity1Evector( matrix, l1 );
+	}
+	else // if( -FLT_EPSILON <= Q && Q <= FLT_EPSILON )
+	{
+		// two roots
+		float rt;
+		if( b < 0.0f )
+			rt = -std::pow( -0.5f*b, 1.0f/3.0f );
+		else
+			rt = std::pow( 0.5f*b, 1.0f/3.0f );
+		
+		float l1 = ( 1.0f/3.0f )*c2 + rt;		// repeated
+		float l2 = ( 1.0f/3.0f )*c2 - 2.0f*rt;
+		
+		// get the eigenvector
+		if( std::fabs( l1 ) > std::fabs( l2 ) )
+			return GetMultiplicity2Evector( matrix, l1 );
+		else
+			return GetMultiplicity1Evector( matrix, l2 );
+	}
+}
+
+#else
+
+#define POWER_ITERATION_COUNT 	8
+
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
+{
+	Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
+	Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
+	Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );
+	Vec4 v = VEC4_CONST( 1.0f );
+	for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
+	{
+		// matrix multiply
+		Vec4 w = row0*v.SplatX();
+		w = MultiplyAdd(row1, v.SplatY(), w);
+		w = MultiplyAdd(row2, v.SplatZ(), w);
+
+		// get max component from xyz in all channels
+		Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));
+
+		// divide through and advance
+		v = w*Reciprocal(a);
+	}
+	return v.GetVec3();
+}
+
+#endif
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/maths.h b/3rdparty/bimg/3rdparty/libsquish/maths.h
new file mode 100644
index 0000000..769ae46
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/maths.h
@@ -0,0 +1,233 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_MATHS_H
+#define SQUISH_MATHS_H
+
+#include <cmath>
+#include <algorithm>
+#include "config.h"
+
+namespace squish {
+
+class Vec3
+{
+public:
+	typedef Vec3 const& Arg;
+
+	Vec3()
+	{
+	}
+
+	explicit Vec3( float s )
+	{
+		m_x = s;
+		m_y = s;
+		m_z = s;
+	}
+
+	Vec3( float x, float y, float z )
+	{
+		m_x = x;
+		m_y = y;
+		m_z = z;
+	}
+	
+	float X() const { return m_x; }
+	float Y() const { return m_y; }
+	float Z() const { return m_z; }
+	
+	Vec3 operator-() const
+	{
+		return Vec3( -m_x, -m_y, -m_z );
+	}
+	
+	Vec3& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator*=( float s )
+	{
+		m_x *= s;
+		m_y *= s;
+		m_z *= s;
+		return *this;
+	}
+	
+	Vec3& operator/=( Arg v )
+	{
+		m_x /= v.m_x;
+		m_y /= v.m_y;
+		m_z /= v.m_z;
+		return *this;
+	}
+	
+	Vec3& operator/=( float s )
+	{
+		float t = 1.0f/s;
+		m_x *= t;
+		m_y *= t;
+		m_z *= t;
+		return *this;
+	}
+	
+	friend Vec3 operator+( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec3 operator-( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec3 operator*( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy *= right;
+	}
+	
+	friend Vec3 operator*( float left, Arg right )
+	{
+		Vec3 copy( right );
+		return copy *= left;
+	}
+	
+	friend Vec3 operator/( Arg left, Arg right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend Vec3 operator/( Arg left, float right )
+	{
+		Vec3 copy( left );
+		return copy /= right;
+	}
+	
+	friend float Dot( Arg left, Arg right )
+	{
+		return left.m_x*right.m_x + left.m_y*right.m_y + left.m_z*right.m_z;
+	}
+	
+	friend Vec3 Min( Arg left, Arg right )
+	{
+		return Vec3(
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Max( Arg left, Arg right )
+	{
+		return Vec3(
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z )
+		);
+	}
+
+	friend Vec3 Truncate( Arg v )
+	{
+		return Vec3(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z )
+		);
+	}
+
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+};
+
+inline float LengthSquared( Vec3::Arg v )
+{
+	return Dot( v, v );
+}
+
+class Sym3x3
+{
+public:
+	Sym3x3()
+	{
+	}
+
+	Sym3x3( float s )
+	{
+		for( int i = 0; i < 6; ++i )
+			m_x[i] = s;
+	}
+
+	float operator[]( int index ) const
+	{
+		return m_x[index];
+	}
+
+	float& operator[]( int index )
+	{
+		return m_x[index];
+	}
+
+private:
+	float m_x[6];
+};
+
+Sym3x3 ComputeWeightedCovariance( int n, Vec3 const* points, float const* weights );
+Vec3 ComputePrincipleComponent( Sym3x3 const& matrix );
+
+} // namespace squish
+
+#endif // ndef SQUISH_MATHS_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/rangefit.cpp b/3rdparty/bimg/3rdparty/libsquish/rangefit.cpp
new file mode 100644
index 0000000..3fca124
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/rangefit.cpp
@@ -0,0 +1,201 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "rangefit.h"
+#include "colourset.h"
+#include "colourblock.h"
+#include <cfloat>
+
+namespace squish {
+
+RangeFit::RangeFit( ColourSet const* colours, int flags, float* metric ) 
+  : ColourFit( colours, flags )
+{
+	// initialise the metric (old perceptual = 0.2126f, 0.7152f, 0.0722f)
+	if( metric )
+		m_metric = Vec3( metric[0], metric[1], metric[2] );
+	else
+		m_metric = Vec3( 1.0f );	
+
+	// initialise the best error
+	m_besterror = FLT_MAX;
+
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	float const* weights = m_colours->GetWeights();
+	
+	// get the covariance matrix
+	Sym3x3 covariance = ComputeWeightedCovariance( count, values, weights );
+	
+	// compute the principle component
+	Vec3 principle = ComputePrincipleComponent( covariance );
+
+	// get the min and max range as the codebook endpoints
+	Vec3 start( 0.0f );
+	Vec3 end( 0.0f );
+	if( count > 0 )
+	{
+		float min, max;
+		
+		// compute the range
+		start = end = values[0];
+		min = max = Dot( values[0], principle );
+		for( int i = 1; i < count; ++i )
+		{
+			float val = Dot( values[i], principle );
+			if( val < min )
+			{
+				start = values[i];
+				min = val;
+			}
+			else if( val > max )
+			{
+				end = values[i];
+				max = val;
+			}
+		}
+	}
+			
+	// clamp the output to [0, 1]
+	Vec3 const one( 1.0f );
+	Vec3 const zero( 0.0f );
+	start = Min( one, Max( zero, start ) );
+	end = Min( one, Max( zero, end ) );
+
+	// clamp to the grid and save
+	Vec3 const grid( 31.0f, 63.0f, 31.0f );
+	Vec3 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f );
+	Vec3 const half( 0.5f );
+	m_start = Truncate( grid*start + half )*gridrcp;
+	m_end = Truncate( grid*end + half )*gridrcp;
+}
+
+void RangeFit::Compress3( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[3];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = 0.5f*m_start + 0.5f*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 3; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+		
+		// save the error
+		m_besterror = error;
+	}
+}
+
+void RangeFit::Compress4( void* block )
+{
+	// cache some values
+	int const count = m_colours->GetCount();
+	Vec3 const* values = m_colours->GetPoints();
+	
+	// create a codebook
+	Vec3 codes[4];
+	codes[0] = m_start;
+	codes[1] = m_end;
+	codes[2] = ( 2.0f/3.0f )*m_start + ( 1.0f/3.0f )*m_end;
+	codes[3] = ( 1.0f/3.0f )*m_start + ( 2.0f/3.0f )*m_end;
+
+	// match each point to the closest code
+	u8 closest[16];
+	float error = 0.0f;
+	for( int i = 0; i < count; ++i )
+	{
+		// find the closest code
+		float dist = FLT_MAX;
+		int idx = 0;
+		for( int j = 0; j < 4; ++j )
+		{
+			float d = LengthSquared( m_metric*( values[i] - codes[j] ) );
+			if( d < dist )
+			{
+				dist = d;
+				idx = j;
+			}
+		}
+		
+		// save the index
+		closest[i] = ( u8 )idx;
+		
+		// accumulate the error
+		error += dist;
+	}
+	
+	// save this scheme if it wins
+	if( error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( closest, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = error;
+	}
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/rangefit.h b/3rdparty/bimg/3rdparty/libsquish/rangefit.h
new file mode 100644
index 0000000..e293bdc
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/rangefit.h
@@ -0,0 +1,54 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_RANGEFIT_H
+#define SQUISH_RANGEFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+#include "maths.h"
+
+namespace squish {
+
+class ColourSet;
+
+class RangeFit : public ColourFit
+{
+public:
+	RangeFit( ColourSet const* colours, int flags, float* metric );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	Vec3 m_metric;
+	Vec3 m_start;
+	Vec3 m_end;
+	float m_besterror;
+};
+
+} // squish
+
+#endif // ndef SQUISH_RANGEFIT_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/simd.h b/3rdparty/bimg/3rdparty/libsquish/simd.h
new file mode 100644
index 0000000..92965e0
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/simd.h
@@ -0,0 +1,32 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_H
+#define SQUISH_SIMD_H
+
+#include "maths.h"
+#include "simd_float.h"
+
+#endif // ndef SQUISH_SIMD_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/simd_float.h b/3rdparty/bimg/3rdparty/libsquish/simd_float.h
new file mode 100644
index 0000000..e6351b8
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/simd_float.h
@@ -0,0 +1,183 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SIMD_FLOAT_H
+#define SQUISH_SIMD_FLOAT_H
+
+#include <algorithm>
+
+namespace squish {
+
+#define VEC4_CONST( X ) Vec4( X )
+
+class Vec4
+{
+public:
+	typedef Vec4 const& Arg;
+
+	Vec4() {}
+		
+	explicit Vec4( float s )
+	  : m_x( s ),
+		m_y( s ),
+		m_z( s ),
+		m_w( s )
+	{
+	}
+	
+	Vec4( float x, float y, float z, float w )
+	  : m_x( x ),
+		m_y( y ),
+		m_z( z ),
+		m_w( w )
+	{
+	}
+	
+	Vec3 GetVec3() const
+	{
+		return Vec3( m_x, m_y, m_z );
+	}
+	
+	Vec4 SplatX() const { return Vec4( m_x ); }
+	Vec4 SplatY() const { return Vec4( m_y ); }
+	Vec4 SplatZ() const { return Vec4( m_z ); }
+	Vec4 SplatW() const { return Vec4( m_w ); }
+
+	Vec4& operator+=( Arg v )
+	{
+		m_x += v.m_x;
+		m_y += v.m_y;
+		m_z += v.m_z;
+		m_w += v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator-=( Arg v )
+	{
+		m_x -= v.m_x;
+		m_y -= v.m_y;
+		m_z -= v.m_z;
+		m_w -= v.m_w;
+		return *this;
+	}
+	
+	Vec4& operator*=( Arg v )
+	{
+		m_x *= v.m_x;
+		m_y *= v.m_y;
+		m_z *= v.m_z;
+		m_w *= v.m_w;
+		return *this;
+	}
+	
+	friend Vec4 operator+( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy += right;
+	}
+	
+	friend Vec4 operator-( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy -= right;
+	}
+	
+	friend Vec4 operator*( Vec4::Arg left, Vec4::Arg right  )
+	{
+		Vec4 copy( left );
+		return copy *= right;
+	}
+	
+	//! Returns a*b + c
+	friend Vec4 MultiplyAdd( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return a*b + c;
+	}
+	
+	//! Returns -( a*b - c )
+	friend Vec4 NegativeMultiplySubtract( Vec4::Arg a, Vec4::Arg b, Vec4::Arg c )
+	{
+		return c - a*b;
+	}
+	
+	friend Vec4 Reciprocal( Vec4::Arg v )
+	{
+		return Vec4( 
+			1.0f/v.m_x, 
+			1.0f/v.m_y, 
+			1.0f/v.m_z, 
+			1.0f/v.m_w 
+		);
+	}
+	
+	friend Vec4 Min( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::min( left.m_x, right.m_x ), 
+			std::min( left.m_y, right.m_y ), 
+			std::min( left.m_z, right.m_z ), 
+			std::min( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Max( Vec4::Arg left, Vec4::Arg right )
+	{
+		return Vec4( 
+			std::max( left.m_x, right.m_x ), 
+			std::max( left.m_y, right.m_y ), 
+			std::max( left.m_z, right.m_z ), 
+			std::max( left.m_w, right.m_w ) 
+		);
+	}
+	
+	friend Vec4 Truncate( Vec4::Arg v )
+	{
+		return Vec4(
+			v.m_x > 0.0f ? std::floor( v.m_x ) : std::ceil( v.m_x ), 
+			v.m_y > 0.0f ? std::floor( v.m_y ) : std::ceil( v.m_y ), 
+			v.m_z > 0.0f ? std::floor( v.m_z ) : std::ceil( v.m_z ),
+			v.m_w > 0.0f ? std::floor( v.m_w ) : std::ceil( v.m_w )
+		);
+	}
+	
+	friend bool CompareAnyLessThan( Vec4::Arg left, Vec4::Arg right ) 
+	{
+		return left.m_x < right.m_x
+			|| left.m_y < right.m_y
+			|| left.m_z < right.m_z
+			|| left.m_w < right.m_w;
+	}
+	
+private:
+	float m_x;
+	float m_y;
+	float m_z;
+	float m_w;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SIMD_FLOAT_H
+
diff --git a/3rdparty/bimg/3rdparty/libsquish/singlecolourfit.cpp b/3rdparty/bimg/3rdparty/libsquish/singlecolourfit.cpp
new file mode 100644
index 0000000..e8a0117
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/singlecolourfit.cpp
@@ -0,0 +1,172 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "singlecolourfit.h"
+#include "colourset.h"
+#include "colourblock.h"
+
+namespace squish {
+
+struct SourceBlock
+{
+	u8 start;
+	u8 end;
+	u8 error;
+};
+
+struct SingleColourLookup
+{
+	SourceBlock sources[2];
+};
+
+#include "singlecolourlookup.inl"
+
+static int FloatToInt( float a, int limit )
+{
+	// use ANSI round-to-zero behaviour to get round-to-nearest
+	int i = ( int )( a + 0.5f );
+
+	// clamp to the limit
+	if( i < 0 )
+		i = 0;
+	else if( i > limit )
+		i = limit; 
+
+	// done
+	return i;
+}
+
+SingleColourFit::SingleColourFit( ColourSet const* colours, int flags )
+  : ColourFit( colours, flags )
+{
+	// grab the single colour
+	Vec3 const* values = m_colours->GetPoints();
+	m_colour[0] = ( u8 )FloatToInt( 255.0f*values->X(), 255 );
+	m_colour[1] = ( u8 )FloatToInt( 255.0f*values->Y(), 255 );
+	m_colour[2] = ( u8 )FloatToInt( 255.0f*values->Z(), 255 );
+		
+	// initialise the best error
+	m_besterror = INT_MAX;
+}
+
+void SingleColourFit::Compress3( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_3, 
+		lookup_6_3, 
+		lookup_5_3
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock3( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::Compress4( void* block )
+{
+	// build the table of lookups
+	SingleColourLookup const* const lookups[] = 
+	{
+		lookup_5_4, 
+		lookup_6_4, 
+		lookup_5_4
+	};
+	
+	// find the best end-points and index
+	ComputeEndPoints( lookups );
+	
+	// build the block if we win
+	if( m_error < m_besterror )
+	{
+		// remap the indices
+		u8 indices[16];
+		m_colours->RemapIndices( &m_index, indices );
+		
+		// save the block
+		WriteColourBlock4( m_start, m_end, indices, block );
+
+		// save the error
+		m_besterror = m_error;
+	}
+}
+
+void SingleColourFit::ComputeEndPoints( SingleColourLookup const* const* lookups )
+{
+	// check each index combination (endpoint or intermediate)
+	m_error = INT_MAX;
+	for( int index = 0; index < 2; ++index )
+	{
+		// check the error for this codebook index
+		SourceBlock const* sources[3];
+		int error = 0;
+		for( int channel = 0; channel < 3; ++channel )
+		{
+			// grab the lookup table and index for this channel
+			SingleColourLookup const* lookup = lookups[channel];
+			int target = m_colour[channel];
+			
+			// store a pointer to the source for this channel
+			sources[channel] = lookup[target].sources + index;
+			
+			// accumulate the error
+			int diff = sources[channel]->error;
+			error += diff*diff;			
+		}
+		
+		// keep it if the error is lower
+		if( error < m_error )
+		{
+			m_start = Vec3(
+				( float )sources[0]->start/31.0f, 
+				( float )sources[1]->start/63.0f, 
+				( float )sources[2]->start/31.0f
+			);
+			m_end = Vec3(
+				( float )sources[0]->end/31.0f, 
+				( float )sources[1]->end/63.0f, 
+				( float )sources[2]->end/31.0f
+			);
+			m_index = ( u8 )( 2*index );
+			m_error = error;
+		}
+	}
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/singlecolourfit.h b/3rdparty/bimg/3rdparty/libsquish/singlecolourfit.h
new file mode 100644
index 0000000..54ec17e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/singlecolourfit.h
@@ -0,0 +1,58 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_SINGLECOLOURFIT_H
+#define SQUISH_SINGLECOLOURFIT_H
+
+#include "squish.h"
+#include "colourfit.h"
+
+namespace squish {
+
+class ColourSet;
+struct SingleColourLookup;
+
+class SingleColourFit : public ColourFit
+{
+public:
+	SingleColourFit( ColourSet const* colours, int flags );
+	
+private:
+	virtual void Compress3( void* block );
+	virtual void Compress4( void* block );
+	
+	void ComputeEndPoints( SingleColourLookup const* const* lookups );
+	
+	u8 m_colour[3];
+	Vec3 m_start;
+	Vec3 m_end;
+	u8 m_index;
+	int m_error;
+	int m_besterror;
+};
+
+} // namespace squish
+
+#endif // ndef SQUISH_SINGLECOLOURFIT_H
diff --git a/3rdparty/bimg/3rdparty/libsquish/singlecolourlookup.inl b/3rdparty/bimg/3rdparty/libsquish/singlecolourlookup.inl
new file mode 100644
index 0000000..5e91174
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/singlecolourlookup.inl
@@ -0,0 +1,1064 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+
+static SingleColourLookup const lookup_5_3[] = 
+{
+	{ { { 0, 0, 0 }, { 0, 0, 0 } } },
+	{ { { 0, 0, 1 }, { 0, 0, 1 } } },
+	{ { { 0, 0, 2 }, { 0, 0, 2 } } },
+	{ { { 0, 0, 3 }, { 0, 1, 1 } } },
+	{ { { 0, 0, 4 }, { 0, 1, 0 } } },
+	{ { { 1, 0, 3 }, { 0, 1, 1 } } },
+	{ { { 1, 0, 2 }, { 0, 1, 2 } } },
+	{ { { 1, 0, 1 }, { 0, 2, 1 } } },
+	{ { { 1, 0, 0 }, { 0, 2, 0 } } },
+	{ { { 1, 0, 1 }, { 0, 2, 1 } } },
+	{ { { 1, 0, 2 }, { 0, 2, 2 } } },
+	{ { { 1, 0, 3 }, { 0, 3, 1 } } },
+	{ { { 1, 0, 4 }, { 0, 3, 0 } } },
+	{ { { 2, 0, 3 }, { 0, 3, 1 } } },
+	{ { { 2, 0, 2 }, { 0, 3, 2 } } },
+	{ { { 2, 0, 1 }, { 0, 4, 1 } } },
+	{ { { 2, 0, 0 }, { 0, 4, 0 } } },
+	{ { { 2, 0, 1 }, { 0, 4, 1 } } },
+	{ { { 2, 0, 2 }, { 0, 4, 2 } } },
+	{ { { 2, 0, 3 }, { 0, 5, 1 } } },
+	{ { { 2, 0, 4 }, { 0, 5, 0 } } },
+	{ { { 3, 0, 3 }, { 0, 5, 1 } } },
+	{ { { 3, 0, 2 }, { 0, 5, 2 } } },
+	{ { { 3, 0, 1 }, { 0, 6, 1 } } },
+	{ { { 3, 0, 0 }, { 0, 6, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 6, 1 } } },
+	{ { { 3, 0, 2 }, { 0, 6, 2 } } },
+	{ { { 3, 0, 3 }, { 0, 7, 1 } } },
+	{ { { 3, 0, 4 }, { 0, 7, 0 } } },
+	{ { { 4, 0, 4 }, { 0, 7, 1 } } },
+	{ { { 4, 0, 3 }, { 0, 7, 2 } } },
+	{ { { 4, 0, 2 }, { 1, 7, 1 } } },
+	{ { { 4, 0, 1 }, { 1, 7, 0 } } },
+	{ { { 4, 0, 0 }, { 0, 8, 0 } } },
+	{ { { 4, 0, 1 }, { 0, 8, 1 } } },
+	{ { { 4, 0, 2 }, { 2, 7, 1 } } },
+	{ { { 4, 0, 3 }, { 2, 7, 0 } } },
+	{ { { 4, 0, 4 }, { 0, 9, 0 } } },
+	{ { { 5, 0, 3 }, { 0, 9, 1 } } },
+	{ { { 5, 0, 2 }, { 3, 7, 1 } } },
+	{ { { 5, 0, 1 }, { 3, 7, 0 } } },
+	{ { { 5, 0, 0 }, { 0, 10, 0 } } },
+	{ { { 5, 0, 1 }, { 0, 10, 1 } } },
+	{ { { 5, 0, 2 }, { 0, 10, 2 } } },
+	{ { { 5, 0, 3 }, { 0, 11, 1 } } },
+	{ { { 5, 0, 4 }, { 0, 11, 0 } } },
+	{ { { 6, 0, 3 }, { 0, 11, 1 } } },
+	{ { { 6, 0, 2 }, { 0, 11, 2 } } },
+	{ { { 6, 0, 1 }, { 0, 12, 1 } } },
+	{ { { 6, 0, 0 }, { 0, 12, 0 } } },
+	{ { { 6, 0, 1 }, { 0, 12, 1 } } },
+	{ { { 6, 0, 2 }, { 0, 12, 2 } } },
+	{ { { 6, 0, 3 }, { 0, 13, 1 } } },
+	{ { { 6, 0, 4 }, { 0, 13, 0 } } },
+	{ { { 7, 0, 3 }, { 0, 13, 1 } } },
+	{ { { 7, 0, 2 }, { 0, 13, 2 } } },
+	{ { { 7, 0, 1 }, { 0, 14, 1 } } },
+	{ { { 7, 0, 0 }, { 0, 14, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 14, 1 } } },
+	{ { { 7, 0, 2 }, { 0, 14, 2 } } },
+	{ { { 7, 0, 3 }, { 0, 15, 1 } } },
+	{ { { 7, 0, 4 }, { 0, 15, 0 } } },
+	{ { { 8, 0, 4 }, { 0, 15, 1 } } },
+	{ { { 8, 0, 3 }, { 0, 15, 2 } } },
+	{ { { 8, 0, 2 }, { 1, 15, 1 } } },
+	{ { { 8, 0, 1 }, { 1, 15, 0 } } },
+	{ { { 8, 0, 0 }, { 0, 16, 0 } } },
+	{ { { 8, 0, 1 }, { 0, 16, 1 } } },
+	{ { { 8, 0, 2 }, { 2, 15, 1 } } },
+	{ { { 8, 0, 3 }, { 2, 15, 0 } } },
+	{ { { 8, 0, 4 }, { 0, 17, 0 } } },
+	{ { { 9, 0, 3 }, { 0, 17, 1 } } },
+	{ { { 9, 0, 2 }, { 3, 15, 1 } } },
+	{ { { 9, 0, 1 }, { 3, 15, 0 } } },
+	{ { { 9, 0, 0 }, { 0, 18, 0 } } },
+	{ { { 9, 0, 1 }, { 0, 18, 1 } } },
+	{ { { 9, 0, 2 }, { 0, 18, 2 } } },
+	{ { { 9, 0, 3 }, { 0, 19, 1 } } },
+	{ { { 9, 0, 4 }, { 0, 19, 0 } } },
+	{ { { 10, 0, 3 }, { 0, 19, 1 } } },
+	{ { { 10, 0, 2 }, { 0, 19, 2 } } },
+	{ { { 10, 0, 1 }, { 0, 20, 1 } } },
+	{ { { 10, 0, 0 }, { 0, 20, 0 } } },
+	{ { { 10, 0, 1 }, { 0, 20, 1 } } },
+	{ { { 10, 0, 2 }, { 0, 20, 2 } } },
+	{ { { 10, 0, 3 }, { 0, 21, 1 } } },
+	{ { { 10, 0, 4 }, { 0, 21, 0 } } },
+	{ { { 11, 0, 3 }, { 0, 21, 1 } } },
+	{ { { 11, 0, 2 }, { 0, 21, 2 } } },
+	{ { { 11, 0, 1 }, { 0, 22, 1 } } },
+	{ { { 11, 0, 0 }, { 0, 22, 0 } } },
+	{ { { 11, 0, 1 }, { 0, 22, 1 } } },
+	{ { { 11, 0, 2 }, { 0, 22, 2 } } },
+	{ { { 11, 0, 3 }, { 0, 23, 1 } } },
+	{ { { 11, 0, 4 }, { 0, 23, 0 } } },
+	{ { { 12, 0, 4 }, { 0, 23, 1 } } },
+	{ { { 12, 0, 3 }, { 0, 23, 2 } } },
+	{ { { 12, 0, 2 }, { 1, 23, 1 } } },
+	{ { { 12, 0, 1 }, { 1, 23, 0 } } },
+	{ { { 12, 0, 0 }, { 0, 24, 0 } } },
+	{ { { 12, 0, 1 }, { 0, 24, 1 } } },
+	{ { { 12, 0, 2 }, { 2, 23, 1 } } },
+	{ { { 12, 0, 3 }, { 2, 23, 0 } } },
+	{ { { 12, 0, 4 }, { 0, 25, 0 } } },
+	{ { { 13, 0, 3 }, { 0, 25, 1 } } },
+	{ { { 13, 0, 2 }, { 3, 23, 1 } } },
+	{ { { 13, 0, 1 }, { 3, 23, 0 } } },
+	{ { { 13, 0, 0 }, { 0, 26, 0 } } },
+	{ { { 13, 0, 1 }, { 0, 26, 1 } } },
+	{ { { 13, 0, 2 }, { 0, 26, 2 } } },
+	{ { { 13, 0, 3 }, { 0, 27, 1 } } },
+	{ { { 13, 0, 4 }, { 0, 27, 0 } } },
+	{ { { 14, 0, 3 }, { 0, 27, 1 } } },
+	{ { { 14, 0, 2 }, { 0, 27, 2 } } },
+	{ { { 14, 0, 1 }, { 0, 28, 1 } } },
+	{ { { 14, 0, 0 }, { 0, 28, 0 } } },
+	{ { { 14, 0, 1 }, { 0, 28, 1 } } },
+	{ { { 14, 0, 2 }, { 0, 28, 2 } } },
+	{ { { 14, 0, 3 }, { 0, 29, 1 } } },
+	{ { { 14, 0, 4 }, { 0, 29, 0 } } },
+	{ { { 15, 0, 3 }, { 0, 29, 1 } } },
+	{ { { 15, 0, 2 }, { 0, 29, 2 } } },
+	{ { { 15, 0, 1 }, { 0, 30, 1 } } },
+	{ { { 15, 0, 0 }, { 0, 30, 0 } } },
+	{ { { 15, 0, 1 }, { 0, 30, 1 } } },
+	{ { { 15, 0, 2 }, { 0, 30, 2 } } },
+	{ { { 15, 0, 3 }, { 0, 31, 1 } } },
+	{ { { 15, 0, 4 }, { 0, 31, 0 } } },
+	{ { { 16, 0, 4 }, { 0, 31, 1 } } },
+	{ { { 16, 0, 3 }, { 0, 31, 2 } } },
+	{ { { 16, 0, 2 }, { 1, 31, 1 } } },
+	{ { { 16, 0, 1 }, { 1, 31, 0 } } },
+	{ { { 16, 0, 0 }, { 4, 28, 0 } } },
+	{ { { 16, 0, 1 }, { 4, 28, 1 } } },
+	{ { { 16, 0, 2 }, { 2, 31, 1 } } },
+	{ { { 16, 0, 3 }, { 2, 31, 0 } } },
+	{ { { 16, 0, 4 }, { 4, 29, 0 } } },
+	{ { { 17, 0, 3 }, { 4, 29, 1 } } },
+	{ { { 17, 0, 2 }, { 3, 31, 1 } } },
+	{ { { 17, 0, 1 }, { 3, 31, 0 } } },
+	{ { { 17, 0, 0 }, { 4, 30, 0 } } },
+	{ { { 17, 0, 1 }, { 4, 30, 1 } } },
+	{ { { 17, 0, 2 }, { 4, 30, 2 } } },
+	{ { { 17, 0, 3 }, { 4, 31, 1 } } },
+	{ { { 17, 0, 4 }, { 4, 31, 0 } } },
+	{ { { 18, 0, 3 }, { 4, 31, 1 } } },
+	{ { { 18, 0, 2 }, { 4, 31, 2 } } },
+	{ { { 18, 0, 1 }, { 5, 31, 1 } } },
+	{ { { 18, 0, 0 }, { 5, 31, 0 } } },
+	{ { { 18, 0, 1 }, { 5, 31, 1 } } },
+	{ { { 18, 0, 2 }, { 5, 31, 2 } } },
+	{ { { 18, 0, 3 }, { 6, 31, 1 } } },
+	{ { { 18, 0, 4 }, { 6, 31, 0 } } },
+	{ { { 19, 0, 3 }, { 6, 31, 1 } } },
+	{ { { 19, 0, 2 }, { 6, 31, 2 } } },
+	{ { { 19, 0, 1 }, { 7, 31, 1 } } },
+	{ { { 19, 0, 0 }, { 7, 31, 0 } } },
+	{ { { 19, 0, 1 }, { 7, 31, 1 } } },
+	{ { { 19, 0, 2 }, { 7, 31, 2 } } },
+	{ { { 19, 0, 3 }, { 8, 31, 1 } } },
+	{ { { 19, 0, 4 }, { 8, 31, 0 } } },
+	{ { { 20, 0, 4 }, { 8, 31, 1 } } },
+	{ { { 20, 0, 3 }, { 8, 31, 2 } } },
+	{ { { 20, 0, 2 }, { 9, 31, 1 } } },
+	{ { { 20, 0, 1 }, { 9, 31, 0 } } },
+	{ { { 20, 0, 0 }, { 12, 28, 0 } } },
+	{ { { 20, 0, 1 }, { 12, 28, 1 } } },
+	{ { { 20, 0, 2 }, { 10, 31, 1 } } },
+	{ { { 20, 0, 3 }, { 10, 31, 0 } } },
+	{ { { 20, 0, 4 }, { 12, 29, 0 } } },
+	{ { { 21, 0, 3 }, { 12, 29, 1 } } },
+	{ { { 21, 0, 2 }, { 11, 31, 1 } } },
+	{ { { 21, 0, 1 }, { 11, 31, 0 } } },
+	{ { { 21, 0, 0 }, { 12, 30, 0 } } },
+	{ { { 21, 0, 1 }, { 12, 30, 1 } } },
+	{ { { 21, 0, 2 }, { 12, 30, 2 } } },
+	{ { { 21, 0, 3 }, { 12, 31, 1 } } },
+	{ { { 21, 0, 4 }, { 12, 31, 0 } } },
+	{ { { 22, 0, 3 }, { 12, 31, 1 } } },
+	{ { { 22, 0, 2 }, { 12, 31, 2 } } },
+	{ { { 22, 0, 1 }, { 13, 31, 1 } } },
+	{ { { 22, 0, 0 }, { 13, 31, 0 } } },
+	{ { { 22, 0, 1 }, { 13, 31, 1 } } },
+	{ { { 22, 0, 2 }, { 13, 31, 2 } } },
+	{ { { 22, 0, 3 }, { 14, 31, 1 } } },
+	{ { { 22, 0, 4 }, { 14, 31, 0 } } },
+	{ { { 23, 0, 3 }, { 14, 31, 1 } } },
+	{ { { 23, 0, 2 }, { 14, 31, 2 } } },
+	{ { { 23, 0, 1 }, { 15, 31, 1 } } },
+	{ { { 23, 0, 0 }, { 15, 31, 0 } } },
+	{ { { 23, 0, 1 }, { 15, 31, 1 } } },
+	{ { { 23, 0, 2 }, { 15, 31, 2 } } },
+	{ { { 23, 0, 3 }, { 16, 31, 1 } } },
+	{ { { 23, 0, 4 }, { 16, 31, 0 } } },
+	{ { { 24, 0, 4 }, { 16, 31, 1 } } },
+	{ { { 24, 0, 3 }, { 16, 31, 2 } } },
+	{ { { 24, 0, 2 }, { 17, 31, 1 } } },
+	{ { { 24, 0, 1 }, { 17, 31, 0 } } },
+	{ { { 24, 0, 0 }, { 20, 28, 0 } } },
+	{ { { 24, 0, 1 }, { 20, 28, 1 } } },
+	{ { { 24, 0, 2 }, { 18, 31, 1 } } },
+	{ { { 24, 0, 3 }, { 18, 31, 0 } } },
+	{ { { 24, 0, 4 }, { 20, 29, 0 } } },
+	{ { { 25, 0, 3 }, { 20, 29, 1 } } },
+	{ { { 25, 0, 2 }, { 19, 31, 1 } } },
+	{ { { 25, 0, 1 }, { 19, 31, 0 } } },
+	{ { { 25, 0, 0 }, { 20, 30, 0 } } },
+	{ { { 25, 0, 1 }, { 20, 30, 1 } } },
+	{ { { 25, 0, 2 }, { 20, 30, 2 } } },
+	{ { { 25, 0, 3 }, { 20, 31, 1 } } },
+	{ { { 25, 0, 4 }, { 20, 31, 0 } } },
+	{ { { 26, 0, 3 }, { 20, 31, 1 } } },
+	{ { { 26, 0, 2 }, { 20, 31, 2 } } },
+	{ { { 26, 0, 1 }, { 21, 31, 1 } } },
+	{ { { 26, 0, 0 }, { 21, 31, 0 } } },
+	{ { { 26, 0, 1 }, { 21, 31, 1 } } },
+	{ { { 26, 0, 2 }, { 21, 31, 2 } } },
+	{ { { 26, 0, 3 }, { 22, 31, 1 } } },
+	{ { { 26, 0, 4 }, { 22, 31, 0 } } },
+	{ { { 27, 0, 3 }, { 22, 31, 1 } } },
+	{ { { 27, 0, 2 }, { 22, 31, 2 } } },
+	{ { { 27, 0, 1 }, { 23, 31, 1 } } },
+	{ { { 27, 0, 0 }, { 23, 31, 0 } } },
+	{ { { 27, 0, 1 }, { 23, 31, 1 } } },
+	{ { { 27, 0, 2 }, { 23, 31, 2 } } },
+	{ { { 27, 0, 3 }, { 24, 31, 1 } } },
+	{ { { 27, 0, 4 }, { 24, 31, 0 } } },
+	{ { { 28, 0, 4 }, { 24, 31, 1 } } },
+	{ { { 28, 0, 3 }, { 24, 31, 2 } } },
+	{ { { 28, 0, 2 }, { 25, 31, 1 } } },
+	{ { { 28, 0, 1 }, { 25, 31, 0 } } },
+	{ { { 28, 0, 0 }, { 28, 28, 0 } } },
+	{ { { 28, 0, 1 }, { 28, 28, 1 } } },
+	{ { { 28, 0, 2 }, { 26, 31, 1 } } },
+	{ { { 28, 0, 3 }, { 26, 31, 0 } } },
+	{ { { 28, 0, 4 }, { 28, 29, 0 } } },
+	{ { { 29, 0, 3 }, { 28, 29, 1 } } },
+	{ { { 29, 0, 2 }, { 27, 31, 1 } } },
+	{ { { 29, 0, 1 }, { 27, 31, 0 } } },
+	{ { { 29, 0, 0 }, { 28, 30, 0 } } },
+	{ { { 29, 0, 1 }, { 28, 30, 1 } } },
+	{ { { 29, 0, 2 }, { 28, 30, 2 } } },
+	{ { { 29, 0, 3 }, { 28, 31, 1 } } },
+	{ { { 29, 0, 4 }, { 28, 31, 0 } } },
+	{ { { 30, 0, 3 }, { 28, 31, 1 } } },
+	{ { { 30, 0, 2 }, { 28, 31, 2 } } },
+	{ { { 30, 0, 1 }, { 29, 31, 1 } } },
+	{ { { 30, 0, 0 }, { 29, 31, 0 } } },
+	{ { { 30, 0, 1 }, { 29, 31, 1 } } },
+	{ { { 30, 0, 2 }, { 29, 31, 2 } } },
+	{ { { 30, 0, 3 }, { 30, 31, 1 } } },
+	{ { { 30, 0, 4 }, { 30, 31, 0 } } },
+	{ { { 31, 0, 3 }, { 30, 31, 1 } } },
+	{ { { 31, 0, 2 }, { 30, 31, 2 } } },
+	{ { { 31, 0, 1 }, { 31, 31, 1 } } },
+	{ { { 31, 0, 0 }, { 31, 31, 0 } } }
+};
+
+static SingleColourLookup const lookup_6_3[] = 
+{
+	{ { { 0, 0, 0 }, { 0, 0, 0 } } },
+	{ { { 0, 0, 1 }, { 0, 1, 1 } } },
+	{ { { 0, 0, 2 }, { 0, 1, 0 } } },
+	{ { { 1, 0, 1 }, { 0, 2, 1 } } },
+	{ { { 1, 0, 0 }, { 0, 2, 0 } } },
+	{ { { 1, 0, 1 }, { 0, 3, 1 } } },
+	{ { { 1, 0, 2 }, { 0, 3, 0 } } },
+	{ { { 2, 0, 1 }, { 0, 4, 1 } } },
+	{ { { 2, 0, 0 }, { 0, 4, 0 } } },
+	{ { { 2, 0, 1 }, { 0, 5, 1 } } },
+	{ { { 2, 0, 2 }, { 0, 5, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 6, 1 } } },
+	{ { { 3, 0, 0 }, { 0, 6, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 7, 1 } } },
+	{ { { 3, 0, 2 }, { 0, 7, 0 } } },
+	{ { { 4, 0, 1 }, { 0, 8, 1 } } },
+	{ { { 4, 0, 0 }, { 0, 8, 0 } } },
+	{ { { 4, 0, 1 }, { 0, 9, 1 } } },
+	{ { { 4, 0, 2 }, { 0, 9, 0 } } },
+	{ { { 5, 0, 1 }, { 0, 10, 1 } } },
+	{ { { 5, 0, 0 }, { 0, 10, 0 } } },
+	{ { { 5, 0, 1 }, { 0, 11, 1 } } },
+	{ { { 5, 0, 2 }, { 0, 11, 0 } } },
+	{ { { 6, 0, 1 }, { 0, 12, 1 } } },
+	{ { { 6, 0, 0 }, { 0, 12, 0 } } },
+	{ { { 6, 0, 1 }, { 0, 13, 1 } } },
+	{ { { 6, 0, 2 }, { 0, 13, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 14, 1 } } },
+	{ { { 7, 0, 0 }, { 0, 14, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 15, 1 } } },
+	{ { { 7, 0, 2 }, { 0, 15, 0 } } },
+	{ { { 8, 0, 1 }, { 0, 16, 1 } } },
+	{ { { 8, 0, 0 }, { 0, 16, 0 } } },
+	{ { { 8, 0, 1 }, { 0, 17, 1 } } },
+	{ { { 8, 0, 2 }, { 0, 17, 0 } } },
+	{ { { 9, 0, 1 }, { 0, 18, 1 } } },
+	{ { { 9, 0, 0 }, { 0, 18, 0 } } },
+	{ { { 9, 0, 1 }, { 0, 19, 1 } } },
+	{ { { 9, 0, 2 }, { 0, 19, 0 } } },
+	{ { { 10, 0, 1 }, { 0, 20, 1 } } },
+	{ { { 10, 0, 0 }, { 0, 20, 0 } } },
+	{ { { 10, 0, 1 }, { 0, 21, 1 } } },
+	{ { { 10, 0, 2 }, { 0, 21, 0 } } },
+	{ { { 11, 0, 1 }, { 0, 22, 1 } } },
+	{ { { 11, 0, 0 }, { 0, 22, 0 } } },
+	{ { { 11, 0, 1 }, { 0, 23, 1 } } },
+	{ { { 11, 0, 2 }, { 0, 23, 0 } } },
+	{ { { 12, 0, 1 }, { 0, 24, 1 } } },
+	{ { { 12, 0, 0 }, { 0, 24, 0 } } },
+	{ { { 12, 0, 1 }, { 0, 25, 1 } } },
+	{ { { 12, 0, 2 }, { 0, 25, 0 } } },
+	{ { { 13, 0, 1 }, { 0, 26, 1 } } },
+	{ { { 13, 0, 0 }, { 0, 26, 0 } } },
+	{ { { 13, 0, 1 }, { 0, 27, 1 } } },
+	{ { { 13, 0, 2 }, { 0, 27, 0 } } },
+	{ { { 14, 0, 1 }, { 0, 28, 1 } } },
+	{ { { 14, 0, 0 }, { 0, 28, 0 } } },
+	{ { { 14, 0, 1 }, { 0, 29, 1 } } },
+	{ { { 14, 0, 2 }, { 0, 29, 0 } } },
+	{ { { 15, 0, 1 }, { 0, 30, 1 } } },
+	{ { { 15, 0, 0 }, { 0, 30, 0 } } },
+	{ { { 15, 0, 1 }, { 0, 31, 1 } } },
+	{ { { 15, 0, 2 }, { 0, 31, 0 } } },
+	{ { { 16, 0, 2 }, { 1, 31, 1 } } },
+	{ { { 16, 0, 1 }, { 1, 31, 0 } } },
+	{ { { 16, 0, 0 }, { 0, 32, 0 } } },
+	{ { { 16, 0, 1 }, { 2, 31, 0 } } },
+	{ { { 16, 0, 2 }, { 0, 33, 0 } } },
+	{ { { 17, 0, 1 }, { 3, 31, 0 } } },
+	{ { { 17, 0, 0 }, { 0, 34, 0 } } },
+	{ { { 17, 0, 1 }, { 4, 31, 0 } } },
+	{ { { 17, 0, 2 }, { 0, 35, 0 } } },
+	{ { { 18, 0, 1 }, { 5, 31, 0 } } },
+	{ { { 18, 0, 0 }, { 0, 36, 0 } } },
+	{ { { 18, 0, 1 }, { 6, 31, 0 } } },
+	{ { { 18, 0, 2 }, { 0, 37, 0 } } },
+	{ { { 19, 0, 1 }, { 7, 31, 0 } } },
+	{ { { 19, 0, 0 }, { 0, 38, 0 } } },
+	{ { { 19, 0, 1 }, { 8, 31, 0 } } },
+	{ { { 19, 0, 2 }, { 0, 39, 0 } } },
+	{ { { 20, 0, 1 }, { 9, 31, 0 } } },
+	{ { { 20, 0, 0 }, { 0, 40, 0 } } },
+	{ { { 20, 0, 1 }, { 10, 31, 0 } } },
+	{ { { 20, 0, 2 }, { 0, 41, 0 } } },
+	{ { { 21, 0, 1 }, { 11, 31, 0 } } },
+	{ { { 21, 0, 0 }, { 0, 42, 0 } } },
+	{ { { 21, 0, 1 }, { 12, 31, 0 } } },
+	{ { { 21, 0, 2 }, { 0, 43, 0 } } },
+	{ { { 22, 0, 1 }, { 13, 31, 0 } } },
+	{ { { 22, 0, 0 }, { 0, 44, 0 } } },
+	{ { { 22, 0, 1 }, { 14, 31, 0 } } },
+	{ { { 22, 0, 2 }, { 0, 45, 0 } } },
+	{ { { 23, 0, 1 }, { 15, 31, 0 } } },
+	{ { { 23, 0, 0 }, { 0, 46, 0 } } },
+	{ { { 23, 0, 1 }, { 0, 47, 1 } } },
+	{ { { 23, 0, 2 }, { 0, 47, 0 } } },
+	{ { { 24, 0, 1 }, { 0, 48, 1 } } },
+	{ { { 24, 0, 0 }, { 0, 48, 0 } } },
+	{ { { 24, 0, 1 }, { 0, 49, 1 } } },
+	{ { { 24, 0, 2 }, { 0, 49, 0 } } },
+	{ { { 25, 0, 1 }, { 0, 50, 1 } } },
+	{ { { 25, 0, 0 }, { 0, 50, 0 } } },
+	{ { { 25, 0, 1 }, { 0, 51, 1 } } },
+	{ { { 25, 0, 2 }, { 0, 51, 0 } } },
+	{ { { 26, 0, 1 }, { 0, 52, 1 } } },
+	{ { { 26, 0, 0 }, { 0, 52, 0 } } },
+	{ { { 26, 0, 1 }, { 0, 53, 1 } } },
+	{ { { 26, 0, 2 }, { 0, 53, 0 } } },
+	{ { { 27, 0, 1 }, { 0, 54, 1 } } },
+	{ { { 27, 0, 0 }, { 0, 54, 0 } } },
+	{ { { 27, 0, 1 }, { 0, 55, 1 } } },
+	{ { { 27, 0, 2 }, { 0, 55, 0 } } },
+	{ { { 28, 0, 1 }, { 0, 56, 1 } } },
+	{ { { 28, 0, 0 }, { 0, 56, 0 } } },
+	{ { { 28, 0, 1 }, { 0, 57, 1 } } },
+	{ { { 28, 0, 2 }, { 0, 57, 0 } } },
+	{ { { 29, 0, 1 }, { 0, 58, 1 } } },
+	{ { { 29, 0, 0 }, { 0, 58, 0 } } },
+	{ { { 29, 0, 1 }, { 0, 59, 1 } } },
+	{ { { 29, 0, 2 }, { 0, 59, 0 } } },
+	{ { { 30, 0, 1 }, { 0, 60, 1 } } },
+	{ { { 30, 0, 0 }, { 0, 60, 0 } } },
+	{ { { 30, 0, 1 }, { 0, 61, 1 } } },
+	{ { { 30, 0, 2 }, { 0, 61, 0 } } },
+	{ { { 31, 0, 1 }, { 0, 62, 1 } } },
+	{ { { 31, 0, 0 }, { 0, 62, 0 } } },
+	{ { { 31, 0, 1 }, { 0, 63, 1 } } },
+	{ { { 31, 0, 2 }, { 0, 63, 0 } } },
+	{ { { 32, 0, 2 }, { 1, 63, 1 } } },
+	{ { { 32, 0, 1 }, { 1, 63, 0 } } },
+	{ { { 32, 0, 0 }, { 16, 48, 0 } } },
+	{ { { 32, 0, 1 }, { 2, 63, 0 } } },
+	{ { { 32, 0, 2 }, { 16, 49, 0 } } },
+	{ { { 33, 0, 1 }, { 3, 63, 0 } } },
+	{ { { 33, 0, 0 }, { 16, 50, 0 } } },
+	{ { { 33, 0, 1 }, { 4, 63, 0 } } },
+	{ { { 33, 0, 2 }, { 16, 51, 0 } } },
+	{ { { 34, 0, 1 }, { 5, 63, 0 } } },
+	{ { { 34, 0, 0 }, { 16, 52, 0 } } },
+	{ { { 34, 0, 1 }, { 6, 63, 0 } } },
+	{ { { 34, 0, 2 }, { 16, 53, 0 } } },
+	{ { { 35, 0, 1 }, { 7, 63, 0 } } },
+	{ { { 35, 0, 0 }, { 16, 54, 0 } } },
+	{ { { 35, 0, 1 }, { 8, 63, 0 } } },
+	{ { { 35, 0, 2 }, { 16, 55, 0 } } },
+	{ { { 36, 0, 1 }, { 9, 63, 0 } } },
+	{ { { 36, 0, 0 }, { 16, 56, 0 } } },
+	{ { { 36, 0, 1 }, { 10, 63, 0 } } },
+	{ { { 36, 0, 2 }, { 16, 57, 0 } } },
+	{ { { 37, 0, 1 }, { 11, 63, 0 } } },
+	{ { { 37, 0, 0 }, { 16, 58, 0 } } },
+	{ { { 37, 0, 1 }, { 12, 63, 0 } } },
+	{ { { 37, 0, 2 }, { 16, 59, 0 } } },
+	{ { { 38, 0, 1 }, { 13, 63, 0 } } },
+	{ { { 38, 0, 0 }, { 16, 60, 0 } } },
+	{ { { 38, 0, 1 }, { 14, 63, 0 } } },
+	{ { { 38, 0, 2 }, { 16, 61, 0 } } },
+	{ { { 39, 0, 1 }, { 15, 63, 0 } } },
+	{ { { 39, 0, 0 }, { 16, 62, 0 } } },
+	{ { { 39, 0, 1 }, { 16, 63, 1 } } },
+	{ { { 39, 0, 2 }, { 16, 63, 0 } } },
+	{ { { 40, 0, 1 }, { 17, 63, 1 } } },
+	{ { { 40, 0, 0 }, { 17, 63, 0 } } },
+	{ { { 40, 0, 1 }, { 18, 63, 1 } } },
+	{ { { 40, 0, 2 }, { 18, 63, 0 } } },
+	{ { { 41, 0, 1 }, { 19, 63, 1 } } },
+	{ { { 41, 0, 0 }, { 19, 63, 0 } } },
+	{ { { 41, 0, 1 }, { 20, 63, 1 } } },
+	{ { { 41, 0, 2 }, { 20, 63, 0 } } },
+	{ { { 42, 0, 1 }, { 21, 63, 1 } } },
+	{ { { 42, 0, 0 }, { 21, 63, 0 } } },
+	{ { { 42, 0, 1 }, { 22, 63, 1 } } },
+	{ { { 42, 0, 2 }, { 22, 63, 0 } } },
+	{ { { 43, 0, 1 }, { 23, 63, 1 } } },
+	{ { { 43, 0, 0 }, { 23, 63, 0 } } },
+	{ { { 43, 0, 1 }, { 24, 63, 1 } } },
+	{ { { 43, 0, 2 }, { 24, 63, 0 } } },
+	{ { { 44, 0, 1 }, { 25, 63, 1 } } },
+	{ { { 44, 0, 0 }, { 25, 63, 0 } } },
+	{ { { 44, 0, 1 }, { 26, 63, 1 } } },
+	{ { { 44, 0, 2 }, { 26, 63, 0 } } },
+	{ { { 45, 0, 1 }, { 27, 63, 1 } } },
+	{ { { 45, 0, 0 }, { 27, 63, 0 } } },
+	{ { { 45, 0, 1 }, { 28, 63, 1 } } },
+	{ { { 45, 0, 2 }, { 28, 63, 0 } } },
+	{ { { 46, 0, 1 }, { 29, 63, 1 } } },
+	{ { { 46, 0, 0 }, { 29, 63, 0 } } },
+	{ { { 46, 0, 1 }, { 30, 63, 1 } } },
+	{ { { 46, 0, 2 }, { 30, 63, 0 } } },
+	{ { { 47, 0, 1 }, { 31, 63, 1 } } },
+	{ { { 47, 0, 0 }, { 31, 63, 0 } } },
+	{ { { 47, 0, 1 }, { 32, 63, 1 } } },
+	{ { { 47, 0, 2 }, { 32, 63, 0 } } },
+	{ { { 48, 0, 2 }, { 33, 63, 1 } } },
+	{ { { 48, 0, 1 }, { 33, 63, 0 } } },
+	{ { { 48, 0, 0 }, { 48, 48, 0 } } },
+	{ { { 48, 0, 1 }, { 34, 63, 0 } } },
+	{ { { 48, 0, 2 }, { 48, 49, 0 } } },
+	{ { { 49, 0, 1 }, { 35, 63, 0 } } },
+	{ { { 49, 0, 0 }, { 48, 50, 0 } } },
+	{ { { 49, 0, 1 }, { 36, 63, 0 } } },
+	{ { { 49, 0, 2 }, { 48, 51, 0 } } },
+	{ { { 50, 0, 1 }, { 37, 63, 0 } } },
+	{ { { 50, 0, 0 }, { 48, 52, 0 } } },
+	{ { { 50, 0, 1 }, { 38, 63, 0 } } },
+	{ { { 50, 0, 2 }, { 48, 53, 0 } } },
+	{ { { 51, 0, 1 }, { 39, 63, 0 } } },
+	{ { { 51, 0, 0 }, { 48, 54, 0 } } },
+	{ { { 51, 0, 1 }, { 40, 63, 0 } } },
+	{ { { 51, 0, 2 }, { 48, 55, 0 } } },
+	{ { { 52, 0, 1 }, { 41, 63, 0 } } },
+	{ { { 52, 0, 0 }, { 48, 56, 0 } } },
+	{ { { 52, 0, 1 }, { 42, 63, 0 } } },
+	{ { { 52, 0, 2 }, { 48, 57, 0 } } },
+	{ { { 53, 0, 1 }, { 43, 63, 0 } } },
+	{ { { 53, 0, 0 }, { 48, 58, 0 } } },
+	{ { { 53, 0, 1 }, { 44, 63, 0 } } },
+	{ { { 53, 0, 2 }, { 48, 59, 0 } } },
+	{ { { 54, 0, 1 }, { 45, 63, 0 } } },
+	{ { { 54, 0, 0 }, { 48, 60, 0 } } },
+	{ { { 54, 0, 1 }, { 46, 63, 0 } } },
+	{ { { 54, 0, 2 }, { 48, 61, 0 } } },
+	{ { { 55, 0, 1 }, { 47, 63, 0 } } },
+	{ { { 55, 0, 0 }, { 48, 62, 0 } } },
+	{ { { 55, 0, 1 }, { 48, 63, 1 } } },
+	{ { { 55, 0, 2 }, { 48, 63, 0 } } },
+	{ { { 56, 0, 1 }, { 49, 63, 1 } } },
+	{ { { 56, 0, 0 }, { 49, 63, 0 } } },
+	{ { { 56, 0, 1 }, { 50, 63, 1 } } },
+	{ { { 56, 0, 2 }, { 50, 63, 0 } } },
+	{ { { 57, 0, 1 }, { 51, 63, 1 } } },
+	{ { { 57, 0, 0 }, { 51, 63, 0 } } },
+	{ { { 57, 0, 1 }, { 52, 63, 1 } } },
+	{ { { 57, 0, 2 }, { 52, 63, 0 } } },
+	{ { { 58, 0, 1 }, { 53, 63, 1 } } },
+	{ { { 58, 0, 0 }, { 53, 63, 0 } } },
+	{ { { 58, 0, 1 }, { 54, 63, 1 } } },
+	{ { { 58, 0, 2 }, { 54, 63, 0 } } },
+	{ { { 59, 0, 1 }, { 55, 63, 1 } } },
+	{ { { 59, 0, 0 }, { 55, 63, 0 } } },
+	{ { { 59, 0, 1 }, { 56, 63, 1 } } },
+	{ { { 59, 0, 2 }, { 56, 63, 0 } } },
+	{ { { 60, 0, 1 }, { 57, 63, 1 } } },
+	{ { { 60, 0, 0 }, { 57, 63, 0 } } },
+	{ { { 60, 0, 1 }, { 58, 63, 1 } } },
+	{ { { 60, 0, 2 }, { 58, 63, 0 } } },
+	{ { { 61, 0, 1 }, { 59, 63, 1 } } },
+	{ { { 61, 0, 0 }, { 59, 63, 0 } } },
+	{ { { 61, 0, 1 }, { 60, 63, 1 } } },
+	{ { { 61, 0, 2 }, { 60, 63, 0 } } },
+	{ { { 62, 0, 1 }, { 61, 63, 1 } } },
+	{ { { 62, 0, 0 }, { 61, 63, 0 } } },
+	{ { { 62, 0, 1 }, { 62, 63, 1 } } },
+	{ { { 62, 0, 2 }, { 62, 63, 0 } } },
+	{ { { 63, 0, 1 }, { 63, 63, 1 } } },
+	{ { { 63, 0, 0 }, { 63, 63, 0 } } }
+};
+
+static SingleColourLookup const lookup_5_4[] = 
+{
+	{ { { 0, 0, 0 }, { 0, 0, 0 } } },
+	{ { { 0, 0, 1 }, { 0, 1, 1 } } },
+	{ { { 0, 0, 2 }, { 0, 1, 0 } } },
+	{ { { 0, 0, 3 }, { 0, 1, 1 } } },
+	{ { { 0, 0, 4 }, { 0, 2, 1 } } },
+	{ { { 1, 0, 3 }, { 0, 2, 0 } } },
+	{ { { 1, 0, 2 }, { 0, 2, 1 } } },
+	{ { { 1, 0, 1 }, { 0, 3, 1 } } },
+	{ { { 1, 0, 0 }, { 0, 3, 0 } } },
+	{ { { 1, 0, 1 }, { 1, 2, 1 } } },
+	{ { { 1, 0, 2 }, { 1, 2, 0 } } },
+	{ { { 1, 0, 3 }, { 0, 4, 0 } } },
+	{ { { 1, 0, 4 }, { 0, 5, 1 } } },
+	{ { { 2, 0, 3 }, { 0, 5, 0 } } },
+	{ { { 2, 0, 2 }, { 0, 5, 1 } } },
+	{ { { 2, 0, 1 }, { 0, 6, 1 } } },
+	{ { { 2, 0, 0 }, { 0, 6, 0 } } },
+	{ { { 2, 0, 1 }, { 2, 3, 1 } } },
+	{ { { 2, 0, 2 }, { 2, 3, 0 } } },
+	{ { { 2, 0, 3 }, { 0, 7, 0 } } },
+	{ { { 2, 0, 4 }, { 1, 6, 1 } } },
+	{ { { 3, 0, 3 }, { 1, 6, 0 } } },
+	{ { { 3, 0, 2 }, { 0, 8, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 9, 1 } } },
+	{ { { 3, 0, 0 }, { 0, 9, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 9, 1 } } },
+	{ { { 3, 0, 2 }, { 0, 10, 1 } } },
+	{ { { 3, 0, 3 }, { 0, 10, 0 } } },
+	{ { { 3, 0, 4 }, { 2, 7, 1 } } },
+	{ { { 4, 0, 4 }, { 2, 7, 0 } } },
+	{ { { 4, 0, 3 }, { 0, 11, 0 } } },
+	{ { { 4, 0, 2 }, { 1, 10, 1 } } },
+	{ { { 4, 0, 1 }, { 1, 10, 0 } } },
+	{ { { 4, 0, 0 }, { 0, 12, 0 } } },
+	{ { { 4, 0, 1 }, { 0, 13, 1 } } },
+	{ { { 4, 0, 2 }, { 0, 13, 0 } } },
+	{ { { 4, 0, 3 }, { 0, 13, 1 } } },
+	{ { { 4, 0, 4 }, { 0, 14, 1 } } },
+	{ { { 5, 0, 3 }, { 0, 14, 0 } } },
+	{ { { 5, 0, 2 }, { 2, 11, 1 } } },
+	{ { { 5, 0, 1 }, { 2, 11, 0 } } },
+	{ { { 5, 0, 0 }, { 0, 15, 0 } } },
+	{ { { 5, 0, 1 }, { 1, 14, 1 } } },
+	{ { { 5, 0, 2 }, { 1, 14, 0 } } },
+	{ { { 5, 0, 3 }, { 0, 16, 0 } } },
+	{ { { 5, 0, 4 }, { 0, 17, 1 } } },
+	{ { { 6, 0, 3 }, { 0, 17, 0 } } },
+	{ { { 6, 0, 2 }, { 0, 17, 1 } } },
+	{ { { 6, 0, 1 }, { 0, 18, 1 } } },
+	{ { { 6, 0, 0 }, { 0, 18, 0 } } },
+	{ { { 6, 0, 1 }, { 2, 15, 1 } } },
+	{ { { 6, 0, 2 }, { 2, 15, 0 } } },
+	{ { { 6, 0, 3 }, { 0, 19, 0 } } },
+	{ { { 6, 0, 4 }, { 1, 18, 1 } } },
+	{ { { 7, 0, 3 }, { 1, 18, 0 } } },
+	{ { { 7, 0, 2 }, { 0, 20, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 21, 1 } } },
+	{ { { 7, 0, 0 }, { 0, 21, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 21, 1 } } },
+	{ { { 7, 0, 2 }, { 0, 22, 1 } } },
+	{ { { 7, 0, 3 }, { 0, 22, 0 } } },
+	{ { { 7, 0, 4 }, { 2, 19, 1 } } },
+	{ { { 8, 0, 4 }, { 2, 19, 0 } } },
+	{ { { 8, 0, 3 }, { 0, 23, 0 } } },
+	{ { { 8, 0, 2 }, { 1, 22, 1 } } },
+	{ { { 8, 0, 1 }, { 1, 22, 0 } } },
+	{ { { 8, 0, 0 }, { 0, 24, 0 } } },
+	{ { { 8, 0, 1 }, { 0, 25, 1 } } },
+	{ { { 8, 0, 2 }, { 0, 25, 0 } } },
+	{ { { 8, 0, 3 }, { 0, 25, 1 } } },
+	{ { { 8, 0, 4 }, { 0, 26, 1 } } },
+	{ { { 9, 0, 3 }, { 0, 26, 0 } } },
+	{ { { 9, 0, 2 }, { 2, 23, 1 } } },
+	{ { { 9, 0, 1 }, { 2, 23, 0 } } },
+	{ { { 9, 0, 0 }, { 0, 27, 0 } } },
+	{ { { 9, 0, 1 }, { 1, 26, 1 } } },
+	{ { { 9, 0, 2 }, { 1, 26, 0 } } },
+	{ { { 9, 0, 3 }, { 0, 28, 0 } } },
+	{ { { 9, 0, 4 }, { 0, 29, 1 } } },
+	{ { { 10, 0, 3 }, { 0, 29, 0 } } },
+	{ { { 10, 0, 2 }, { 0, 29, 1 } } },
+	{ { { 10, 0, 1 }, { 0, 30, 1 } } },
+	{ { { 10, 0, 0 }, { 0, 30, 0 } } },
+	{ { { 10, 0, 1 }, { 2, 27, 1 } } },
+	{ { { 10, 0, 2 }, { 2, 27, 0 } } },
+	{ { { 10, 0, 3 }, { 0, 31, 0 } } },
+	{ { { 10, 0, 4 }, { 1, 30, 1 } } },
+	{ { { 11, 0, 3 }, { 1, 30, 0 } } },
+	{ { { 11, 0, 2 }, { 4, 24, 0 } } },
+	{ { { 11, 0, 1 }, { 1, 31, 1 } } },
+	{ { { 11, 0, 0 }, { 1, 31, 0 } } },
+	{ { { 11, 0, 1 }, { 1, 31, 1 } } },
+	{ { { 11, 0, 2 }, { 2, 30, 1 } } },
+	{ { { 11, 0, 3 }, { 2, 30, 0 } } },
+	{ { { 11, 0, 4 }, { 2, 31, 1 } } },
+	{ { { 12, 0, 4 }, { 2, 31, 0 } } },
+	{ { { 12, 0, 3 }, { 4, 27, 0 } } },
+	{ { { 12, 0, 2 }, { 3, 30, 1 } } },
+	{ { { 12, 0, 1 }, { 3, 30, 0 } } },
+	{ { { 12, 0, 0 }, { 4, 28, 0 } } },
+	{ { { 12, 0, 1 }, { 3, 31, 1 } } },
+	{ { { 12, 0, 2 }, { 3, 31, 0 } } },
+	{ { { 12, 0, 3 }, { 3, 31, 1 } } },
+	{ { { 12, 0, 4 }, { 4, 30, 1 } } },
+	{ { { 13, 0, 3 }, { 4, 30, 0 } } },
+	{ { { 13, 0, 2 }, { 6, 27, 1 } } },
+	{ { { 13, 0, 1 }, { 6, 27, 0 } } },
+	{ { { 13, 0, 0 }, { 4, 31, 0 } } },
+	{ { { 13, 0, 1 }, { 5, 30, 1 } } },
+	{ { { 13, 0, 2 }, { 5, 30, 0 } } },
+	{ { { 13, 0, 3 }, { 8, 24, 0 } } },
+	{ { { 13, 0, 4 }, { 5, 31, 1 } } },
+	{ { { 14, 0, 3 }, { 5, 31, 0 } } },
+	{ { { 14, 0, 2 }, { 5, 31, 1 } } },
+	{ { { 14, 0, 1 }, { 6, 30, 1 } } },
+	{ { { 14, 0, 0 }, { 6, 30, 0 } } },
+	{ { { 14, 0, 1 }, { 6, 31, 1 } } },
+	{ { { 14, 0, 2 }, { 6, 31, 0 } } },
+	{ { { 14, 0, 3 }, { 8, 27, 0 } } },
+	{ { { 14, 0, 4 }, { 7, 30, 1 } } },
+	{ { { 15, 0, 3 }, { 7, 30, 0 } } },
+	{ { { 15, 0, 2 }, { 8, 28, 0 } } },
+	{ { { 15, 0, 1 }, { 7, 31, 1 } } },
+	{ { { 15, 0, 0 }, { 7, 31, 0 } } },
+	{ { { 15, 0, 1 }, { 7, 31, 1 } } },
+	{ { { 15, 0, 2 }, { 8, 30, 1 } } },
+	{ { { 15, 0, 3 }, { 8, 30, 0 } } },
+	{ { { 15, 0, 4 }, { 10, 27, 1 } } },
+	{ { { 16, 0, 4 }, { 10, 27, 0 } } },
+	{ { { 16, 0, 3 }, { 8, 31, 0 } } },
+	{ { { 16, 0, 2 }, { 9, 30, 1 } } },
+	{ { { 16, 0, 1 }, { 9, 30, 0 } } },
+	{ { { 16, 0, 0 }, { 12, 24, 0 } } },
+	{ { { 16, 0, 1 }, { 9, 31, 1 } } },
+	{ { { 16, 0, 2 }, { 9, 31, 0 } } },
+	{ { { 16, 0, 3 }, { 9, 31, 1 } } },
+	{ { { 16, 0, 4 }, { 10, 30, 1 } } },
+	{ { { 17, 0, 3 }, { 10, 30, 0 } } },
+	{ { { 17, 0, 2 }, { 10, 31, 1 } } },
+	{ { { 17, 0, 1 }, { 10, 31, 0 } } },
+	{ { { 17, 0, 0 }, { 12, 27, 0 } } },
+	{ { { 17, 0, 1 }, { 11, 30, 1 } } },
+	{ { { 17, 0, 2 }, { 11, 30, 0 } } },
+	{ { { 17, 0, 3 }, { 12, 28, 0 } } },
+	{ { { 17, 0, 4 }, { 11, 31, 1 } } },
+	{ { { 18, 0, 3 }, { 11, 31, 0 } } },
+	{ { { 18, 0, 2 }, { 11, 31, 1 } } },
+	{ { { 18, 0, 1 }, { 12, 30, 1 } } },
+	{ { { 18, 0, 0 }, { 12, 30, 0 } } },
+	{ { { 18, 0, 1 }, { 14, 27, 1 } } },
+	{ { { 18, 0, 2 }, { 14, 27, 0 } } },
+	{ { { 18, 0, 3 }, { 12, 31, 0 } } },
+	{ { { 18, 0, 4 }, { 13, 30, 1 } } },
+	{ { { 19, 0, 3 }, { 13, 30, 0 } } },
+	{ { { 19, 0, 2 }, { 16, 24, 0 } } },
+	{ { { 19, 0, 1 }, { 13, 31, 1 } } },
+	{ { { 19, 0, 0 }, { 13, 31, 0 } } },
+	{ { { 19, 0, 1 }, { 13, 31, 1 } } },
+	{ { { 19, 0, 2 }, { 14, 30, 1 } } },
+	{ { { 19, 0, 3 }, { 14, 30, 0 } } },
+	{ { { 19, 0, 4 }, { 14, 31, 1 } } },
+	{ { { 20, 0, 4 }, { 14, 31, 0 } } },
+	{ { { 20, 0, 3 }, { 16, 27, 0 } } },
+	{ { { 20, 0, 2 }, { 15, 30, 1 } } },
+	{ { { 20, 0, 1 }, { 15, 30, 0 } } },
+	{ { { 20, 0, 0 }, { 16, 28, 0 } } },
+	{ { { 20, 0, 1 }, { 15, 31, 1 } } },
+	{ { { 20, 0, 2 }, { 15, 31, 0 } } },
+	{ { { 20, 0, 3 }, { 15, 31, 1 } } },
+	{ { { 20, 0, 4 }, { 16, 30, 1 } } },
+	{ { { 21, 0, 3 }, { 16, 30, 0 } } },
+	{ { { 21, 0, 2 }, { 18, 27, 1 } } },
+	{ { { 21, 0, 1 }, { 18, 27, 0 } } },
+	{ { { 21, 0, 0 }, { 16, 31, 0 } } },
+	{ { { 21, 0, 1 }, { 17, 30, 1 } } },
+	{ { { 21, 0, 2 }, { 17, 30, 0 } } },
+	{ { { 21, 0, 3 }, { 20, 24, 0 } } },
+	{ { { 21, 0, 4 }, { 17, 31, 1 } } },
+	{ { { 22, 0, 3 }, { 17, 31, 0 } } },
+	{ { { 22, 0, 2 }, { 17, 31, 1 } } },
+	{ { { 22, 0, 1 }, { 18, 30, 1 } } },
+	{ { { 22, 0, 0 }, { 18, 30, 0 } } },
+	{ { { 22, 0, 1 }, { 18, 31, 1 } } },
+	{ { { 22, 0, 2 }, { 18, 31, 0 } } },
+	{ { { 22, 0, 3 }, { 20, 27, 0 } } },
+	{ { { 22, 0, 4 }, { 19, 30, 1 } } },
+	{ { { 23, 0, 3 }, { 19, 30, 0 } } },
+	{ { { 23, 0, 2 }, { 20, 28, 0 } } },
+	{ { { 23, 0, 1 }, { 19, 31, 1 } } },
+	{ { { 23, 0, 0 }, { 19, 31, 0 } } },
+	{ { { 23, 0, 1 }, { 19, 31, 1 } } },
+	{ { { 23, 0, 2 }, { 20, 30, 1 } } },
+	{ { { 23, 0, 3 }, { 20, 30, 0 } } },
+	{ { { 23, 0, 4 }, { 22, 27, 1 } } },
+	{ { { 24, 0, 4 }, { 22, 27, 0 } } },
+	{ { { 24, 0, 3 }, { 20, 31, 0 } } },
+	{ { { 24, 0, 2 }, { 21, 30, 1 } } },
+	{ { { 24, 0, 1 }, { 21, 30, 0 } } },
+	{ { { 24, 0, 0 }, { 24, 24, 0 } } },
+	{ { { 24, 0, 1 }, { 21, 31, 1 } } },
+	{ { { 24, 0, 2 }, { 21, 31, 0 } } },
+	{ { { 24, 0, 3 }, { 21, 31, 1 } } },
+	{ { { 24, 0, 4 }, { 22, 30, 1 } } },
+	{ { { 25, 0, 3 }, { 22, 30, 0 } } },
+	{ { { 25, 0, 2 }, { 22, 31, 1 } } },
+	{ { { 25, 0, 1 }, { 22, 31, 0 } } },
+	{ { { 25, 0, 0 }, { 24, 27, 0 } } },
+	{ { { 25, 0, 1 }, { 23, 30, 1 } } },
+	{ { { 25, 0, 2 }, { 23, 30, 0 } } },
+	{ { { 25, 0, 3 }, { 24, 28, 0 } } },
+	{ { { 25, 0, 4 }, { 23, 31, 1 } } },
+	{ { { 26, 0, 3 }, { 23, 31, 0 } } },
+	{ { { 26, 0, 2 }, { 23, 31, 1 } } },
+	{ { { 26, 0, 1 }, { 24, 30, 1 } } },
+	{ { { 26, 0, 0 }, { 24, 30, 0 } } },
+	{ { { 26, 0, 1 }, { 26, 27, 1 } } },
+	{ { { 26, 0, 2 }, { 26, 27, 0 } } },
+	{ { { 26, 0, 3 }, { 24, 31, 0 } } },
+	{ { { 26, 0, 4 }, { 25, 30, 1 } } },
+	{ { { 27, 0, 3 }, { 25, 30, 0 } } },
+	{ { { 27, 0, 2 }, { 28, 24, 0 } } },
+	{ { { 27, 0, 1 }, { 25, 31, 1 } } },
+	{ { { 27, 0, 0 }, { 25, 31, 0 } } },
+	{ { { 27, 0, 1 }, { 25, 31, 1 } } },
+	{ { { 27, 0, 2 }, { 26, 30, 1 } } },
+	{ { { 27, 0, 3 }, { 26, 30, 0 } } },
+	{ { { 27, 0, 4 }, { 26, 31, 1 } } },
+	{ { { 28, 0, 4 }, { 26, 31, 0 } } },
+	{ { { 28, 0, 3 }, { 28, 27, 0 } } },
+	{ { { 28, 0, 2 }, { 27, 30, 1 } } },
+	{ { { 28, 0, 1 }, { 27, 30, 0 } } },
+	{ { { 28, 0, 0 }, { 28, 28, 0 } } },
+	{ { { 28, 0, 1 }, { 27, 31, 1 } } },
+	{ { { 28, 0, 2 }, { 27, 31, 0 } } },
+	{ { { 28, 0, 3 }, { 27, 31, 1 } } },
+	{ { { 28, 0, 4 }, { 28, 30, 1 } } },
+	{ { { 29, 0, 3 }, { 28, 30, 0 } } },
+	{ { { 29, 0, 2 }, { 30, 27, 1 } } },
+	{ { { 29, 0, 1 }, { 30, 27, 0 } } },
+	{ { { 29, 0, 0 }, { 28, 31, 0 } } },
+	{ { { 29, 0, 1 }, { 29, 30, 1 } } },
+	{ { { 29, 0, 2 }, { 29, 30, 0 } } },
+	{ { { 29, 0, 3 }, { 29, 30, 1 } } },
+	{ { { 29, 0, 4 }, { 29, 31, 1 } } },
+	{ { { 30, 0, 3 }, { 29, 31, 0 } } },
+	{ { { 30, 0, 2 }, { 29, 31, 1 } } },
+	{ { { 30, 0, 1 }, { 30, 30, 1 } } },
+	{ { { 30, 0, 0 }, { 30, 30, 0 } } },
+	{ { { 30, 0, 1 }, { 30, 31, 1 } } },
+	{ { { 30, 0, 2 }, { 30, 31, 0 } } },
+	{ { { 30, 0, 3 }, { 30, 31, 1 } } },
+	{ { { 30, 0, 4 }, { 31, 30, 1 } } },
+	{ { { 31, 0, 3 }, { 31, 30, 0 } } },
+	{ { { 31, 0, 2 }, { 31, 30, 1 } } },
+	{ { { 31, 0, 1 }, { 31, 31, 1 } } },
+	{ { { 31, 0, 0 }, { 31, 31, 0 } } }
+};
+
+static SingleColourLookup const lookup_6_4[] = 
+{
+	{ { { 0, 0, 0 }, { 0, 0, 0 } } },
+	{ { { 0, 0, 1 }, { 0, 1, 0 } } },
+	{ { { 0, 0, 2 }, { 0, 2, 0 } } },
+	{ { { 1, 0, 1 }, { 0, 3, 1 } } },
+	{ { { 1, 0, 0 }, { 0, 3, 0 } } },
+	{ { { 1, 0, 1 }, { 0, 4, 0 } } },
+	{ { { 1, 0, 2 }, { 0, 5, 0 } } },
+	{ { { 2, 0, 1 }, { 0, 6, 1 } } },
+	{ { { 2, 0, 0 }, { 0, 6, 0 } } },
+	{ { { 2, 0, 1 }, { 0, 7, 0 } } },
+	{ { { 2, 0, 2 }, { 0, 8, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 9, 1 } } },
+	{ { { 3, 0, 0 }, { 0, 9, 0 } } },
+	{ { { 3, 0, 1 }, { 0, 10, 0 } } },
+	{ { { 3, 0, 2 }, { 0, 11, 0 } } },
+	{ { { 4, 0, 1 }, { 0, 12, 1 } } },
+	{ { { 4, 0, 0 }, { 0, 12, 0 } } },
+	{ { { 4, 0, 1 }, { 0, 13, 0 } } },
+	{ { { 4, 0, 2 }, { 0, 14, 0 } } },
+	{ { { 5, 0, 1 }, { 0, 15, 1 } } },
+	{ { { 5, 0, 0 }, { 0, 15, 0 } } },
+	{ { { 5, 0, 1 }, { 0, 16, 0 } } },
+	{ { { 5, 0, 2 }, { 1, 15, 0 } } },
+	{ { { 6, 0, 1 }, { 0, 17, 0 } } },
+	{ { { 6, 0, 0 }, { 0, 18, 0 } } },
+	{ { { 6, 0, 1 }, { 0, 19, 0 } } },
+	{ { { 6, 0, 2 }, { 3, 14, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 20, 0 } } },
+	{ { { 7, 0, 0 }, { 0, 21, 0 } } },
+	{ { { 7, 0, 1 }, { 0, 22, 0 } } },
+	{ { { 7, 0, 2 }, { 4, 15, 0 } } },
+	{ { { 8, 0, 1 }, { 0, 23, 0 } } },
+	{ { { 8, 0, 0 }, { 0, 24, 0 } } },
+	{ { { 8, 0, 1 }, { 0, 25, 0 } } },
+	{ { { 8, 0, 2 }, { 6, 14, 0 } } },
+	{ { { 9, 0, 1 }, { 0, 26, 0 } } },
+	{ { { 9, 0, 0 }, { 0, 27, 0 } } },
+	{ { { 9, 0, 1 }, { 0, 28, 0 } } },
+	{ { { 9, 0, 2 }, { 7, 15, 0 } } },
+	{ { { 10, 0, 1 }, { 0, 29, 0 } } },
+	{ { { 10, 0, 0 }, { 0, 30, 0 } } },
+	{ { { 10, 0, 1 }, { 0, 31, 0 } } },
+	{ { { 10, 0, 2 }, { 9, 14, 0 } } },
+	{ { { 11, 0, 1 }, { 0, 32, 0 } } },
+	{ { { 11, 0, 0 }, { 0, 33, 0 } } },
+	{ { { 11, 0, 1 }, { 2, 30, 0 } } },
+	{ { { 11, 0, 2 }, { 0, 34, 0 } } },
+	{ { { 12, 0, 1 }, { 0, 35, 0 } } },
+	{ { { 12, 0, 0 }, { 0, 36, 0 } } },
+	{ { { 12, 0, 1 }, { 3, 31, 0 } } },
+	{ { { 12, 0, 2 }, { 0, 37, 0 } } },
+	{ { { 13, 0, 1 }, { 0, 38, 0 } } },
+	{ { { 13, 0, 0 }, { 0, 39, 0 } } },
+	{ { { 13, 0, 1 }, { 5, 30, 0 } } },
+	{ { { 13, 0, 2 }, { 0, 40, 0 } } },
+	{ { { 14, 0, 1 }, { 0, 41, 0 } } },
+	{ { { 14, 0, 0 }, { 0, 42, 0 } } },
+	{ { { 14, 0, 1 }, { 6, 31, 0 } } },
+	{ { { 14, 0, 2 }, { 0, 43, 0 } } },
+	{ { { 15, 0, 1 }, { 0, 44, 0 } } },
+	{ { { 15, 0, 0 }, { 0, 45, 0 } } },
+	{ { { 15, 0, 1 }, { 8, 30, 0 } } },
+	{ { { 15, 0, 2 }, { 0, 46, 0 } } },
+	{ { { 16, 0, 2 }, { 0, 47, 0 } } },
+	{ { { 16, 0, 1 }, { 1, 46, 0 } } },
+	{ { { 16, 0, 0 }, { 0, 48, 0 } } },
+	{ { { 16, 0, 1 }, { 0, 49, 0 } } },
+	{ { { 16, 0, 2 }, { 0, 50, 0 } } },
+	{ { { 17, 0, 1 }, { 2, 47, 0 } } },
+	{ { { 17, 0, 0 }, { 0, 51, 0 } } },
+	{ { { 17, 0, 1 }, { 0, 52, 0 } } },
+	{ { { 17, 0, 2 }, { 0, 53, 0 } } },
+	{ { { 18, 0, 1 }, { 4, 46, 0 } } },
+	{ { { 18, 0, 0 }, { 0, 54, 0 } } },
+	{ { { 18, 0, 1 }, { 0, 55, 0 } } },
+	{ { { 18, 0, 2 }, { 0, 56, 0 } } },
+	{ { { 19, 0, 1 }, { 5, 47, 0 } } },
+	{ { { 19, 0, 0 }, { 0, 57, 0 } } },
+	{ { { 19, 0, 1 }, { 0, 58, 0 } } },
+	{ { { 19, 0, 2 }, { 0, 59, 0 } } },
+	{ { { 20, 0, 1 }, { 7, 46, 0 } } },
+	{ { { 20, 0, 0 }, { 0, 60, 0 } } },
+	{ { { 20, 0, 1 }, { 0, 61, 0 } } },
+	{ { { 20, 0, 2 }, { 0, 62, 0 } } },
+	{ { { 21, 0, 1 }, { 8, 47, 0 } } },
+	{ { { 21, 0, 0 }, { 0, 63, 0 } } },
+	{ { { 21, 0, 1 }, { 1, 62, 0 } } },
+	{ { { 21, 0, 2 }, { 1, 63, 0 } } },
+	{ { { 22, 0, 1 }, { 10, 46, 0 } } },
+	{ { { 22, 0, 0 }, { 2, 62, 0 } } },
+	{ { { 22, 0, 1 }, { 2, 63, 0 } } },
+	{ { { 22, 0, 2 }, { 3, 62, 0 } } },
+	{ { { 23, 0, 1 }, { 11, 47, 0 } } },
+	{ { { 23, 0, 0 }, { 3, 63, 0 } } },
+	{ { { 23, 0, 1 }, { 4, 62, 0 } } },
+	{ { { 23, 0, 2 }, { 4, 63, 0 } } },
+	{ { { 24, 0, 1 }, { 13, 46, 0 } } },
+	{ { { 24, 0, 0 }, { 5, 62, 0 } } },
+	{ { { 24, 0, 1 }, { 5, 63, 0 } } },
+	{ { { 24, 0, 2 }, { 6, 62, 0 } } },
+	{ { { 25, 0, 1 }, { 14, 47, 0 } } },
+	{ { { 25, 0, 0 }, { 6, 63, 0 } } },
+	{ { { 25, 0, 1 }, { 7, 62, 0 } } },
+	{ { { 25, 0, 2 }, { 7, 63, 0 } } },
+	{ { { 26, 0, 1 }, { 16, 45, 0 } } },
+	{ { { 26, 0, 0 }, { 8, 62, 0 } } },
+	{ { { 26, 0, 1 }, { 8, 63, 0 } } },
+	{ { { 26, 0, 2 }, { 9, 62, 0 } } },
+	{ { { 27, 0, 1 }, { 16, 48, 0 } } },
+	{ { { 27, 0, 0 }, { 9, 63, 0 } } },
+	{ { { 27, 0, 1 }, { 10, 62, 0 } } },
+	{ { { 27, 0, 2 }, { 10, 63, 0 } } },
+	{ { { 28, 0, 1 }, { 16, 51, 0 } } },
+	{ { { 28, 0, 0 }, { 11, 62, 0 } } },
+	{ { { 28, 0, 1 }, { 11, 63, 0 } } },
+	{ { { 28, 0, 2 }, { 12, 62, 0 } } },
+	{ { { 29, 0, 1 }, { 16, 54, 0 } } },
+	{ { { 29, 0, 0 }, { 12, 63, 0 } } },
+	{ { { 29, 0, 1 }, { 13, 62, 0 } } },
+	{ { { 29, 0, 2 }, { 13, 63, 0 } } },
+	{ { { 30, 0, 1 }, { 16, 57, 0 } } },
+	{ { { 30, 0, 0 }, { 14, 62, 0 } } },
+	{ { { 30, 0, 1 }, { 14, 63, 0 } } },
+	{ { { 30, 0, 2 }, { 15, 62, 0 } } },
+	{ { { 31, 0, 1 }, { 16, 60, 0 } } },
+	{ { { 31, 0, 0 }, { 15, 63, 0 } } },
+	{ { { 31, 0, 1 }, { 24, 46, 0 } } },
+	{ { { 31, 0, 2 }, { 16, 62, 0 } } },
+	{ { { 32, 0, 2 }, { 16, 63, 0 } } },
+	{ { { 32, 0, 1 }, { 17, 62, 0 } } },
+	{ { { 32, 0, 0 }, { 25, 47, 0 } } },
+	{ { { 32, 0, 1 }, { 17, 63, 0 } } },
+	{ { { 32, 0, 2 }, { 18, 62, 0 } } },
+	{ { { 33, 0, 1 }, { 18, 63, 0 } } },
+	{ { { 33, 0, 0 }, { 27, 46, 0 } } },
+	{ { { 33, 0, 1 }, { 19, 62, 0 } } },
+	{ { { 33, 0, 2 }, { 19, 63, 0 } } },
+	{ { { 34, 0, 1 }, { 20, 62, 0 } } },
+	{ { { 34, 0, 0 }, { 28, 47, 0 } } },
+	{ { { 34, 0, 1 }, { 20, 63, 0 } } },
+	{ { { 34, 0, 2 }, { 21, 62, 0 } } },
+	{ { { 35, 0, 1 }, { 21, 63, 0 } } },
+	{ { { 35, 0, 0 }, { 30, 46, 0 } } },
+	{ { { 35, 0, 1 }, { 22, 62, 0 } } },
+	{ { { 35, 0, 2 }, { 22, 63, 0 } } },
+	{ { { 36, 0, 1 }, { 23, 62, 0 } } },
+	{ { { 36, 0, 0 }, { 31, 47, 0 } } },
+	{ { { 36, 0, 1 }, { 23, 63, 0 } } },
+	{ { { 36, 0, 2 }, { 24, 62, 0 } } },
+	{ { { 37, 0, 1 }, { 24, 63, 0 } } },
+	{ { { 37, 0, 0 }, { 32, 47, 0 } } },
+	{ { { 37, 0, 1 }, { 25, 62, 0 } } },
+	{ { { 37, 0, 2 }, { 25, 63, 0 } } },
+	{ { { 38, 0, 1 }, { 26, 62, 0 } } },
+	{ { { 38, 0, 0 }, { 32, 50, 0 } } },
+	{ { { 38, 0, 1 }, { 26, 63, 0 } } },
+	{ { { 38, 0, 2 }, { 27, 62, 0 } } },
+	{ { { 39, 0, 1 }, { 27, 63, 0 } } },
+	{ { { 39, 0, 0 }, { 32, 53, 0 } } },
+	{ { { 39, 0, 1 }, { 28, 62, 0 } } },
+	{ { { 39, 0, 2 }, { 28, 63, 0 } } },
+	{ { { 40, 0, 1 }, { 29, 62, 0 } } },
+	{ { { 40, 0, 0 }, { 32, 56, 0 } } },
+	{ { { 40, 0, 1 }, { 29, 63, 0 } } },
+	{ { { 40, 0, 2 }, { 30, 62, 0 } } },
+	{ { { 41, 0, 1 }, { 30, 63, 0 } } },
+	{ { { 41, 0, 0 }, { 32, 59, 0 } } },
+	{ { { 41, 0, 1 }, { 31, 62, 0 } } },
+	{ { { 41, 0, 2 }, { 31, 63, 0 } } },
+	{ { { 42, 0, 1 }, { 32, 61, 0 } } },
+	{ { { 42, 0, 0 }, { 32, 62, 0 } } },
+	{ { { 42, 0, 1 }, { 32, 63, 0 } } },
+	{ { { 42, 0, 2 }, { 41, 46, 0 } } },
+	{ { { 43, 0, 1 }, { 33, 62, 0 } } },
+	{ { { 43, 0, 0 }, { 33, 63, 0 } } },
+	{ { { 43, 0, 1 }, { 34, 62, 0 } } },
+	{ { { 43, 0, 2 }, { 42, 47, 0 } } },
+	{ { { 44, 0, 1 }, { 34, 63, 0 } } },
+	{ { { 44, 0, 0 }, { 35, 62, 0 } } },
+	{ { { 44, 0, 1 }, { 35, 63, 0 } } },
+	{ { { 44, 0, 2 }, { 44, 46, 0 } } },
+	{ { { 45, 0, 1 }, { 36, 62, 0 } } },
+	{ { { 45, 0, 0 }, { 36, 63, 0 } } },
+	{ { { 45, 0, 1 }, { 37, 62, 0 } } },
+	{ { { 45, 0, 2 }, { 45, 47, 0 } } },
+	{ { { 46, 0, 1 }, { 37, 63, 0 } } },
+	{ { { 46, 0, 0 }, { 38, 62, 0 } } },
+	{ { { 46, 0, 1 }, { 38, 63, 0 } } },
+	{ { { 46, 0, 2 }, { 47, 46, 0 } } },
+	{ { { 47, 0, 1 }, { 39, 62, 0 } } },
+	{ { { 47, 0, 0 }, { 39, 63, 0 } } },
+	{ { { 47, 0, 1 }, { 40, 62, 0 } } },
+	{ { { 47, 0, 2 }, { 48, 46, 0 } } },
+	{ { { 48, 0, 2 }, { 40, 63, 0 } } },
+	{ { { 48, 0, 1 }, { 41, 62, 0 } } },
+	{ { { 48, 0, 0 }, { 41, 63, 0 } } },
+	{ { { 48, 0, 1 }, { 48, 49, 0 } } },
+	{ { { 48, 0, 2 }, { 42, 62, 0 } } },
+	{ { { 49, 0, 1 }, { 42, 63, 0 } } },
+	{ { { 49, 0, 0 }, { 43, 62, 0 } } },
+	{ { { 49, 0, 1 }, { 48, 52, 0 } } },
+	{ { { 49, 0, 2 }, { 43, 63, 0 } } },
+	{ { { 50, 0, 1 }, { 44, 62, 0 } } },
+	{ { { 50, 0, 0 }, { 44, 63, 0 } } },
+	{ { { 50, 0, 1 }, { 48, 55, 0 } } },
+	{ { { 50, 0, 2 }, { 45, 62, 0 } } },
+	{ { { 51, 0, 1 }, { 45, 63, 0 } } },
+	{ { { 51, 0, 0 }, { 46, 62, 0 } } },
+	{ { { 51, 0, 1 }, { 48, 58, 0 } } },
+	{ { { 51, 0, 2 }, { 46, 63, 0 } } },
+	{ { { 52, 0, 1 }, { 47, 62, 0 } } },
+	{ { { 52, 0, 0 }, { 47, 63, 0 } } },
+	{ { { 52, 0, 1 }, { 48, 61, 0 } } },
+	{ { { 52, 0, 2 }, { 48, 62, 0 } } },
+	{ { { 53, 0, 1 }, { 56, 47, 0 } } },
+	{ { { 53, 0, 0 }, { 48, 63, 0 } } },
+	{ { { 53, 0, 1 }, { 49, 62, 0 } } },
+	{ { { 53, 0, 2 }, { 49, 63, 0 } } },
+	{ { { 54, 0, 1 }, { 58, 46, 0 } } },
+	{ { { 54, 0, 0 }, { 50, 62, 0 } } },
+	{ { { 54, 0, 1 }, { 50, 63, 0 } } },
+	{ { { 54, 0, 2 }, { 51, 62, 0 } } },
+	{ { { 55, 0, 1 }, { 59, 47, 0 } } },
+	{ { { 55, 0, 0 }, { 51, 63, 0 } } },
+	{ { { 55, 0, 1 }, { 52, 62, 0 } } },
+	{ { { 55, 0, 2 }, { 52, 63, 0 } } },
+	{ { { 56, 0, 1 }, { 61, 46, 0 } } },
+	{ { { 56, 0, 0 }, { 53, 62, 0 } } },
+	{ { { 56, 0, 1 }, { 53, 63, 0 } } },
+	{ { { 56, 0, 2 }, { 54, 62, 0 } } },
+	{ { { 57, 0, 1 }, { 62, 47, 0 } } },
+	{ { { 57, 0, 0 }, { 54, 63, 0 } } },
+	{ { { 57, 0, 1 }, { 55, 62, 0 } } },
+	{ { { 57, 0, 2 }, { 55, 63, 0 } } },
+	{ { { 58, 0, 1 }, { 56, 62, 1 } } },
+	{ { { 58, 0, 0 }, { 56, 62, 0 } } },
+	{ { { 58, 0, 1 }, { 56, 63, 0 } } },
+	{ { { 58, 0, 2 }, { 57, 62, 0 } } },
+	{ { { 59, 0, 1 }, { 57, 63, 1 } } },
+	{ { { 59, 0, 0 }, { 57, 63, 0 } } },
+	{ { { 59, 0, 1 }, { 58, 62, 0 } } },
+	{ { { 59, 0, 2 }, { 58, 63, 0 } } },
+	{ { { 60, 0, 1 }, { 59, 62, 1 } } },
+	{ { { 60, 0, 0 }, { 59, 62, 0 } } },
+	{ { { 60, 0, 1 }, { 59, 63, 0 } } },
+	{ { { 60, 0, 2 }, { 60, 62, 0 } } },
+	{ { { 61, 0, 1 }, { 60, 63, 1 } } },
+	{ { { 61, 0, 0 }, { 60, 63, 0 } } },
+	{ { { 61, 0, 1 }, { 61, 62, 0 } } },
+	{ { { 61, 0, 2 }, { 61, 63, 0 } } },
+	{ { { 62, 0, 1 }, { 62, 62, 1 } } },
+	{ { { 62, 0, 0 }, { 62, 62, 0 } } },
+	{ { { 62, 0, 1 }, { 62, 63, 0 } } },
+	{ { { 62, 0, 2 }, { 63, 62, 0 } } },
+	{ { { 63, 0, 1 }, { 63, 63, 1 } } },
+	{ { { 63, 0, 0 }, { 63, 63, 0 } } }
+};
diff --git a/3rdparty/bimg/3rdparty/libsquish/squish.cpp b/3rdparty/bimg/3rdparty/libsquish/squish.cpp
new file mode 100644
index 0000000..cd91f87
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/squish.cpp
@@ -0,0 +1,260 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#include "squish.h"
+#include "colourset.h"
+#include "maths.h"
+#include "rangefit.h"
+#include "clusterfit.h"
+#include "colourblock.h"
+#include "alpha.h"
+#include "singlecolourfit.h"
+
+namespace squish {
+
+static int FixFlags( int flags )
+{
+	// grab the flag bits
+	int method = flags & ( kDxt1 | kDxt3 | kDxt5 | kBc4 | kBc5 );
+	int fit = flags & ( kColourIterativeClusterFit | kColourClusterFit | kColourRangeFit );
+	int extra = flags & kWeightColourByAlpha;
+	
+	// set defaults
+	if ( method != kDxt3
+	&&   method != kDxt5
+	&&   method != kBc4
+	&&   method != kBc5 )
+	{
+		method = kDxt1;
+	}
+	if( fit != kColourRangeFit && fit != kColourIterativeClusterFit )
+		fit = kColourClusterFit;
+		
+	// done
+	return method | fit | extra;
+}
+
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	if ( ( flags & ( kBc4 | kBc5 ) ) != 0 )
+	{
+		u8 alpha[16*4];
+		for( int i = 0; i < 16; ++i )
+		{
+			alpha[i*4 + 3] = rgba[i*4 + 0]; // copy R to A
+		}
+
+		u8* rBlock = reinterpret_cast< u8* >( block );
+		CompressAlphaDxt5( alpha, mask, rBlock );
+
+		if ( ( flags & ( kBc5 ) ) != 0 )
+		{
+			for( int i = 0; i < 16; ++i )
+			{
+				alpha[i*4 + 3] = rgba[i*4 + 1]; // copy G to A
+			}
+
+			u8* gBlock = reinterpret_cast< u8* >( block ) + 8;
+			CompressAlphaDxt5( alpha, mask, gBlock );
+		}
+
+		return;
+	}
+
+	// get the block locations
+	void* colourBlock = block;
+	void* alphaBlock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8* >( block ) + 8;
+
+	// create the minimal point set
+	ColourSet colours( rgba, mask, flags );
+	
+	// check the compression type and compress colour
+	if( colours.GetCount() == 1 )
+	{
+		// always do a single colour fit
+		SingleColourFit fit( &colours, flags );
+		fit.Compress( colourBlock );
+	}
+	else if( ( flags & kColourRangeFit ) != 0 || colours.GetCount() == 0 )
+	{
+		// do a range fit
+		RangeFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	else
+	{
+		// default to a cluster fit (could be iterative or not)
+		ClusterFit fit( &colours, flags, metric );
+		fit.Compress( colourBlock );
+	}
+	
+	// compress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		CompressAlphaDxt3( rgba, mask, alphaBlock );
+	else if( ( flags & kDxt5 ) != 0 )
+		CompressAlphaDxt5( rgba, mask, alphaBlock );
+}
+
+void Decompress( u8* rgba, void const* block, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// get the block locations
+	void const* colourBlock = block;
+	void const* alphaBock = block;
+	if( ( flags & ( kDxt3 | kDxt5 ) ) != 0 )
+		colourBlock = reinterpret_cast< u8 const* >( block ) + 8;
+
+	// decompress colour
+	DecompressColour( rgba, colourBlock, ( flags & kDxt1 ) != 0 );
+
+	// decompress alpha separately if necessary
+	if( ( flags & kDxt3 ) != 0 )
+		DecompressAlphaDxt3( rgba, alphaBock );
+	else if( ( flags & kDxt5 ) != 0 )
+		DecompressAlphaDxt5( rgba, alphaBock );
+}
+
+int GetStorageRequirements( int width, int height, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+	
+	// compute the storage requirements
+	int blockcount = ( ( width + 3 )/4 ) * ( ( height + 3 )/4 );
+	int blocksize = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+	return blockcount*blocksize;
+}
+
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block output
+	u8* targetBlock = reinterpret_cast< u8* >( blocks );
+	int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// build the 4x4 block of pixels
+			u8 sourceRgba[16*4];
+			u8* targetPixel = sourceRgba;
+			int mask = 0;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the source pixel in the image
+					int sx = x + px;
+					int sy = y + py;
+					
+					// enable if we're in the image
+					if( sx < width && sy < height )
+					{
+						// copy the rgba value
+						u8 const* sourcePixel = rgba + 4*( width*sy + sx );
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+							
+						// enable this pixel
+						mask |= ( 1 << ( 4*py + px ) );
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						targetPixel += 4;
+					}
+				}
+			}
+			
+			// compress it into the output
+			CompressMasked( sourceRgba, mask, targetBlock, flags, metric );
+			
+			// advance
+			targetBlock += bytesPerBlock;
+		}
+	}
+}
+
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags )
+{
+	// fix any bad flags
+	flags = FixFlags( flags );
+
+	// initialise the block input
+	u8 const* sourceBlock = reinterpret_cast< u8 const* >( blocks );
+	int bytesPerBlock = ( ( flags & ( kDxt1 | kBc4 ) ) != 0 ) ? 8 : 16;
+
+	// loop over blocks
+	for( int y = 0; y < height; y += 4 )
+	{
+		for( int x = 0; x < width; x += 4 )
+		{
+			// decompress the block
+			u8 targetRgba[4*16];
+			Decompress( targetRgba, sourceBlock, flags );
+			
+			// write the decompressed pixels to the correct image locations
+			u8 const* sourcePixel = targetRgba;
+			for( int py = 0; py < 4; ++py )
+			{
+				for( int px = 0; px < 4; ++px )
+				{
+					// get the target location
+					int sx = x + px;
+					int sy = y + py;
+					if( sx < width && sy < height )
+					{
+						u8* targetPixel = rgba + 4*( width*sy + sx );
+						
+						// copy the rgba value
+						for( int i = 0; i < 4; ++i )
+							*targetPixel++ = *sourcePixel++;
+					}
+					else
+					{
+						// skip this pixel as its outside the image
+						sourcePixel += 4;
+					}
+				}
+			}
+			
+			// advance
+			sourceBlock += bytesPerBlock;
+		}
+	}
+}
+
+} // namespace squish
diff --git a/3rdparty/bimg/3rdparty/libsquish/squish.h b/3rdparty/bimg/3rdparty/libsquish/squish.h
new file mode 100644
index 0000000..175375f
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/libsquish/squish.h
@@ -0,0 +1,269 @@
+/* -----------------------------------------------------------------------------
+
+	Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+	Permission is hereby granted, free of charge, to any person obtaining
+	a copy of this software and associated documentation files (the 
+	"Software"), to	deal in the Software without restriction, including
+	without limitation the rights to use, copy, modify, merge, publish,
+	distribute, sublicense, and/or sell copies of the Software, and to 
+	permit persons to whom the Software is furnished to do so, subject to 
+	the following conditions:
+
+	The above copyright notice and this permission notice shall be included
+	in all copies or substantial portions of the Software.
+
+	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+	
+   -------------------------------------------------------------------------- */
+   
+#ifndef SQUISH_H
+#define SQUISH_H
+
+//! All squish API functions live in this namespace.
+namespace squish {
+
+// -----------------------------------------------------------------------------
+
+//! Typedef a quantity that is a single unsigned byte.
+typedef unsigned char u8;
+
+// -----------------------------------------------------------------------------
+
+enum
+{
+	//! Use DXT1 compression.
+	kDxt1 = ( 1 << 0 ),
+
+	//! Use DXT3 compression.
+	kDxt3 = ( 1 << 1 ),
+
+	//! Use DXT5 compression.
+	kDxt5 = ( 1 << 2 ),
+
+	//! Use BC4 compression.
+	kBc4 = ( 1 << 3 ),
+
+	//! Use BC5 compression.
+	kBc5 = ( 1 << 4 ),
+
+	//! Use a slow but high quality colour compressor (the default).
+	kColourClusterFit = ( 1 << 5 ),
+
+	//! Use a fast but low quality colour compressor.
+	kColourRangeFit	= ( 1 << 6 ),
+
+	//! Weight the colour by alpha during cluster fit (disabled by default).
+	kWeightColourByAlpha = ( 1 << 7 ),
+
+	//! Use a very slow but very high quality colour compressor.
+	kColourIterativeClusterFit = ( 1 << 8 ),
+};
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param mask		The valid pixel mask.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+		
+	The mask parameter enables only certain pixels within the block. The lowest
+	bit enables the first pixel and so on up to the 16th bit. Bits beyond the
+	16th bit are ignored. Pixels that are not enabled are allowed to take
+	arbitrary colours in the output block. An example of how this can be used
+	is in the CompressImage function to disable pixels outside the bounds of
+	the image when the width or height is not divisible by 4.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+*/
+void CompressMasked( u8 const* rgba, int mask, void* block, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses a 4x4 block of pixels.
+
+	@param rgba		The rgba values of the 16 source pixels.
+	@param block	Storage for the compressed DXT block.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for the compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	This method is an inline that calls CompressMasked with a mask of 0xffff, 
+	provided for compatibility with older versions of squish.
+*/
+inline void Compress( u8 const* rgba, void* block, int flags, float* metric = 0 )
+{
+	CompressMasked( rgba, 0xffff, block, flags, metric );
+}
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses a 4x4 block of pixels.
+
+	@param rgba		Storage for the 16 decompressed pixels.
+	@param block	The compressed DXT block.
+	@param flags	Compression flags.
+
+	The decompressed pixels will be written as a contiguous array of 16 rgba
+	values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , r16, g16, b16, a16 }
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+*/
+void Decompress( u8* rgba, void const* block, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Computes the amount of compressed storage required.
+
+	@param width	The width of the image.
+	@param height	The height of the image.
+	@param flags	Compression flags.
+	
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+	
+	Most DXT images will be a multiple of 4 in each dimension, but this 
+	function supports arbitrary size images by allowing the outer blocks to
+	be only partially used.
+*/
+int GetStorageRequirements( int width, int height, int flags );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Compresses an image in memory.
+
+	@param rgba		The pixels of the source.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	Storage for the compressed output.
+	@param flags	Compression flags.
+	@param metric	An optional perceptual metric.
+	
+	The source pixels should be presented as a contiguous array of width*height
+	rgba values, with each component as 1 byte each. In memory this should be:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. When using DXT1 
+	compression, 8 bytes of storage are required for each compressed DXT block. 
+	DXT3 and DXT5 compression require 16 bytes of storage per block.
+	
+	The flags parameter can also specify a preferred colour compressor to use 
+	when fitting the RGB components of the data. Possible colour compressors 
+	are: kColourClusterFit (the default), kColourRangeFit (very fast, low 
+	quality) or kColourIterativeClusterFit (slowest, best quality).
+		
+	When using kColourClusterFit or kColourIterativeClusterFit, an additional 
+	flag can be specified to weight the importance of each pixel by its alpha 
+	value. For images that are rendered using alpha blending, this can 
+	significantly increase the perceived quality.
+	
+	The metric parameter can be used to weight the relative importance of each
+	colour channel, or pass NULL to use the default uniform weight of 
+	{ 1.0f, 1.0f, 1.0f }. This replaces the previous flag-based control that 
+	allowed either uniform or "perceptual" weights with the fixed values
+	{ 0.2126f, 0.7152f, 0.0722f }. If non-NULL, the metric should point to a 
+	contiguous array of 3 floats.
+	
+	Internally this function calls squish::CompressMasked for each block, which 
+	allows for pixels outside the image to take arbitrary values. The function 
+	squish::GetStorageRequirements can be called to compute the amount of memory
+	to allocate for the compressed output.
+*/
+void CompressImage( u8 const* rgba, int width, int height, void* blocks, int flags, float* metric = 0 );
+
+// -----------------------------------------------------------------------------
+
+/*! @brief Decompresses an image in memory.
+
+	@param rgba		Storage for the decompressed pixels.
+	@param width	The width of the source image.
+	@param height	The height of the source image.
+	@param blocks	The compressed DXT blocks.
+	@param flags	Compression flags.
+	
+	The decompressed pixels will be written as a contiguous array of width*height
+	16 rgba values, with each component as 1 byte each. In memory this is:
+	
+		{ r1, g1, b1, a1, .... , rn, gn, bn, an } for n = width*height
+		
+	The flags parameter should specify either kDxt1, kDxt3 or kDxt5 compression, 
+	however, DXT1 will be used by default if none is specified. All other flags 
+	are ignored.
+
+	Internally this function calls squish::Decompress for each block.
+*/
+void DecompressImage( u8* rgba, int width, int height, void const* blocks, int flags );
+
+// -----------------------------------------------------------------------------
+
+} // namespace squish
+
+#endif // ndef SQUISH_H
+
diff --git a/3rdparty/bimg/3rdparty/lodepng/README.md b/3rdparty/bimg/3rdparty/lodepng/README.md
new file mode 100644
index 0000000..35f9254
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/lodepng/README.md
@@ -0,0 +1,10 @@
+LodePNG
+-------
+
+PNG encoder and decoder in C and C++.
+
+Home page: http://lodev.org/lodepng/
+
+Only two files are needed to allow your program to read and write PNG files: lodepng.cpp and lodepng.h.
+
+The other files in the project are just examples, unit tests, etc...
diff --git a/3rdparty/bimg/3rdparty/lodepng/lodepng.cpp b/3rdparty/bimg/3rdparty/lodepng/lodepng.cpp
new file mode 100644
index 0000000..7baf7f9
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/lodepng/lodepng.cpp
@@ -0,0 +1,6224 @@
+/*
+LodePNG version 20160501
+
+Copyright (c) 2005-2016 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+/*
+The manual and changelog are in the header file "lodepng.h"
+Rename this file to lodepng.cpp to use it for C++, or to lodepng.c to use it for C.
+*/
+
+#include "lodepng.h"
+
+#include <limits.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#if defined(_MSC_VER) && (_MSC_VER >= 1310) /*Visual Studio: A few warning types are not desired here.*/
+#pragma warning( disable : 4244 ) /*implicit conversions: not warned by gcc -Wall -Wextra and requires too much casts*/
+#pragma warning( disable : 4996 ) /*VS does not like fopen, but fopen_s is not standard C so unusable here*/
+#endif /*_MSC_VER */
+
+const char* LODEPNG_VERSION_STRING = "20160501";
+
+/*
+This source file is built up in the following large parts. The code sections
+with the "LODEPNG_COMPILE_" #defines divide this up further in an intermixed way.
+-Tools for C and common code for PNG and Zlib
+-C Code for Zlib (huffman, deflate, ...)
+-C Code for PNG (file format chunks, adam7, PNG filters, color conversions, ...)
+-The C++ wrapper around all of the above
+*/
+
+/*The malloc, realloc and free functions defined here with "lodepng_" in front
+of the name, so that you can easily change them to others related to your
+platform if needed. Everything else in the code calls these. Pass
+-DLODEPNG_NO_COMPILE_ALLOCATORS to the compiler, or comment out
+#define LODEPNG_COMPILE_ALLOCATORS in the header, to disable the ones here and
+define them in your own project's source files without needing to change
+lodepng source code. Don't forget to remove "static" if you copypaste them
+from here.*/
+
+#ifdef LODEPNG_COMPILE_ALLOCATORS
+static void* lodepng_malloc(size_t size)
+{
+  return malloc(size);
+}
+
+static void* lodepng_realloc(void* ptr, size_t new_size)
+{
+  return realloc(ptr, new_size);
+}
+
+static void lodepng_free(void* ptr)
+{
+  free(ptr);
+}
+#else /*LODEPNG_COMPILE_ALLOCATORS*/
+void* lodepng_malloc(size_t size);
+void* lodepng_realloc(void* ptr, size_t new_size);
+void lodepng_free(void* ptr);
+#endif /*LODEPNG_COMPILE_ALLOCATORS*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // Tools for C, and common code for PNG and Zlib.                       // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Often in case of an error a value is assigned to a variable and then it breaks
+out of a loop (to go to the cleanup phase of a function). This macro does that.
+It makes the error handling code shorter and more readable.
+
+Example: if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83);
+*/
+#define CERROR_BREAK(errorvar, code)\
+{\
+  errorvar = code;\
+  break;\
+}
+
+/*version of CERROR_BREAK that assumes the common case where the error variable is named "error"*/
+#define ERROR_BREAK(code) CERROR_BREAK(error, code)
+
+/*Set error var to the error code, and return it.*/
+#define CERROR_RETURN_ERROR(errorvar, code)\
+{\
+  errorvar = code;\
+  return code;\
+}
+
+/*Try the code, if it returns error, also return the error.*/
+#define CERROR_TRY_RETURN(call)\
+{\
+  unsigned error = call;\
+  if(error) return error;\
+}
+
+/*Set error var to the error code, and return from the void function.*/
+#define CERROR_RETURN(errorvar, code)\
+{\
+  errorvar = code;\
+  return;\
+}
+
+/*
+About uivector, ucvector and string:
+-All of them wrap dynamic arrays or text strings in a similar way.
+-LodePNG was originally written in C++. The vectors replace the std::vectors that were used in the C++ version.
+-The string tools are made to avoid problems with compilers that declare things like strncat as deprecated.
+-They're not used in the interface, only internally in this file as static functions.
+-As with many other structs in this file, the init and cleanup functions serve as ctor and dtor.
+*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*dynamic vector of unsigned ints*/
+typedef struct uivector
+{
+  unsigned* data;
+  size_t size; /*size in number of unsigned longs*/
+  size_t allocsize; /*allocated size in bytes*/
+} uivector;
+
+static void uivector_cleanup(void* p)
+{
+  ((uivector*)p)->size = ((uivector*)p)->allocsize = 0;
+  lodepng_free(((uivector*)p)->data);
+  ((uivector*)p)->data = NULL;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_reserve(uivector* p, size_t allocsize)
+{
+  if(allocsize > p->allocsize)
+  {
+    size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+    void* data = lodepng_realloc(p->data, newsize);
+    if(data)
+    {
+      p->allocsize = newsize;
+      p->data = (unsigned*)data;
+    }
+    else return 0; /*error: not enough memory*/
+  }
+  return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_resize(uivector* p, size_t size)
+{
+  if(!uivector_reserve(p, size * sizeof(unsigned))) return 0;
+  p->size = size;
+  return 1; /*success*/
+}
+
+/*resize and give all new elements the value*/
+static unsigned uivector_resizev(uivector* p, size_t size, unsigned value)
+{
+  size_t oldsize = p->size, i;
+  if(!uivector_resize(p, size)) return 0;
+  for(i = oldsize; i < size; ++i) p->data[i] = value;
+  return 1;
+}
+
+static void uivector_init(uivector* p)
+{
+  p->data = NULL;
+  p->size = p->allocsize = 0;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned uivector_push_back(uivector* p, unsigned c)
+{
+  if(!uivector_resize(p, p->size + 1)) return 0;
+  p->data[p->size - 1] = c;
+  return 1;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+/*dynamic vector of unsigned chars*/
+typedef struct ucvector
+{
+  unsigned char* data;
+  size_t size; /*used size*/
+  size_t allocsize; /*allocated size*/
+} ucvector;
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_reserve(ucvector* p, size_t allocsize)
+{
+  if(allocsize > p->allocsize)
+  {
+    size_t newsize = (allocsize > p->allocsize * 2) ? allocsize : (allocsize * 3 / 2);
+    void* data = lodepng_realloc(p->data, newsize);
+    if(data)
+    {
+      p->allocsize = newsize;
+      p->data = (unsigned char*)data;
+    }
+    else return 0; /*error: not enough memory*/
+  }
+  return 1;
+}
+
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_resize(ucvector* p, size_t size)
+{
+  if(!ucvector_reserve(p, size * sizeof(unsigned char))) return 0;
+  p->size = size;
+  return 1; /*success*/
+}
+
+#ifdef LODEPNG_COMPILE_PNG
+
+static void ucvector_cleanup(void* p)
+{
+  ((ucvector*)p)->size = ((ucvector*)p)->allocsize = 0;
+  lodepng_free(((ucvector*)p)->data);
+  ((ucvector*)p)->data = NULL;
+}
+
+static void ucvector_init(ucvector* p)
+{
+  p->data = NULL;
+  p->size = p->allocsize = 0;
+}
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*you can both convert from vector to buffer&size and vica versa. If you use
+init_buffer to take over a buffer and size, it is not needed to use cleanup*/
+static void ucvector_init_buffer(ucvector* p, unsigned char* buffer, size_t size)
+{
+  p->data = buffer;
+  p->allocsize = p->size = size;
+}
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#if (defined(LODEPNG_COMPILE_PNG) && defined(LODEPNG_COMPILE_ANCILLARY_CHUNKS)) || defined(LODEPNG_COMPILE_ENCODER)
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned ucvector_push_back(ucvector* p, unsigned char c)
+{
+  if(!ucvector_resize(p, p->size + 1)) return 0;
+  p->data[p->size - 1] = c;
+  return 1;
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*returns 1 if success, 0 if failure ==> nothing done*/
+static unsigned string_resize(char** out, size_t size)
+{
+  char* data = (char*)lodepng_realloc(*out, size + 1);
+  if(data)
+  {
+    data[size] = 0; /*null termination char*/
+    *out = data;
+  }
+  return data != 0;
+}
+
+/*init a {char*, size_t} pair for use as string*/
+static void string_init(char** out)
+{
+  *out = NULL;
+  string_resize(out, 0);
+}
+
+/*free the above pair again*/
+static void string_cleanup(char** out)
+{
+  lodepng_free(*out);
+  *out = NULL;
+}
+
+static void string_set(char** out, const char* in)
+{
+  size_t insize = strlen(in), i;
+  if(string_resize(out, insize))
+  {
+    for(i = 0; i != insize; ++i)
+    {
+      (*out)[i] = in[i];
+    }
+  }
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_read32bitInt(const unsigned char* buffer)
+{
+  return (unsigned)((buffer[0] << 24) | (buffer[1] << 16) | (buffer[2] << 8) | buffer[3]);
+}
+
+#if defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)
+/*buffer must have at least 4 allocated bytes available*/
+static void lodepng_set32bitInt(unsigned char* buffer, unsigned value)
+{
+  buffer[0] = (unsigned char)((value >> 24) & 0xff);
+  buffer[1] = (unsigned char)((value >> 16) & 0xff);
+  buffer[2] = (unsigned char)((value >>  8) & 0xff);
+  buffer[3] = (unsigned char)((value      ) & 0xff);
+}
+#endif /*defined(LODEPNG_COMPILE_PNG) || defined(LODEPNG_COMPILE_ENCODER)*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static void lodepng_add32bitInt(ucvector* buffer, unsigned value)
+{
+  ucvector_resize(buffer, buffer->size + 4); /*todo: give error if resize failed*/
+  lodepng_set32bitInt(&buffer->data[buffer->size - 4], value);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / File IO                                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DISK
+
+/* returns negative value on error. This should be pure C compatible, so no fstat. */
+static long lodepng_filesize(const char* filename)
+{
+  FILE* file;
+  long size;
+  file = fopen(filename, "rb");
+  if(!file) return -1;
+
+  if(fseek(file, 0, SEEK_END) != 0)
+  {
+    fclose(file);
+    return -1;
+  }
+
+  size = ftell(file);
+  /* It may give LONG_MAX as directory size, this is invalid for us. */
+  if(size == LONG_MAX) size = -1;
+
+  fclose(file);
+  return size;
+}
+
+/* load file into buffer that already has the correct allocated size. Returns error code.*/
+static unsigned lodepng_buffer_file(unsigned char* out, size_t size, const char* filename)
+{
+  FILE* file;
+  size_t readsize;
+  file = fopen(filename, "rb");
+  if(!file) return 78;
+
+  readsize = fread(out, 1, size, file);
+  fclose(file);
+
+  if (readsize != size) return 78;
+  return 0;
+}
+
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename)
+{
+  long size = lodepng_filesize(filename);
+  if (size < 0) return 78;
+  *outsize = (size_t)size;
+
+  *out = (unsigned char*)lodepng_malloc((size_t)size);
+  if(!(*out) && size > 0) return 83; /*the above malloc failed*/
+
+  return lodepng_buffer_file(*out, (size_t)size, filename);
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename)
+{
+  FILE* file;
+  file = fopen(filename, "wb" );
+  if(!file) return 79;
+  fwrite((char*)buffer , 1 , buffersize, file);
+  fclose(file);
+  return 0;
+}
+
+#endif /*LODEPNG_COMPILE_DISK*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of common code and tools. Begin of Zlib related code.            // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_ENCODER
+/*TODO: this ignores potential out of memory errors*/
+#define addBitToStream(/*size_t**/ bitpointer, /*ucvector**/ bitstream, /*unsigned char*/ bit)\
+{\
+  /*add a new byte at the end*/\
+  if(((*bitpointer) & 7) == 0) ucvector_push_back(bitstream, (unsigned char)0);\
+  /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/\
+  (bitstream->data[bitstream->size - 1]) |= (bit << ((*bitpointer) & 0x7));\
+  ++(*bitpointer);\
+}
+
+static void addBitsToStream(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+  size_t i;
+  for(i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> i) & 1));
+}
+
+static void addBitsToStreamReversed(size_t* bitpointer, ucvector* bitstream, unsigned value, size_t nbits)
+{
+  size_t i;
+  for(i = 0; i != nbits; ++i) addBitToStream(bitpointer, bitstream, (unsigned char)((value >> (nbits - 1 - i)) & 1));
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+#define READBIT(bitpointer, bitstream) ((bitstream[bitpointer >> 3] >> (bitpointer & 0x7)) & (unsigned char)1)
+
+static unsigned char readBitFromStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+  unsigned char result = (unsigned char)(READBIT(*bitpointer, bitstream));
+  ++(*bitpointer);
+  return result;
+}
+
+static unsigned readBitsFromStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+  unsigned result = 0, i;
+  for(i = 0; i != nbits; ++i)
+  {
+    result += ((unsigned)READBIT(*bitpointer, bitstream)) << i;
+    ++(*bitpointer);
+  }
+  return result;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflate - Huffman                                                      / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#define FIRST_LENGTH_CODE_INDEX 257
+#define LAST_LENGTH_CODE_INDEX 285
+/*256 literals, the end code, some length codes, and 2 unused codes*/
+#define NUM_DEFLATE_CODE_SYMBOLS 288
+/*the distance codes have their own symbols, 30 used, 2 unused*/
+#define NUM_DISTANCE_SYMBOLS 32
+/*the code length codes. 0-15: code lengths, 16: copy previous 3-6 times, 17: 3-10 zeros, 18: 11-138 zeros*/
+#define NUM_CODE_LENGTH_CODES 19
+
+/*the base lengths represented by codes 257-285*/
+static const unsigned LENGTHBASE[29]
+  = {3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, 35, 43, 51, 59,
+     67, 83, 99, 115, 131, 163, 195, 227, 258};
+
+/*the extra bits used by codes 257-285 (added to base length)*/
+static const unsigned LENGTHEXTRA[29]
+  = {0, 0, 0, 0, 0, 0, 0,  0,  1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,
+      4,  4,  4,   4,   5,   5,   5,   5,   0};
+
+/*the base backwards distances (the bits of distance codes appear after length codes and use their own huffman tree)*/
+static const unsigned DISTANCEBASE[30]
+  = {1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, 257, 385, 513,
+     769, 1025, 1537, 2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577};
+
+/*the extra bits of backwards distances (added to base)*/
+static const unsigned DISTANCEEXTRA[30]
+  = {0, 0, 0, 0, 1, 1, 2,  2,  3,  3,  4,  4,  5,  5,   6,   6,   7,   7,   8,
+       8,    9,    9,   10,   10,   11,   11,   12,    12,    13,    13};
+
+/*the order in which "code length alphabet code lengths" are stored, out of this
+the huffman tree of the dynamic huffman tree lengths is generated*/
+static const unsigned CLCL_ORDER[NUM_CODE_LENGTH_CODES]
+  = {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*
+Huffman tree struct, containing multiple representations of the tree
+*/
+typedef struct HuffmanTree
+{
+  unsigned* tree2d;
+  unsigned* tree1d;
+  unsigned* lengths; /*the lengths of the codes of the 1d-tree*/
+  unsigned maxbitlen; /*maximum number of bits a single code can get*/
+  unsigned numcodes; /*number of symbols in the alphabet = number of codes*/
+} HuffmanTree;
+
+/*function used for debug purposes to draw the tree in ascii art with C++*/
+/*
+static void HuffmanTree_draw(HuffmanTree* tree)
+{
+  std::cout << "tree. length: " << tree->numcodes << " maxbitlen: " << tree->maxbitlen << std::endl;
+  for(size_t i = 0; i != tree->tree1d.size; ++i)
+  {
+    if(tree->lengths.data[i])
+      std::cout << i << " " << tree->tree1d.data[i] << " " << tree->lengths.data[i] << std::endl;
+  }
+  std::cout << std::endl;
+}*/
+
+static void HuffmanTree_init(HuffmanTree* tree)
+{
+  tree->tree2d = 0;
+  tree->tree1d = 0;
+  tree->lengths = 0;
+}
+
+static void HuffmanTree_cleanup(HuffmanTree* tree)
+{
+  lodepng_free(tree->tree2d);
+  lodepng_free(tree->tree1d);
+  lodepng_free(tree->lengths);
+}
+
+/*the tree representation used by the decoder. return value is error*/
+static unsigned HuffmanTree_make2DTree(HuffmanTree* tree)
+{
+  unsigned nodefilled = 0; /*up to which node it is filled*/
+  unsigned treepos = 0; /*position in the tree (1 of the numcodes columns)*/
+  unsigned n, i;
+
+  tree->tree2d = (unsigned*)lodepng_malloc(tree->numcodes * 2 * sizeof(unsigned));
+  if(!tree->tree2d) return 83; /*alloc fail*/
+
+  /*
+  convert tree1d[] to tree2d[][]. In the 2D array, a value of 32767 means
+  uninited, a value >= numcodes is an address to another bit, a value < numcodes
+  is a code. The 2 rows are the 2 possible bit values (0 or 1), there are as
+  many columns as codes - 1.
+  A good huffman tree has N * 2 - 1 nodes, of which N - 1 are internal nodes.
+  Here, the internal nodes are stored (what their 0 and 1 option point to).
+  There is only memory for such good tree currently, if there are more nodes
+  (due to too long length codes), error 55 will happen
+  */
+  for(n = 0; n < tree->numcodes * 2; ++n)
+  {
+    tree->tree2d[n] = 32767; /*32767 here means the tree2d isn't filled there yet*/
+  }
+
+  for(n = 0; n < tree->numcodes; ++n) /*the codes*/
+  {
+    for(i = 0; i != tree->lengths[n]; ++i) /*the bits for this code*/
+    {
+      unsigned char bit = (unsigned char)((tree->tree1d[n] >> (tree->lengths[n] - i - 1)) & 1);
+      /*oversubscribed, see comment in lodepng_error_text*/
+      if(treepos > 2147483647 || treepos + 2 > tree->numcodes) return 55;
+      if(tree->tree2d[2 * treepos + bit] == 32767) /*not yet filled in*/
+      {
+        if(i + 1 == tree->lengths[n]) /*last bit*/
+        {
+          tree->tree2d[2 * treepos + bit] = n; /*put the current code in it*/
+          treepos = 0;
+        }
+        else
+        {
+          /*put address of the next step in here, first that address has to be found of course
+          (it's just nodefilled + 1)...*/
+          ++nodefilled;
+          /*addresses encoded with numcodes added to it*/
+          tree->tree2d[2 * treepos + bit] = nodefilled + tree->numcodes;
+          treepos = nodefilled;
+        }
+      }
+      else treepos = tree->tree2d[2 * treepos + bit] - tree->numcodes;
+    }
+  }
+
+  for(n = 0; n < tree->numcodes * 2; ++n)
+  {
+    if(tree->tree2d[n] == 32767) tree->tree2d[n] = 0; /*remove possible remaining 32767's*/
+  }
+
+  return 0;
+}
+
+/*
+Second step for the ...makeFromLengths and ...makeFromFrequencies functions.
+numcodes, lengths and maxbitlen must already be filled in correctly. return
+value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths2(HuffmanTree* tree)
+{
+  uivector blcount;
+  uivector nextcode;
+  unsigned error = 0;
+  unsigned bits, n;
+
+  uivector_init(&blcount);
+  uivector_init(&nextcode);
+
+  tree->tree1d = (unsigned*)lodepng_malloc(tree->numcodes * sizeof(unsigned));
+  if(!tree->tree1d) error = 83; /*alloc fail*/
+
+  if(!uivector_resizev(&blcount, tree->maxbitlen + 1, 0)
+  || !uivector_resizev(&nextcode, tree->maxbitlen + 1, 0))
+    error = 83; /*alloc fail*/
+
+  if(!error)
+  {
+    /*step 1: count number of instances of each code length*/
+    for(bits = 0; bits != tree->numcodes; ++bits) ++blcount.data[tree->lengths[bits]];
+    /*step 2: generate the nextcode values*/
+    for(bits = 1; bits <= tree->maxbitlen; ++bits)
+    {
+      nextcode.data[bits] = (nextcode.data[bits - 1] + blcount.data[bits - 1]) << 1;
+    }
+    /*step 3: generate all the codes*/
+    for(n = 0; n != tree->numcodes; ++n)
+    {
+      if(tree->lengths[n] != 0) tree->tree1d[n] = nextcode.data[tree->lengths[n]]++;
+    }
+  }
+
+  uivector_cleanup(&blcount);
+  uivector_cleanup(&nextcode);
+
+  if(!error) return HuffmanTree_make2DTree(tree);
+  else return error;
+}
+
+/*
+given the code lengths (as stored in the PNG file), generate the tree as defined
+by Deflate. maxbitlen is the maximum bits that a code in the tree can have.
+return value is error.
+*/
+static unsigned HuffmanTree_makeFromLengths(HuffmanTree* tree, const unsigned* bitlen,
+                                            size_t numcodes, unsigned maxbitlen)
+{
+  unsigned i;
+  tree->lengths = (unsigned*)lodepng_malloc(numcodes * sizeof(unsigned));
+  if(!tree->lengths) return 83; /*alloc fail*/
+  for(i = 0; i != numcodes; ++i) tree->lengths[i] = bitlen[i];
+  tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+  tree->maxbitlen = maxbitlen;
+  return HuffmanTree_makeFromLengths2(tree);
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*BPM: Boundary Package Merge, see "A Fast and Space-Economical Algorithm for Length-Limited Coding",
+Jyrki Katajainen, Alistair Moffat, Andrew Turpin, 1995.*/
+
+/*chain node for boundary package merge*/
+typedef struct BPMNode
+{
+  int weight; /*the sum of all weights in this chain*/
+  unsigned index; /*index of this leaf node (called "count" in the paper)*/
+  struct BPMNode* tail; /*the next nodes in this chain (null if last)*/
+  int in_use;
+} BPMNode;
+
+/*lists of chains*/
+typedef struct BPMLists
+{
+  /*memory pool*/
+  unsigned memsize;
+  BPMNode* memory;
+  unsigned numfree;
+  unsigned nextfree;
+  BPMNode** freelist;
+  /*two heads of lookahead chains per list*/
+  unsigned listsize;
+  BPMNode** chains0;
+  BPMNode** chains1;
+} BPMLists;
+
+/*creates a new chain node with the given parameters, from the memory in the lists */
+static BPMNode* bpmnode_create(BPMLists* lists, int weight, unsigned index, BPMNode* tail)
+{
+  unsigned i;
+  BPMNode* result;
+
+  /*memory full, so garbage collect*/
+  if(lists->nextfree >= lists->numfree)
+  {
+    /*mark only those that are in use*/
+    for(i = 0; i != lists->memsize; ++i) lists->memory[i].in_use = 0;
+    for(i = 0; i != lists->listsize; ++i)
+    {
+      BPMNode* node;
+      for(node = lists->chains0[i]; node != 0; node = node->tail) node->in_use = 1;
+      for(node = lists->chains1[i]; node != 0; node = node->tail) node->in_use = 1;
+    }
+    /*collect those that are free*/
+    lists->numfree = 0;
+    for(i = 0; i != lists->memsize; ++i)
+    {
+      if(!lists->memory[i].in_use) lists->freelist[lists->numfree++] = &lists->memory[i];
+    }
+    lists->nextfree = 0;
+  }
+
+  result = lists->freelist[lists->nextfree++];
+  result->weight = weight;
+  result->index = index;
+  result->tail = tail;
+  return result;
+}
+
+/*sort the leaves with stable mergesort*/
+static void bpmnode_sort(BPMNode* leaves, size_t num)
+{
+  BPMNode* mem = (BPMNode*)lodepng_malloc(sizeof(*leaves) * num);
+  size_t width, counter = 0;
+  for(width = 1; width < num; width *= 2)
+  {
+    BPMNode* a = (counter & 1) ? mem : leaves;
+    BPMNode* b = (counter & 1) ? leaves : mem;
+    size_t p;
+    for(p = 0; p < num; p += 2 * width)
+    {
+      size_t q = (p + width > num) ? num : (p + width);
+      size_t r = (p + 2 * width > num) ? num : (p + 2 * width);
+      size_t i = p, j = q, k;
+      for(k = p; k < r; k++)
+      {
+        if(i < q && (j >= r || a[i].weight <= a[j].weight)) b[k] = a[i++];
+        else b[k] = a[j++];
+      }
+    }
+    counter++;
+  }
+  if(counter & 1) memcpy(leaves, mem, sizeof(*leaves) * num);
+  lodepng_free(mem);
+}
+
+/*Boundary Package Merge step, numpresent is the amount of leaves, and c is the current chain.*/
+static void boundaryPM(BPMLists* lists, BPMNode* leaves, size_t numpresent, int c, int num)
+{
+  unsigned lastindex = lists->chains1[c]->index;
+
+  if(c == 0)
+  {
+    if(lastindex >= numpresent) return;
+    lists->chains0[c] = lists->chains1[c];
+    lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, 0);
+  }
+  else
+  {
+    /*sum of the weights of the head nodes of the previous lookahead chains.*/
+    int sum = lists->chains0[c - 1]->weight + lists->chains1[c - 1]->weight;
+    lists->chains0[c] = lists->chains1[c];
+    if(lastindex < numpresent && sum > leaves[lastindex].weight)
+    {
+      lists->chains1[c] = bpmnode_create(lists, leaves[lastindex].weight, lastindex + 1, lists->chains1[c]->tail);
+      return;
+    }
+    lists->chains1[c] = bpmnode_create(lists, sum, lastindex, lists->chains1[c - 1]);
+    /*in the end we are only interested in the chain of the last list, so no
+    need to recurse if we're at the last one (this gives measurable speedup)*/
+    if(num + 1 < (int)(2 * numpresent - 2))
+    {
+      boundaryPM(lists, leaves, numpresent, c - 1, num);
+      boundaryPM(lists, leaves, numpresent, c - 1, num);
+    }
+  }
+}
+
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+                                      size_t numcodes, unsigned maxbitlen)
+{
+  unsigned error = 0;
+  unsigned i;
+  size_t numpresent = 0; /*number of symbols with non-zero frequency*/
+  BPMNode* leaves; /*the symbols, only those with > 0 frequency*/
+
+  if(numcodes == 0) return 80; /*error: a tree of 0 symbols is not supposed to be made*/
+  if((1u << maxbitlen) < numcodes) return 80; /*error: represent all symbols*/
+
+  leaves = (BPMNode*)lodepng_malloc(numcodes * sizeof(*leaves));
+  if(!leaves) return 83; /*alloc fail*/
+
+  for(i = 0; i != numcodes; ++i)
+  {
+    if(frequencies[i] > 0)
+    {
+      leaves[numpresent].weight = (int)frequencies[i];
+      leaves[numpresent].index = i;
+      ++numpresent;
+    }
+  }
+
+  for(i = 0; i != numcodes; ++i) lengths[i] = 0;
+
+  /*ensure at least two present symbols. There should be at least one symbol
+  according to RFC 1951 section 3.2.7. Some decoders incorrectly require two. To
+  make these work as well ensure there are at least two symbols. The
+  Package-Merge code below also doesn't work correctly if there's only one
+  symbol, it'd give it the theoritical 0 bits but in practice zlib wants 1 bit*/
+  if(numpresent == 0)
+  {
+    lengths[0] = lengths[1] = 1; /*note that for RFC 1951 section 3.2.7, only lengths[0] = 1 is needed*/
+  }
+  else if(numpresent == 1)
+  {
+    lengths[leaves[0].index] = 1;
+    lengths[leaves[0].index == 0 ? 1 : 0] = 1;
+  }
+  else
+  {
+    BPMLists lists;
+    BPMNode* node;
+
+    bpmnode_sort(leaves, numpresent);
+
+    lists.listsize = maxbitlen;
+    lists.memsize = 2 * maxbitlen * (maxbitlen + 1);
+    lists.nextfree = 0;
+    lists.numfree = lists.memsize;
+    lists.memory = (BPMNode*)lodepng_malloc(lists.memsize * sizeof(*lists.memory));
+    lists.freelist = (BPMNode**)lodepng_malloc(lists.memsize * sizeof(BPMNode*));
+    lists.chains0 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+    lists.chains1 = (BPMNode**)lodepng_malloc(lists.listsize * sizeof(BPMNode*));
+    if(!lists.memory || !lists.freelist || !lists.chains0 || !lists.chains1) error = 83; /*alloc fail*/
+
+    if(!error)
+    {
+      for(i = 0; i != lists.memsize; ++i) lists.freelist[i] = &lists.memory[i];
+
+      bpmnode_create(&lists, leaves[0].weight, 1, 0);
+      bpmnode_create(&lists, leaves[1].weight, 2, 0);
+
+      for(i = 0; i != lists.listsize; ++i)
+      {
+        lists.chains0[i] = &lists.memory[0];
+        lists.chains1[i] = &lists.memory[1];
+      }
+
+      /*each boundaryPM call adds one chain to the last list, and we need 2 * numpresent - 2 chains.*/
+      for(i = 2; i != 2 * numpresent - 2; ++i) boundaryPM(&lists, leaves, numpresent, (int)maxbitlen - 1, (int)i);
+
+      for(node = lists.chains1[maxbitlen - 1]; node; node = node->tail)
+      {
+        for(i = 0; i != node->index; ++i) ++lengths[leaves[i].index];
+      }
+    }
+
+    lodepng_free(lists.memory);
+    lodepng_free(lists.freelist);
+    lodepng_free(lists.chains0);
+    lodepng_free(lists.chains1);
+  }
+
+  lodepng_free(leaves);
+  return error;
+}
+
+/*Create the Huffman tree given the symbol frequencies*/
+static unsigned HuffmanTree_makeFromFrequencies(HuffmanTree* tree, const unsigned* frequencies,
+                                                size_t mincodes, size_t numcodes, unsigned maxbitlen)
+{
+  unsigned error = 0;
+  while(!frequencies[numcodes - 1] && numcodes > mincodes) --numcodes; /*trim zeroes*/
+  tree->maxbitlen = maxbitlen;
+  tree->numcodes = (unsigned)numcodes; /*number of symbols*/
+  tree->lengths = (unsigned*)lodepng_realloc(tree->lengths, numcodes * sizeof(unsigned));
+  if(!tree->lengths) return 83; /*alloc fail*/
+  /*initialize all lengths to 0*/
+  memset(tree->lengths, 0, numcodes * sizeof(unsigned));
+
+  error = lodepng_huffman_code_lengths(tree->lengths, frequencies, numcodes, maxbitlen);
+  if(!error) error = HuffmanTree_makeFromLengths2(tree);
+  return error;
+}
+
+static unsigned HuffmanTree_getCode(const HuffmanTree* tree, unsigned index)
+{
+  return tree->tree1d[index];
+}
+
+static unsigned HuffmanTree_getLength(const HuffmanTree* tree, unsigned index)
+{
+  return tree->lengths[index];
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*get the literal and length code tree of a deflated block with fixed tree, as per the deflate specification*/
+static unsigned generateFixedLitLenTree(HuffmanTree* tree)
+{
+  unsigned i, error = 0;
+  unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+  if(!bitlen) return 83; /*alloc fail*/
+
+  /*288 possible codes: 0-255=literals, 256=endcode, 257-285=lengthcodes, 286-287=unused*/
+  for(i =   0; i <= 143; ++i) bitlen[i] = 8;
+  for(i = 144; i <= 255; ++i) bitlen[i] = 9;
+  for(i = 256; i <= 279; ++i) bitlen[i] = 7;
+  for(i = 280; i <= 287; ++i) bitlen[i] = 8;
+
+  error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DEFLATE_CODE_SYMBOLS, 15);
+
+  lodepng_free(bitlen);
+  return error;
+}
+
+/*get the distance code tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static unsigned generateFixedDistanceTree(HuffmanTree* tree)
+{
+  unsigned i, error = 0;
+  unsigned* bitlen = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+  if(!bitlen) return 83; /*alloc fail*/
+
+  /*there are 32 distance codes, but 30-31 are unused*/
+  for(i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen[i] = 5;
+  error = HuffmanTree_makeFromLengths(tree, bitlen, NUM_DISTANCE_SYMBOLS, 15);
+
+  lodepng_free(bitlen);
+  return error;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/*
+returns the code, or (unsigned)(-1) if error happened
+inbitlength is the length of the complete buffer, in bits (so its byte length times 8)
+*/
+static unsigned huffmanDecodeSymbol(const unsigned char* in, size_t* bp,
+                                    const HuffmanTree* codetree, size_t inbitlength)
+{
+  unsigned treepos = 0, ct;
+  for(;;)
+  {
+    if(*bp >= inbitlength) return (unsigned)(-1); /*error: end of input memory reached without endcode*/
+    /*
+    decode the symbol from the tree. The "readBitFromStream" code is inlined in
+    the expression below because this is the biggest bottleneck while decoding
+    */
+    ct = codetree->tree2d[(treepos << 1) + READBIT(*bp, in)];
+    ++(*bp);
+    if(ct < codetree->numcodes) return ct; /*the symbol is decoded, return it*/
+    else treepos = ct - codetree->numcodes; /*symbol not yet decoded, instead move tree position*/
+
+    if(treepos >= codetree->numcodes) return (unsigned)(-1); /*error: it appeared outside the codetree*/
+  }
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Inflator (Decompressor)                                                / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*get the tree of a deflated block with fixed tree, as specified in the deflate specification*/
+static void getTreeInflateFixed(HuffmanTree* tree_ll, HuffmanTree* tree_d)
+{
+  /*TODO: check for out of memory errors*/
+  generateFixedLitLenTree(tree_ll);
+  generateFixedDistanceTree(tree_d);
+}
+
+/*get the tree of a deflated block with dynamic tree, the tree itself is also Huffman compressed with a known tree*/
+static unsigned getTreeInflateDynamic(HuffmanTree* tree_ll, HuffmanTree* tree_d,
+                                      const unsigned char* in, size_t* bp, size_t inlength)
+{
+  /*make sure that length values that aren't filled in will be 0, or a wrong tree will be generated*/
+  unsigned error = 0;
+  unsigned n, HLIT, HDIST, HCLEN, i;
+  size_t inbitlength = inlength * 8;
+
+  /*see comments in deflateDynamic for explanation of the context and these variables, it is analogous*/
+  unsigned* bitlen_ll = 0; /*lit,len code lengths*/
+  unsigned* bitlen_d = 0; /*dist code lengths*/
+  /*code length code lengths ("clcl"), the bit lengths of the huffman tree used to compress bitlen_ll and bitlen_d*/
+  unsigned* bitlen_cl = 0;
+  HuffmanTree tree_cl; /*the code tree for code length codes (the huffman tree for compressed huffman trees)*/
+
+  if((*bp) + 14 > (inlength << 3)) return 49; /*error: the bit pointer is or will go past the memory*/
+
+  /*number of literal/length codes + 257. Unlike the spec, the value 257 is added to it here already*/
+  HLIT =  readBitsFromStream(bp, in, 5) + 257;
+  /*number of distance codes. Unlike the spec, the value 1 is added to it here already*/
+  HDIST = readBitsFromStream(bp, in, 5) + 1;
+  /*number of code length codes. Unlike the spec, the value 4 is added to it here already*/
+  HCLEN = readBitsFromStream(bp, in, 4) + 4;
+
+  if((*bp) + HCLEN * 3 > (inlength << 3)) return 50; /*error: the bit pointer is or will go past the memory*/
+
+  HuffmanTree_init(&tree_cl);
+
+  while(!error)
+  {
+    /*read the code length codes out of 3 * (amount of code length codes) bits*/
+
+    bitlen_cl = (unsigned*)lodepng_malloc(NUM_CODE_LENGTH_CODES * sizeof(unsigned));
+    if(!bitlen_cl) ERROR_BREAK(83 /*alloc fail*/);
+
+    for(i = 0; i != NUM_CODE_LENGTH_CODES; ++i)
+    {
+      if(i < HCLEN) bitlen_cl[CLCL_ORDER[i]] = readBitsFromStream(bp, in, 3);
+      else bitlen_cl[CLCL_ORDER[i]] = 0; /*if not, it must stay 0*/
+    }
+
+    error = HuffmanTree_makeFromLengths(&tree_cl, bitlen_cl, NUM_CODE_LENGTH_CODES, 7);
+    if(error) break;
+
+    /*now we can use this tree to read the lengths for the tree that this function will return*/
+    bitlen_ll = (unsigned*)lodepng_malloc(NUM_DEFLATE_CODE_SYMBOLS * sizeof(unsigned));
+    bitlen_d = (unsigned*)lodepng_malloc(NUM_DISTANCE_SYMBOLS * sizeof(unsigned));
+    if(!bitlen_ll || !bitlen_d) ERROR_BREAK(83 /*alloc fail*/);
+    for(i = 0; i != NUM_DEFLATE_CODE_SYMBOLS; ++i) bitlen_ll[i] = 0;
+    for(i = 0; i != NUM_DISTANCE_SYMBOLS; ++i) bitlen_d[i] = 0;
+
+    /*i is the current symbol we're reading in the part that contains the code lengths of lit/len and dist codes*/
+    i = 0;
+    while(i < HLIT + HDIST)
+    {
+      unsigned code = huffmanDecodeSymbol(in, bp, &tree_cl, inbitlength);
+      if(code <= 15) /*a length code*/
+      {
+        if(i < HLIT) bitlen_ll[i] = code;
+        else bitlen_d[i - HLIT] = code;
+        ++i;
+      }
+      else if(code == 16) /*repeat previous*/
+      {
+        unsigned replength = 3; /*read in the 2 bits that indicate repeat length (3-6)*/
+        unsigned value; /*set value to the previous code*/
+
+        if(i == 0) ERROR_BREAK(54); /*can't repeat previous if i is 0*/
+
+        if((*bp + 2) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+        replength += readBitsFromStream(bp, in, 2);
+
+        if(i < HLIT + 1) value = bitlen_ll[i - 1];
+        else value = bitlen_d[i - HLIT - 1];
+        /*repeat this value in the next lengths*/
+        for(n = 0; n < replength; ++n)
+        {
+          if(i >= HLIT + HDIST) ERROR_BREAK(13); /*error: i is larger than the amount of codes*/
+          if(i < HLIT) bitlen_ll[i] = value;
+          else bitlen_d[i - HLIT] = value;
+          ++i;
+        }
+      }
+      else if(code == 17) /*repeat "0" 3-10 times*/
+      {
+        unsigned replength = 3; /*read in the bits that indicate repeat length*/
+        if((*bp + 3) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+        replength += readBitsFromStream(bp, in, 3);
+
+        /*repeat this value in the next lengths*/
+        for(n = 0; n < replength; ++n)
+        {
+          if(i >= HLIT + HDIST) ERROR_BREAK(14); /*error: i is larger than the amount of codes*/
+
+          if(i < HLIT) bitlen_ll[i] = 0;
+          else bitlen_d[i - HLIT] = 0;
+          ++i;
+        }
+      }
+      else if(code == 18) /*repeat "0" 11-138 times*/
+      {
+        unsigned replength = 11; /*read in the bits that indicate repeat length*/
+        if((*bp + 7) > inbitlength) ERROR_BREAK(50); /*error, bit pointer jumps past memory*/
+        replength += readBitsFromStream(bp, in, 7);
+
+        /*repeat this value in the next lengths*/
+        for(n = 0; n < replength; ++n)
+        {
+          if(i >= HLIT + HDIST) ERROR_BREAK(15); /*error: i is larger than the amount of codes*/
+
+          if(i < HLIT) bitlen_ll[i] = 0;
+          else bitlen_d[i - HLIT] = 0;
+          ++i;
+        }
+      }
+      else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+      {
+        if(code == (unsigned)(-1))
+        {
+          /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+          (10=no endcode, 11=wrong jump outside of tree)*/
+          error = (*bp) > inbitlength ? 10 : 11;
+        }
+        else error = 16; /*unexisting code, this can never happen*/
+        break;
+      }
+    }
+    if(error) break;
+
+    if(bitlen_ll[256] == 0) ERROR_BREAK(64); /*the length of the end code 256 must be larger than 0*/
+
+    /*now we've finally got HLIT and HDIST, so generate the code trees, and the function is done*/
+    error = HuffmanTree_makeFromLengths(tree_ll, bitlen_ll, NUM_DEFLATE_CODE_SYMBOLS, 15);
+    if(error) break;
+    error = HuffmanTree_makeFromLengths(tree_d, bitlen_d, NUM_DISTANCE_SYMBOLS, 15);
+
+    break; /*end of error-while*/
+  }
+
+  lodepng_free(bitlen_cl);
+  lodepng_free(bitlen_ll);
+  lodepng_free(bitlen_d);
+  HuffmanTree_cleanup(&tree_cl);
+
+  return error;
+}
+
+/*inflate a block with dynamic of fixed Huffman tree*/
+static unsigned inflateHuffmanBlock(ucvector* out, const unsigned char* in, size_t* bp,
+                                    size_t* pos, size_t inlength, unsigned btype)
+{
+  unsigned error = 0;
+  HuffmanTree tree_ll; /*the huffman tree for literal and length codes*/
+  HuffmanTree tree_d; /*the huffman tree for distance codes*/
+  size_t inbitlength = inlength * 8;
+
+  HuffmanTree_init(&tree_ll);
+  HuffmanTree_init(&tree_d);
+
+  if(btype == 1) getTreeInflateFixed(&tree_ll, &tree_d);
+  else if(btype == 2) error = getTreeInflateDynamic(&tree_ll, &tree_d, in, bp, inlength);
+
+  while(!error) /*decode all symbols until end reached, breaks at end code*/
+  {
+    /*code_ll is literal, length or end code*/
+    unsigned code_ll = huffmanDecodeSymbol(in, bp, &tree_ll, inbitlength);
+    if(code_ll <= 255) /*literal symbol*/
+    {
+      /*ucvector_push_back would do the same, but for some reason the two lines below run 10% faster*/
+      if(!ucvector_resize(out, (*pos) + 1)) ERROR_BREAK(83 /*alloc fail*/);
+      out->data[*pos] = (unsigned char)code_ll;
+      ++(*pos);
+    }
+    else if(code_ll >= FIRST_LENGTH_CODE_INDEX && code_ll <= LAST_LENGTH_CODE_INDEX) /*length code*/
+    {
+      unsigned code_d, distance;
+      unsigned numextrabits_l, numextrabits_d; /*extra bits for length and distance*/
+      size_t start, forward, backward, length;
+
+      /*part 1: get length base*/
+      length = LENGTHBASE[code_ll - FIRST_LENGTH_CODE_INDEX];
+
+      /*part 2: get extra bits and add the value of that to length*/
+      numextrabits_l = LENGTHEXTRA[code_ll - FIRST_LENGTH_CODE_INDEX];
+      if((*bp + numextrabits_l) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+      length += readBitsFromStream(bp, in, numextrabits_l);
+
+      /*part 3: get distance code*/
+      code_d = huffmanDecodeSymbol(in, bp, &tree_d, inbitlength);
+      if(code_d > 29)
+      {
+        if(code_ll == (unsigned)(-1)) /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+        {
+          /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+          (10=no endcode, 11=wrong jump outside of tree)*/
+          error = (*bp) > inlength * 8 ? 10 : 11;
+        }
+        else error = 18; /*error: invalid distance code (30-31 are never used)*/
+        break;
+      }
+      distance = DISTANCEBASE[code_d];
+
+      /*part 4: get extra bits from distance*/
+      numextrabits_d = DISTANCEEXTRA[code_d];
+      if((*bp + numextrabits_d) > inbitlength) ERROR_BREAK(51); /*error, bit pointer will jump past memory*/
+      distance += readBitsFromStream(bp, in, numextrabits_d);
+
+      /*part 5: fill in all the out[n] values based on the length and dist*/
+      start = (*pos);
+      if(distance > start) ERROR_BREAK(52); /*too long backward distance*/
+      backward = start - distance;
+
+      if(!ucvector_resize(out, (*pos) + length)) ERROR_BREAK(83 /*alloc fail*/);
+      if (distance < length) {
+        for(forward = 0; forward < length; ++forward)
+        {
+          out->data[(*pos)++] = out->data[backward++];
+        }
+      } else {
+        memcpy(out->data + *pos, out->data + backward, length);
+        *pos += length;
+      }
+    }
+    else if(code_ll == 256)
+    {
+      break; /*end code, break the loop*/
+    }
+    else /*if(code == (unsigned)(-1))*/ /*huffmanDecodeSymbol returns (unsigned)(-1) in case of error*/
+    {
+      /*return error code 10 or 11 depending on the situation that happened in huffmanDecodeSymbol
+      (10=no endcode, 11=wrong jump outside of tree)*/
+      error = ((*bp) > inlength * 8) ? 10 : 11;
+      break;
+    }
+  }
+
+  HuffmanTree_cleanup(&tree_ll);
+  HuffmanTree_cleanup(&tree_d);
+
+  return error;
+}
+
+static unsigned inflateNoCompression(ucvector* out, const unsigned char* in, size_t* bp, size_t* pos, size_t inlength)
+{
+  size_t p;
+  unsigned LEN, NLEN, n, error = 0;
+
+  /*go to first boundary of byte*/
+  while(((*bp) & 0x7) != 0) ++(*bp);
+  p = (*bp) / 8; /*byte position*/
+
+  /*read LEN (2 bytes) and NLEN (2 bytes)*/
+  if(p + 4 >= inlength) return 52; /*error, bit pointer will jump past memory*/
+  LEN = in[p] + 256u * in[p + 1]; p += 2;
+  NLEN = in[p] + 256u * in[p + 1]; p += 2;
+
+  /*check if 16-bit NLEN is really the one's complement of LEN*/
+  if(LEN + NLEN != 65535) return 21; /*error: NLEN is not one's complement of LEN*/
+
+  if(!ucvector_resize(out, (*pos) + LEN)) return 83; /*alloc fail*/
+
+  /*read the literal data: LEN bytes are now stored in the out buffer*/
+  if(p + LEN > inlength) return 23; /*error: reading outside of in buffer*/
+  for(n = 0; n < LEN; ++n) out->data[(*pos)++] = in[p++];
+
+  (*bp) = p * 8;
+
+  return error;
+}
+
+static unsigned lodepng_inflatev(ucvector* out,
+                                 const unsigned char* in, size_t insize,
+                                 const LodePNGDecompressSettings* settings)
+{
+  /*bit pointer in the "in" data, current byte is bp >> 3, current bit is bp & 0x7 (from lsb to msb of the byte)*/
+  size_t bp = 0;
+  unsigned BFINAL = 0;
+  size_t pos = 0; /*byte position in the out buffer*/
+  unsigned error = 0;
+
+  (void)settings;
+
+  while(!BFINAL)
+  {
+    unsigned BTYPE;
+    if(bp + 2 >= insize * 8) return 52; /*error, bit pointer will jump past memory*/
+    BFINAL = readBitFromStream(&bp, in);
+    BTYPE = 1u * readBitFromStream(&bp, in);
+    BTYPE += 2u * readBitFromStream(&bp, in);
+
+    if(BTYPE == 3) return 20; /*error: invalid BTYPE*/
+    else if(BTYPE == 0) error = inflateNoCompression(out, in, &bp, &pos, insize); /*no compression*/
+    else error = inflateHuffmanBlock(out, in, &bp, &pos, insize, BTYPE); /*compression, BTYPE 01 or 10*/
+
+    if(error) return error;
+  }
+
+  return error;
+}
+
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGDecompressSettings* settings)
+{
+  unsigned error;
+  ucvector v;
+  ucvector_init_buffer(&v, *out, *outsize);
+  error = lodepng_inflatev(&v, in, insize, settings);
+  *out = v.data;
+  *outsize = v.size;
+  return error;
+}
+
+static unsigned inflate(unsigned char** out, size_t* outsize,
+                        const unsigned char* in, size_t insize,
+                        const LodePNGDecompressSettings* settings)
+{
+  if(settings->custom_inflate)
+  {
+    return settings->custom_inflate(out, outsize, in, insize, settings);
+  }
+  else
+  {
+    return lodepng_inflate(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Deflator (Compressor)                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static const size_t MAX_SUPPORTED_DEFLATE_LENGTH = 258;
+
+/*bitlen is the size in bits of the code*/
+static void addHuffmanSymbol(size_t* bp, ucvector* compressed, unsigned code, unsigned bitlen)
+{
+  addBitsToStreamReversed(bp, compressed, code, bitlen);
+}
+
+/*search the index in the array, that has the largest value smaller than or equal to the given value,
+given array must be sorted (if no value is smaller, it returns the size of the given array)*/
+static size_t searchCodeIndex(const unsigned* array, size_t array_size, size_t value)
+{
+  /*binary search (only small gain over linear). TODO: use CPU log2 instruction for getting symbols instead*/
+  size_t left = 1;
+  size_t right = array_size - 1;
+
+  while(left <= right) {
+    size_t mid = (left + right) >> 1;
+    if (array[mid] >= value) right = mid - 1;
+    else left = mid + 1;
+  }
+  if(left >= array_size || array[left] > value) left--;
+  return left;
+}
+
+static void addLengthDistance(uivector* values, size_t length, size_t distance)
+{
+  /*values in encoded vector are those used by deflate:
+  0-255: literal bytes
+  256: end
+  257-285: length/distance pair (length code, followed by extra length bits, distance code, extra distance bits)
+  286-287: invalid*/
+
+  unsigned length_code = (unsigned)searchCodeIndex(LENGTHBASE, 29, length);
+  unsigned extra_length = (unsigned)(length - LENGTHBASE[length_code]);
+  unsigned dist_code = (unsigned)searchCodeIndex(DISTANCEBASE, 30, distance);
+  unsigned extra_distance = (unsigned)(distance - DISTANCEBASE[dist_code]);
+
+  uivector_push_back(values, length_code + FIRST_LENGTH_CODE_INDEX);
+  uivector_push_back(values, extra_length);
+  uivector_push_back(values, dist_code);
+  uivector_push_back(values, extra_distance);
+}
+
+/*3 bytes of data get encoded into two bytes. The hash cannot use more than 3
+bytes as input because 3 is the minimum match length for deflate*/
+static const unsigned HASH_NUM_VALUES = 65536;
+static const unsigned HASH_BIT_MASK = 65535; /*HASH_NUM_VALUES - 1, but C90 does not like that as initializer*/
+
+typedef struct Hash
+{
+  int* head; /*hash value to head circular pos - can be outdated if went around window*/
+  /*circular pos to prev circular pos*/
+  unsigned short* chain;
+  int* val; /*circular pos to hash value*/
+
+  /*TODO: do this not only for zeros but for any repeated byte. However for PNG
+  it's always going to be the zeros that dominate, so not important for PNG*/
+  int* headz; /*similar to head, but for chainz*/
+  unsigned short* chainz; /*those with same amount of zeros*/
+  unsigned short* zeros; /*length of zeros streak, used as a second hash chain*/
+} Hash;
+
+static unsigned hash_init(Hash* hash, unsigned windowsize)
+{
+  unsigned i;
+  hash->head = (int*)lodepng_malloc(sizeof(int) * HASH_NUM_VALUES);
+  hash->val = (int*)lodepng_malloc(sizeof(int) * windowsize);
+  hash->chain = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+  hash->zeros = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+  hash->headz = (int*)lodepng_malloc(sizeof(int) * (MAX_SUPPORTED_DEFLATE_LENGTH + 1));
+  hash->chainz = (unsigned short*)lodepng_malloc(sizeof(unsigned short) * windowsize);
+
+  if(!hash->head || !hash->chain || !hash->val  || !hash->headz|| !hash->chainz || !hash->zeros)
+  {
+    return 83; /*alloc fail*/
+  }
+
+  /*initialize hash table*/
+  for(i = 0; i != HASH_NUM_VALUES; ++i) hash->head[i] = -1;
+  for(i = 0; i != windowsize; ++i) hash->val[i] = -1;
+  for(i = 0; i != windowsize; ++i) hash->chain[i] = i; /*same value as index indicates uninitialized*/
+
+  for(i = 0; i <= MAX_SUPPORTED_DEFLATE_LENGTH; ++i) hash->headz[i] = -1;
+  for(i = 0; i != windowsize; ++i) hash->chainz[i] = i; /*same value as index indicates uninitialized*/
+
+  return 0;
+}
+
+static void hash_cleanup(Hash* hash)
+{
+  lodepng_free(hash->head);
+  lodepng_free(hash->val);
+  lodepng_free(hash->chain);
+
+  lodepng_free(hash->zeros);
+  lodepng_free(hash->headz);
+  lodepng_free(hash->chainz);
+}
+
+
+
+static unsigned getHash(const unsigned char* data, size_t size, size_t pos)
+{
+  unsigned result = 0;
+  if(pos + 2 < size)
+  {
+    /*A simple shift and xor hash is used. Since the data of PNGs is dominated
+    by zeroes due to the filters, a better hash does not have a significant
+    effect on speed in traversing the chain, and causes more time spend on
+    calculating the hash.*/
+    result ^= (unsigned)(data[pos + 0] << 0u);
+    result ^= (unsigned)(data[pos + 1] << 4u);
+    result ^= (unsigned)(data[pos + 2] << 8u);
+  } else {
+    size_t amount, i;
+    if(pos >= size) return 0;
+    amount = size - pos;
+    for(i = 0; i != amount; ++i) result ^= (unsigned)(data[pos + i] << (i * 8u));
+  }
+  return result & HASH_BIT_MASK;
+}
+
+static unsigned countZeros(const unsigned char* data, size_t size, size_t pos)
+{
+  const unsigned char* start = data + pos;
+  const unsigned char* end = start + MAX_SUPPORTED_DEFLATE_LENGTH;
+  if(end > data + size) end = data + size;
+  data = start;
+  while(data != end && *data == 0) ++data;
+  /*subtracting two addresses returned as 32-bit number (max value is MAX_SUPPORTED_DEFLATE_LENGTH)*/
+  return (unsigned)(data - start);
+}
+
+/*wpos = pos & (windowsize - 1)*/
+static void updateHashChain(Hash* hash, size_t wpos, unsigned hashval, unsigned short numzeros)
+{
+  hash->val[wpos] = (int)hashval;
+  if(hash->head[hashval] != -1) hash->chain[wpos] = hash->head[hashval];
+  hash->head[hashval] = wpos;
+
+  hash->zeros[wpos] = numzeros;
+  if(hash->headz[numzeros] != -1) hash->chainz[wpos] = hash->headz[numzeros];
+  hash->headz[numzeros] = wpos;
+}
+
+/*
+LZ77-encode the data. Return value is error code. The input are raw bytes, the output
+is in the form of unsigned integers with codes representing for example literal bytes, or
+length/distance pairs.
+It uses a hash table technique to let it encode faster. When doing LZ77 encoding, a
+sliding window (of windowsize) is used, and all past bytes in that window can be used as
+the "dictionary". A brute force search through all possible distances would be slow, and
+this hash technique is one out of several ways to speed this up.
+*/
+static unsigned encodeLZ77(uivector* out, Hash* hash,
+                           const unsigned char* in, size_t inpos, size_t insize, unsigned windowsize,
+                           unsigned minmatch, unsigned nicematch, unsigned lazymatching)
+{
+  size_t pos;
+  unsigned i, error = 0;
+  /*for large window lengths, assume the user wants no compression loss. Otherwise, max hash chain length speedup.*/
+  unsigned maxchainlength = windowsize >= 8192 ? windowsize : windowsize / 8;
+  unsigned maxlazymatch = windowsize >= 8192 ? MAX_SUPPORTED_DEFLATE_LENGTH : 64;
+
+  unsigned usezeros = 1; /*not sure if setting it to false for windowsize < 8192 is better or worse*/
+  unsigned numzeros = 0;
+
+  unsigned offset; /*the offset represents the distance in LZ77 terminology*/
+  unsigned length;
+  unsigned lazy = 0;
+  unsigned lazylength = 0, lazyoffset = 0;
+  unsigned hashval;
+  unsigned current_offset, current_length;
+  unsigned prev_offset;
+  const unsigned char *lastptr, *foreptr, *backptr;
+  unsigned hashpos;
+
+  if(windowsize == 0 || windowsize > 32768) return 60; /*error: windowsize smaller/larger than allowed*/
+  if((windowsize & (windowsize - 1)) != 0) return 90; /*error: must be power of two*/
+
+  if(nicematch > MAX_SUPPORTED_DEFLATE_LENGTH) nicematch = MAX_SUPPORTED_DEFLATE_LENGTH;
+
+  for(pos = inpos; pos < insize; ++pos)
+  {
+    size_t wpos = pos & (windowsize - 1); /*position for in 'circular' hash buffers*/
+    unsigned chainlength = 0;
+
+    hashval = getHash(in, insize, pos);
+
+    if(usezeros && hashval == 0)
+    {
+      if(numzeros == 0) numzeros = countZeros(in, insize, pos);
+      else if(pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+    }
+    else
+    {
+      numzeros = 0;
+    }
+
+    updateHashChain(hash, wpos, hashval, numzeros);
+
+    /*the length and offset found for the current position*/
+    length = 0;
+    offset = 0;
+
+    hashpos = hash->chain[wpos];
+
+    lastptr = &in[insize < pos + MAX_SUPPORTED_DEFLATE_LENGTH ? insize : pos + MAX_SUPPORTED_DEFLATE_LENGTH];
+
+    /*search for the longest string*/
+    prev_offset = 0;
+    for(;;)
+    {
+      if(chainlength++ >= maxchainlength) break;
+      current_offset = hashpos <= wpos ? wpos - hashpos : wpos - hashpos + windowsize;
+
+      if(current_offset < prev_offset) break; /*stop when went completely around the circular buffer*/
+      prev_offset = current_offset;
+      if(current_offset > 0)
+      {
+        /*test the next characters*/
+        foreptr = &in[pos];
+        backptr = &in[pos - current_offset];
+
+        /*common case in PNGs is lots of zeros. Quickly skip over them as a speedup*/
+        if(numzeros >= 3)
+        {
+          unsigned skip = hash->zeros[hashpos];
+          if(skip > numzeros) skip = numzeros;
+          backptr += skip;
+          foreptr += skip;
+        }
+
+        while(foreptr != lastptr && *backptr == *foreptr) /*maximum supported length by deflate is max length*/
+        {
+          ++backptr;
+          ++foreptr;
+        }
+        current_length = (unsigned)(foreptr - &in[pos]);
+
+        if(current_length > length)
+        {
+          length = current_length; /*the longest length*/
+          offset = current_offset; /*the offset that is related to this longest length*/
+          /*jump out once a length of max length is found (speed gain). This also jumps
+          out if length is MAX_SUPPORTED_DEFLATE_LENGTH*/
+          if(current_length >= nicematch) break;
+        }
+      }
+
+      if(hashpos == hash->chain[hashpos]) break;
+
+      if(numzeros >= 3 && length > numzeros)
+      {
+        hashpos = hash->chainz[hashpos];
+        if(hash->zeros[hashpos] != numzeros) break;
+      }
+      else
+      {
+        hashpos = hash->chain[hashpos];
+        /*outdated hash value, happens if particular value was not encountered in whole last window*/
+        if(hash->val[hashpos] != (int)hashval) break;
+      }
+    }
+
+    if(lazymatching)
+    {
+      if(!lazy && length >= 3 && length <= maxlazymatch && length < MAX_SUPPORTED_DEFLATE_LENGTH)
+      {
+        lazy = 1;
+        lazylength = length;
+        lazyoffset = offset;
+        continue; /*try the next byte*/
+      }
+      if(lazy)
+      {
+        lazy = 0;
+        if(pos == 0) ERROR_BREAK(81);
+        if(length > lazylength + 1)
+        {
+          /*push the previous character as literal*/
+          if(!uivector_push_back(out, in[pos - 1])) ERROR_BREAK(83 /*alloc fail*/);
+        }
+        else
+        {
+          length = lazylength;
+          offset = lazyoffset;
+          hash->head[hashval] = -1; /*the same hashchain update will be done, this ensures no wrong alteration*/
+          hash->headz[numzeros] = -1; /*idem*/
+          --pos;
+        }
+      }
+    }
+    if(length >= 3 && offset > windowsize) ERROR_BREAK(86 /*too big (or overflown negative) offset*/);
+
+    /*encode it as length/distance pair or literal value*/
+    if(length < 3) /*only lengths of 3 or higher are supported as length/distance pair*/
+    {
+      if(!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+    }
+    else if(length < minmatch || (length == 3 && offset > 4096))
+    {
+      /*compensate for the fact that longer offsets have more extra bits, a
+      length of only 3 may be not worth it then*/
+      if(!uivector_push_back(out, in[pos])) ERROR_BREAK(83 /*alloc fail*/);
+    }
+    else
+    {
+      addLengthDistance(out, length, offset);
+      for(i = 1; i < length; ++i)
+      {
+        ++pos;
+        wpos = pos & (windowsize - 1);
+        hashval = getHash(in, insize, pos);
+        if(usezeros && hashval == 0)
+        {
+          if(numzeros == 0) numzeros = countZeros(in, insize, pos);
+          else if(pos + numzeros > insize || in[pos + numzeros - 1] != 0) --numzeros;
+        }
+        else
+        {
+          numzeros = 0;
+        }
+        updateHashChain(hash, wpos, hashval, numzeros);
+      }
+    }
+  } /*end of the loop through each character of input*/
+
+  return error;
+}
+
+/* /////////////////////////////////////////////////////////////////////////// */
+
+static unsigned deflateNoCompression(ucvector* out, const unsigned char* data, size_t datasize)
+{
+  /*non compressed deflate block data: 1 bit BFINAL,2 bits BTYPE,(5 bits): it jumps to start of next byte,
+  2 bytes LEN, 2 bytes NLEN, LEN bytes literal DATA*/
+
+  size_t i, j, numdeflateblocks = (datasize + 65534) / 65535;
+  unsigned datapos = 0;
+  for(i = 0; i != numdeflateblocks; ++i)
+  {
+    unsigned BFINAL, BTYPE, LEN, NLEN;
+    unsigned char firstbyte;
+
+    BFINAL = (i == numdeflateblocks - 1);
+    BTYPE = 0;
+
+    firstbyte = (unsigned char)(BFINAL + ((BTYPE & 1) << 1) + ((BTYPE & 2) << 1));
+    ucvector_push_back(out, firstbyte);
+
+    LEN = 65535;
+    if(datasize - datapos < 65535) LEN = (unsigned)datasize - datapos;
+    NLEN = 65535 - LEN;
+
+    ucvector_push_back(out, (unsigned char)(LEN & 255));
+    ucvector_push_back(out, (unsigned char)(LEN >> 8));
+    ucvector_push_back(out, (unsigned char)(NLEN & 255));
+    ucvector_push_back(out, (unsigned char)(NLEN >> 8));
+
+    /*Decompressed data*/
+    for(j = 0; j < 65535 && datapos < datasize; ++j)
+    {
+      ucvector_push_back(out, data[datapos++]);
+    }
+  }
+
+  return 0;
+}
+
+/*
+write the lz77-encoded data, which has lit, len and dist codes, to compressed stream using huffman trees.
+tree_ll: the tree for lit and len codes.
+tree_d: the tree for distance codes.
+*/
+static void writeLZ77data(size_t* bp, ucvector* out, const uivector* lz77_encoded,
+                          const HuffmanTree* tree_ll, const HuffmanTree* tree_d)
+{
+  size_t i = 0;
+  for(i = 0; i != lz77_encoded->size; ++i)
+  {
+    unsigned val = lz77_encoded->data[i];
+    addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_ll, val), HuffmanTree_getLength(tree_ll, val));
+    if(val > 256) /*for a length code, 3 more things have to be added*/
+    {
+      unsigned length_index = val - FIRST_LENGTH_CODE_INDEX;
+      unsigned n_length_extra_bits = LENGTHEXTRA[length_index];
+      unsigned length_extra_bits = lz77_encoded->data[++i];
+
+      unsigned distance_code = lz77_encoded->data[++i];
+
+      unsigned distance_index = distance_code;
+      unsigned n_distance_extra_bits = DISTANCEEXTRA[distance_index];
+      unsigned distance_extra_bits = lz77_encoded->data[++i];
+
+      addBitsToStream(bp, out, length_extra_bits, n_length_extra_bits);
+      addHuffmanSymbol(bp, out, HuffmanTree_getCode(tree_d, distance_code),
+                       HuffmanTree_getLength(tree_d, distance_code));
+      addBitsToStream(bp, out, distance_extra_bits, n_distance_extra_bits);
+    }
+  }
+}
+
+/*Deflate for a block of type "dynamic", that is, with freely, optimally, created huffman trees*/
+static unsigned deflateDynamic(ucvector* out, size_t* bp, Hash* hash,
+                               const unsigned char* data, size_t datapos, size_t dataend,
+                               const LodePNGCompressSettings* settings, unsigned final)
+{
+  unsigned error = 0;
+
+  /*
+  A block is compressed as follows: The PNG data is lz77 encoded, resulting in
+  literal bytes and length/distance pairs. This is then huffman compressed with
+  two huffman trees. One huffman tree is used for the lit and len values ("ll"),
+  another huffman tree is used for the dist values ("d"). These two trees are
+  stored using their code lengths, and to compress even more these code lengths
+  are also run-length encoded and huffman compressed. This gives a huffman tree
+  of code lengths "cl". The code lenghts used to describe this third tree are
+  the code length code lengths ("clcl").
+  */
+
+  /*The lz77 encoded data, represented with integers since there will also be length and distance codes in it*/
+  uivector lz77_encoded;
+  HuffmanTree tree_ll; /*tree for lit,len values*/
+  HuffmanTree tree_d; /*tree for distance codes*/
+  HuffmanTree tree_cl; /*tree for encoding the code lengths representing tree_ll and tree_d*/
+  uivector frequencies_ll; /*frequency of lit,len codes*/
+  uivector frequencies_d; /*frequency of dist codes*/
+  uivector frequencies_cl; /*frequency of code length codes*/
+  uivector bitlen_lld; /*lit,len,dist code lenghts (int bits), literally (without repeat codes).*/
+  uivector bitlen_lld_e; /*bitlen_lld encoded with repeat codes (this is a rudemtary run length compression)*/
+  /*bitlen_cl is the code length code lengths ("clcl"). The bit lengths of codes to represent tree_cl
+  (these are written as is in the file, it would be crazy to compress these using yet another huffman
+  tree that needs to be represented by yet another set of code lengths)*/
+  uivector bitlen_cl;
+  size_t datasize = dataend - datapos;
+
+  /*
+  Due to the huffman compression of huffman tree representations ("two levels"), there are some anologies:
+  bitlen_lld is to tree_cl what data is to tree_ll and tree_d.
+  bitlen_lld_e is to bitlen_lld what lz77_encoded is to data.
+  bitlen_cl is to bitlen_lld_e what bitlen_lld is to lz77_encoded.
+  */
+
+  unsigned BFINAL = final;
+  size_t numcodes_ll, numcodes_d, i;
+  unsigned HLIT, HDIST, HCLEN;
+
+  uivector_init(&lz77_encoded);
+  HuffmanTree_init(&tree_ll);
+  HuffmanTree_init(&tree_d);
+  HuffmanTree_init(&tree_cl);
+  uivector_init(&frequencies_ll);
+  uivector_init(&frequencies_d);
+  uivector_init(&frequencies_cl);
+  uivector_init(&bitlen_lld);
+  uivector_init(&bitlen_lld_e);
+  uivector_init(&bitlen_cl);
+
+  /*This while loop never loops due to a break at the end, it is here to
+  allow breaking out of it to the cleanup phase on error conditions.*/
+  while(!error)
+  {
+    if(settings->use_lz77)
+    {
+      error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                         settings->minmatch, settings->nicematch, settings->lazymatching);
+      if(error) break;
+    }
+    else
+    {
+      if(!uivector_resize(&lz77_encoded, datasize)) ERROR_BREAK(83 /*alloc fail*/);
+      for(i = datapos; i < dataend; ++i) lz77_encoded.data[i - datapos] = data[i]; /*no LZ77, but still will be Huffman compressed*/
+    }
+
+    if(!uivector_resizev(&frequencies_ll, 286, 0)) ERROR_BREAK(83 /*alloc fail*/);
+    if(!uivector_resizev(&frequencies_d, 30, 0)) ERROR_BREAK(83 /*alloc fail*/);
+
+    /*Count the frequencies of lit, len and dist codes*/
+    for(i = 0; i != lz77_encoded.size; ++i)
+    {
+      unsigned symbol = lz77_encoded.data[i];
+      ++frequencies_ll.data[symbol];
+      if(symbol > 256)
+      {
+        unsigned dist = lz77_encoded.data[i + 2];
+        ++frequencies_d.data[dist];
+        i += 3;
+      }
+    }
+    frequencies_ll.data[256] = 1; /*there will be exactly 1 end code, at the end of the block*/
+
+    /*Make both huffman trees, one for the lit and len codes, one for the dist codes*/
+    error = HuffmanTree_makeFromFrequencies(&tree_ll, frequencies_ll.data, 257, frequencies_ll.size, 15);
+    if(error) break;
+    /*2, not 1, is chosen for mincodes: some buggy PNG decoders require at least 2 symbols in the dist tree*/
+    error = HuffmanTree_makeFromFrequencies(&tree_d, frequencies_d.data, 2, frequencies_d.size, 15);
+    if(error) break;
+
+    numcodes_ll = tree_ll.numcodes; if(numcodes_ll > 286) numcodes_ll = 286;
+    numcodes_d = tree_d.numcodes; if(numcodes_d > 30) numcodes_d = 30;
+    /*store the code lengths of both generated trees in bitlen_lld*/
+    for(i = 0; i != numcodes_ll; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_ll, (unsigned)i));
+    for(i = 0; i != numcodes_d; ++i) uivector_push_back(&bitlen_lld, HuffmanTree_getLength(&tree_d, (unsigned)i));
+
+    /*run-length compress bitlen_ldd into bitlen_lld_e by using repeat codes 16 (copy length 3-6 times),
+    17 (3-10 zeroes), 18 (11-138 zeroes)*/
+    for(i = 0; i != (unsigned)bitlen_lld.size; ++i)
+    {
+      unsigned j = 0; /*amount of repititions*/
+      while(i + j + 1 < (unsigned)bitlen_lld.size && bitlen_lld.data[i + j + 1] == bitlen_lld.data[i]) ++j;
+
+      if(bitlen_lld.data[i] == 0 && j >= 2) /*repeat code for zeroes*/
+      {
+        ++j; /*include the first zero*/
+        if(j <= 10) /*repeat code 17 supports max 10 zeroes*/
+        {
+          uivector_push_back(&bitlen_lld_e, 17);
+          uivector_push_back(&bitlen_lld_e, j - 3);
+        }
+        else /*repeat code 18 supports max 138 zeroes*/
+        {
+          if(j > 138) j = 138;
+          uivector_push_back(&bitlen_lld_e, 18);
+          uivector_push_back(&bitlen_lld_e, j - 11);
+        }
+        i += (j - 1);
+      }
+      else if(j >= 3) /*repeat code for value other than zero*/
+      {
+        size_t k;
+        unsigned num = j / 6, rest = j % 6;
+        uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+        for(k = 0; k < num; ++k)
+        {
+          uivector_push_back(&bitlen_lld_e, 16);
+          uivector_push_back(&bitlen_lld_e, 6 - 3);
+        }
+        if(rest >= 3)
+        {
+          uivector_push_back(&bitlen_lld_e, 16);
+          uivector_push_back(&bitlen_lld_e, rest - 3);
+        }
+        else j -= rest;
+        i += j;
+      }
+      else /*too short to benefit from repeat code*/
+      {
+        uivector_push_back(&bitlen_lld_e, bitlen_lld.data[i]);
+      }
+    }
+
+    /*generate tree_cl, the huffmantree of huffmantrees*/
+
+    if(!uivector_resizev(&frequencies_cl, NUM_CODE_LENGTH_CODES, 0)) ERROR_BREAK(83 /*alloc fail*/);
+    for(i = 0; i != bitlen_lld_e.size; ++i)
+    {
+      ++frequencies_cl.data[bitlen_lld_e.data[i]];
+      /*after a repeat code come the bits that specify the number of repetitions,
+      those don't need to be in the frequencies_cl calculation*/
+      if(bitlen_lld_e.data[i] >= 16) ++i;
+    }
+
+    error = HuffmanTree_makeFromFrequencies(&tree_cl, frequencies_cl.data,
+                                            frequencies_cl.size, frequencies_cl.size, 7);
+    if(error) break;
+
+    if(!uivector_resize(&bitlen_cl, tree_cl.numcodes)) ERROR_BREAK(83 /*alloc fail*/);
+    for(i = 0; i != tree_cl.numcodes; ++i)
+    {
+      /*lenghts of code length tree is in the order as specified by deflate*/
+      bitlen_cl.data[i] = HuffmanTree_getLength(&tree_cl, CLCL_ORDER[i]);
+    }
+    while(bitlen_cl.data[bitlen_cl.size - 1] == 0 && bitlen_cl.size > 4)
+    {
+      /*remove zeros at the end, but minimum size must be 4*/
+      if(!uivector_resize(&bitlen_cl, bitlen_cl.size - 1)) ERROR_BREAK(83 /*alloc fail*/);
+    }
+    if(error) break;
+
+    /*
+    Write everything into the output
+
+    After the BFINAL and BTYPE, the dynamic block consists out of the following:
+    - 5 bits HLIT, 5 bits HDIST, 4 bits HCLEN
+    - (HCLEN+4)*3 bits code lengths of code length alphabet
+    - HLIT + 257 code lenghts of lit/length alphabet (encoded using the code length
+      alphabet, + possible repetition codes 16, 17, 18)
+    - HDIST + 1 code lengths of distance alphabet (encoded using the code length
+      alphabet, + possible repetition codes 16, 17, 18)
+    - compressed data
+    - 256 (end code)
+    */
+
+    /*Write block type*/
+    addBitToStream(bp, out, BFINAL);
+    addBitToStream(bp, out, 0); /*first bit of BTYPE "dynamic"*/
+    addBitToStream(bp, out, 1); /*second bit of BTYPE "dynamic"*/
+
+    /*write the HLIT, HDIST and HCLEN values*/
+    HLIT = (unsigned)(numcodes_ll - 257);
+    HDIST = (unsigned)(numcodes_d - 1);
+    HCLEN = (unsigned)bitlen_cl.size - 4;
+    /*trim zeroes for HCLEN. HLIT and HDIST were already trimmed at tree creation*/
+    while(!bitlen_cl.data[HCLEN + 4 - 1] && HCLEN > 0) --HCLEN;
+    addBitsToStream(bp, out, HLIT, 5);
+    addBitsToStream(bp, out, HDIST, 5);
+    addBitsToStream(bp, out, HCLEN, 4);
+
+    /*write the code lenghts of the code length alphabet*/
+    for(i = 0; i != HCLEN + 4; ++i) addBitsToStream(bp, out, bitlen_cl.data[i], 3);
+
+    /*write the lenghts of the lit/len AND the dist alphabet*/
+    for(i = 0; i != bitlen_lld_e.size; ++i)
+    {
+      addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_cl, bitlen_lld_e.data[i]),
+                       HuffmanTree_getLength(&tree_cl, bitlen_lld_e.data[i]));
+      /*extra bits of repeat codes*/
+      if(bitlen_lld_e.data[i] == 16) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 2);
+      else if(bitlen_lld_e.data[i] == 17) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 3);
+      else if(bitlen_lld_e.data[i] == 18) addBitsToStream(bp, out, bitlen_lld_e.data[++i], 7);
+    }
+
+    /*write the compressed data symbols*/
+    writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+    /*error: the length of the end code 256 must be larger than 0*/
+    if(HuffmanTree_getLength(&tree_ll, 256) == 0) ERROR_BREAK(64);
+
+    /*write the end code*/
+    addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+    break; /*end of error-while*/
+  }
+
+  /*cleanup*/
+  uivector_cleanup(&lz77_encoded);
+  HuffmanTree_cleanup(&tree_ll);
+  HuffmanTree_cleanup(&tree_d);
+  HuffmanTree_cleanup(&tree_cl);
+  uivector_cleanup(&frequencies_ll);
+  uivector_cleanup(&frequencies_d);
+  uivector_cleanup(&frequencies_cl);
+  uivector_cleanup(&bitlen_lld_e);
+  uivector_cleanup(&bitlen_lld);
+  uivector_cleanup(&bitlen_cl);
+
+  return error;
+}
+
+static unsigned deflateFixed(ucvector* out, size_t* bp, Hash* hash,
+                             const unsigned char* data,
+                             size_t datapos, size_t dataend,
+                             const LodePNGCompressSettings* settings, unsigned final)
+{
+  HuffmanTree tree_ll; /*tree for literal values and length codes*/
+  HuffmanTree tree_d; /*tree for distance codes*/
+
+  unsigned BFINAL = final;
+  unsigned error = 0;
+  size_t i;
+
+  HuffmanTree_init(&tree_ll);
+  HuffmanTree_init(&tree_d);
+
+  generateFixedLitLenTree(&tree_ll);
+  generateFixedDistanceTree(&tree_d);
+
+  addBitToStream(bp, out, BFINAL);
+  addBitToStream(bp, out, 1); /*first bit of BTYPE*/
+  addBitToStream(bp, out, 0); /*second bit of BTYPE*/
+
+  if(settings->use_lz77) /*LZ77 encoded*/
+  {
+    uivector lz77_encoded;
+    uivector_init(&lz77_encoded);
+    error = encodeLZ77(&lz77_encoded, hash, data, datapos, dataend, settings->windowsize,
+                       settings->minmatch, settings->nicematch, settings->lazymatching);
+    if(!error) writeLZ77data(bp, out, &lz77_encoded, &tree_ll, &tree_d);
+    uivector_cleanup(&lz77_encoded);
+  }
+  else /*no LZ77, but still will be Huffman compressed*/
+  {
+    for(i = datapos; i < dataend; ++i)
+    {
+      addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, data[i]), HuffmanTree_getLength(&tree_ll, data[i]));
+    }
+  }
+  /*add END code*/
+  if(!error) addHuffmanSymbol(bp, out, HuffmanTree_getCode(&tree_ll, 256), HuffmanTree_getLength(&tree_ll, 256));
+
+  /*cleanup*/
+  HuffmanTree_cleanup(&tree_ll);
+  HuffmanTree_cleanup(&tree_d);
+
+  return error;
+}
+
+static unsigned lodepng_deflatev(ucvector* out, const unsigned char* in, size_t insize,
+                                 const LodePNGCompressSettings* settings)
+{
+  unsigned error = 0;
+  size_t i, blocksize, numdeflateblocks;
+  size_t bp = 0; /*the bit pointer*/
+  Hash hash;
+
+  if(settings->btype > 2) return 61;
+  else if(settings->btype == 0) return deflateNoCompression(out, in, insize);
+  else if(settings->btype == 1) blocksize = insize;
+  else /*if(settings->btype == 2)*/
+  {
+    /*on PNGs, deflate blocks of 65-262k seem to give most dense encoding*/
+    blocksize = insize / 8 + 8;
+    if(blocksize < 65536) blocksize = 65536;
+    if(blocksize > 262144) blocksize = 262144;
+  }
+
+  numdeflateblocks = (insize + blocksize - 1) / blocksize;
+  if(numdeflateblocks == 0) numdeflateblocks = 1;
+
+  error = hash_init(&hash, settings->windowsize);
+  if(error) return error;
+
+  for(i = 0; i != numdeflateblocks && !error; ++i)
+  {
+    unsigned final = (i == numdeflateblocks - 1);
+    size_t start = i * blocksize;
+    size_t end = start + blocksize;
+    if(end > insize) end = insize;
+
+    if(settings->btype == 1) error = deflateFixed(out, &bp, &hash, in, start, end, settings, final);
+    else if(settings->btype == 2) error = deflateDynamic(out, &bp, &hash, in, start, end, settings, final);
+  }
+
+  hash_cleanup(&hash);
+
+  return error;
+}
+
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGCompressSettings* settings)
+{
+  unsigned error;
+  ucvector v;
+  ucvector_init_buffer(&v, *out, *outsize);
+  error = lodepng_deflatev(&v, in, insize, settings);
+  *out = v.data;
+  *outsize = v.size;
+  return error;
+}
+
+static unsigned deflate(unsigned char** out, size_t* outsize,
+                        const unsigned char* in, size_t insize,
+                        const LodePNGCompressSettings* settings)
+{
+  if(settings->custom_deflate)
+  {
+    return settings->custom_deflate(out, outsize, in, insize, settings);
+  }
+  else
+  {
+    return lodepng_deflate(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Adler32                                                                  */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned update_adler32(unsigned adler, const unsigned char* data, unsigned len)
+{
+   unsigned s1 = adler & 0xffff;
+   unsigned s2 = (adler >> 16) & 0xffff;
+
+  while(len > 0)
+  {
+    /*at least 5550 sums can be done before the sums overflow, saving a lot of module divisions*/
+    unsigned amount = len > 5550 ? 5550 : len;
+    len -= amount;
+    while(amount > 0)
+    {
+      s1 += (*data++);
+      s2 += s1;
+      --amount;
+    }
+    s1 %= 65521;
+    s2 %= 65521;
+  }
+
+  return (s2 << 16) | s1;
+}
+
+/*Return the adler32 of the bytes data[0..len-1]*/
+static unsigned adler32(const unsigned char* data, unsigned len)
+{
+  return update_adler32(1L, data, len);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Zlib                                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                 size_t insize, const LodePNGDecompressSettings* settings)
+{
+  unsigned error = 0;
+  unsigned CM, CINFO, FDICT;
+
+  if(insize < 2) return 53; /*error, size of zlib data too small*/
+  /*read information from zlib header*/
+  if((in[0] * 256 + in[1]) % 31 != 0)
+  {
+    /*error: 256 * in[0] + in[1] must be a multiple of 31, the FCHECK value is supposed to be made that way*/
+    return 24;
+  }
+
+  CM = in[0] & 15;
+  CINFO = (in[0] >> 4) & 15;
+  /*FCHECK = in[1] & 31;*/ /*FCHECK is already tested above*/
+  FDICT = (in[1] >> 5) & 1;
+  /*FLEVEL = (in[1] >> 6) & 3;*/ /*FLEVEL is not used here*/
+
+  if(CM != 8 || CINFO > 7)
+  {
+    /*error: only compression method 8: inflate with sliding window of 32k is supported by the PNG spec*/
+    return 25;
+  }
+  if(FDICT != 0)
+  {
+    /*error: the specification of PNG says about the zlib stream:
+      "The additional flags shall not specify a preset dictionary."*/
+    return 26;
+  }
+
+  error = inflate(out, outsize, in + 2, insize - 2, settings);
+  if(error) return error;
+
+  if(!settings->ignore_adler32)
+  {
+    unsigned ADLER32 = lodepng_read32bitInt(&in[insize - 4]);
+    unsigned checksum = adler32(*out, (unsigned)(*outsize));
+    if(checksum != ADLER32) return 58; /*error, adler checksum not correct, data must be corrupted*/
+  }
+
+  return 0; /*no error*/
+}
+
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                size_t insize, const LodePNGDecompressSettings* settings)
+{
+  if(settings->custom_zlib)
+  {
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+  }
+  else
+  {
+    return lodepng_zlib_decompress(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                               size_t insize, const LodePNGCompressSettings* settings)
+{
+  /*initially, *out must be NULL and outsize 0, if you just give some random *out
+  that's pointing to a non allocated buffer, this'll crash*/
+  ucvector outv;
+  size_t i;
+  unsigned error;
+  unsigned char* deflatedata = 0;
+  size_t deflatesize = 0;
+
+  /*zlib data: 1 byte CMF (CM+CINFO), 1 byte FLG, deflate data, 4 byte ADLER32 checksum of the Decompressed data*/
+  unsigned CMF = 120; /*0b01111000: CM 8, CINFO 7. With CINFO 7, any window size up to 32768 can be used.*/
+  unsigned FLEVEL = 0;
+  unsigned FDICT = 0;
+  unsigned CMFFLG = 256 * CMF + FDICT * 32 + FLEVEL * 64;
+  unsigned FCHECK = 31 - CMFFLG % 31;
+  CMFFLG += FCHECK;
+
+  /*ucvector-controlled version of the output buffer, for dynamic array*/
+  ucvector_init_buffer(&outv, *out, *outsize);
+
+  ucvector_push_back(&outv, (unsigned char)(CMFFLG >> 8));
+  ucvector_push_back(&outv, (unsigned char)(CMFFLG & 255));
+
+  error = deflate(&deflatedata, &deflatesize, in, insize, settings);
+
+  if(!error)
+  {
+    unsigned ADLER32 = adler32(in, (unsigned)insize);
+    for(i = 0; i != deflatesize; ++i) ucvector_push_back(&outv, deflatedata[i]);
+    lodepng_free(deflatedata);
+    lodepng_add32bitInt(&outv, ADLER32);
+  }
+
+  *out = outv.data;
+  *outsize = outv.size;
+
+  return error;
+}
+
+/* compress using the default or custom zlib function */
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                              size_t insize, const LodePNGCompressSettings* settings)
+{
+  if(settings->custom_zlib)
+  {
+    return settings->custom_zlib(out, outsize, in, insize, settings);
+  }
+  else
+  {
+    return lodepng_zlib_compress(out, outsize, in, insize, settings);
+  }
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#else /*no LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+static unsigned zlib_decompress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                size_t insize, const LodePNGDecompressSettings* settings)
+{
+  if(!settings->custom_zlib) return 87; /*no custom zlib function provided */
+  return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+static unsigned zlib_compress(unsigned char** out, size_t* outsize, const unsigned char* in,
+                              size_t insize, const LodePNGCompressSettings* settings)
+{
+  if(!settings->custom_zlib) return 87; /*no custom zlib function provided */
+  return settings->custom_zlib(out, outsize, in, insize, settings);
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/*this is a good tradeoff between speed and compression ratio*/
+#define DEFAULT_WINDOWSIZE 2048
+
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings)
+{
+  /*compress with dynamic huffman tree (not in the mathematical sense, just not the predefined one)*/
+  settings->btype = 2;
+  settings->use_lz77 = 1;
+  settings->windowsize = DEFAULT_WINDOWSIZE;
+  settings->minmatch = 3;
+  settings->nicematch = 128;
+  settings->lazymatching = 1;
+
+  settings->custom_zlib = 0;
+  settings->custom_deflate = 0;
+  settings->custom_context = 0;
+}
+
+const LodePNGCompressSettings lodepng_default_compress_settings = {2, 1, DEFAULT_WINDOWSIZE, 3, 128, 1, 0, 0, 0};
+
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings)
+{
+  settings->ignore_adler32 = 0;
+
+  settings->custom_zlib = 0;
+  settings->custom_inflate = 0;
+  settings->custom_context = 0;
+}
+
+const LodePNGDecompressSettings lodepng_default_decompress_settings = {0, 0, 0, 0};
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // End of Zlib related code. Begin of PNG related code.                 // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_PNG
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / CRC32                                                                  / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+
+#ifndef LODEPNG_NO_COMPILE_CRC
+/* CRC polynomial: 0xedb88320 */
+static unsigned lodepng_crc32_table[256] = {
+           0u, 1996959894u, 3993919788u, 2567524794u,  124634137u, 1886057615u, 3915621685u, 2657392035u,
+   249268274u, 2044508324u, 3772115230u, 2547177864u,  162941995u, 2125561021u, 3887607047u, 2428444049u,
+   498536548u, 1789927666u, 4089016648u, 2227061214u,  450548861u, 1843258603u, 4107580753u, 2211677639u,
+   325883990u, 1684777152u, 4251122042u, 2321926636u,  335633487u, 1661365465u, 4195302755u, 2366115317u,
+   997073096u, 1281953886u, 3579855332u, 2724688242u, 1006888145u, 1258607687u, 3524101629u, 2768942443u,
+   901097722u, 1119000684u, 3686517206u, 2898065728u,  853044451u, 1172266101u, 3705015759u, 2882616665u,
+   651767980u, 1373503546u, 3369554304u, 3218104598u,  565507253u, 1454621731u, 3485111705u, 3099436303u,
+   671266974u, 1594198024u, 3322730930u, 2970347812u,  795835527u, 1483230225u, 3244367275u, 3060149565u,
+  1994146192u,   31158534u, 2563907772u, 4023717930u, 1907459465u,  112637215u, 2680153253u, 3904427059u,
+  2013776290u,  251722036u, 2517215374u, 3775830040u, 2137656763u,  141376813u, 2439277719u, 3865271297u,
+  1802195444u,  476864866u, 2238001368u, 4066508878u, 1812370925u,  453092731u, 2181625025u, 4111451223u,
+  1706088902u,  314042704u, 2344532202u, 4240017532u, 1658658271u,  366619977u, 2362670323u, 4224994405u,
+  1303535960u,  984961486u, 2747007092u, 3569037538u, 1256170817u, 1037604311u, 2765210733u, 3554079995u,
+  1131014506u,  879679996u, 2909243462u, 3663771856u, 1141124467u,  855842277u, 2852801631u, 3708648649u,
+  1342533948u,  654459306u, 3188396048u, 3373015174u, 1466479909u,  544179635u, 3110523913u, 3462522015u,
+  1591671054u,  702138776u, 2966460450u, 3352799412u, 1504918807u,  783551873u, 3082640443u, 3233442989u,
+  3988292384u, 2596254646u,   62317068u, 1957810842u, 3939845945u, 2647816111u,   81470997u, 1943803523u,
+  3814918930u, 2489596804u,  225274430u, 2053790376u, 3826175755u, 2466906013u,  167816743u, 2097651377u,
+  4027552580u, 2265490386u,  503444072u, 1762050814u, 4150417245u, 2154129355u,  426522225u, 1852507879u,
+  4275313526u, 2312317920u,  282753626u, 1742555852u, 4189708143u, 2394877945u,  397917763u, 1622183637u,
+  3604390888u, 2714866558u,  953729732u, 1340076626u, 3518719985u, 2797360999u, 1068828381u, 1219638859u,
+  3624741850u, 2936675148u,  906185462u, 1090812512u, 3747672003u, 2825379669u,  829329135u, 1181335161u,
+  3412177804u, 3160834842u,  628085408u, 1382605366u, 3423369109u, 3138078467u,  570562233u, 1426400815u,
+  3317316542u, 2998733608u,  733239954u, 1555261956u, 3268935591u, 3050360625u,  752459403u, 1541320221u,
+  2607071920u, 3965973030u, 1969922972u,   40735498u, 2617837225u, 3943577151u, 1913087877u,   83908371u,
+  2512341634u, 3803740692u, 2075208622u,  213261112u, 2463272603u, 3855990285u, 2094854071u,  198958881u,
+  2262029012u, 4057260610u, 1759359992u,  534414190u, 2176718541u, 4139329115u, 1873836001u,  414664567u,
+  2282248934u, 4279200368u, 1711684554u,  285281116u, 2405801727u, 4167216745u, 1634467795u,  376229701u,
+  2685067896u, 3608007406u, 1308918612u,  956543938u, 2808555105u, 3495958263u, 1231636301u, 1047427035u,
+  2932959818u, 3654703836u, 1088359270u,  936918000u, 2847714899u, 3736837829u, 1202900863u,  817233897u,
+  3183342108u, 3401237130u, 1404277552u,  615818150u, 3134207493u, 3453421203u, 1423857449u,  601450431u,
+  3009837614u, 3294710456u, 1567103746u,  711928724u, 3020668471u, 3272380065u, 1510334235u,  755167117u
+};
+
+/*Return the CRC of the bytes buf[0..len-1].*/
+unsigned lodepng_crc32(const unsigned char* data, size_t length)
+{
+  unsigned r = 0xffffffffu;
+  size_t i;
+  for(i = 0; i < length; ++i)
+  {
+    r = lodepng_crc32_table[(r ^ data[i]) & 0xff] ^ (r >> 8);
+  }
+  return r ^ 0xffffffffu;
+}
+#else /* !LODEPNG_NO_COMPILE_CRC */
+unsigned lodepng_crc32(const unsigned char* data, size_t length);
+#endif /* !LODEPNG_NO_COMPILE_CRC */
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Reading and writing single bits and bytes from/to stream for LodePNG   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+static unsigned char readBitFromReversedStream(size_t* bitpointer, const unsigned char* bitstream)
+{
+  unsigned char result = (unsigned char)((bitstream[(*bitpointer) >> 3] >> (7 - ((*bitpointer) & 0x7))) & 1);
+  ++(*bitpointer);
+  return result;
+}
+
+static unsigned readBitsFromReversedStream(size_t* bitpointer, const unsigned char* bitstream, size_t nbits)
+{
+  unsigned result = 0;
+  size_t i;
+  for(i = 0 ; i < nbits; ++i)
+  {
+    result <<= 1;
+    result |= (unsigned)readBitFromReversedStream(bitpointer, bitstream);
+  }
+  return result;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+static void setBitOfReversedStream0(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+  /*the current bit in bitstream must be 0 for this to work*/
+  if(bit)
+  {
+    /*earlier bit of huffman code is in a lesser significant bit of an earlier byte*/
+    bitstream[(*bitpointer) >> 3] |= (bit << (7 - ((*bitpointer) & 0x7)));
+  }
+  ++(*bitpointer);
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+static void setBitOfReversedStream(size_t* bitpointer, unsigned char* bitstream, unsigned char bit)
+{
+  /*the current bit in bitstream may be 0 or 1 for this to work*/
+  if(bit == 0) bitstream[(*bitpointer) >> 3] &=  (unsigned char)(~(1 << (7 - ((*bitpointer) & 0x7))));
+  else         bitstream[(*bitpointer) >> 3] |=  (1 << (7 - ((*bitpointer) & 0x7)));
+  ++(*bitpointer);
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG chunks                                                             / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+unsigned lodepng_chunk_length(const unsigned char* chunk)
+{
+  return lodepng_read32bitInt(&chunk[0]);
+}
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk)
+{
+  unsigned i;
+  for(i = 0; i != 4; ++i) type[i] = (char)chunk[4 + i];
+  type[4] = 0; /*null termination char*/
+}
+
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type)
+{
+  if(strlen(type) != 4) return 0;
+  return (chunk[4] == type[0] && chunk[5] == type[1] && chunk[6] == type[2] && chunk[7] == type[3]);
+}
+
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk)
+{
+  return((chunk[4] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_private(const unsigned char* chunk)
+{
+  return((chunk[6] & 32) != 0);
+}
+
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk)
+{
+  return((chunk[7] & 32) != 0);
+}
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk)
+{
+  return &chunk[8];
+}
+
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk)
+{
+  return &chunk[8];
+}
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk)
+{
+  unsigned length = lodepng_chunk_length(chunk);
+  unsigned CRC = lodepng_read32bitInt(&chunk[length + 8]);
+  /*the CRC is taken of the data and the 4 chunk type letters, not the length*/
+  unsigned checksum = lodepng_crc32(&chunk[4], length + 4);
+  if(CRC != checksum) return 1;
+  else return 0;
+}
+
+void lodepng_chunk_generate_crc(unsigned char* chunk)
+{
+  unsigned length = lodepng_chunk_length(chunk);
+  unsigned CRC = lodepng_crc32(&chunk[4], length + 4);
+  lodepng_set32bitInt(chunk + 8 + length, CRC);
+}
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk)
+{
+  unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+  return &chunk[total_chunk_length];
+}
+
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk)
+{
+  unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+  return &chunk[total_chunk_length];
+}
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk)
+{
+  unsigned i;
+  unsigned total_chunk_length = lodepng_chunk_length(chunk) + 12;
+  unsigned char *chunk_start, *new_buffer;
+  size_t new_length = (*outlength) + total_chunk_length;
+  if(new_length < total_chunk_length || new_length < (*outlength)) return 77; /*integer overflow happened*/
+
+  new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+  if(!new_buffer) return 83; /*alloc fail*/
+  (*out) = new_buffer;
+  (*outlength) = new_length;
+  chunk_start = &(*out)[new_length - total_chunk_length];
+
+  for(i = 0; i != total_chunk_length; ++i) chunk_start[i] = chunk[i];
+
+  return 0;
+}
+
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+                              const char* type, const unsigned char* data)
+{
+  unsigned i;
+  unsigned char *chunk, *new_buffer;
+  size_t new_length = (*outlength) + length + 12;
+  if(new_length < length + 12 || new_length < (*outlength)) return 77; /*integer overflow happened*/
+  new_buffer = (unsigned char*)lodepng_realloc(*out, new_length);
+  if(!new_buffer) return 83; /*alloc fail*/
+  (*out) = new_buffer;
+  (*outlength) = new_length;
+  chunk = &(*out)[(*outlength) - length - 12];
+
+  /*1: length*/
+  lodepng_set32bitInt(chunk, (unsigned)length);
+
+  /*2: chunk name (4 letters)*/
+  chunk[4] = (unsigned char)type[0];
+  chunk[5] = (unsigned char)type[1];
+  chunk[6] = (unsigned char)type[2];
+  chunk[7] = (unsigned char)type[3];
+
+  /*3: the data*/
+  for(i = 0; i != length; ++i) chunk[8 + i] = data[i];
+
+  /*4: CRC (of the chunkname characters and the data)*/
+  lodepng_chunk_generate_crc(chunk);
+
+  return 0;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / Color types and such                                                   / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*return type is a LodePNG error code*/
+static unsigned checkColorValidity(LodePNGColorType colortype, unsigned bd) /*bd = bitdepth*/
+{
+  switch(colortype)
+  {
+    case 0: if(!(bd == 1 || bd == 2 || bd == 4 || bd == 8 || bd == 16)) return 37; break; /*grey*/
+    case 2: if(!(                                 bd == 8 || bd == 16)) return 37; break; /*RGB*/
+    case 3: if(!(bd == 1 || bd == 2 || bd == 4 || bd == 8            )) return 37; break; /*palette*/
+    case 4: if(!(                                 bd == 8 || bd == 16)) return 37; break; /*grey + alpha*/
+    case 6: if(!(                                 bd == 8 || bd == 16)) return 37; break; /*RGBA*/
+    default: return 31;
+  }
+  return 0; /*allowed color type / bits combination*/
+}
+
+static unsigned getNumColorChannels(LodePNGColorType colortype)
+{
+  switch(colortype)
+  {
+    case 0: return 1; /*grey*/
+    case 2: return 3; /*RGB*/
+    case 3: return 1; /*palette*/
+    case 4: return 2; /*grey + alpha*/
+    case 6: return 4; /*RGBA*/
+  }
+  return 0; /*unexisting color type*/
+}
+
+static unsigned lodepng_get_bpp_lct(LodePNGColorType colortype, unsigned bitdepth)
+{
+  /*bits per pixel is amount of channels * bits per channel*/
+  return getNumColorChannels(colortype) * bitdepth;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+void lodepng_color_mode_init(LodePNGColorMode* info)
+{
+  info->key_defined = 0;
+  info->key_r = info->key_g = info->key_b = 0;
+  info->colortype = LCT_RGBA;
+  info->bitdepth = 8;
+  info->palette = 0;
+  info->palettesize = 0;
+}
+
+void lodepng_color_mode_cleanup(LodePNGColorMode* info)
+{
+  lodepng_palette_clear(info);
+}
+
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source)
+{
+  size_t i;
+  lodepng_color_mode_cleanup(dest);
+  *dest = *source;
+  if(source->palette)
+  {
+    dest->palette = (unsigned char*)lodepng_malloc(1024);
+    if(!dest->palette && source->palettesize) return 83; /*alloc fail*/
+    for(i = 0; i != source->palettesize * 4; ++i) dest->palette[i] = source->palette[i];
+  }
+  return 0;
+}
+
+static int lodepng_color_mode_equal(const LodePNGColorMode* a, const LodePNGColorMode* b)
+{
+  size_t i;
+  if(a->colortype != b->colortype) return 0;
+  if(a->bitdepth != b->bitdepth) return 0;
+  if(a->key_defined != b->key_defined) return 0;
+  if(a->key_defined)
+  {
+    if(a->key_r != b->key_r) return 0;
+    if(a->key_g != b->key_g) return 0;
+    if(a->key_b != b->key_b) return 0;
+  }
+  /*if one of the palette sizes is 0, then we consider it to be the same as the
+  other: it means that e.g. the palette was not given by the user and should be
+  considered the same as the palette inside the PNG.*/
+  if(1/*a->palettesize != 0 && b->palettesize != 0*/) {
+    if(a->palettesize != b->palettesize) return 0;
+    for(i = 0; i != a->palettesize * 4; ++i)
+    {
+      if(a->palette[i] != b->palette[i]) return 0;
+    }
+  }
+  return 1;
+}
+
+void lodepng_palette_clear(LodePNGColorMode* info)
+{
+  if(info->palette) lodepng_free(info->palette);
+  info->palette = 0;
+  info->palettesize = 0;
+}
+
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+                             unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+  unsigned char* data;
+  /*the same resize technique as C++ std::vectors is used, and here it's made so that for a palette with
+  the max of 256 colors, it'll have the exact alloc size*/
+  if(!info->palette) /*allocate palette if empty*/
+  {
+    /*room for 256 colors with 4 bytes each*/
+    data = (unsigned char*)lodepng_realloc(info->palette, 1024);
+    if(!data) return 83; /*alloc fail*/
+    else info->palette = data;
+  }
+  info->palette[4 * info->palettesize + 0] = r;
+  info->palette[4 * info->palettesize + 1] = g;
+  info->palette[4 * info->palettesize + 2] = b;
+  info->palette[4 * info->palettesize + 3] = a;
+  ++info->palettesize;
+  return 0;
+}
+
+unsigned lodepng_get_bpp(const LodePNGColorMode* info)
+{
+  /*calculate bits per pixel out of colortype and bitdepth*/
+  return lodepng_get_bpp_lct(info->colortype, info->bitdepth);
+}
+
+unsigned lodepng_get_channels(const LodePNGColorMode* info)
+{
+  return getNumColorChannels(info->colortype);
+}
+
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info)
+{
+  return info->colortype == LCT_GREY || info->colortype == LCT_GREY_ALPHA;
+}
+
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info)
+{
+  return (info->colortype & 4) != 0; /*4 or 6*/
+}
+
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info)
+{
+  return info->colortype == LCT_PALETTE;
+}
+
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info)
+{
+  size_t i;
+  for(i = 0; i != info->palettesize; ++i)
+  {
+    if(info->palette[i * 4 + 3] < 255) return 1;
+  }
+  return 0;
+}
+
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info)
+{
+  return info->key_defined
+      || lodepng_is_alpha_type(info)
+      || lodepng_has_palette_alpha(info);
+}
+
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+  /*will not overflow for any color type if roughly w * h < 268435455*/
+  size_t bpp = lodepng_get_bpp(color);
+  size_t n = w * h;
+  return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+size_t lodepng_get_raw_size_lct(unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+  /*will not overflow for any color type if roughly w * h < 268435455*/
+  size_t bpp = lodepng_get_bpp_lct(colortype, bitdepth);
+  size_t n = w * h;
+  return ((n / 8) * bpp) + ((n & 7) * bpp + 7) / 8;
+}
+
+
+#ifdef LODEPNG_COMPILE_PNG
+#ifdef LODEPNG_COMPILE_DECODER
+/*in an idat chunk, each scanline is a multiple of 8 bits, unlike the lodepng output buffer*/
+static size_t lodepng_get_raw_size_idat(unsigned w, unsigned h, const LodePNGColorMode* color)
+{
+  /*will not overflow for any color type if roughly w * h < 268435455*/
+  size_t bpp = lodepng_get_bpp(color);
+  size_t line = ((w / 8) * bpp) + ((w & 7) * bpp + 7) / 8;
+  return h * line;
+}
+#endif /*LODEPNG_COMPILE_DECODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static void LodePNGUnknownChunks_init(LodePNGInfo* info)
+{
+  unsigned i;
+  for(i = 0; i != 3; ++i) info->unknown_chunks_data[i] = 0;
+  for(i = 0; i != 3; ++i) info->unknown_chunks_size[i] = 0;
+}
+
+static void LodePNGUnknownChunks_cleanup(LodePNGInfo* info)
+{
+  unsigned i;
+  for(i = 0; i != 3; ++i) lodepng_free(info->unknown_chunks_data[i]);
+}
+
+static unsigned LodePNGUnknownChunks_copy(LodePNGInfo* dest, const LodePNGInfo* src)
+{
+  unsigned i;
+
+  LodePNGUnknownChunks_cleanup(dest);
+
+  for(i = 0; i != 3; ++i)
+  {
+    size_t j;
+    dest->unknown_chunks_size[i] = src->unknown_chunks_size[i];
+    dest->unknown_chunks_data[i] = (unsigned char*)lodepng_malloc(src->unknown_chunks_size[i]);
+    if(!dest->unknown_chunks_data[i] && dest->unknown_chunks_size[i]) return 83; /*alloc fail*/
+    for(j = 0; j < src->unknown_chunks_size[i]; ++j)
+    {
+      dest->unknown_chunks_data[i][j] = src->unknown_chunks_data[i][j];
+    }
+  }
+
+  return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGText_init(LodePNGInfo* info)
+{
+  info->text_num = 0;
+  info->text_keys = NULL;
+  info->text_strings = NULL;
+}
+
+static void LodePNGText_cleanup(LodePNGInfo* info)
+{
+  size_t i;
+  for(i = 0; i != info->text_num; ++i)
+  {
+    string_cleanup(&info->text_keys[i]);
+    string_cleanup(&info->text_strings[i]);
+  }
+  lodepng_free(info->text_keys);
+  lodepng_free(info->text_strings);
+}
+
+static unsigned LodePNGText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+  size_t i = 0;
+  dest->text_keys = 0;
+  dest->text_strings = 0;
+  dest->text_num = 0;
+  for(i = 0; i != source->text_num; ++i)
+  {
+    CERROR_TRY_RETURN(lodepng_add_text(dest, source->text_keys[i], source->text_strings[i]));
+  }
+  return 0;
+}
+
+void lodepng_clear_text(LodePNGInfo* info)
+{
+  LodePNGText_cleanup(info);
+}
+
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str)
+{
+  char** new_keys = (char**)(lodepng_realloc(info->text_keys, sizeof(char*) * (info->text_num + 1)));
+  char** new_strings = (char**)(lodepng_realloc(info->text_strings, sizeof(char*) * (info->text_num + 1)));
+  if(!new_keys || !new_strings)
+  {
+    lodepng_free(new_keys);
+    lodepng_free(new_strings);
+    return 83; /*alloc fail*/
+  }
+
+  ++info->text_num;
+  info->text_keys = new_keys;
+  info->text_strings = new_strings;
+
+  string_init(&info->text_keys[info->text_num - 1]);
+  string_set(&info->text_keys[info->text_num - 1], key);
+
+  string_init(&info->text_strings[info->text_num - 1]);
+  string_set(&info->text_strings[info->text_num - 1], str);
+
+  return 0;
+}
+
+/******************************************************************************/
+
+static void LodePNGIText_init(LodePNGInfo* info)
+{
+  info->itext_num = 0;
+  info->itext_keys = NULL;
+  info->itext_langtags = NULL;
+  info->itext_transkeys = NULL;
+  info->itext_strings = NULL;
+}
+
+static void LodePNGIText_cleanup(LodePNGInfo* info)
+{
+  size_t i;
+  for(i = 0; i != info->itext_num; ++i)
+  {
+    string_cleanup(&info->itext_keys[i]);
+    string_cleanup(&info->itext_langtags[i]);
+    string_cleanup(&info->itext_transkeys[i]);
+    string_cleanup(&info->itext_strings[i]);
+  }
+  lodepng_free(info->itext_keys);
+  lodepng_free(info->itext_langtags);
+  lodepng_free(info->itext_transkeys);
+  lodepng_free(info->itext_strings);
+}
+
+static unsigned LodePNGIText_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+  size_t i = 0;
+  dest->itext_keys = 0;
+  dest->itext_langtags = 0;
+  dest->itext_transkeys = 0;
+  dest->itext_strings = 0;
+  dest->itext_num = 0;
+  for(i = 0; i != source->itext_num; ++i)
+  {
+    CERROR_TRY_RETURN(lodepng_add_itext(dest, source->itext_keys[i], source->itext_langtags[i],
+                                        source->itext_transkeys[i], source->itext_strings[i]));
+  }
+  return 0;
+}
+
+void lodepng_clear_itext(LodePNGInfo* info)
+{
+  LodePNGIText_cleanup(info);
+}
+
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+                           const char* transkey, const char* str)
+{
+  char** new_keys = (char**)(lodepng_realloc(info->itext_keys, sizeof(char*) * (info->itext_num + 1)));
+  char** new_langtags = (char**)(lodepng_realloc(info->itext_langtags, sizeof(char*) * (info->itext_num + 1)));
+  char** new_transkeys = (char**)(lodepng_realloc(info->itext_transkeys, sizeof(char*) * (info->itext_num + 1)));
+  char** new_strings = (char**)(lodepng_realloc(info->itext_strings, sizeof(char*) * (info->itext_num + 1)));
+  if(!new_keys || !new_langtags || !new_transkeys || !new_strings)
+  {
+    lodepng_free(new_keys);
+    lodepng_free(new_langtags);
+    lodepng_free(new_transkeys);
+    lodepng_free(new_strings);
+    return 83; /*alloc fail*/
+  }
+
+  ++info->itext_num;
+  info->itext_keys = new_keys;
+  info->itext_langtags = new_langtags;
+  info->itext_transkeys = new_transkeys;
+  info->itext_strings = new_strings;
+
+  string_init(&info->itext_keys[info->itext_num - 1]);
+  string_set(&info->itext_keys[info->itext_num - 1], key);
+
+  string_init(&info->itext_langtags[info->itext_num - 1]);
+  string_set(&info->itext_langtags[info->itext_num - 1], langtag);
+
+  string_init(&info->itext_transkeys[info->itext_num - 1]);
+  string_set(&info->itext_transkeys[info->itext_num - 1], transkey);
+
+  string_init(&info->itext_strings[info->itext_num - 1]);
+  string_set(&info->itext_strings[info->itext_num - 1], str);
+
+  return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+void lodepng_info_init(LodePNGInfo* info)
+{
+  lodepng_color_mode_init(&info->color);
+  info->interlace_method = 0;
+  info->compression_method = 0;
+  info->filter_method = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  info->background_defined = 0;
+  info->background_r = info->background_g = info->background_b = 0;
+
+  LodePNGText_init(info);
+  LodePNGIText_init(info);
+
+  info->time_defined = 0;
+  info->phys_defined = 0;
+
+  LodePNGUnknownChunks_init(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+void lodepng_info_cleanup(LodePNGInfo* info)
+{
+  lodepng_color_mode_cleanup(&info->color);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  LodePNGText_cleanup(info);
+  LodePNGIText_cleanup(info);
+
+  LodePNGUnknownChunks_cleanup(info);
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source)
+{
+  lodepng_info_cleanup(dest);
+  *dest = *source;
+  lodepng_color_mode_init(&dest->color);
+  CERROR_TRY_RETURN(lodepng_color_mode_copy(&dest->color, &source->color));
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  CERROR_TRY_RETURN(LodePNGText_copy(dest, source));
+  CERROR_TRY_RETURN(LodePNGIText_copy(dest, source));
+
+  LodePNGUnknownChunks_init(dest);
+  CERROR_TRY_RETURN(LodePNGUnknownChunks_copy(dest, source));
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+  return 0;
+}
+
+void lodepng_info_swap(LodePNGInfo* a, LodePNGInfo* b)
+{
+  LodePNGInfo temp = *a;
+  *a = *b;
+  *b = temp;
+}
+
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*index: bitgroup index, bits: bitgroup size(1, 2 or 4), in: bitgroup value, out: octet array to add bits to*/
+static void addColorBits(unsigned char* out, size_t index, unsigned bits, unsigned in)
+{
+  unsigned m = bits == 1 ? 7 : bits == 2 ? 3 : 1; /*8 / bits - 1*/
+  /*p = the partial index in the byte, e.g. with 4 palettebits it is 0 for first half or 1 for second half*/
+  unsigned p = index & m;
+  in &= (1u << bits) - 1u; /*filter out any other bits of the input value*/
+  in = in << (bits * (m - p));
+  if(p == 0) out[index * bits / 8] = in;
+  else out[index * bits / 8] |= in;
+}
+
+typedef struct ColorTree ColorTree;
+
+/*
+One node of a color tree
+This is the data structure used to count the number of unique colors and to get a palette
+index for a color. It's like an octree, but because the alpha channel is used too, each
+node has 16 instead of 8 children.
+*/
+struct ColorTree
+{
+  ColorTree* children[16]; /*up to 16 pointers to ColorTree of next level*/
+  int index; /*the payload. Only has a meaningful value if this is in the last level*/
+};
+
+static void color_tree_init(ColorTree* tree)
+{
+  int i;
+  for(i = 0; i != 16; ++i) tree->children[i] = 0;
+  tree->index = -1;
+}
+
+static void color_tree_cleanup(ColorTree* tree)
+{
+  int i;
+  for(i = 0; i != 16; ++i)
+  {
+    if(tree->children[i])
+    {
+      color_tree_cleanup(tree->children[i]);
+      lodepng_free(tree->children[i]);
+    }
+  }
+}
+
+/*returns -1 if color not present, its index otherwise*/
+static int color_tree_get(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+  int bit = 0;
+  for(bit = 0; bit < 8; ++bit)
+  {
+    int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+    if(!tree->children[i]) return -1;
+    else tree = tree->children[i];
+  }
+  return tree ? tree->index : -1;
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+static int color_tree_has(ColorTree* tree, unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+  return color_tree_get(tree, r, g, b, a) >= 0;
+}
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*color is not allowed to already exist.
+Index should be >= 0 (it's signed to be compatible with using -1 for "doesn't exist")*/
+static void color_tree_add(ColorTree* tree,
+                           unsigned char r, unsigned char g, unsigned char b, unsigned char a, unsigned index)
+{
+  int bit;
+  for(bit = 0; bit < 8; ++bit)
+  {
+    int i = 8 * ((r >> bit) & 1) + 4 * ((g >> bit) & 1) + 2 * ((b >> bit) & 1) + 1 * ((a >> bit) & 1);
+    if(!tree->children[i])
+    {
+      tree->children[i] = (ColorTree*)lodepng_malloc(sizeof(ColorTree));
+      color_tree_init(tree->children[i]);
+    }
+    tree = tree->children[i];
+  }
+  tree->index = (int)index;
+}
+
+/*put a pixel, given its RGBA color, into image of any color type*/
+static unsigned rgba8ToPixel(unsigned char* out, size_t i,
+                             const LodePNGColorMode* mode, ColorTree* tree /*for palette*/,
+                             unsigned char r, unsigned char g, unsigned char b, unsigned char a)
+{
+  if(mode->colortype == LCT_GREY)
+  {
+    unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+    if(mode->bitdepth == 8) out[i] = grey;
+    else if(mode->bitdepth == 16) out[i * 2 + 0] = out[i * 2 + 1] = grey;
+    else
+    {
+      /*take the most significant bits of grey*/
+      grey = (grey >> (8 - mode->bitdepth)) & ((1 << mode->bitdepth) - 1);
+      addColorBits(out, i, mode->bitdepth, grey);
+    }
+  }
+  else if(mode->colortype == LCT_RGB)
+  {
+    if(mode->bitdepth == 8)
+    {
+      out[i * 3 + 0] = r;
+      out[i * 3 + 1] = g;
+      out[i * 3 + 2] = b;
+    }
+    else
+    {
+      out[i * 6 + 0] = out[i * 6 + 1] = r;
+      out[i * 6 + 2] = out[i * 6 + 3] = g;
+      out[i * 6 + 4] = out[i * 6 + 5] = b;
+    }
+  }
+  else if(mode->colortype == LCT_PALETTE)
+  {
+    int index = color_tree_get(tree, r, g, b, a);
+    if(index < 0) return 82; /*color not in palette*/
+    if(mode->bitdepth == 8) out[i] = index;
+    else addColorBits(out, i, mode->bitdepth, (unsigned)index);
+  }
+  else if(mode->colortype == LCT_GREY_ALPHA)
+  {
+    unsigned char grey = r; /*((unsigned short)r + g + b) / 3*/;
+    if(mode->bitdepth == 8)
+    {
+      out[i * 2 + 0] = grey;
+      out[i * 2 + 1] = a;
+    }
+    else if(mode->bitdepth == 16)
+    {
+      out[i * 4 + 0] = out[i * 4 + 1] = grey;
+      out[i * 4 + 2] = out[i * 4 + 3] = a;
+    }
+  }
+  else if(mode->colortype == LCT_RGBA)
+  {
+    if(mode->bitdepth == 8)
+    {
+      out[i * 4 + 0] = r;
+      out[i * 4 + 1] = g;
+      out[i * 4 + 2] = b;
+      out[i * 4 + 3] = a;
+    }
+    else
+    {
+      out[i * 8 + 0] = out[i * 8 + 1] = r;
+      out[i * 8 + 2] = out[i * 8 + 3] = g;
+      out[i * 8 + 4] = out[i * 8 + 5] = b;
+      out[i * 8 + 6] = out[i * 8 + 7] = a;
+    }
+  }
+
+  return 0; /*no error*/
+}
+
+/*put a pixel, given its RGBA16 color, into image of any color 16-bitdepth type*/
+static void rgba16ToPixel(unsigned char* out, size_t i,
+                         const LodePNGColorMode* mode,
+                         unsigned short r, unsigned short g, unsigned short b, unsigned short a)
+{
+  if(mode->colortype == LCT_GREY)
+  {
+    unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+    out[i * 2 + 0] = (grey >> 8) & 255;
+    out[i * 2 + 1] = grey & 255;
+  }
+  else if(mode->colortype == LCT_RGB)
+  {
+    out[i * 6 + 0] = (r >> 8) & 255;
+    out[i * 6 + 1] = r & 255;
+    out[i * 6 + 2] = (g >> 8) & 255;
+    out[i * 6 + 3] = g & 255;
+    out[i * 6 + 4] = (b >> 8) & 255;
+    out[i * 6 + 5] = b & 255;
+  }
+  else if(mode->colortype == LCT_GREY_ALPHA)
+  {
+    unsigned short grey = r; /*((unsigned)r + g + b) / 3*/;
+    out[i * 4 + 0] = (grey >> 8) & 255;
+    out[i * 4 + 1] = grey & 255;
+    out[i * 4 + 2] = (a >> 8) & 255;
+    out[i * 4 + 3] = a & 255;
+  }
+  else if(mode->colortype == LCT_RGBA)
+  {
+    out[i * 8 + 0] = (r >> 8) & 255;
+    out[i * 8 + 1] = r & 255;
+    out[i * 8 + 2] = (g >> 8) & 255;
+    out[i * 8 + 3] = g & 255;
+    out[i * 8 + 4] = (b >> 8) & 255;
+    out[i * 8 + 5] = b & 255;
+    out[i * 8 + 6] = (a >> 8) & 255;
+    out[i * 8 + 7] = a & 255;
+  }
+}
+
+/*Get RGBA8 color of pixel with index i (y * width + x) from the raw image with given color type.*/
+static void getPixelColorRGBA8(unsigned char* r, unsigned char* g,
+                               unsigned char* b, unsigned char* a,
+                               const unsigned char* in, size_t i,
+                               const LodePNGColorMode* mode)
+{
+  if(mode->colortype == LCT_GREY)
+  {
+    if(mode->bitdepth == 8)
+    {
+      *r = *g = *b = in[i];
+      if(mode->key_defined && *r == mode->key_r) *a = 0;
+      else *a = 255;
+    }
+    else if(mode->bitdepth == 16)
+    {
+      *r = *g = *b = in[i * 2 + 0];
+      if(mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+      else *a = 255;
+    }
+    else
+    {
+      unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+      size_t j = i * mode->bitdepth;
+      unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+      *r = *g = *b = (value * 255) / highest;
+      if(mode->key_defined && value == mode->key_r) *a = 0;
+      else *a = 255;
+    }
+  }
+  else if(mode->colortype == LCT_RGB)
+  {
+    if(mode->bitdepth == 8)
+    {
+      *r = in[i * 3 + 0]; *g = in[i * 3 + 1]; *b = in[i * 3 + 2];
+      if(mode->key_defined && *r == mode->key_r && *g == mode->key_g && *b == mode->key_b) *a = 0;
+      else *a = 255;
+    }
+    else
+    {
+      *r = in[i * 6 + 0];
+      *g = in[i * 6 + 2];
+      *b = in[i * 6 + 4];
+      if(mode->key_defined && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+         && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+         && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+      else *a = 255;
+    }
+  }
+  else if(mode->colortype == LCT_PALETTE)
+  {
+    unsigned index;
+    if(mode->bitdepth == 8) index = in[i];
+    else
+    {
+      size_t j = i * mode->bitdepth;
+      index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+    }
+
+    if(index >= mode->palettesize)
+    {
+      /*This is an error according to the PNG spec, but common PNG decoders make it black instead.
+      Done here too, slightly faster due to no error handling needed.*/
+      *r = *g = *b = 0;
+      *a = 255;
+    }
+    else
+    {
+      *r = mode->palette[index * 4 + 0];
+      *g = mode->palette[index * 4 + 1];
+      *b = mode->palette[index * 4 + 2];
+      *a = mode->palette[index * 4 + 3];
+    }
+  }
+  else if(mode->colortype == LCT_GREY_ALPHA)
+  {
+    if(mode->bitdepth == 8)
+    {
+      *r = *g = *b = in[i * 2 + 0];
+      *a = in[i * 2 + 1];
+    }
+    else
+    {
+      *r = *g = *b = in[i * 4 + 0];
+      *a = in[i * 4 + 2];
+    }
+  }
+  else if(mode->colortype == LCT_RGBA)
+  {
+    if(mode->bitdepth == 8)
+    {
+      *r = in[i * 4 + 0];
+      *g = in[i * 4 + 1];
+      *b = in[i * 4 + 2];
+      *a = in[i * 4 + 3];
+    }
+    else
+    {
+      *r = in[i * 8 + 0];
+      *g = in[i * 8 + 2];
+      *b = in[i * 8 + 4];
+      *a = in[i * 8 + 6];
+    }
+  }
+}
+
+/*Similar to getPixelColorRGBA8, but with all the for loops inside of the color
+mode test cases, optimized to convert the colors much faster, when converting
+to RGBA or RGB with 8 bit per cannel. buffer must be RGBA or RGB output with
+enough memory, if has_alpha is true the output is RGBA. mode has the color mode
+of the input buffer.*/
+static void getPixelColorsRGBA8(unsigned char* buffer, size_t numpixels,
+                                unsigned has_alpha, const unsigned char* in,
+                                const LodePNGColorMode* mode)
+{
+  unsigned num_channels = has_alpha ? 4 : 3;
+  size_t i;
+  if(mode->colortype == LCT_GREY)
+  {
+    if(mode->bitdepth == 8)
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = buffer[1] = buffer[2] = in[i];
+        if(has_alpha) buffer[3] = mode->key_defined && in[i] == mode->key_r ? 0 : 255;
+      }
+    }
+    else if(mode->bitdepth == 16)
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = buffer[1] = buffer[2] = in[i * 2];
+        if(has_alpha) buffer[3] = mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r ? 0 : 255;
+      }
+    }
+    else
+    {
+      unsigned highest = ((1U << mode->bitdepth) - 1U); /*highest possible value for this bit depth*/
+      size_t j = 0;
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        unsigned value = readBitsFromReversedStream(&j, in, mode->bitdepth);
+        buffer[0] = buffer[1] = buffer[2] = (value * 255) / highest;
+        if(has_alpha) buffer[3] = mode->key_defined && value == mode->key_r ? 0 : 255;
+      }
+    }
+  }
+  else if(mode->colortype == LCT_RGB)
+  {
+    if(mode->bitdepth == 8)
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = in[i * 3 + 0];
+        buffer[1] = in[i * 3 + 1];
+        buffer[2] = in[i * 3 + 2];
+        if(has_alpha) buffer[3] = mode->key_defined && buffer[0] == mode->key_r
+           && buffer[1]== mode->key_g && buffer[2] == mode->key_b ? 0 : 255;
+      }
+    }
+    else
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = in[i * 6 + 0];
+        buffer[1] = in[i * 6 + 2];
+        buffer[2] = in[i * 6 + 4];
+        if(has_alpha) buffer[3] = mode->key_defined
+           && 256U * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+           && 256U * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+           && 256U * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b ? 0 : 255;
+      }
+    }
+  }
+  else if(mode->colortype == LCT_PALETTE)
+  {
+    unsigned index;
+    size_t j = 0;
+    for(i = 0; i != numpixels; ++i, buffer += num_channels)
+    {
+      if(mode->bitdepth == 8) index = in[i];
+      else index = readBitsFromReversedStream(&j, in, mode->bitdepth);
+
+      if(index >= mode->palettesize)
+      {
+        /*This is an error according to the PNG spec, but most PNG decoders make it black instead.
+        Done here too, slightly faster due to no error handling needed.*/
+        buffer[0] = buffer[1] = buffer[2] = 0;
+        if(has_alpha) buffer[3] = 255;
+      }
+      else
+      {
+        buffer[0] = mode->palette[index * 4 + 0];
+        buffer[1] = mode->palette[index * 4 + 1];
+        buffer[2] = mode->palette[index * 4 + 2];
+        if(has_alpha) buffer[3] = mode->palette[index * 4 + 3];
+      }
+    }
+  }
+  else if(mode->colortype == LCT_GREY_ALPHA)
+  {
+    if(mode->bitdepth == 8)
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = buffer[1] = buffer[2] = in[i * 2 + 0];
+        if(has_alpha) buffer[3] = in[i * 2 + 1];
+      }
+    }
+    else
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = buffer[1] = buffer[2] = in[i * 4 + 0];
+        if(has_alpha) buffer[3] = in[i * 4 + 2];
+      }
+    }
+  }
+  else if(mode->colortype == LCT_RGBA)
+  {
+    if(mode->bitdepth == 8)
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = in[i * 4 + 0];
+        buffer[1] = in[i * 4 + 1];
+        buffer[2] = in[i * 4 + 2];
+        if(has_alpha) buffer[3] = in[i * 4 + 3];
+      }
+    }
+    else
+    {
+      for(i = 0; i != numpixels; ++i, buffer += num_channels)
+      {
+        buffer[0] = in[i * 8 + 0];
+        buffer[1] = in[i * 8 + 2];
+        buffer[2] = in[i * 8 + 4];
+        if(has_alpha) buffer[3] = in[i * 8 + 6];
+      }
+    }
+  }
+}
+
+/*Get RGBA16 color of pixel with index i (y * width + x) from the raw image with
+given color type, but the given color type must be 16-bit itself.*/
+static void getPixelColorRGBA16(unsigned short* r, unsigned short* g, unsigned short* b, unsigned short* a,
+                                const unsigned char* in, size_t i, const LodePNGColorMode* mode)
+{
+  if(mode->colortype == LCT_GREY)
+  {
+    *r = *g = *b = 256 * in[i * 2 + 0] + in[i * 2 + 1];
+    if(mode->key_defined && 256U * in[i * 2 + 0] + in[i * 2 + 1] == mode->key_r) *a = 0;
+    else *a = 65535;
+  }
+  else if(mode->colortype == LCT_RGB)
+  {
+    *r = 256u * in[i * 6 + 0] + in[i * 6 + 1];
+    *g = 256u * in[i * 6 + 2] + in[i * 6 + 3];
+    *b = 256u * in[i * 6 + 4] + in[i * 6 + 5];
+    if(mode->key_defined
+       && 256u * in[i * 6 + 0] + in[i * 6 + 1] == mode->key_r
+       && 256u * in[i * 6 + 2] + in[i * 6 + 3] == mode->key_g
+       && 256u * in[i * 6 + 4] + in[i * 6 + 5] == mode->key_b) *a = 0;
+    else *a = 65535;
+  }
+  else if(mode->colortype == LCT_GREY_ALPHA)
+  {
+    *r = *g = *b = 256u * in[i * 4 + 0] + in[i * 4 + 1];
+    *a = 256u * in[i * 4 + 2] + in[i * 4 + 3];
+  }
+  else if(mode->colortype == LCT_RGBA)
+  {
+    *r = 256u * in[i * 8 + 0] + in[i * 8 + 1];
+    *g = 256u * in[i * 8 + 2] + in[i * 8 + 3];
+    *b = 256u * in[i * 8 + 4] + in[i * 8 + 5];
+    *a = 256u * in[i * 8 + 6] + in[i * 8 + 7];
+  }
+}
+
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+                         const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+                         unsigned w, unsigned h)
+{
+  size_t i;
+  ColorTree tree;
+  size_t numpixels = w * h;
+
+  if(lodepng_color_mode_equal(mode_out, mode_in))
+  {
+    size_t numbytes = lodepng_get_raw_size(w, h, mode_in);
+    for(i = 0; i != numbytes; ++i) out[i] = in[i];
+    return 0;
+  }
+
+  if(mode_out->colortype == LCT_PALETTE)
+  {
+    size_t palettesize = mode_out->palettesize;
+    const unsigned char* palette = mode_out->palette;
+    size_t palsize = size_t(1) << mode_out->bitdepth;
+    /*if the user specified output palette but did not give the values, assume
+    they want the values of the input color type (assuming that one is palette).
+    Note that we never create a new palette ourselves.*/
+    if(palettesize == 0)
+    {
+      palettesize = mode_in->palettesize;
+      palette = mode_in->palette;
+    }
+    if(palettesize < palsize) palsize = palettesize;
+    color_tree_init(&tree);
+    for(i = 0; i != palsize; ++i)
+    {
+      const unsigned char* p = &palette[i * 4];
+      color_tree_add(&tree, p[0], p[1], p[2], p[3], (unsigned int)(i));
+    }
+  }
+
+  if(mode_in->bitdepth == 16 && mode_out->bitdepth == 16)
+  {
+    for(i = 0; i != numpixels; ++i)
+    {
+      unsigned short r = 0, g = 0, b = 0, a = 0;
+      getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode_in);
+      rgba16ToPixel(out, i, mode_out, r, g, b, a);
+    }
+  }
+  else if(mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGBA)
+  {
+    getPixelColorsRGBA8(out, numpixels, 1, in, mode_in);
+  }
+  else if(mode_out->bitdepth == 8 && mode_out->colortype == LCT_RGB)
+  {
+    getPixelColorsRGBA8(out, numpixels, 0, in, mode_in);
+  }
+  else
+  {
+    unsigned char r = 0, g = 0, b = 0, a = 0;
+    for(i = 0; i != numpixels; ++i)
+    {
+      getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode_in);
+      CERROR_TRY_RETURN(rgba8ToPixel(out, i, mode_out, &tree, r, g, b, a));
+    }
+  }
+
+  if(mode_out->colortype == LCT_PALETTE)
+  {
+    color_tree_cleanup(&tree);
+  }
+
+  return 0; /*no error*/
+}
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile)
+{
+  profile->colored = 0;
+  profile->key = 0;
+  profile->alpha = 0;
+  profile->key_r = profile->key_g = profile->key_b = 0;
+  profile->numcolors = 0;
+  profile->bits = 1;
+}
+
+/*function used for debug purposes with C++*/
+/*void printColorProfile(LodePNGColorProfile* p)
+{
+  std::cout << "colored: " << (int)p->colored << ", ";
+  std::cout << "key: " << (int)p->key << ", ";
+  std::cout << "key_r: " << (int)p->key_r << ", ";
+  std::cout << "key_g: " << (int)p->key_g << ", ";
+  std::cout << "key_b: " << (int)p->key_b << ", ";
+  std::cout << "alpha: " << (int)p->alpha << ", ";
+  std::cout << "numcolors: " << (int)p->numcolors << ", ";
+  std::cout << "bits: " << (int)p->bits << std::endl;
+}*/
+
+/*Returns how many bits needed to represent given value (max 8 bit)*/
+static unsigned getValueRequiredBits(unsigned char value)
+{
+  if(value == 0 || value == 255) return 1;
+  /*The scaling of 2-bit and 4-bit values uses multiples of 85 and 17*/
+  if(value % 17 == 0) return value % 85 == 0 ? 2 : 4;
+  return 8;
+}
+
+/*profile must already have been inited with mode.
+It's ok to set some parameters of profile to done already.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+                                   const unsigned char* in, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode)
+{
+  unsigned error = 0;
+  size_t i;
+  ColorTree tree;
+  size_t numpixels = w * h;
+
+  unsigned colored_done = lodepng_is_greyscale_type(mode) ? 1 : 0;
+  unsigned alpha_done = lodepng_can_have_alpha(mode) ? 0 : 1;
+  unsigned numcolors_done = 0;
+  unsigned bpp = lodepng_get_bpp(mode);
+  unsigned bits_done = bpp == 1 ? 1 : 0;
+  unsigned maxnumcolors = 257;
+  unsigned sixteen = 0;
+  if(bpp <= 8) maxnumcolors = bpp == 1 ? 2 : (bpp == 2 ? 4 : (bpp == 4 ? 16 : 256));
+
+  color_tree_init(&tree);
+
+  /*Check if the 16-bit input is truly 16-bit*/
+  if(mode->bitdepth == 16)
+  {
+    unsigned short r, g, b, a;
+    for(i = 0; i != numpixels; ++i)
+    {
+      getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+      if((r & 255) != ((r >> 8) & 255) || (g & 255) != ((g >> 8) & 255) ||
+         (b & 255) != ((b >> 8) & 255) || (a & 255) != ((a >> 8) & 255)) /*first and second byte differ*/
+      {
+        sixteen = 1;
+        break;
+      }
+    }
+  }
+
+  if(sixteen)
+  {
+    unsigned short r = 0, g = 0, b = 0, a = 0;
+    profile->bits = 16;
+    bits_done = numcolors_done = 1; /*counting colors no longer useful, palette doesn't support 16-bit*/
+
+    for(i = 0; i != numpixels; ++i)
+    {
+      getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+
+      if(!colored_done && (r != g || r != b))
+      {
+        profile->colored = 1;
+        colored_done = 1;
+      }
+
+      if(!alpha_done)
+      {
+        unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+        if(a != 65535 && (a != 0 || (profile->key && !matchkey)))
+        {
+          profile->alpha = 1;
+          alpha_done = 1;
+          if(profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+        }
+        else if(a == 0 && !profile->alpha && !profile->key)
+        {
+          profile->key = 1;
+          profile->key_r = r;
+          profile->key_g = g;
+          profile->key_b = b;
+        }
+        else if(a == 65535 && profile->key && matchkey)
+        {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          alpha_done = 1;
+        }
+      }
+      if(alpha_done && numcolors_done && colored_done && bits_done) break;
+    }
+
+    if(profile->key && !profile->alpha)
+    {
+      for(i = 0; i != numpixels; ++i)
+      {
+        getPixelColorRGBA16(&r, &g, &b, &a, in, i, mode);
+        if(a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+        {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          alpha_done = 1;
+        }
+      }
+    }
+  }
+  else /* < 16-bit */
+  {
+    unsigned char r = 0, g = 0, b = 0, a = 0;
+    for(i = 0; i != numpixels; ++i)
+    {
+      getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+
+      if(!bits_done && profile->bits < 8)
+      {
+        /*only r is checked, < 8 bits is only relevant for greyscale*/
+        unsigned bits = getValueRequiredBits(r);
+        if(bits > profile->bits) profile->bits = bits;
+      }
+      bits_done = (profile->bits >= bpp);
+
+      if(!colored_done && (r != g || r != b))
+      {
+        profile->colored = 1;
+        colored_done = 1;
+        if(profile->bits < 8) profile->bits = 8; /*PNG has no colored modes with less than 8-bit per channel*/
+      }
+
+      if(!alpha_done)
+      {
+        unsigned matchkey = (r == profile->key_r && g == profile->key_g && b == profile->key_b);
+        if(a != 255 && (a != 0 || (profile->key && !matchkey)))
+        {
+          profile->alpha = 1;
+          alpha_done = 1;
+          if(profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+        }
+        else if(a == 0 && !profile->alpha && !profile->key)
+        {
+          profile->key = 1;
+          profile->key_r = r;
+          profile->key_g = g;
+          profile->key_b = b;
+        }
+        else if(a == 255 && profile->key && matchkey)
+        {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          alpha_done = 1;
+          if(profile->bits < 8) profile->bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+        }
+      }
+
+      if(!numcolors_done)
+      {
+        if(!color_tree_has(&tree, r, g, b, a))
+        {
+          color_tree_add(&tree, r, g, b, a, profile->numcolors);
+          if(profile->numcolors < 256)
+          {
+            unsigned char* p = profile->palette;
+            unsigned n = profile->numcolors;
+            p[n * 4 + 0] = r;
+            p[n * 4 + 1] = g;
+            p[n * 4 + 2] = b;
+            p[n * 4 + 3] = a;
+          }
+          ++profile->numcolors;
+          numcolors_done = profile->numcolors >= maxnumcolors;
+        }
+      }
+
+      if(alpha_done && numcolors_done && colored_done && bits_done) break;
+    }
+
+    if(profile->key && !profile->alpha)
+    {
+      for(i = 0; i != numpixels; ++i)
+      {
+        getPixelColorRGBA8(&r, &g, &b, &a, in, i, mode);
+        if(a != 0 && r == profile->key_r && g == profile->key_g && b == profile->key_b)
+        {
+          /* Color key cannot be used if an opaque pixel also has that RGB color. */
+          profile->alpha = 1;
+          alpha_done = 1;
+        }
+      }
+    }
+
+    /*make the profile's key always 16-bit for consistency - repeat each byte twice*/
+    profile->key_r += (profile->key_r << 8);
+    profile->key_g += (profile->key_g << 8);
+    profile->key_b += (profile->key_b << 8);
+  }
+
+  color_tree_cleanup(&tree);
+  return error;
+}
+
+/*Automatically chooses color type that gives smallest amount of bits in the
+output image, e.g. grey if there are only greyscale pixels, palette if there
+are less than 256 colors, ...
+Updates values of mode with a potentially smaller color model. mode_out should
+contain the user chosen color model, but will be overwritten with the new chosen one.*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+                                   const unsigned char* image, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in)
+{
+  LodePNGColorProfile prof;
+  unsigned error = 0;
+  unsigned i, n, palettebits, grey_ok, palette_ok;
+
+  lodepng_color_profile_init(&prof);
+  error = lodepng_get_color_profile(&prof, image, w, h, mode_in);
+  if(error) return error;
+  mode_out->key_defined = 0;
+
+  if(prof.key && w * h <= 16)
+  {
+    prof.alpha = 1; /*too few pixels to justify tRNS chunk overhead*/
+    if(prof.bits < 8) prof.bits = 8; /*PNG has no alphachannel modes with less than 8-bit per channel*/
+  }
+  grey_ok = !prof.colored && !prof.alpha; /*grey without alpha, with potentially low bits*/
+  n = prof.numcolors;
+  palettebits = n <= 2 ? 1 : (n <= 4 ? 2 : (n <= 16 ? 4 : 8));
+  palette_ok = n <= 256 && (n * 2 < w * h) && prof.bits <= 8;
+  if(w * h < n * 2) palette_ok = 0; /*don't add palette overhead if image has only a few pixels*/
+  if(grey_ok && prof.bits <= palettebits) palette_ok = 0; /*grey is less overhead*/
+
+  if(palette_ok)
+  {
+    unsigned char* p = prof.palette;
+    lodepng_palette_clear(mode_out); /*remove potential earlier palette*/
+    for(i = 0; i != prof.numcolors; ++i)
+    {
+      error = lodepng_palette_add(mode_out, p[i * 4 + 0], p[i * 4 + 1], p[i * 4 + 2], p[i * 4 + 3]);
+      if(error) break;
+    }
+
+    mode_out->colortype = LCT_PALETTE;
+    mode_out->bitdepth = palettebits;
+
+    if(mode_in->colortype == LCT_PALETTE && mode_in->palettesize >= mode_out->palettesize
+        && mode_in->bitdepth == mode_out->bitdepth)
+    {
+      /*If input should have same palette colors, keep original to preserve its order and prevent conversion*/
+      lodepng_color_mode_cleanup(mode_out);
+      lodepng_color_mode_copy(mode_out, mode_in);
+    }
+  }
+  else /*8-bit or 16-bit per channel*/
+  {
+    mode_out->bitdepth = prof.bits;
+    mode_out->colortype = prof.alpha ? (prof.colored ? LCT_RGBA : LCT_GREY_ALPHA)
+                                     : (prof.colored ? LCT_RGB : LCT_GREY);
+
+    if(prof.key && !prof.alpha)
+    {
+      unsigned mask = (1u << mode_out->bitdepth) - 1u; /*profile always uses 16-bit, mask converts it*/
+      mode_out->key_r = prof.key_r & mask;
+      mode_out->key_g = prof.key_g & mask;
+      mode_out->key_b = prof.key_b & mask;
+      mode_out->key_defined = 1;
+    }
+  }
+
+  return error;
+}
+
+#endif /* #ifdef LODEPNG_COMPILE_ENCODER */
+
+/*
+Paeth predicter, used by PNG filter type 4
+The parameters are of type short, but should come from unsigned chars, the shorts
+are only needed to make the paeth calculation correct.
+*/
+static unsigned char paethPredictor(short a, short b, short c)
+{
+  short pa = abs(b - c);
+  short pb = abs(a - c);
+  short pc = abs(a + b - c - c);
+
+  if(pc < pa && pc < pb) return (unsigned char)c;
+  else if(pb < pa) return (unsigned char)b;
+  else return (unsigned char)a;
+}
+
+/*shared values used by multiple Adam7 related functions*/
+
+static const unsigned ADAM7_IX[7] = { 0, 4, 0, 2, 0, 1, 0 }; /*x start values*/
+static const unsigned ADAM7_IY[7] = { 0, 0, 4, 0, 2, 0, 1 }; /*y start values*/
+static const unsigned ADAM7_DX[7] = { 8, 8, 4, 4, 2, 2, 1 }; /*x delta values*/
+static const unsigned ADAM7_DY[7] = { 8, 8, 8, 4, 4, 2, 2 }; /*y delta values*/
+
+/*
+Outputs various dimensions and positions in the image related to the Adam7 reduced images.
+passw: output containing the width of the 7 passes
+passh: output containing the height of the 7 passes
+filter_passstart: output containing the index of the start and end of each
+ reduced image with filter bytes
+padded_passstart output containing the index of the start and end of each
+ reduced image when without filter bytes but with padded scanlines
+passstart: output containing the index of the start and end of each reduced
+ image without padding between scanlines, but still padding between the images
+w, h: width and height of non-interlaced image
+bpp: bits per pixel
+"padded" is only relevant if bpp is less than 8 and a scanline or image does not
+ end at a full byte
+*/
+static void Adam7_getpassvalues(unsigned passw[7], unsigned passh[7], size_t filter_passstart[8],
+                                size_t padded_passstart[8], size_t passstart[8], unsigned w, unsigned h, unsigned bpp)
+{
+  /*the passstart values have 8 values: the 8th one indicates the byte after the end of the 7th (= last) pass*/
+  unsigned i;
+
+  /*calculate width and height in pixels of each pass*/
+  for(i = 0; i != 7; ++i)
+  {
+    passw[i] = (w + ADAM7_DX[i] - ADAM7_IX[i] - 1) / ADAM7_DX[i];
+    passh[i] = (h + ADAM7_DY[i] - ADAM7_IY[i] - 1) / ADAM7_DY[i];
+    if(passw[i] == 0) passh[i] = 0;
+    if(passh[i] == 0) passw[i] = 0;
+  }
+
+  filter_passstart[0] = padded_passstart[0] = passstart[0] = 0;
+  for(i = 0; i != 7; ++i)
+  {
+    /*if passw[i] is 0, it's 0 bytes, not 1 (no filtertype-byte)*/
+    filter_passstart[i + 1] = filter_passstart[i]
+                            + ((passw[i] && passh[i]) ? passh[i] * (1 + (passw[i] * bpp + 7) / 8) : 0);
+    /*bits padded if needed to fill full byte at end of each scanline*/
+    padded_passstart[i + 1] = padded_passstart[i] + passh[i] * ((passw[i] * bpp + 7) / 8);
+    /*only padded at end of reduced image*/
+    passstart[i + 1] = passstart[i] + (passh[i] * passw[i] * bpp + 7) / 8;
+  }
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Decoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*read the information from the header and store it in the LodePNGInfo. return value is error*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h, LodePNGState* state,
+                         const unsigned char* in, size_t insize)
+{
+  LodePNGInfo* info = &state->info_png;
+  if(insize == 0 || in == 0)
+  {
+    CERROR_RETURN_ERROR(state->error, 48); /*error: the given data is empty*/
+  }
+  if(insize < 33)
+  {
+    CERROR_RETURN_ERROR(state->error, 27); /*error: the data length is smaller than the length of a PNG header*/
+  }
+
+  /*when decoding a new PNG image, make sure all parameters created after previous decoding are reset*/
+  lodepng_info_cleanup(info);
+  lodepng_info_init(info);
+
+  if(in[0] != 137 || in[1] != 80 || in[2] != 78 || in[3] != 71
+     || in[4] != 13 || in[5] != 10 || in[6] != 26 || in[7] != 10)
+  {
+    CERROR_RETURN_ERROR(state->error, 28); /*error: the first 8 bytes are not the correct PNG signature*/
+  }
+  if(lodepng_chunk_length(in + 8) != 13)
+  {
+    CERROR_RETURN_ERROR(state->error, 94); /*error: header size must be 13 bytes*/
+  }
+  if(!lodepng_chunk_type_equals(in + 8, "IHDR"))
+  {
+    CERROR_RETURN_ERROR(state->error, 29); /*error: it doesn't start with a IHDR chunk!*/
+  }
+
+  /*read the values given in the header*/
+  *w = lodepng_read32bitInt(&in[16]);
+  *h = lodepng_read32bitInt(&in[20]);
+  info->color.bitdepth = in[24];
+  info->color.colortype = (LodePNGColorType)in[25];
+  info->compression_method = in[26];
+  info->filter_method = in[27];
+  info->interlace_method = in[28];
+
+  if(*w == 0 || *h == 0)
+  {
+    CERROR_RETURN_ERROR(state->error, 93);
+  }
+
+  if(!state->decoder.ignore_crc)
+  {
+    unsigned CRC = lodepng_read32bitInt(&in[29]);
+    unsigned checksum = lodepng_crc32(&in[12], 17);
+    if(CRC != checksum)
+    {
+      CERROR_RETURN_ERROR(state->error, 57); /*invalid CRC*/
+    }
+  }
+
+  /*error: only compression method 0 is allowed in the specification*/
+  if(info->compression_method != 0) CERROR_RETURN_ERROR(state->error, 32);
+  /*error: only filter method 0 is allowed in the specification*/
+  if(info->filter_method != 0) CERROR_RETURN_ERROR(state->error, 33);
+  /*error: only interlace methods 0 and 1 exist in the specification*/
+  if(info->interlace_method > 1) CERROR_RETURN_ERROR(state->error, 34);
+
+  state->error = checkColorValidity(info->color.colortype, info->color.bitdepth);
+  return state->error;
+}
+
+static unsigned unfilterScanline(unsigned char* recon, const unsigned char* scanline, const unsigned char* precon,
+                                 size_t bytewidth, unsigned char filterType, size_t length)
+{
+  /*
+  For PNG filter method 0
+  unfilter a PNG image scanline by scanline. when the pixels are smaller than 1 byte,
+  the filter works byte per byte (bytewidth = 1)
+  precon is the previous unfiltered scanline, recon the result, scanline the current one
+  the incoming scanlines do NOT include the filtertype byte, that one is given in the parameter filterType instead
+  recon and scanline MAY be the same memory address! precon must be disjoint.
+  */
+
+  size_t i;
+  switch(filterType)
+  {
+    case 0:
+      for(i = 0; i != length; ++i) recon[i] = scanline[i];
+      break;
+    case 1:
+      for(i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+      for(i = bytewidth; i < length; ++i) recon[i] = scanline[i] + recon[i - bytewidth];
+      break;
+    case 2:
+      if(precon)
+      {
+        for(i = 0; i != length; ++i) recon[i] = scanline[i] + precon[i];
+      }
+      else
+      {
+        for(i = 0; i != length; ++i) recon[i] = scanline[i];
+      }
+      break;
+    case 3:
+      if(precon)
+      {
+        for(i = 0; i != bytewidth; ++i) recon[i] = scanline[i] + (precon[i] >> 1);
+        for(i = bytewidth; i < length; ++i) recon[i] = scanline[i] + ((recon[i - bytewidth] + precon[i]) >> 1);
+      }
+      else
+      {
+        for(i = 0; i != bytewidth; ++i) recon[i] = scanline[i];
+        for(i = bytewidth; i < length; ++i) recon[i] = scanline[i] + (recon[i - bytewidth] >> 1);
+      }
+      break;
+    case 4:
+      if(precon)
+      {
+        for(i = 0; i != bytewidth; ++i)
+        {
+          recon[i] = (scanline[i] + precon[i]); /*paethPredictor(0, precon[i], 0) is always precon[i]*/
+        }
+        for(i = bytewidth; i < length; ++i)
+        {
+          recon[i] = (scanline[i] + paethPredictor(recon[i - bytewidth], precon[i], precon[i - bytewidth]));
+        }
+      }
+      else
+      {
+        for(i = 0; i != bytewidth; ++i)
+        {
+          recon[i] = scanline[i];
+        }
+        for(i = bytewidth; i < length; ++i)
+        {
+          /*paethPredictor(recon[i - bytewidth], 0, 0) is always recon[i - bytewidth]*/
+          recon[i] = (scanline[i] + recon[i - bytewidth]);
+        }
+      }
+      break;
+    default: return 36; /*error: unexisting filter type given*/
+  }
+  return 0;
+}
+
+static unsigned unfilter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+  /*
+  For PNG filter method 0
+  this function unfilters a single image (e.g. without interlacing this is called once, with Adam7 seven times)
+  out must have enough bytes allocated already, in must have the scanlines + 1 filtertype byte per scanline
+  w and h are image dimensions or dimensions of reduced image, bpp is bits per pixel
+  in and out are allowed to be the same memory address (but aren't the same size since in has the extra filter bytes)
+  */
+
+  unsigned y;
+  unsigned char* prevline = 0;
+
+  /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+  size_t bytewidth = (bpp + 7) / 8;
+  size_t linebytes = (w * bpp + 7) / 8;
+
+  for(y = 0; y < h; ++y)
+  {
+    size_t outindex = linebytes * y;
+    size_t inindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+    unsigned char filterType = in[inindex];
+
+    CERROR_TRY_RETURN(unfilterScanline(&out[outindex], &in[inindex + 1], prevline, bytewidth, filterType, linebytes));
+
+    prevline = &out[outindex];
+  }
+
+  return 0;
+}
+
+/*
+in: Adam7 interlaced image, with no padding bits between scanlines, but between
+ reduced images so that each reduced image starts at a byte.
+out: the same pixels, but re-ordered so that they're now a non-interlaced image with size w*h
+bpp: bits per pixel
+out has the following size in bits: w * h * bpp.
+in is possibly bigger due to padding bits between reduced images.
+out must be big enough AND must be 0 everywhere if bpp < 8 in the current implementation
+(because that's likely a little bit faster)
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_deinterlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+  unsigned passw[7], passh[7];
+  size_t filter_passstart[8], padded_passstart[8], passstart[8];
+  unsigned i;
+
+  Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+  if(bpp >= 8)
+  {
+    for(i = 0; i != 7; ++i)
+    {
+      unsigned x, y, b;
+      size_t bytewidth = bpp / 8;
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x)
+      {
+        size_t pixelinstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+        size_t pixeloutstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+        for(b = 0; b < bytewidth; ++b)
+        {
+          out[pixeloutstart + b] = in[pixelinstart + b];
+        }
+      }
+    }
+  }
+  else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+  {
+    for(i = 0; i != 7; ++i)
+    {
+      unsigned x, y, b;
+      unsigned ilinebits = bpp * passw[i];
+      unsigned olinebits = bpp * w;
+      size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x)
+      {
+        ibp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+        obp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+        for(b = 0; b < bpp; ++b)
+        {
+          unsigned char bit = readBitFromReversedStream(&ibp, in);
+          /*note that this function assumes the out buffer is completely 0, use setBitOfReversedStream otherwise*/
+          setBitOfReversedStream0(&obp, out, bit);
+        }
+      }
+    }
+  }
+}
+
+static void removePaddingBits(unsigned char* out, const unsigned char* in,
+                              size_t olinebits, size_t ilinebits, unsigned h)
+{
+  /*
+  After filtering there are still padding bits if scanlines have non multiple of 8 bit amounts. They need
+  to be removed (except at last scanline of (Adam7-reduced) image) before working with pure image buffers
+  for the Adam7 code, the color convert code and the output to the user.
+  in and out are allowed to be the same buffer, in may also be higher but still overlapping; in must
+  have >= ilinebits*h bits, out must have >= olinebits*h bits, olinebits must be <= ilinebits
+  also used to move bits after earlier such operations happened, e.g. in a sequence of reduced images from Adam7
+  only useful if (ilinebits - olinebits) is a value in the range 1..7
+  */
+  unsigned y;
+  size_t diff = ilinebits - olinebits;
+  size_t ibp = 0, obp = 0; /*input and output bit pointers*/
+  for(y = 0; y < h; ++y)
+  {
+    size_t x;
+    for(x = 0; x < olinebits; ++x)
+    {
+      unsigned char bit = readBitFromReversedStream(&ibp, in);
+      setBitOfReversedStream(&obp, out, bit);
+    }
+    ibp += diff;
+  }
+}
+
+/*out must be buffer big enough to contain full image, and in must contain the full decompressed data from
+the IDAT chunks (with filter index bytes and possible padding bits)
+return value is error*/
+static unsigned postProcessScanlines(unsigned char* out, unsigned char* in,
+                                     unsigned w, unsigned h, const LodePNGInfo* info_png)
+{
+  /*
+  This function converts the filtered-padded-interlaced data into pure 2D image buffer with the PNG's colortype.
+  Steps:
+  *) if no Adam7: 1) unfilter 2) remove padding bits (= posible extra bits per scanline if bpp < 8)
+  *) if adam7: 1) 7x unfilter 2) 7x remove padding bits 3) Adam7_deinterlace
+  NOTE: the in buffer will be overwritten with intermediate data!
+  */
+  unsigned bpp = lodepng_get_bpp(&info_png->color);
+  if(bpp == 0) return 31; /*error: invalid colortype*/
+
+  if(info_png->interlace_method == 0)
+  {
+    if(bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+    {
+      CERROR_TRY_RETURN(unfilter(in, in, w, h, bpp));
+      removePaddingBits(out, in, w * bpp, ((w * bpp + 7) / 8) * 8, h);
+    }
+    /*we can immediately filter into the out buffer, no other steps needed*/
+    else CERROR_TRY_RETURN(unfilter(out, in, w, h, bpp));
+  }
+  else /*interlace_method is 1 (Adam7)*/
+  {
+    unsigned passw[7], passh[7]; size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned i;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    for(i = 0; i != 7; ++i)
+    {
+      CERROR_TRY_RETURN(unfilter(&in[padded_passstart[i]], &in[filter_passstart[i]], passw[i], passh[i], bpp));
+      /*TODO: possible efficiency improvement: if in this reduced image the bits fit nicely in 1 scanline,
+      move bytes instead of bits or move not at all*/
+      if(bpp < 8)
+      {
+        /*remove padding bits in scanlines; after this there still may be padding
+        bits between the different reduced images: each reduced image still starts nicely at a byte*/
+        removePaddingBits(&in[passstart[i]], &in[padded_passstart[i]], passw[i] * bpp,
+                          ((passw[i] * bpp + 7) / 8) * 8, passh[i]);
+      }
+    }
+
+    Adam7_deinterlace(out, in, w, h, bpp);
+  }
+
+  return 0;
+}
+
+static unsigned readChunk_PLTE(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+  unsigned pos = 0, i;
+  if(color->palette) lodepng_free(color->palette);
+  color->palettesize = chunkLength / 3;
+  color->palette = (unsigned char*)lodepng_malloc(4 * color->palettesize);
+  if(!color->palette && color->palettesize)
+  {
+    color->palettesize = 0;
+    return 83; /*alloc fail*/
+  }
+  if(color->palettesize > 256) return 38; /*error: palette too big*/
+
+  for(i = 0; i != color->palettesize; ++i)
+  {
+    color->palette[4 * i + 0] = data[pos++]; /*R*/
+    color->palette[4 * i + 1] = data[pos++]; /*G*/
+    color->palette[4 * i + 2] = data[pos++]; /*B*/
+    color->palette[4 * i + 3] = 255; /*alpha*/
+  }
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_tRNS(LodePNGColorMode* color, const unsigned char* data, size_t chunkLength)
+{
+  unsigned i;
+  if(color->colortype == LCT_PALETTE)
+  {
+    /*error: more alpha values given than there are palette entries*/
+    if(chunkLength > color->palettesize) return 38;
+
+    for(i = 0; i != chunkLength; ++i) color->palette[4 * i + 3] = data[i];
+  }
+  else if(color->colortype == LCT_GREY)
+  {
+    /*error: this chunk must be 2 bytes for greyscale image*/
+    if(chunkLength != 2) return 30;
+
+    color->key_defined = 1;
+    color->key_r = color->key_g = color->key_b = 256u * data[0] + data[1];
+  }
+  else if(color->colortype == LCT_RGB)
+  {
+    /*error: this chunk must be 6 bytes for RGB image*/
+    if(chunkLength != 6) return 41;
+
+    color->key_defined = 1;
+    color->key_r = 256u * data[0] + data[1];
+    color->key_g = 256u * data[2] + data[3];
+    color->key_b = 256u * data[4] + data[5];
+  }
+  else return 42; /*error: tRNS chunk not allowed for other color models*/
+
+  return 0; /* OK */
+}
+
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*background color chunk (bKGD)*/
+static unsigned readChunk_bKGD(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+  if(info->color.colortype == LCT_PALETTE)
+  {
+    /*error: this chunk must be 1 byte for indexed color image*/
+    if(chunkLength != 1) return 43;
+
+    info->background_defined = 1;
+    info->background_r = info->background_g = info->background_b = data[0];
+  }
+  else if(info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+  {
+    /*error: this chunk must be 2 bytes for greyscale image*/
+    if(chunkLength != 2) return 44;
+
+    info->background_defined = 1;
+    info->background_r = info->background_g = info->background_b = 256u * data[0] + data[1];
+  }
+  else if(info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+  {
+    /*error: this chunk must be 6 bytes for greyscale image*/
+    if(chunkLength != 6) return 45;
+
+    info->background_defined = 1;
+    info->background_r = 256u * data[0] + data[1];
+    info->background_g = 256u * data[2] + data[3];
+    info->background_b = 256u * data[4] + data[5];
+  }
+
+  return 0; /* OK */
+}
+
+/*text chunk (tEXt)*/
+static unsigned readChunk_tEXt(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+  unsigned error = 0;
+  char *key = 0, *str = 0;
+  unsigned i;
+
+  while(!error) /*not really a while loop, only used to break on error*/
+  {
+    unsigned length, string2_begin;
+
+    length = 0;
+    while(length < chunkLength && data[length] != 0) ++length;
+    /*even though it's not allowed by the standard, no error is thrown if
+    there's no null termination char, if the text is empty*/
+    if(length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+    key = (char*)lodepng_malloc(length + 1);
+    if(!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    key[length] = 0;
+    for(i = 0; i != length; ++i) key[i] = (char)data[i];
+
+    string2_begin = length + 1; /*skip keyword null terminator*/
+
+    length = chunkLength < string2_begin ? 0 : chunkLength - string2_begin;
+    str = (char*)lodepng_malloc(length + 1);
+    if(!str) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    str[length] = 0;
+    for(i = 0; i != length; ++i) str[i] = (char)data[string2_begin + i];
+
+    error = lodepng_add_text(info, key, str);
+
+    break;
+  }
+
+  lodepng_free(key);
+  lodepng_free(str);
+
+  return error;
+}
+
+/*compressed text chunk (zTXt)*/
+static unsigned readChunk_zTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+                               const unsigned char* data, size_t chunkLength)
+{
+  unsigned error = 0;
+  unsigned i;
+
+  unsigned length, string2_begin;
+  char *key = 0;
+  ucvector decoded;
+
+  ucvector_init(&decoded);
+
+  while(!error) /*not really a while loop, only used to break on error*/
+  {
+    for(length = 0; length < chunkLength && data[length] != 0; ++length) ;
+    if(length + 2 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+    if(length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+    key = (char*)lodepng_malloc(length + 1);
+    if(!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    key[length] = 0;
+    for(i = 0; i != length; ++i) key[i] = (char)data[i];
+
+    if(data[length + 1] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+    string2_begin = length + 2;
+    if(string2_begin > chunkLength) CERROR_BREAK(error, 75); /*no null termination, corrupt?*/
+
+    length = chunkLength - string2_begin;
+    /*will fail if zlib error, e.g. if length is too small*/
+    error = zlib_decompress(&decoded.data, &decoded.size,
+                            (unsigned char*)(&data[string2_begin]),
+                            length, zlibsettings);
+    if(error) break;
+    ucvector_push_back(&decoded, 0);
+
+    error = lodepng_add_text(info, key, (char*)decoded.data);
+
+    break;
+  }
+
+  lodepng_free(key);
+  ucvector_cleanup(&decoded);
+
+  return error;
+}
+
+/*international text chunk (iTXt)*/
+static unsigned readChunk_iTXt(LodePNGInfo* info, const LodePNGDecompressSettings* zlibsettings,
+                               const unsigned char* data, size_t chunkLength)
+{
+  unsigned error = 0;
+  unsigned i;
+
+  unsigned length, begin, compressed;
+  char *key = 0, *langtag = 0, *transkey = 0;
+  ucvector decoded;
+  ucvector_init(&decoded);
+
+  while(!error) /*not really a while loop, only used to break on error*/
+  {
+    /*Quick check if the chunk length isn't too small. Even without check
+    it'd still fail with other error checks below if it's too short. This just gives a different error code.*/
+    if(chunkLength < 5) CERROR_BREAK(error, 30); /*iTXt chunk too short*/
+
+    /*read the key*/
+    for(length = 0; length < chunkLength && data[length] != 0; ++length) ;
+    if(length + 3 >= chunkLength) CERROR_BREAK(error, 75); /*no null termination char, corrupt?*/
+    if(length < 1 || length > 79) CERROR_BREAK(error, 89); /*keyword too short or long*/
+
+    key = (char*)lodepng_malloc(length + 1);
+    if(!key) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    key[length] = 0;
+    for(i = 0; i != length; ++i) key[i] = (char)data[i];
+
+    /*read the compression method*/
+    compressed = data[length + 1];
+    if(data[length + 2] != 0) CERROR_BREAK(error, 72); /*the 0 byte indicating compression must be 0*/
+
+    /*even though it's not allowed by the standard, no error is thrown if
+    there's no null termination char, if the text is empty for the next 3 texts*/
+
+    /*read the langtag*/
+    begin = length + 3;
+    length = 0;
+    for(i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+    langtag = (char*)lodepng_malloc(length + 1);
+    if(!langtag) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    langtag[length] = 0;
+    for(i = 0; i != length; ++i) langtag[i] = (char)data[begin + i];
+
+    /*read the transkey*/
+    begin += length + 1;
+    length = 0;
+    for(i = begin; i < chunkLength && data[i] != 0; ++i) ++length;
+
+    transkey = (char*)lodepng_malloc(length + 1);
+    if(!transkey) CERROR_BREAK(error, 83); /*alloc fail*/
+
+    transkey[length] = 0;
+    for(i = 0; i != length; ++i) transkey[i] = (char)data[begin + i];
+
+    /*read the actual text*/
+    begin += length + 1;
+
+    length = chunkLength < begin ? 0 : chunkLength - begin;
+
+    if(compressed)
+    {
+      /*will fail if zlib error, e.g. if length is too small*/
+      error = zlib_decompress(&decoded.data, &decoded.size,
+                              (unsigned char*)(&data[begin]),
+                              length, zlibsettings);
+      if(error) break;
+      if(decoded.allocsize < decoded.size) decoded.allocsize = decoded.size;
+      ucvector_push_back(&decoded, 0);
+    }
+    else
+    {
+      if(!ucvector_resize(&decoded, length + 1)) CERROR_BREAK(error, 83 /*alloc fail*/);
+
+      decoded.data[length] = 0;
+      for(i = 0; i != length; ++i) decoded.data[i] = data[begin + i];
+    }
+
+    error = lodepng_add_itext(info, key, langtag, transkey, (char*)decoded.data);
+
+    break;
+  }
+
+  lodepng_free(key);
+  lodepng_free(langtag);
+  lodepng_free(transkey);
+  ucvector_cleanup(&decoded);
+
+  return error;
+}
+
+static unsigned readChunk_tIME(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+  if(chunkLength != 7) return 73; /*invalid tIME chunk size*/
+
+  info->time_defined = 1;
+  info->time.year = 256u * data[0] + data[1];
+  info->time.month = data[2];
+  info->time.day = data[3];
+  info->time.hour = data[4];
+  info->time.minute = data[5];
+  info->time.second = data[6];
+
+  return 0; /* OK */
+}
+
+static unsigned readChunk_pHYs(LodePNGInfo* info, const unsigned char* data, size_t chunkLength)
+{
+  if(chunkLength != 9) return 74; /*invalid pHYs chunk size*/
+
+  info->phys_defined = 1;
+  info->phys_x = 16777216u * data[0] + 65536u * data[1] + 256u * data[2] + data[3];
+  info->phys_y = 16777216u * data[4] + 65536u * data[5] + 256u * data[6] + data[7];
+  info->phys_unit = data[8];
+
+  return 0; /* OK */
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*read a PNG, the result will be in the same color type as the PNG (hence "generic")*/
+static void decodeGeneric(unsigned char** out, unsigned* w, unsigned* h,
+                          LodePNGState* state,
+                          const unsigned char* in, size_t insize)
+{
+  unsigned char IEND = 0;
+  const unsigned char* chunk;
+  size_t i;
+  ucvector idat; /*the data from idat chunks*/
+  ucvector scanlines;
+  size_t predict;
+  size_t numpixels;
+  size_t outsize = 0;
+
+  /*for unknown chunk order*/
+  unsigned unknown = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  unsigned critical_pos = 1; /*1 = after IHDR, 2 = after PLTE, 3 = after IDAT*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+  /*provide some proper output values if error will happen*/
+  *out = 0;
+
+  state->error = lodepng_inspect(w, h, state, in, insize); /*reads header and resets other parameters in state->info_png*/
+  if(state->error) return;
+
+  numpixels = *w * *h;
+
+  /*multiplication overflow*/
+  if(*h != 0 && numpixels / *h != *w) CERROR_RETURN(state->error, 92);
+  /*multiplication overflow possible further below. Allows up to 2^31-1 pixel
+  bytes with 16-bit RGBA, the rest is room for filter bytes.*/
+  if(numpixels > 268435455) CERROR_RETURN(state->error, 92);
+
+  ucvector_init(&idat);
+  chunk = &in[33]; /*first byte of the first chunk after the header*/
+
+  /*loop through the chunks, ignoring unknown chunks and stopping at IEND chunk.
+  IDAT data is put at the start of the in buffer*/
+  while(!IEND && !state->error)
+  {
+    unsigned chunkLength;
+    const unsigned char* data; /*the data in the chunk*/
+
+    /*error: size of the in buffer too small to contain next chunk*/
+    if((size_t)((chunk - in) + 12) > insize || chunk < in) CERROR_BREAK(state->error, 30);
+
+    /*length of the data of the chunk, excluding the length bytes, chunk type and CRC bytes*/
+    chunkLength = lodepng_chunk_length(chunk);
+    /*error: chunk length larger than the max PNG chunk size*/
+    if(chunkLength > 2147483647) CERROR_BREAK(state->error, 63);
+
+    if((size_t)((chunk - in) + chunkLength + 12) > insize || (chunk + chunkLength + 12) < in)
+    {
+      CERROR_BREAK(state->error, 64); /*error: size of the in buffer too small to contain next chunk*/
+    }
+
+    data = lodepng_chunk_data_const(chunk);
+
+    /*IDAT chunk, containing compressed image data*/
+    if(lodepng_chunk_type_equals(chunk, "IDAT"))
+    {
+      size_t oldsize = idat.size;
+      if(!ucvector_resize(&idat, oldsize + chunkLength)) CERROR_BREAK(state->error, 83 /*alloc fail*/);
+      for(i = 0; i != chunkLength; ++i) idat.data[oldsize + i] = data[i];
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      critical_pos = 3;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    }
+    /*IEND chunk*/
+    else if(lodepng_chunk_type_equals(chunk, "IEND"))
+    {
+      IEND = 1;
+    }
+    /*palette chunk (PLTE)*/
+    else if(lodepng_chunk_type_equals(chunk, "PLTE"))
+    {
+      state->error = readChunk_PLTE(&state->info_png.color, data, chunkLength);
+      if(state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      critical_pos = 2;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    }
+    /*palette transparency chunk (tRNS)*/
+    else if(lodepng_chunk_type_equals(chunk, "tRNS"))
+    {
+      state->error = readChunk_tRNS(&state->info_png.color, data, chunkLength);
+      if(state->error) break;
+    }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*background color chunk (bKGD)*/
+    else if(lodepng_chunk_type_equals(chunk, "bKGD"))
+    {
+      state->error = readChunk_bKGD(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    }
+    /*text chunk (tEXt)*/
+    else if(lodepng_chunk_type_equals(chunk, "tEXt"))
+    {
+      if(state->decoder.read_text_chunks)
+      {
+        state->error = readChunk_tEXt(&state->info_png, data, chunkLength);
+        if(state->error) break;
+      }
+    }
+    /*compressed text chunk (zTXt)*/
+    else if(lodepng_chunk_type_equals(chunk, "zTXt"))
+    {
+      if(state->decoder.read_text_chunks)
+      {
+        state->error = readChunk_zTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+        if(state->error) break;
+      }
+    }
+    /*international text chunk (iTXt)*/
+    else if(lodepng_chunk_type_equals(chunk, "iTXt"))
+    {
+      if(state->decoder.read_text_chunks)
+      {
+        state->error = readChunk_iTXt(&state->info_png, &state->decoder.zlibsettings, data, chunkLength);
+        if(state->error) break;
+      }
+    }
+    else if(lodepng_chunk_type_equals(chunk, "tIME"))
+    {
+      state->error = readChunk_tIME(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    }
+    else if(lodepng_chunk_type_equals(chunk, "pHYs"))
+    {
+      state->error = readChunk_pHYs(&state->info_png, data, chunkLength);
+      if(state->error) break;
+    }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    else /*it's not an implemented chunk type, so ignore it: skip over the data*/
+    {
+      /*error: unknown critical chunk (5th bit of first byte of chunk type is 0)*/
+      if(!lodepng_chunk_ancillary(chunk)) CERROR_BREAK(state->error, 69);
+
+      unknown = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+      if(state->decoder.remember_unknown_chunks)
+      {
+        state->error = lodepng_chunk_append(&state->info_png.unknown_chunks_data[critical_pos - 1],
+                                            &state->info_png.unknown_chunks_size[critical_pos - 1], chunk);
+        if(state->error) break;
+      }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    }
+
+    if(!state->decoder.ignore_crc && !unknown) /*check CRC if wanted, only on known chunk types*/
+    {
+      if(lodepng_chunk_check_crc(chunk)) CERROR_BREAK(state->error, 57); /*invalid CRC*/
+    }
+
+    if(!IEND) chunk = lodepng_chunk_next_const(chunk);
+  }
+
+  ucvector_init(&scanlines);
+  /*predict output size, to allocate exact size for output buffer to avoid more dynamic allocation.
+  If the decompressed size does not match the prediction, the image must be corrupt.*/
+  if(state->info_png.interlace_method == 0)
+  {
+    /*The extra *h is added because this are the filter bytes every scanline starts with*/
+    predict = lodepng_get_raw_size_idat(*w, *h, &state->info_png.color) + *h;
+  }
+  else
+  {
+    /*Adam-7 interlaced: predicted size is the sum of the 7 sub-images sizes*/
+    const LodePNGColorMode* color = &state->info_png.color;
+    predict = 0;
+    predict += lodepng_get_raw_size_idat((*w + 7) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+    if(*w > 4) predict += lodepng_get_raw_size_idat((*w + 3) >> 3, (*h + 7) >> 3, color) + ((*h + 7) >> 3);
+    predict += lodepng_get_raw_size_idat((*w + 3) >> 2, (*h + 3) >> 3, color) + ((*h + 3) >> 3);
+    if(*w > 2) predict += lodepng_get_raw_size_idat((*w + 1) >> 2, (*h + 3) >> 2, color) + ((*h + 3) >> 2);
+    predict += lodepng_get_raw_size_idat((*w + 1) >> 1, (*h + 1) >> 2, color) + ((*h + 1) >> 2);
+    if(*w > 1) predict += lodepng_get_raw_size_idat((*w + 0) >> 1, (*h + 1) >> 1, color) + ((*h + 1) >> 1);
+    predict += lodepng_get_raw_size_idat((*w + 0), (*h + 0) >> 1, color) + ((*h + 0) >> 1);
+  }
+  if(!state->error && !ucvector_reserve(&scanlines, predict)) state->error = 83; /*alloc fail*/
+  if(!state->error)
+  {
+    state->error = zlib_decompress(&scanlines.data, &scanlines.size, idat.data,
+                                   idat.size, &state->decoder.zlibsettings);
+    if(!state->error && scanlines.size != predict) state->error = 91; /*decompressed size doesn't match prediction*/
+  }
+  ucvector_cleanup(&idat);
+
+  if(!state->error)
+  {
+    outsize = lodepng_get_raw_size(*w, *h, &state->info_png.color);
+    *out = (unsigned char*)lodepng_malloc(outsize);
+    if(!*out) state->error = 83; /*alloc fail*/
+  }
+  if(!state->error)
+  {
+    for(i = 0; i < outsize; i++) (*out)[i] = 0;
+    state->error = postProcessScanlines(*out, scanlines.data, *w, *h, &state->info_png);
+  }
+  ucvector_cleanup(&scanlines);
+}
+
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+                        LodePNGState* state,
+                        const unsigned char* in, size_t insize)
+{
+  *out = 0;
+  decodeGeneric(out, w, h, state, in, insize);
+  if(state->error) return state->error;
+  if(!state->decoder.color_convert || lodepng_color_mode_equal(&state->info_raw, &state->info_png.color))
+  {
+    /*same color type, no copying or converting of data needed*/
+    /*store the info_png color settings on the info_raw so that the info_raw still reflects what colortype
+    the raw image has to the end user*/
+    if(!state->decoder.color_convert)
+    {
+      state->error = lodepng_color_mode_copy(&state->info_raw, &state->info_png.color);
+      if(state->error) return state->error;
+    }
+  }
+  else
+  {
+    /*color conversion needed; sort of copy of the data*/
+    unsigned char* data = *out;
+    size_t outsize;
+
+    /*TODO: check if this works according to the statement in the documentation: "The converter can convert
+    from greyscale input color type, to 8-bit greyscale or greyscale with alpha"*/
+    if(!(state->info_raw.colortype == LCT_RGB || state->info_raw.colortype == LCT_RGBA)
+       && !(state->info_raw.bitdepth == 8))
+    {
+      return 56; /*unsupported color mode conversion*/
+    }
+
+    outsize = lodepng_get_raw_size(*w, *h, &state->info_raw);
+    *out = (unsigned char*)lodepng_malloc(outsize);
+    if(!(*out))
+    {
+      state->error = 83; /*alloc fail*/
+    }
+    else state->error = lodepng_convert(*out, data, &state->info_raw,
+                                        &state->info_png.color, *w, *h);
+    lodepng_free(data);
+  }
+  return state->error;
+}
+
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in,
+                               size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+{
+  unsigned error;
+  LodePNGState state;
+  lodepng_state_init(&state);
+  state.info_raw.colortype = colortype;
+  state.info_raw.bitdepth = bitdepth;
+  error = lodepng_decode(out, w, h, &state, in, insize);
+  lodepng_state_cleanup(&state);
+  return error;
+}
+
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+  return lodepng_decode_memory(out, w, h, in, insize, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h, const unsigned char* in, size_t insize)
+{
+  return lodepng_decode_memory(out, w, h, in, insize, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename,
+                             LodePNGColorType colortype, unsigned bitdepth)
+{
+  unsigned char* buffer = 0;
+  size_t buffersize;
+  unsigned error;
+  error = lodepng_load_file(&buffer, &buffersize, filename);
+  if(!error) error = lodepng_decode_memory(out, w, h, buffer, buffersize, colortype, bitdepth);
+  lodepng_free(buffer);
+  return error;
+}
+
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+  return lodepng_decode_file(out, w, h, filename, LCT_RGBA, 8);
+}
+
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h, const char* filename)
+{
+  return lodepng_decode_file(out, w, h, filename, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings)
+{
+  settings->color_convert = 1;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  settings->read_text_chunks = 1;
+  settings->remember_unknown_chunks = 0;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+  settings->ignore_crc = 0;
+  lodepng_decompress_settings_init(&settings->zlibsettings);
+}
+
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+
+void lodepng_state_init(LodePNGState* state)
+{
+#ifdef LODEPNG_COMPILE_DECODER
+  lodepng_decoder_settings_init(&state->decoder);
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+  lodepng_encoder_settings_init(&state->encoder);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+  lodepng_color_mode_init(&state->info_raw);
+  lodepng_info_init(&state->info_png);
+  state->error = 1;
+}
+
+void lodepng_state_cleanup(LodePNGState* state)
+{
+  lodepng_color_mode_cleanup(&state->info_raw);
+  lodepng_info_cleanup(&state->info_png);
+}
+
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source)
+{
+  lodepng_state_cleanup(dest);
+  *dest = *source;
+  lodepng_color_mode_init(&dest->info_raw);
+  lodepng_info_init(&dest->info_png);
+  dest->error = lodepng_color_mode_copy(&dest->info_raw, &source->info_raw); if(dest->error) return;
+  dest->error = lodepng_info_copy(&dest->info_png, &source->info_png); if(dest->error) return;
+}
+
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* / PNG Encoder                                                            / */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+/*chunkName must be string of 4 characters*/
+static unsigned addChunk(ucvector* out, const char* chunkName, const unsigned char* data, size_t length)
+{
+  CERROR_TRY_RETURN(lodepng_chunk_create(&out->data, &out->size, (unsigned)length, chunkName, data));
+  out->allocsize = out->size; /*fix the allocsize again*/
+  return 0;
+}
+
+static void writeSignature(ucvector* out)
+{
+  /*8 bytes PNG signature, aka the magic bytes*/
+  ucvector_push_back(out, 137);
+  ucvector_push_back(out, 80);
+  ucvector_push_back(out, 78);
+  ucvector_push_back(out, 71);
+  ucvector_push_back(out, 13);
+  ucvector_push_back(out, 10);
+  ucvector_push_back(out, 26);
+  ucvector_push_back(out, 10);
+}
+
+static unsigned addChunk_IHDR(ucvector* out, unsigned w, unsigned h,
+                              LodePNGColorType colortype, unsigned bitdepth, unsigned interlace_method)
+{
+  unsigned error = 0;
+  ucvector header;
+  ucvector_init(&header);
+
+  lodepng_add32bitInt(&header, w); /*width*/
+  lodepng_add32bitInt(&header, h); /*height*/
+  ucvector_push_back(&header, (unsigned char)bitdepth); /*bit depth*/
+  ucvector_push_back(&header, (unsigned char)colortype); /*color type*/
+  ucvector_push_back(&header, 0); /*compression method*/
+  ucvector_push_back(&header, 0); /*filter method*/
+  ucvector_push_back(&header, interlace_method); /*interlace method*/
+
+  error = addChunk(out, "IHDR", header.data, header.size);
+  ucvector_cleanup(&header);
+
+  return error;
+}
+
+static unsigned addChunk_PLTE(ucvector* out, const LodePNGColorMode* info)
+{
+  unsigned error = 0;
+  size_t i;
+  ucvector PLTE;
+  ucvector_init(&PLTE);
+  for(i = 0; i != info->palettesize * 4; ++i)
+  {
+    /*add all channels except alpha channel*/
+    if(i % 4 != 3) ucvector_push_back(&PLTE, info->palette[i]);
+  }
+  error = addChunk(out, "PLTE", PLTE.data, PLTE.size);
+  ucvector_cleanup(&PLTE);
+
+  return error;
+}
+
+static unsigned addChunk_tRNS(ucvector* out, const LodePNGColorMode* info)
+{
+  unsigned error = 0;
+  size_t i;
+  ucvector tRNS;
+  ucvector_init(&tRNS);
+  if(info->colortype == LCT_PALETTE)
+  {
+    size_t amount = info->palettesize;
+    /*the tail of palette values that all have 255 as alpha, does not have to be encoded*/
+    for(i = info->palettesize; i != 0; --i)
+    {
+      if(info->palette[4 * (i - 1) + 3] == 255) --amount;
+      else break;
+    }
+    /*add only alpha channel*/
+    for(i = 0; i != amount; ++i) ucvector_push_back(&tRNS, info->palette[4 * i + 3]);
+  }
+  else if(info->colortype == LCT_GREY)
+  {
+    if(info->key_defined)
+    {
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+    }
+  }
+  else if(info->colortype == LCT_RGB)
+  {
+    if(info->key_defined)
+    {
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_r & 255));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_g >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_g & 255));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_b >> 8));
+      ucvector_push_back(&tRNS, (unsigned char)(info->key_b & 255));
+    }
+  }
+
+  error = addChunk(out, "tRNS", tRNS.data, tRNS.size);
+  ucvector_cleanup(&tRNS);
+
+  return error;
+}
+
+static unsigned addChunk_IDAT(ucvector* out, const unsigned char* data, size_t datasize,
+                              LodePNGCompressSettings* zlibsettings)
+{
+  ucvector zlibdata;
+  unsigned error = 0;
+
+  /*compress with the Zlib compressor*/
+  ucvector_init(&zlibdata);
+  error = zlib_compress(&zlibdata.data, &zlibdata.size, data, datasize, zlibsettings);
+  if(!error) error = addChunk(out, "IDAT", zlibdata.data, zlibdata.size);
+  ucvector_cleanup(&zlibdata);
+
+  return error;
+}
+
+static unsigned addChunk_IEND(ucvector* out)
+{
+  unsigned error = 0;
+  error = addChunk(out, "IEND", 0, 0);
+  return error;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+
+static unsigned addChunk_tEXt(ucvector* out, const char* keyword, const char* textstring)
+{
+  unsigned error = 0;
+  size_t i;
+  ucvector text;
+  ucvector_init(&text);
+  for(i = 0; keyword[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)keyword[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&text, 0); /*0 termination char*/
+  for(i = 0; textstring[i] != 0; ++i) ucvector_push_back(&text, (unsigned char)textstring[i]);
+  error = addChunk(out, "tEXt", text.data, text.size);
+  ucvector_cleanup(&text);
+
+  return error;
+}
+
+static unsigned addChunk_zTXt(ucvector* out, const char* keyword, const char* textstring,
+                              LodePNGCompressSettings* zlibsettings)
+{
+  unsigned error = 0;
+  ucvector data, compressed;
+  size_t i, textsize = strlen(textstring);
+
+  ucvector_init(&data);
+  ucvector_init(&compressed);
+  for(i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&data, 0); /*0 termination char*/
+  ucvector_push_back(&data, 0); /*compression method: 0*/
+
+  error = zlib_compress(&compressed.data, &compressed.size,
+                        (unsigned char*)textstring, textsize, zlibsettings);
+  if(!error)
+  {
+    for(i = 0; i != compressed.size; ++i) ucvector_push_back(&data, compressed.data[i]);
+    error = addChunk(out, "zTXt", data.data, data.size);
+  }
+
+  ucvector_cleanup(&compressed);
+  ucvector_cleanup(&data);
+  return error;
+}
+
+static unsigned addChunk_iTXt(ucvector* out, unsigned compressed, const char* keyword, const char* langtag,
+                              const char* transkey, const char* textstring, LodePNGCompressSettings* zlibsettings)
+{
+  unsigned error = 0;
+  ucvector data;
+  size_t i, textsize = strlen(textstring);
+
+  ucvector_init(&data);
+
+  for(i = 0; keyword[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)keyword[i]);
+  if(i < 1 || i > 79) return 89; /*error: invalid keyword size*/
+  ucvector_push_back(&data, 0); /*null termination char*/
+  ucvector_push_back(&data, compressed ? 1 : 0); /*compression flag*/
+  ucvector_push_back(&data, 0); /*compression method*/
+  for(i = 0; langtag[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)langtag[i]);
+  ucvector_push_back(&data, 0); /*null termination char*/
+  for(i = 0; transkey[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)transkey[i]);
+  ucvector_push_back(&data, 0); /*null termination char*/
+
+  if(compressed)
+  {
+    ucvector compressed_data;
+    ucvector_init(&compressed_data);
+    error = zlib_compress(&compressed_data.data, &compressed_data.size,
+                          (unsigned char*)textstring, textsize, zlibsettings);
+    if(!error)
+    {
+      for(i = 0; i != compressed_data.size; ++i) ucvector_push_back(&data, compressed_data.data[i]);
+    }
+    ucvector_cleanup(&compressed_data);
+  }
+  else /*not compressed*/
+  {
+    for(i = 0; textstring[i] != 0; ++i) ucvector_push_back(&data, (unsigned char)textstring[i]);
+  }
+
+  if(!error) error = addChunk(out, "iTXt", data.data, data.size);
+  ucvector_cleanup(&data);
+  return error;
+}
+
+static unsigned addChunk_bKGD(ucvector* out, const LodePNGInfo* info)
+{
+  unsigned error = 0;
+  ucvector bKGD;
+  ucvector_init(&bKGD);
+  if(info->color.colortype == LCT_GREY || info->color.colortype == LCT_GREY_ALPHA)
+  {
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+  }
+  else if(info->color.colortype == LCT_RGB || info->color.colortype == LCT_RGBA)
+  {
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_g >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_g & 255));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_b >> 8));
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_b & 255));
+  }
+  else if(info->color.colortype == LCT_PALETTE)
+  {
+    ucvector_push_back(&bKGD, (unsigned char)(info->background_r & 255)); /*palette index*/
+  }
+
+  error = addChunk(out, "bKGD", bKGD.data, bKGD.size);
+  ucvector_cleanup(&bKGD);
+
+  return error;
+}
+
+static unsigned addChunk_tIME(ucvector* out, const LodePNGTime* time)
+{
+  unsigned error = 0;
+  unsigned char* data = (unsigned char*)lodepng_malloc(7);
+  if(!data) return 83; /*alloc fail*/
+  data[0] = (unsigned char)(time->year >> 8);
+  data[1] = (unsigned char)(time->year & 255);
+  data[2] = (unsigned char)time->month;
+  data[3] = (unsigned char)time->day;
+  data[4] = (unsigned char)time->hour;
+  data[5] = (unsigned char)time->minute;
+  data[6] = (unsigned char)time->second;
+  error = addChunk(out, "tIME", data, 7);
+  lodepng_free(data);
+  return error;
+}
+
+static unsigned addChunk_pHYs(ucvector* out, const LodePNGInfo* info)
+{
+  unsigned error = 0;
+  ucvector data;
+  ucvector_init(&data);
+
+  lodepng_add32bitInt(&data, info->phys_x);
+  lodepng_add32bitInt(&data, info->phys_y);
+  ucvector_push_back(&data, info->phys_unit);
+
+  error = addChunk(out, "pHYs", data.data, data.size);
+  ucvector_cleanup(&data);
+
+  return error;
+}
+
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+static void filterScanline(unsigned char* out, const unsigned char* scanline, const unsigned char* prevline,
+                           size_t length, size_t bytewidth, unsigned char filterType)
+{
+  size_t i;
+  switch(filterType)
+  {
+    case 0: /*None*/
+      for(i = 0; i != length; ++i) out[i] = scanline[i];
+      break;
+    case 1: /*Sub*/
+      for(i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+      for(i = bytewidth; i < length; ++i) out[i] = scanline[i] - scanline[i - bytewidth];
+      break;
+    case 2: /*Up*/
+      if(prevline)
+      {
+        for(i = 0; i != length; ++i) out[i] = scanline[i] - prevline[i];
+      }
+      else
+      {
+        for(i = 0; i != length; ++i) out[i] = scanline[i];
+      }
+      break;
+    case 3: /*Average*/
+      if(prevline)
+      {
+        for(i = 0; i != bytewidth; ++i) out[i] = scanline[i] - (prevline[i] >> 1);
+        for(i = bytewidth; i < length; ++i) out[i] = scanline[i] - ((scanline[i - bytewidth] + prevline[i]) >> 1);
+      }
+      else
+      {
+        for(i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        for(i = bytewidth; i < length; ++i) out[i] = scanline[i] - (scanline[i - bytewidth] >> 1);
+      }
+      break;
+    case 4: /*Paeth*/
+      if(prevline)
+      {
+        /*paethPredictor(0, prevline[i], 0) is always prevline[i]*/
+        for(i = 0; i != bytewidth; ++i) out[i] = (scanline[i] - prevline[i]);
+        for(i = bytewidth; i < length; ++i)
+        {
+          out[i] = (scanline[i] - paethPredictor(scanline[i - bytewidth], prevline[i], prevline[i - bytewidth]));
+        }
+      }
+      else
+      {
+        for(i = 0; i != bytewidth; ++i) out[i] = scanline[i];
+        /*paethPredictor(scanline[i - bytewidth], 0, 0) is always scanline[i - bytewidth]*/
+        for(i = bytewidth; i < length; ++i) out[i] = (scanline[i] - scanline[i - bytewidth]);
+      }
+      break;
+    default: return; /*unexisting filter type given*/
+  }
+}
+
+/* log2 approximation. A slight bit faster than std::log. */
+static float flog2(float f)
+{
+  float result = 0;
+  while(f > 32) { result += 4; f /= 16; }
+  while(f > 2) { ++result; f /= 2; }
+  return result + 1.442695f * (f * f * f / 3 - 3 * f * f / 2 + 3 * f - 1.83333f);
+}
+
+static unsigned filter(unsigned char* out, const unsigned char* in, unsigned w, unsigned h,
+                       const LodePNGColorMode* info, const LodePNGEncoderSettings* settings)
+{
+  /*
+  For PNG filter method 0
+  out must be a buffer with as size: h + (w * h * bpp + 7) / 8, because there are
+  the scanlines with 1 extra byte per scanline
+  */
+
+  unsigned bpp = lodepng_get_bpp(info);
+  /*the width of a scanline in bytes, not including the filter type*/
+  size_t linebytes = (w * bpp + 7) / 8;
+  /*bytewidth is used for filtering, is 1 when bpp < 8, number of bytes per pixel otherwise*/
+  size_t bytewidth = (bpp + 7) / 8;
+  const unsigned char* prevline = 0;
+  unsigned x, y;
+  unsigned error = 0;
+  LodePNGFilterStrategy strategy = settings->filter_strategy;
+
+  /*
+  There is a heuristic called the minimum sum of absolute differences heuristic, suggested by the PNG standard:
+   *  If the image type is Palette, or the bit depth is smaller than 8, then do not filter the image (i.e.
+      use fixed filtering, with the filter None).
+   * (The other case) If the image type is Grayscale or RGB (with or without Alpha), and the bit depth is
+     not smaller than 8, then use adaptive filtering heuristic as follows: independently for each row, apply
+     all five filters and select the filter that produces the smallest sum of absolute values per row.
+  This heuristic is used if filter strategy is LFS_MINSUM and filter_palette_zero is true.
+
+  If filter_palette_zero is true and filter_strategy is not LFS_MINSUM, the above heuristic is followed,
+  but for "the other case", whatever strategy filter_strategy is set to instead of the minimum sum
+  heuristic is used.
+  */
+  if(settings->filter_palette_zero &&
+     (info->colortype == LCT_PALETTE || info->bitdepth < 8)) strategy = LFS_ZERO;
+
+  if(bpp == 0) return 31; /*error: invalid color type*/
+
+  if(strategy == LFS_ZERO)
+  {
+    for(y = 0; y != h; ++y)
+    {
+      size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+      size_t inindex = linebytes * y;
+      out[outindex] = 0; /*filter type byte*/
+      filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, 0);
+      prevline = &in[inindex];
+    }
+  }
+  else if(strategy == LFS_MINSUM)
+  {
+    /*adaptive filtering*/
+    size_t sum[5];
+    unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+    size_t smallest = 0;
+    unsigned char type, bestType = 0;
+
+    for(type = 0; type != 5; ++type)
+    {
+      attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+      if(!attempt[type]) return 83; /*alloc fail*/
+    }
+
+    if(!error)
+    {
+      for(y = 0; y != h; ++y)
+      {
+        /*try the 5 filter types*/
+        for(type = 0; type != 5; ++type)
+        {
+          filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+
+          /*calculate the sum of the result*/
+          sum[type] = 0;
+          if(type == 0)
+          {
+            for(x = 0; x != linebytes; ++x) sum[type] += (unsigned char)(attempt[type][x]);
+          }
+          else
+          {
+            for(x = 0; x != linebytes; ++x)
+            {
+              /*For differences, each byte should be treated as signed, values above 127 are negative
+              (converted to signed char). Filtertype 0 isn't a difference though, so use unsigned there.
+              This means filtertype 0 is almost never chosen, but that is justified.*/
+              unsigned char s = attempt[type][x];
+              sum[type] += s < 128 ? s : (255U - s);
+            }
+          }
+
+          /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+          if(type == 0 || sum[type] < smallest)
+          {
+            bestType = type;
+            smallest = sum[type];
+          }
+        }
+
+        prevline = &in[y * linebytes];
+
+        /*now fill the out values*/
+        out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+        for(x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+      }
+    }
+
+    for(type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+  }
+  else if(strategy == LFS_ENTROPY)
+  {
+    float sum[5];
+    unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+    float smallest = 0;
+    unsigned type, bestType = 0;
+    unsigned count[256];
+
+    for(type = 0; type != 5; ++type)
+    {
+      attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+      if(!attempt[type]) return 83; /*alloc fail*/
+    }
+
+    for(y = 0; y != h; ++y)
+    {
+      /*try the 5 filter types*/
+      for(type = 0; type != 5; ++type)
+      {
+        filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+        for(x = 0; x != 256; ++x) count[x] = 0;
+        for(x = 0; x != linebytes; ++x) ++count[attempt[type][x]];
+        ++count[type]; /*the filter type itself is part of the scanline*/
+        sum[type] = 0;
+        for(x = 0; x != 256; ++x)
+        {
+          float p = count[x] / (float)(linebytes + 1);
+          sum[type] += count[x] == 0 ? 0 : flog2(1 / p) * p;
+        }
+        /*check if this is smallest sum (or if type == 0 it's the first case so always store the values)*/
+        if(type == 0 || sum[type] < smallest)
+        {
+          bestType = type;
+          smallest = sum[type];
+        }
+      }
+
+      prevline = &in[y * linebytes];
+
+      /*now fill the out values*/
+      out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+      for(x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+    }
+
+    for(type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+  }
+  else if(strategy == LFS_PREDEFINED)
+  {
+    for(y = 0; y != h; ++y)
+    {
+      size_t outindex = (1 + linebytes) * y; /*the extra filterbyte added to each row*/
+      size_t inindex = linebytes * y;
+      unsigned char type = settings->predefined_filters[y];
+      out[outindex] = type; /*filter type byte*/
+      filterScanline(&out[outindex + 1], &in[inindex], prevline, linebytes, bytewidth, type);
+      prevline = &in[inindex];
+    }
+  }
+  else if(strategy == LFS_BRUTE_FORCE)
+  {
+    /*brute force filter chooser.
+    deflate the scanline after every filter attempt to see which one deflates best.
+    This is very slow and gives only slightly smaller, sometimes even larger, result*/
+    size_t size[5];
+    unsigned char* attempt[5]; /*five filtering attempts, one for each filter type*/
+    size_t smallest = 0;
+    unsigned type = 0, bestType = 0;
+    unsigned char* dummy;
+    LodePNGCompressSettings zlibsettings = settings->zlibsettings;
+    /*use fixed tree on the attempts so that the tree is not adapted to the filtertype on purpose,
+    to simulate the true case where the tree is the same for the whole image. Sometimes it gives
+    better result with dynamic tree anyway. Using the fixed tree sometimes gives worse, but in rare
+    cases better compression. It does make this a bit less slow, so it's worth doing this.*/
+    zlibsettings.btype = 1;
+    /*a custom encoder likely doesn't read the btype setting and is optimized for complete PNG
+    images only, so disable it*/
+    zlibsettings.custom_zlib = 0;
+    zlibsettings.custom_deflate = 0;
+    for(type = 0; type != 5; ++type)
+    {
+      attempt[type] = (unsigned char*)lodepng_malloc(linebytes);
+      if(!attempt[type]) return 83; /*alloc fail*/
+    }
+    for(y = 0; y != h; ++y) /*try the 5 filter types*/
+    {
+      for(type = 0; type != 5; ++type)
+      {
+        unsigned testsize = linebytes;
+        /*if(testsize > 8) testsize /= 8;*/ /*it already works good enough by testing a part of the row*/
+
+        filterScanline(attempt[type], &in[y * linebytes], prevline, linebytes, bytewidth, type);
+        size[type] = 0;
+        dummy = 0;
+        zlib_compress(&dummy, &size[type], attempt[type], testsize, &zlibsettings);
+        lodepng_free(dummy);
+        /*check if this is smallest size (or if type == 0 it's the first case so always store the values)*/
+        if(type == 0 || size[type] < smallest)
+        {
+          bestType = type;
+          smallest = size[type];
+        }
+      }
+      prevline = &in[y * linebytes];
+      out[y * (linebytes + 1)] = bestType; /*the first byte of a scanline will be the filter type*/
+      for(x = 0; x != linebytes; ++x) out[y * (linebytes + 1) + 1 + x] = attempt[bestType][x];
+    }
+    for(type = 0; type != 5; ++type) lodepng_free(attempt[type]);
+  }
+  else return 88; /* unknown filter strategy */
+
+  return error;
+}
+
+static void addPaddingBits(unsigned char* out, const unsigned char* in,
+                           size_t olinebits, size_t ilinebits, unsigned h)
+{
+  /*The opposite of the removePaddingBits function
+  olinebits must be >= ilinebits*/
+  unsigned y;
+  size_t diff = olinebits - ilinebits;
+  size_t obp = 0, ibp = 0; /*bit pointers*/
+  for(y = 0; y != h; ++y)
+  {
+    size_t x;
+    for(x = 0; x < ilinebits; ++x)
+    {
+      unsigned char bit = readBitFromReversedStream(&ibp, in);
+      setBitOfReversedStream(&obp, out, bit);
+    }
+    /*obp += diff; --> no, fill in some value in the padding bits too, to avoid
+    "Use of uninitialised value of size ###" warning from valgrind*/
+    for(x = 0; x != diff; ++x) setBitOfReversedStream(&obp, out, 0);
+  }
+}
+
+/*
+in: non-interlaced image with size w*h
+out: the same pixels, but re-ordered according to PNG's Adam7 interlacing, with
+ no padding bits between scanlines, but between reduced images so that each
+ reduced image starts at a byte.
+bpp: bits per pixel
+there are no padding bits, not between scanlines, not between reduced images
+in has the following size in bits: w * h * bpp.
+out is possibly bigger due to padding bits between reduced images
+NOTE: comments about padding bits are only relevant if bpp < 8
+*/
+static void Adam7_interlace(unsigned char* out, const unsigned char* in, unsigned w, unsigned h, unsigned bpp)
+{
+  unsigned passw[7], passh[7];
+  size_t filter_passstart[8], padded_passstart[8], passstart[8];
+  unsigned i;
+
+  Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+  if(bpp >= 8)
+  {
+    for(i = 0; i != 7; ++i)
+    {
+      unsigned x, y, b;
+      size_t bytewidth = bpp / 8;
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x)
+      {
+        size_t pixelinstart = ((ADAM7_IY[i] + y * ADAM7_DY[i]) * w + ADAM7_IX[i] + x * ADAM7_DX[i]) * bytewidth;
+        size_t pixeloutstart = passstart[i] + (y * passw[i] + x) * bytewidth;
+        for(b = 0; b < bytewidth; ++b)
+        {
+          out[pixeloutstart + b] = in[pixelinstart + b];
+        }
+      }
+    }
+  }
+  else /*bpp < 8: Adam7 with pixels < 8 bit is a bit trickier: with bit pointers*/
+  {
+    for(i = 0; i != 7; ++i)
+    {
+      unsigned x, y, b;
+      unsigned ilinebits = bpp * passw[i];
+      unsigned olinebits = bpp * w;
+      size_t obp, ibp; /*bit pointers (for out and in buffer)*/
+      for(y = 0; y < passh[i]; ++y)
+      for(x = 0; x < passw[i]; ++x)
+      {
+        ibp = (ADAM7_IY[i] + y * ADAM7_DY[i]) * olinebits + (ADAM7_IX[i] + x * ADAM7_DX[i]) * bpp;
+        obp = (8 * passstart[i]) + (y * ilinebits + x * bpp);
+        for(b = 0; b < bpp; ++b)
+        {
+          unsigned char bit = readBitFromReversedStream(&ibp, in);
+          setBitOfReversedStream(&obp, out, bit);
+        }
+      }
+    }
+  }
+}
+
+/*out must be buffer big enough to contain uncompressed IDAT chunk data, and in must contain the full image.
+return value is error**/
+static unsigned preProcessScanlines(unsigned char** out, size_t* outsize, const unsigned char* in,
+                                    unsigned w, unsigned h,
+                                    const LodePNGInfo* info_png, const LodePNGEncoderSettings* settings)
+{
+  /*
+  This function converts the pure 2D image with the PNG's colortype, into filtered-padded-interlaced data. Steps:
+  *) if no Adam7: 1) add padding bits (= posible extra bits per scanline if bpp < 8) 2) filter
+  *) if adam7: 1) Adam7_interlace 2) 7x add padding bits 3) 7x filter
+  */
+  unsigned bpp = lodepng_get_bpp(&info_png->color);
+  unsigned error = 0;
+
+  if(info_png->interlace_method == 0)
+  {
+    *outsize = h + (h * ((w * bpp + 7) / 8)); /*image size plus an extra byte per scanline + possible padding bits*/
+    *out = (unsigned char*)lodepng_malloc(*outsize);
+    if(!(*out) && (*outsize)) error = 83; /*alloc fail*/
+
+    if(!error)
+    {
+      /*non multiple of 8 bits per scanline, padding bits needed per scanline*/
+      if(bpp < 8 && w * bpp != ((w * bpp + 7) / 8) * 8)
+      {
+        unsigned char* padded = (unsigned char*)lodepng_malloc(h * ((w * bpp + 7) / 8));
+        if(!padded) error = 83; /*alloc fail*/
+        if(!error)
+        {
+          addPaddingBits(padded, in, ((w * bpp + 7) / 8) * 8, w * bpp, h);
+          error = filter(*out, padded, w, h, &info_png->color, settings);
+        }
+        lodepng_free(padded);
+      }
+      else
+      {
+        /*we can immediately filter into the out buffer, no other steps needed*/
+        error = filter(*out, in, w, h, &info_png->color, settings);
+      }
+    }
+  }
+  else /*interlace_method is 1 (Adam7)*/
+  {
+    unsigned passw[7], passh[7];
+    size_t filter_passstart[8], padded_passstart[8], passstart[8];
+    unsigned char* adam7;
+
+    Adam7_getpassvalues(passw, passh, filter_passstart, padded_passstart, passstart, w, h, bpp);
+
+    *outsize = filter_passstart[7]; /*image size plus an extra byte per scanline + possible padding bits*/
+    *out = (unsigned char*)lodepng_malloc(*outsize);
+    if(!(*out)) error = 83; /*alloc fail*/
+
+    adam7 = (unsigned char*)lodepng_malloc(passstart[7]);
+    if(!adam7 && passstart[7]) error = 83; /*alloc fail*/
+
+    if(!error)
+    {
+      unsigned i;
+
+      Adam7_interlace(adam7, in, w, h, bpp);
+      for(i = 0; i != 7; ++i)
+      {
+        if(bpp < 8)
+        {
+          unsigned char* padded = (unsigned char*)lodepng_malloc(padded_passstart[i + 1] - padded_passstart[i]);
+          if(!padded) ERROR_BREAK(83); /*alloc fail*/
+          addPaddingBits(padded, &adam7[passstart[i]],
+                         ((passw[i] * bpp + 7) / 8) * 8, passw[i] * bpp, passh[i]);
+          error = filter(&(*out)[filter_passstart[i]], padded,
+                         passw[i], passh[i], &info_png->color, settings);
+          lodepng_free(padded);
+        }
+        else
+        {
+          error = filter(&(*out)[filter_passstart[i]], &adam7[padded_passstart[i]],
+                         passw[i], passh[i], &info_png->color, settings);
+        }
+
+        if(error) break;
+      }
+    }
+
+    lodepng_free(adam7);
+  }
+
+  return error;
+}
+
+/*
+palette must have 4 * palettesize bytes allocated, and given in format RGBARGBARGBARGBA...
+returns 0 if the palette is opaque,
+returns 1 if the palette has a single color with alpha 0 ==> color key
+returns 2 if the palette is semi-translucent.
+*/
+static unsigned getPaletteTranslucency(const unsigned char* palette, size_t palettesize)
+{
+  size_t i;
+  unsigned key = 0;
+  unsigned r = 0, g = 0, b = 0; /*the value of the color with alpha 0, so long as color keying is possible*/
+  for(i = 0; i != palettesize; ++i)
+  {
+    if(!key && palette[4 * i + 3] == 0)
+    {
+      r = palette[4 * i + 0]; g = palette[4 * i + 1]; b = palette[4 * i + 2];
+      key = 1;
+      i = (size_t)(-1); /*restart from beginning, to detect earlier opaque colors with key's value*/
+    }
+    else if(palette[4 * i + 3] != 255) return 2;
+    /*when key, no opaque RGB may have key's RGB*/
+    else if(key && r == palette[i * 4 + 0] && g == palette[i * 4 + 1] && b == palette[i * 4 + 2]) return 2;
+  }
+  return key;
+}
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+static unsigned addUnknownChunks(ucvector* out, unsigned char* data, size_t datasize)
+{
+  unsigned char* inchunk = data;
+  while((size_t)(inchunk - data) < datasize)
+  {
+    CERROR_TRY_RETURN(lodepng_chunk_append(&out->data, &out->size, inchunk));
+    out->allocsize = out->size; /*fix the allocsize again*/
+    inchunk = lodepng_chunk_next(inchunk);
+  }
+  return 0;
+}
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+                        const unsigned char* image, unsigned w, unsigned h,
+                        LodePNGState* state)
+{
+  LodePNGInfo info;
+  ucvector outv;
+  unsigned char* data = 0; /*uncompressed version of the IDAT chunk data*/
+  size_t datasize = 0;
+
+  /*provide some proper output values if error will happen*/
+  *out = 0;
+  *outsize = 0;
+  state->error = 0;
+
+  lodepng_info_init(&info);
+  lodepng_info_copy(&info, &state->info_png);
+
+  if((info.color.colortype == LCT_PALETTE || state->encoder.force_palette)
+      && (info.color.palettesize == 0 || info.color.palettesize > 256))
+  {
+    state->error = 68; /*invalid palette size, it is only allowed to be 1-256*/
+    return state->error;
+  }
+
+  if(state->encoder.auto_convert)
+  {
+    state->error = lodepng_auto_choose_color(&info.color, image, w, h, &state->info_raw);
+  }
+  if(state->error) return state->error;
+
+  if(state->encoder.zlibsettings.btype > 2)
+  {
+    CERROR_RETURN_ERROR(state->error, 61); /*error: unexisting btype*/
+  }
+  if(state->info_png.interlace_method > 1)
+  {
+    CERROR_RETURN_ERROR(state->error, 71); /*error: unexisting interlace mode*/
+  }
+
+  state->error = checkColorValidity(info.color.colortype, info.color.bitdepth);
+  if(state->error) return state->error; /*error: unexisting color type given*/
+  state->error = checkColorValidity(state->info_raw.colortype, state->info_raw.bitdepth);
+  if(state->error) return state->error; /*error: unexisting color type given*/
+
+  if(!lodepng_color_mode_equal(&state->info_raw, &info.color))
+  {
+    unsigned char* converted;
+    size_t size = (w * h * (size_t)lodepng_get_bpp(&info.color) + 7) / 8;
+
+    converted = (unsigned char*)lodepng_malloc(size);
+    if(!converted && size) state->error = 83; /*alloc fail*/
+    if(!state->error)
+    {
+      state->error = lodepng_convert(converted, image, &info.color, &state->info_raw, w, h);
+    }
+    if(!state->error) preProcessScanlines(&data, &datasize, converted, w, h, &info, &state->encoder);
+    lodepng_free(converted);
+  }
+  else preProcessScanlines(&data, &datasize, image, w, h, &info, &state->encoder);
+
+  ucvector_init(&outv);
+  while(!state->error) /*while only executed once, to break on error*/
+  {
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    size_t i;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    /*write signature and chunks*/
+    writeSignature(&outv);
+    /*IHDR*/
+    addChunk_IHDR(&outv, w, h, info.color.colortype, info.color.bitdepth, info.interlace_method);
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*unknown chunks between IHDR and PLTE*/
+    if(info.unknown_chunks_data[0])
+    {
+      state->error = addUnknownChunks(&outv, info.unknown_chunks_data[0], info.unknown_chunks_size[0]);
+      if(state->error) break;
+    }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    /*PLTE*/
+    if(info.color.colortype == LCT_PALETTE)
+    {
+      addChunk_PLTE(&outv, &info.color);
+    }
+    if(state->encoder.force_palette && (info.color.colortype == LCT_RGB || info.color.colortype == LCT_RGBA))
+    {
+      addChunk_PLTE(&outv, &info.color);
+    }
+    /*tRNS*/
+    if(info.color.colortype == LCT_PALETTE && getPaletteTranslucency(info.color.palette, info.color.palettesize) != 0)
+    {
+      addChunk_tRNS(&outv, &info.color);
+    }
+    if((info.color.colortype == LCT_GREY || info.color.colortype == LCT_RGB) && info.color.key_defined)
+    {
+      addChunk_tRNS(&outv, &info.color);
+    }
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*bKGD (must come between PLTE and the IDAt chunks*/
+    if(info.background_defined) addChunk_bKGD(&outv, &info);
+    /*pHYs (must come before the IDAT chunks)*/
+    if(info.phys_defined) addChunk_pHYs(&outv, &info);
+
+    /*unknown chunks between PLTE and IDAT*/
+    if(info.unknown_chunks_data[1])
+    {
+      state->error = addUnknownChunks(&outv, info.unknown_chunks_data[1], info.unknown_chunks_size[1]);
+      if(state->error) break;
+    }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    /*IDAT (multiple IDAT chunks must be consecutive)*/
+    state->error = addChunk_IDAT(&outv, data, datasize, &state->encoder.zlibsettings);
+    if(state->error) break;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+    /*tIME*/
+    if(info.time_defined) addChunk_tIME(&outv, &info.time);
+    /*tEXt and/or zTXt*/
+    for(i = 0; i != info.text_num; ++i)
+    {
+      if(strlen(info.text_keys[i]) > 79)
+      {
+        state->error = 66; /*text chunk too large*/
+        break;
+      }
+      if(strlen(info.text_keys[i]) < 1)
+      {
+        state->error = 67; /*text chunk too small*/
+        break;
+      }
+      if(state->encoder.text_compression)
+      {
+        addChunk_zTXt(&outv, info.text_keys[i], info.text_strings[i], &state->encoder.zlibsettings);
+      }
+      else
+      {
+        addChunk_tEXt(&outv, info.text_keys[i], info.text_strings[i]);
+      }
+    }
+    /*LodePNG version id in text chunk*/
+    if(state->encoder.add_id)
+    {
+      unsigned alread_added_id_text = 0;
+      for(i = 0; i != info.text_num; ++i)
+      {
+        if(!strcmp(info.text_keys[i], "LodePNG"))
+        {
+          alread_added_id_text = 1;
+          break;
+        }
+      }
+      if(alread_added_id_text == 0)
+      {
+        addChunk_tEXt(&outv, "LodePNG", LODEPNG_VERSION_STRING); /*it's shorter as tEXt than as zTXt chunk*/
+      }
+    }
+    /*iTXt*/
+    for(i = 0; i != info.itext_num; ++i)
+    {
+      if(strlen(info.itext_keys[i]) > 79)
+      {
+        state->error = 66; /*text chunk too large*/
+        break;
+      }
+      if(strlen(info.itext_keys[i]) < 1)
+      {
+        state->error = 67; /*text chunk too small*/
+        break;
+      }
+      addChunk_iTXt(&outv, state->encoder.text_compression,
+                    info.itext_keys[i], info.itext_langtags[i], info.itext_transkeys[i], info.itext_strings[i],
+                    &state->encoder.zlibsettings);
+    }
+
+    /*unknown chunks between IDAT and IEND*/
+    if(info.unknown_chunks_data[2])
+    {
+      state->error = addUnknownChunks(&outv, info.unknown_chunks_data[2], info.unknown_chunks_size[2]);
+      if(state->error) break;
+    }
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+    addChunk_IEND(&outv);
+
+    break; /*this isn't really a while loop; no error happened so break out now!*/
+  }
+
+  lodepng_info_cleanup(&info);
+  lodepng_free(data);
+  /*instead of cleaning the vector up, give it to the output*/
+  *out = outv.data;
+  *outsize = outv.size;
+
+  return state->error;
+}
+
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize, const unsigned char* image,
+                               unsigned w, unsigned h, LodePNGColorType colortype, unsigned bitdepth)
+{
+  unsigned error;
+  LodePNGState state;
+  lodepng_state_init(&state);
+  state.info_raw.colortype = colortype;
+  state.info_raw.bitdepth = bitdepth;
+  state.info_png.color.colortype = colortype;
+  state.info_png.color.bitdepth = bitdepth;
+  lodepng_encode(out, outsize, image, w, h, &state);
+  error = state.error;
+  lodepng_state_cleanup(&state);
+  return error;
+}
+
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+  return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize, const unsigned char* image, unsigned w, unsigned h)
+{
+  return lodepng_encode_memory(out, outsize, image, w, h, LCT_RGB, 8);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned lodepng_encode_file(const char* filename, const unsigned char* image, unsigned w, unsigned h,
+                             LodePNGColorType colortype, unsigned bitdepth)
+{
+  unsigned char* buffer;
+  size_t buffersize;
+  unsigned error = lodepng_encode_memory(&buffer, &buffersize, image, w, h, colortype, bitdepth);
+  if(!error) error = lodepng_save_file(buffer, buffersize, filename);
+  lodepng_free(buffer);
+  return error;
+}
+
+unsigned lodepng_encode32_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+  return lodepng_encode_file(filename, image, w, h, LCT_RGBA, 8);
+}
+
+unsigned lodepng_encode24_file(const char* filename, const unsigned char* image, unsigned w, unsigned h)
+{
+  return lodepng_encode_file(filename, image, w, h, LCT_RGB, 8);
+}
+#endif /*LODEPNG_COMPILE_DISK*/
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings)
+{
+  lodepng_compress_settings_init(&settings->zlibsettings);
+  settings->filter_palette_zero = 1;
+  settings->filter_strategy = LFS_MINSUM;
+  settings->auto_convert = 1;
+  settings->force_palette = 0;
+  settings->predefined_filters = 0;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  settings->add_id = 0;
+  settings->text_compression = 1;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+}
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*
+This returns the description of a numerical error code in English. This is also
+the documentation of all the error codes.
+*/
+const char* lodepng_error_text(unsigned code)
+{
+  switch(code)
+  {
+    case 0: return "no error, everything went ok";
+    case 1: return "nothing done yet"; /*the Encoder/Decoder has done nothing yet, error checking makes no sense yet*/
+    case 10: return "end of input memory reached without huffman end code"; /*while huffman decoding*/
+    case 11: return "error in code tree made it jump outside of huffman tree"; /*while huffman decoding*/
+    case 13: return "problem while processing dynamic deflate block";
+    case 14: return "problem while processing dynamic deflate block";
+    case 15: return "problem while processing dynamic deflate block";
+    case 16: return "unexisting code while processing dynamic deflate block";
+    case 17: return "end of out buffer memory reached while inflating";
+    case 18: return "invalid distance code while inflating";
+    case 19: return "end of out buffer memory reached while inflating";
+    case 20: return "invalid deflate block BTYPE encountered while decoding";
+    case 21: return "NLEN is not ones complement of LEN in a deflate block";
+     /*end of out buffer memory reached while inflating:
+     This can happen if the inflated deflate data is longer than the amount of bytes required to fill up
+     all the pixels of the image, given the color depth and image dimensions. Something that doesn't
+     happen in a normal, well encoded, PNG image.*/
+    case 22: return "end of out buffer memory reached while inflating";
+    case 23: return "end of in buffer memory reached while inflating";
+    case 24: return "invalid FCHECK in zlib header";
+    case 25: return "invalid compression method in zlib header";
+    case 26: return "FDICT encountered in zlib header while it's not used for PNG";
+    case 27: return "PNG file is smaller than a PNG header";
+    /*Checks the magic file header, the first 8 bytes of the PNG file*/
+    case 28: return "incorrect PNG signature, it's no PNG or corrupted";
+    case 29: return "first chunk is not the header chunk";
+    case 30: return "chunk length too large, chunk broken off at end of file";
+    case 31: return "illegal PNG color type or bpp";
+    case 32: return "illegal PNG compression method";
+    case 33: return "illegal PNG filter method";
+    case 34: return "illegal PNG interlace method";
+    case 35: return "chunk length of a chunk is too large or the chunk too small";
+    case 36: return "illegal PNG filter type encountered";
+    case 37: return "illegal bit depth for this color type given";
+    case 38: return "the palette is too big"; /*more than 256 colors*/
+    case 39: return "more palette alpha values given in tRNS chunk than there are colors in the palette";
+    case 40: return "tRNS chunk has wrong size for greyscale image";
+    case 41: return "tRNS chunk has wrong size for RGB image";
+    case 42: return "tRNS chunk appeared while it was not allowed for this color type";
+    case 43: return "bKGD chunk has wrong size for palette image";
+    case 44: return "bKGD chunk has wrong size for greyscale image";
+    case 45: return "bKGD chunk has wrong size for RGB image";
+    case 48: return "empty input buffer given to decoder. Maybe caused by non-existing file?";
+    case 49: return "jumped past memory while generating dynamic huffman tree";
+    case 50: return "jumped past memory while generating dynamic huffman tree";
+    case 51: return "jumped past memory while inflating huffman block";
+    case 52: return "jumped past memory while inflating";
+    case 53: return "size of zlib data too small";
+    case 54: return "repeat symbol in tree while there was no value symbol yet";
+    /*jumped past tree while generating huffman tree, this could be when the
+    tree will have more leaves than symbols after generating it out of the
+    given lenghts. They call this an oversubscribed dynamic bit lengths tree in zlib.*/
+    case 55: return "jumped past tree while generating huffman tree";
+    case 56: return "given output image colortype or bitdepth not supported for color conversion";
+    case 57: return "invalid CRC encountered (checking CRC can be disabled)";
+    case 58: return "invalid ADLER32 encountered (checking ADLER32 can be disabled)";
+    case 59: return "requested color conversion not supported";
+    case 60: return "invalid window size given in the settings of the encoder (must be 0-32768)";
+    case 61: return "invalid BTYPE given in the settings of the encoder (only 0, 1 and 2 are allowed)";
+    /*LodePNG leaves the choice of RGB to greyscale conversion formula to the user.*/
+    case 62: return "conversion from color to greyscale not supported";
+    case 63: return "length of a chunk too long, max allowed for PNG is 2147483647 bytes per chunk"; /*(2^31-1)*/
+    /*this would result in the inability of a deflated block to ever contain an end code. It must be at least 1.*/
+    case 64: return "the length of the END symbol 256 in the Huffman tree is 0";
+    case 66: return "the length of a text chunk keyword given to the encoder is longer than the maximum of 79 bytes";
+    case 67: return "the length of a text chunk keyword given to the encoder is smaller than the minimum of 1 byte";
+    case 68: return "tried to encode a PLTE chunk with a palette that has less than 1 or more than 256 colors";
+    case 69: return "unknown chunk type with 'critical' flag encountered by the decoder";
+    case 71: return "unexisting interlace mode given to encoder (must be 0 or 1)";
+    case 72: return "while decoding, unexisting compression method encountering in zTXt or iTXt chunk (it must be 0)";
+    case 73: return "invalid tIME chunk size";
+    case 74: return "invalid pHYs chunk size";
+    /*length could be wrong, or data chopped off*/
+    case 75: return "no null termination char found while decoding text chunk";
+    case 76: return "iTXt chunk too short to contain required bytes";
+    case 77: return "integer overflow in buffer size";
+    case 78: return "failed to open file for reading"; /*file doesn't exist or couldn't be opened for reading*/
+    case 79: return "failed to open file for writing";
+    case 80: return "tried creating a tree of 0 symbols";
+    case 81: return "lazy matching at pos 0 is impossible";
+    case 82: return "color conversion to palette requested while a color isn't in palette";
+    case 83: return "memory allocation failed";
+    case 84: return "given image too small to contain all pixels to be encoded";
+    case 86: return "impossible offset in lz77 encoding (internal bug)";
+    case 87: return "must provide custom zlib function pointer if LODEPNG_COMPILE_ZLIB is not defined";
+    case 88: return "invalid filter strategy given for LodePNGEncoderSettings.filter_strategy";
+    case 89: return "text chunk keyword too short or long: must have size 1-79";
+    /*the windowsize in the LodePNGCompressSettings. Requiring POT(==> & instead of %) makes encoding 12% faster.*/
+    case 90: return "windowsize must be a power of two";
+    case 91: return "invalid decompressed idat size";
+    case 92: return "too many pixels, not supported";
+    case 93: return "zero width or height is invalid";
+    case 94: return "header chunk must have a size of 13 bytes";
+  }
+  return "unknown error code";
+}
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* // C++ Wrapper                                                          // */
+/* ////////////////////////////////////////////////////////////////////////// */
+/* ////////////////////////////////////////////////////////////////////////// */
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename)
+{
+  long size = lodepng_filesize(filename.c_str());
+  if(size < 0) return 78;
+  buffer.resize((size_t)size);
+  return size == 0 ? 0 : lodepng_buffer_file(&buffer[0], (size_t)size, filename.c_str());
+}
+
+/*write given buffer to the file, overwriting the file, it doesn't append to it.*/
+unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename)
+{
+  return lodepng_save_file(buffer.empty() ? 0 : &buffer[0], buffer.size(), filename.c_str());
+}
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                    const LodePNGDecompressSettings& settings)
+{
+  unsigned char* buffer = 0;
+  size_t buffersize = 0;
+  unsigned error = zlib_decompress(&buffer, &buffersize, in, insize, &settings);
+  if(buffer)
+  {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                    const LodePNGDecompressSettings& settings)
+{
+  return decompress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+}
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                  const LodePNGCompressSettings& settings)
+{
+  unsigned char* buffer = 0;
+  size_t buffersize = 0;
+  unsigned error = zlib_compress(&buffer, &buffersize, in, insize, &settings);
+  if(buffer)
+  {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                  const LodePNGCompressSettings& settings)
+{
+  return compress(out, in.empty() ? 0 : &in[0], in.size(), settings);
+}
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+
+
+#ifdef LODEPNG_COMPILE_PNG
+
+State::State()
+{
+  lodepng_state_init(this);
+}
+
+State::State(const State& other)
+{
+  lodepng_state_init(this);
+  lodepng_state_copy(this, &other);
+}
+
+State::~State()
+{
+  lodepng_state_cleanup(this);
+}
+
+State& State::operator=(const State& other)
+{
+  lodepng_state_copy(this, &other);
+  return *this;
+}
+
+#ifdef LODEPNG_COMPILE_DECODER
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const unsigned char* in,
+                size_t insize, LodePNGColorType colortype, unsigned bitdepth)
+{
+  unsigned char* buffer;
+  unsigned error = lodepng_decode_memory(&buffer, &w, &h, in, insize, colortype, bitdepth);
+  if(buffer && !error)
+  {
+    State state;
+    state.info_raw.colortype = colortype;
+    state.info_raw.bitdepth = bitdepth;
+    size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const std::vector<unsigned char>& in, LodePNGColorType colortype, unsigned bitdepth)
+{
+  return decode(out, w, h, in.empty() ? 0 : &in[0], (unsigned)in.size(), colortype, bitdepth);
+}
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const unsigned char* in, size_t insize)
+{
+  unsigned char* buffer = NULL;
+  unsigned error = lodepng_decode(&buffer, &w, &h, &state, in, insize);
+  if(buffer && !error)
+  {
+    size_t buffersize = lodepng_get_raw_size(w, h, &state.info_raw);
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+  }
+  lodepng_free(buffer);
+  return error;
+}
+
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const std::vector<unsigned char>& in)
+{
+  return decode(out, w, h, state, in.empty() ? 0 : &in[0], in.size());
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h, const std::string& filename,
+                LodePNGColorType colortype, unsigned bitdepth)
+{
+  std::vector<unsigned char> buffer;
+  unsigned error = load_file(buffer, filename);
+  if(error) return error;
+  return decode(out, w, h, buffer, colortype, bitdepth);
+}
+#endif /* LODEPNG_COMPILE_DECODER */
+#endif /* LODEPNG_COMPILE_DISK */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+unsigned encode(std::vector<unsigned char>& out, const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth)
+{
+  unsigned char* buffer;
+  size_t buffersize;
+  unsigned error = lodepng_encode_memory(&buffer, &buffersize, in, w, h, colortype, bitdepth);
+  if(buffer)
+  {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth)
+{
+  if(lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+  return encode(out, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+}
+
+unsigned encode(std::vector<unsigned char>& out,
+                const unsigned char* in, unsigned w, unsigned h,
+                State& state)
+{
+  unsigned char* buffer;
+  size_t buffersize;
+  unsigned error = lodepng_encode(&buffer, &buffersize, in, w, h, &state);
+  if(buffer)
+  {
+    out.insert(out.end(), &buffer[0], &buffer[buffersize]);
+    lodepng_free(buffer);
+  }
+  return error;
+}
+
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                State& state)
+{
+  if(lodepng_get_raw_size(w, h, &state.info_raw) > in.size()) return 84;
+  return encode(out, in.empty() ? 0 : &in[0], w, h, state);
+}
+
+#ifdef LODEPNG_COMPILE_DISK
+unsigned encode(const std::string& filename,
+                const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth)
+{
+  std::vector<unsigned char> buffer;
+  unsigned error = encode(buffer, in, w, h, colortype, bitdepth);
+  if(!error) error = save_file(buffer, filename);
+  return error;
+}
+
+unsigned encode(const std::string& filename,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype, unsigned bitdepth)
+{
+  if(lodepng_get_raw_size_lct(w, h, colortype, bitdepth) > in.size()) return 84;
+  return encode(filename, in.empty() ? 0 : &in[0], w, h, colortype, bitdepth);
+}
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_PNG */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
diff --git a/3rdparty/bimg/3rdparty/lodepng/lodepng.h b/3rdparty/bimg/3rdparty/lodepng/lodepng.h
new file mode 100644
index 0000000..94e8195
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/lodepng/lodepng.h
@@ -0,0 +1,1759 @@
+/*
+LodePNG version 20160501
+
+Copyright (c) 2005-2016 Lode Vandevenne
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+    1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would be
+    appreciated but is not required.
+
+    2. Altered source versions must be plainly marked as such, and must not be
+    misrepresented as being the original software.
+
+    3. This notice may not be removed or altered from any source
+    distribution.
+*/
+
+#ifndef LODEPNG_H
+#define LODEPNG_H
+
+#include <string.h> /*for size_t*/
+
+extern const char* LODEPNG_VERSION_STRING;
+
+/*
+The following #defines are used to create code sections. They can be disabled
+to disable code sections, which can give faster compile time and smaller binary.
+The "NO_COMPILE" defines are designed to be used to pass as defines to the
+compiler command to disable them without modifying this header, e.g.
+-DLODEPNG_NO_COMPILE_ZLIB for gcc.
+In addition to those below, you can also define LODEPNG_NO_COMPILE_CRC to
+allow implementing a custom lodepng_crc32.
+*/
+/*deflate & zlib. If disabled, you must specify alternative zlib functions in
+the custom_zlib field of the compress and decompress settings*/
+#ifndef LODEPNG_NO_COMPILE_ZLIB
+#define LODEPNG_COMPILE_ZLIB
+#endif
+/*png encoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_PNG
+#define LODEPNG_COMPILE_PNG
+#endif
+/*deflate&zlib decoder and png decoder*/
+#ifndef LODEPNG_NO_COMPILE_DECODER
+#define LODEPNG_COMPILE_DECODER
+#endif
+/*deflate&zlib encoder and png encoder*/
+#ifndef LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_COMPILE_ENCODER
+#endif
+/*the optional built in harddisk file loading and saving functions*/
+#ifndef LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_COMPILE_DISK
+#endif
+/*support for chunks other than IHDR, IDAT, PLTE, tRNS, IEND: ancillary and unknown chunks*/
+#ifndef LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_COMPILE_ANCILLARY_CHUNKS
+#endif
+/*ability to convert error numerical codes to English text string*/
+#ifndef LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_COMPILE_ERROR_TEXT
+#endif
+/*Compile the default allocators (C's free, malloc and realloc). If you disable this,
+you can define the functions lodepng_free, lodepng_malloc and lodepng_realloc in your
+source files with custom allocators.*/
+#ifndef LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_COMPILE_ALLOCATORS
+#endif
+/*compile the C++ version (you can disable the C++ wrapper here even when compiling for C++)*/
+#ifdef __cplusplus
+#ifndef LODEPNG_NO_COMPILE_CPP
+#define LODEPNG_COMPILE_CPP
+#endif
+#endif
+
+#ifdef LODEPNG_COMPILE_CPP
+#include <vector>
+#include <string>
+#endif /*LODEPNG_COMPILE_CPP*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*The PNG color types (also used for raw).*/
+typedef enum LodePNGColorType
+{
+  LCT_GREY = 0, /*greyscale: 1,2,4,8,16 bit*/
+  LCT_RGB = 2, /*RGB: 8,16 bit*/
+  LCT_PALETTE = 3, /*palette: 1,2,4,8 bit*/
+  LCT_GREY_ALPHA = 4, /*greyscale with alpha: 8,16 bit*/
+  LCT_RGBA = 6 /*RGB with alpha: 8,16 bit*/
+} LodePNGColorType;
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Converts PNG data in memory to raw pixel data.
+out: Output parameter. Pointer to buffer that will contain the raw pixel data.
+     After decoding, its size is w * h * (bytes per pixel) bytes larger than
+     initially. Bytes per pixel depends on colortype and bitdepth.
+     Must be freed after usage with free(*out).
+     Note: for 16-bit per channel colors, uses big endian format like PNG does.
+w: Output parameter. Pointer to width of pixel data.
+h: Output parameter. Pointer to height of pixel data.
+in: Memory buffer with the PNG file.
+insize: size of the in buffer.
+colortype: the desired color type for the raw output image. See explanation on PNG color types.
+bitdepth: the desired bit depth for the raw output image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_decode_memory(unsigned char** out, unsigned* w, unsigned* h,
+                               const unsigned char* in, size_t insize,
+                               LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_memory, but always decodes to 32-bit RGBA raw image*/
+unsigned lodepng_decode32(unsigned char** out, unsigned* w, unsigned* h,
+                          const unsigned char* in, size_t insize);
+
+/*Same as lodepng_decode_memory, but always decodes to 24-bit RGB raw image*/
+unsigned lodepng_decode24(unsigned char** out, unsigned* w, unsigned* h,
+                          const unsigned char* in, size_t insize);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load PNG from disk, from file with given name.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned lodepng_decode_file(unsigned char** out, unsigned* w, unsigned* h,
+                             const char* filename,
+                             LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_decode_file, but always decodes to 32-bit RGBA raw image.*/
+unsigned lodepng_decode32_file(unsigned char** out, unsigned* w, unsigned* h,
+                               const char* filename);
+
+/*Same as lodepng_decode_file, but always decodes to 24-bit RGB raw image.*/
+unsigned lodepng_decode24_file(unsigned char** out, unsigned* w, unsigned* h,
+                               const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Converts raw pixel data into a PNG image in memory. The colortype and bitdepth
+  of the output PNG image cannot be chosen, they are automatically determined
+  by the colortype, bitdepth and content of the input pixel data.
+  Note: for 16-bit per channel colors, needs big endian format like PNG does.
+out: Output parameter. Pointer to buffer that will contain the PNG image data.
+     Must be freed after usage with free(*out).
+outsize: Output parameter. Pointer to the size in bytes of the out buffer.
+image: The raw pixel data to encode. The size of this buffer should be
+       w * h * (bytes per pixel), bytes per pixel depends on colortype and bitdepth.
+w: width of the raw pixel data in pixels.
+h: height of the raw pixel data in pixels.
+colortype: the color type of the raw input image. See explanation on PNG color types.
+bitdepth: the bit depth of the raw input image. See explanation on PNG color types.
+Return value: LodePNG error code (0 means no error).
+*/
+unsigned lodepng_encode_memory(unsigned char** out, size_t* outsize,
+                               const unsigned char* image, unsigned w, unsigned h,
+                               LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_memory, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32(unsigned char** out, size_t* outsize,
+                          const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_memory, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24(unsigned char** out, size_t* outsize,
+                          const unsigned char* image, unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned lodepng_encode_file(const char* filename,
+                             const unsigned char* image, unsigned w, unsigned h,
+                             LodePNGColorType colortype, unsigned bitdepth);
+
+/*Same as lodepng_encode_file, but always encodes from 32-bit RGBA raw image.*/
+unsigned lodepng_encode32_file(const char* filename,
+                               const unsigned char* image, unsigned w, unsigned h);
+
+/*Same as lodepng_encode_file, but always encodes from 24-bit RGB raw image.*/
+unsigned lodepng_encode24_file(const char* filename,
+                               const unsigned char* image, unsigned w, unsigned h);
+#endif /*LODEPNG_COMPILE_DISK*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#ifdef LODEPNG_COMPILE_CPP
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_DECODER
+/*Same as lodepng_decode_memory, but decodes to an std::vector. The colortype
+is the format to output the pixels to. Default is RGBA 8-bit per channel.*/
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const unsigned char* in, size_t insize,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const std::vector<unsigned char>& in,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts PNG file from disk to raw pixel data in memory.
+Same as the other decode functions, but instead takes a filename as input.
+*/
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                const std::string& filename,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*Same as lodepng_encode_memory, but encodes to an std::vector. colortype
+is that of the raw input data. The output PNG color type will be auto chosen.*/
+unsigned encode(std::vector<unsigned char>& out,
+                const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Converts 32-bit RGBA raw pixel data into a PNG file on disk.
+Same as the other encode functions, but instead takes a filename as output.
+NOTE: This overwrites existing files without warning!
+*/
+unsigned encode(const std::string& filename,
+                const unsigned char* in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+unsigned encode(const std::string& filename,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                LodePNGColorType colortype = LCT_RGBA, unsigned bitdepth = 8);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_ENCODER */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+#endif /*LODEPNG_COMPILE_PNG*/
+
+#ifdef LODEPNG_COMPILE_ERROR_TEXT
+/*Returns an English description of the numerical error code.*/
+const char* lodepng_error_text(unsigned code);
+#endif /*LODEPNG_COMPILE_ERROR_TEXT*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Settings for zlib decompression*/
+typedef struct LodePNGDecompressSettings LodePNGDecompressSettings;
+struct LodePNGDecompressSettings
+{
+  unsigned ignore_adler32; /*if 1, continue and don't give an error message if the Adler32 checksum is corrupted*/
+
+  /*use custom zlib decoder instead of built in one (default: null)*/
+  unsigned (*custom_zlib)(unsigned char**, size_t*,
+                          const unsigned char*, size_t,
+                          const LodePNGDecompressSettings*);
+  /*use custom deflate decoder instead of built in one (default: null)
+  if custom_zlib is used, custom_deflate is ignored since only the built in
+  zlib function will call custom_deflate*/
+  unsigned (*custom_inflate)(unsigned char**, size_t*,
+                             const unsigned char*, size_t,
+                             const LodePNGDecompressSettings*);
+
+  const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGDecompressSettings lodepng_default_decompress_settings;
+void lodepng_decompress_settings_init(LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Settings for zlib compression. Tweaking these settings tweaks the balance
+between speed and compression ratio.
+*/
+typedef struct LodePNGCompressSettings LodePNGCompressSettings;
+struct LodePNGCompressSettings /*deflate = compress*/
+{
+  /*LZ77 related settings*/
+  unsigned btype; /*the block type for LZ (0, 1, 2 or 3, see zlib standard). Should be 2 for proper compression.*/
+  unsigned use_lz77; /*whether or not to use LZ77. Should be 1 for proper compression.*/
+  unsigned windowsize; /*must be a power of two <= 32768. higher compresses more but is slower. Default value: 2048.*/
+  unsigned minmatch; /*mininum lz77 length. 3 is normally best, 6 can be better for some PNGs. Default: 0*/
+  unsigned nicematch; /*stop searching if >= this length found. Set to 258 for best compression. Default: 128*/
+  unsigned lazymatching; /*use lazy matching: better compression but a bit slower. Default: true*/
+
+  /*use custom zlib encoder instead of built in one (default: null)*/
+  unsigned (*custom_zlib)(unsigned char**, size_t*,
+                          const unsigned char*, size_t,
+                          const LodePNGCompressSettings*);
+  /*use custom deflate encoder instead of built in one (default: null)
+  if custom_zlib is used, custom_deflate is ignored since only the built in
+  zlib function will call custom_deflate*/
+  unsigned (*custom_deflate)(unsigned char**, size_t*,
+                             const unsigned char*, size_t,
+                             const LodePNGCompressSettings*);
+
+  const void* custom_context; /*optional custom settings for custom functions*/
+};
+
+extern const LodePNGCompressSettings lodepng_default_compress_settings;
+void lodepng_compress_settings_init(LodePNGCompressSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_PNG
+/*
+Color mode of an image. Contains all information required to decode the pixel
+bits to RGBA colors. This information is the same as used in the PNG file
+format, and is used both for PNG and raw image data in LodePNG.
+*/
+typedef struct LodePNGColorMode
+{
+  /*header (IHDR)*/
+  LodePNGColorType colortype; /*color type, see PNG standard or documentation further in this header file*/
+  unsigned bitdepth;  /*bits per sample, see PNG standard or documentation further in this header file*/
+
+  /*
+  palette (PLTE and tRNS)
+
+  Dynamically allocated with the colors of the palette, including alpha.
+  When encoding a PNG, to store your colors in the palette of the LodePNGColorMode, first use
+  lodepng_palette_clear, then for each color use lodepng_palette_add.
+  If you encode an image without alpha with palette, don't forget to put value 255 in each A byte of the palette.
+
+  When decoding, by default you can ignore this palette, since LodePNG already
+  fills the palette colors in the pixels of the raw RGBA output.
+
+  The palette is only supported for color type 3.
+  */
+  unsigned char* palette; /*palette in RGBARGBA... order. When allocated, must be either 0, or have size 1024*/
+  size_t palettesize; /*palette size in number of colors (amount of bytes is 4 * palettesize)*/
+
+  /*
+  transparent color key (tRNS)
+
+  This color uses the same bit depth as the bitdepth value in this struct, which can be 1-bit to 16-bit.
+  For greyscale PNGs, r, g and b will all 3 be set to the same.
+
+  When decoding, by default you can ignore this information, since LodePNG sets
+  pixels with this key to transparent already in the raw RGBA output.
+
+  The color key is only supported for color types 0 and 2.
+  */
+  unsigned key_defined; /*is a transparent color key given? 0 = false, 1 = true*/
+  unsigned key_r;       /*red/greyscale component of color key*/
+  unsigned key_g;       /*green component of color key*/
+  unsigned key_b;       /*blue component of color key*/
+} LodePNGColorMode;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_color_mode_init(LodePNGColorMode* info);
+void lodepng_color_mode_cleanup(LodePNGColorMode* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_color_mode_copy(LodePNGColorMode* dest, const LodePNGColorMode* source);
+
+void lodepng_palette_clear(LodePNGColorMode* info);
+/*add 1 color to the palette*/
+unsigned lodepng_palette_add(LodePNGColorMode* info,
+                             unsigned char r, unsigned char g, unsigned char b, unsigned char a);
+
+/*get the total amount of bits per pixel, based on colortype and bitdepth in the struct*/
+unsigned lodepng_get_bpp(const LodePNGColorMode* info);
+/*get the amount of color channels used, based on colortype in the struct.
+If a palette is used, it counts as 1 channel.*/
+unsigned lodepng_get_channels(const LodePNGColorMode* info);
+/*is it a greyscale type? (only colortype 0 or 4)*/
+unsigned lodepng_is_greyscale_type(const LodePNGColorMode* info);
+/*has it got an alpha channel? (only colortype 2 or 6)*/
+unsigned lodepng_is_alpha_type(const LodePNGColorMode* info);
+/*has it got a palette? (only colortype 3)*/
+unsigned lodepng_is_palette_type(const LodePNGColorMode* info);
+/*only returns true if there is a palette and there is a value in the palette with alpha < 255.
+Loops through the palette to check this.*/
+unsigned lodepng_has_palette_alpha(const LodePNGColorMode* info);
+/*
+Check if the given color info indicates the possibility of having non-opaque pixels in the PNG image.
+Returns true if the image can have translucent or invisible pixels (it still be opaque if it doesn't use such pixels).
+Returns false if the image can only have opaque pixels.
+In detail, it returns true only if it's a color type with alpha, or has a palette with non-opaque values,
+or if "key_defined" is true.
+*/
+unsigned lodepng_can_have_alpha(const LodePNGColorMode* info);
+/*Returns the byte size of a raw image buffer with given width, height and color mode*/
+size_t lodepng_get_raw_size(unsigned w, unsigned h, const LodePNGColorMode* color);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+/*The information of a Time chunk in PNG.*/
+typedef struct LodePNGTime
+{
+  unsigned year;    /*2 bytes used (0-65535)*/
+  unsigned month;   /*1-12*/
+  unsigned day;     /*1-31*/
+  unsigned hour;    /*0-23*/
+  unsigned minute;  /*0-59*/
+  unsigned second;  /*0-60 (to allow for leap seconds)*/
+} LodePNGTime;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*Information about the PNG image, except pixels, width and height.*/
+typedef struct LodePNGInfo
+{
+  /*header (IHDR), palette (PLTE) and transparency (tRNS) chunks*/
+  unsigned compression_method;/*compression method of the original file. Always 0.*/
+  unsigned filter_method;     /*filter method of the original file*/
+  unsigned interlace_method;  /*interlace method of the original file*/
+  LodePNGColorMode color;     /*color type and bits, palette and transparency of the PNG file*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  /*
+  suggested background color chunk (bKGD)
+  This color uses the same color mode as the PNG (except alpha channel), which can be 1-bit to 16-bit.
+
+  For greyscale PNGs, r, g and b will all 3 be set to the same. When encoding
+  the encoder writes the red one. For palette PNGs: When decoding, the RGB value
+  will be stored, not a palette index. But when encoding, specify the index of
+  the palette in background_r, the other two are then ignored.
+
+  The decoder does not use this background color to edit the color of pixels.
+  */
+  unsigned background_defined; /*is a suggested background color given?*/
+  unsigned background_r;       /*red component of suggested background color*/
+  unsigned background_g;       /*green component of suggested background color*/
+  unsigned background_b;       /*blue component of suggested background color*/
+
+  /*
+  non-international text chunks (tEXt and zTXt)
+
+  The char** arrays each contain num strings. The actual messages are in
+  text_strings, while text_keys are keywords that give a short description what
+  the actual text represents, e.g. Title, Author, Description, or anything else.
+
+  A keyword is minimum 1 character and maximum 79 characters long. It's
+  discouraged to use a single line length longer than 79 characters for texts.
+
+  Don't allocate these text buffers yourself. Use the init/cleanup functions
+  correctly and use lodepng_add_text and lodepng_clear_text.
+  */
+  size_t text_num; /*the amount of texts in these char** buffers (there may be more texts in itext)*/
+  char** text_keys; /*the keyword of a text chunk (e.g. "Comment")*/
+  char** text_strings; /*the actual text*/
+
+  /*
+  international text chunks (iTXt)
+  Similar to the non-international text chunks, but with additional strings
+  "langtags" and "transkeys".
+  */
+  size_t itext_num; /*the amount of international texts in this PNG*/
+  char** itext_keys; /*the English keyword of the text chunk (e.g. "Comment")*/
+  char** itext_langtags; /*language tag for this text's language, ISO/IEC 646 string, e.g. ISO 639 language tag*/
+  char** itext_transkeys; /*keyword translated to the international language - UTF-8 string*/
+  char** itext_strings; /*the actual international text - UTF-8 string*/
+
+  /*time chunk (tIME)*/
+  unsigned time_defined; /*set to 1 to make the encoder generate a tIME chunk*/
+  LodePNGTime time;
+
+  /*phys chunk (pHYs)*/
+  unsigned phys_defined; /*if 0, there is no pHYs chunk and the values below are undefined, if 1 else there is one*/
+  unsigned phys_x; /*pixels per unit in x direction*/
+  unsigned phys_y; /*pixels per unit in y direction*/
+  unsigned phys_unit; /*may be 0 (unknown unit) or 1 (metre)*/
+
+  /*
+  unknown chunks
+  There are 3 buffers, one for each position in the PNG where unknown chunks can appear
+  each buffer contains all unknown chunks for that position consecutively
+  The 3 buffers are the unknown chunks between certain critical chunks:
+  0: IHDR-PLTE, 1: PLTE-IDAT, 2: IDAT-IEND
+  Do not allocate or traverse this data yourself. Use the chunk traversing functions declared
+  later, such as lodepng_chunk_next and lodepng_chunk_append, to read/write this struct.
+  */
+  unsigned char* unknown_chunks_data[3];
+  size_t unknown_chunks_size[3]; /*size in bytes of the unknown chunks, given for protection*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGInfo;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_info_init(LodePNGInfo* info);
+void lodepng_info_cleanup(LodePNGInfo* info);
+/*return value is error code (0 means no error)*/
+unsigned lodepng_info_copy(LodePNGInfo* dest, const LodePNGInfo* source);
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+void lodepng_clear_text(LodePNGInfo* info); /*use this to clear the texts again after you filled them in*/
+unsigned lodepng_add_text(LodePNGInfo* info, const char* key, const char* str); /*push back both texts at once*/
+
+void lodepng_clear_itext(LodePNGInfo* info); /*use this to clear the itexts again after you filled them in*/
+unsigned lodepng_add_itext(LodePNGInfo* info, const char* key, const char* langtag,
+                           const char* transkey, const char* str); /*push back the 4 texts of 1 chunk at once*/
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+
+/*
+Converts raw buffer from one color type to another color type, based on
+LodePNGColorMode structs to describe the input and output color type.
+See the reference manual at the end of this header file to see which color conversions are supported.
+return value = LodePNG error code (0 if all went ok, an error if the conversion isn't supported)
+The out buffer must have size (w * h * bpp + 7) / 8, where bpp is the bits per pixel
+of the output color type (lodepng_get_bpp).
+For < 8 bpp images, there should not be padding bits at the end of scanlines.
+For 16-bit per channel colors, uses big endian format like PNG does.
+Return value is LodePNG error code
+*/
+unsigned lodepng_convert(unsigned char* out, const unsigned char* in,
+                         const LodePNGColorMode* mode_out, const LodePNGColorMode* mode_in,
+                         unsigned w, unsigned h);
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Settings for the decoder. This contains settings for the PNG and the Zlib
+decoder, but not the Info settings from the Info structs.
+*/
+typedef struct LodePNGDecoderSettings
+{
+  LodePNGDecompressSettings zlibsettings; /*in here is the setting to ignore Adler32 checksums*/
+
+  unsigned ignore_crc; /*ignore CRC checksums*/
+
+  unsigned color_convert; /*whether to convert the PNG to the color type you want. Default: yes*/
+
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  unsigned read_text_chunks; /*if false but remember_unknown_chunks is true, they're stored in the unknown chunks*/
+  /*store all bytes from unknown chunks in the LodePNGInfo (off by default, useful for a png editor)*/
+  unsigned remember_unknown_chunks;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGDecoderSettings;
+
+void lodepng_decoder_settings_init(LodePNGDecoderSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*automatically use color type with less bits per pixel if losslessly possible. Default: AUTO*/
+typedef enum LodePNGFilterStrategy
+{
+  /*every filter at zero*/
+  LFS_ZERO,
+  /*Use filter that gives minimum sum, as described in the official PNG filter heuristic.*/
+  LFS_MINSUM,
+  /*Use the filter type that gives smallest Shannon entropy for this scanline. Depending
+  on the image, this is better or worse than minsum.*/
+  LFS_ENTROPY,
+  /*
+  Brute-force-search PNG filters by compressing each filter for each scanline.
+  Experimental, very slow, and only rarely gives better compression than MINSUM.
+  */
+  LFS_BRUTE_FORCE,
+  /*use predefined_filters buffer: you specify the filter type for each scanline*/
+  LFS_PREDEFINED
+} LodePNGFilterStrategy;
+
+/*Gives characteristics about the colors of the image, which helps decide which color model to use for encoding.
+Used internally by default if "auto_convert" is enabled. Public because it's useful for custom algorithms.*/
+typedef struct LodePNGColorProfile
+{
+  unsigned colored; /*not greyscale*/
+  unsigned key; /*if true, image is not opaque. Only if true and alpha is false, color key is possible.*/
+  unsigned short key_r; /*these values are always in 16-bit bitdepth in the profile*/
+  unsigned short key_g;
+  unsigned short key_b;
+  unsigned alpha; /*alpha channel or alpha palette required*/
+  unsigned numcolors; /*amount of colors, up to 257. Not valid if bits == 16.*/
+  unsigned char palette[1024]; /*Remembers up to the first 256 RGBA colors, in no particular order*/
+  unsigned bits; /*bits per channel (not for palette). 1,2 or 4 for greyscale only. 16 if 16-bit per channel required.*/
+} LodePNGColorProfile;
+
+void lodepng_color_profile_init(LodePNGColorProfile* profile);
+
+/*Get a LodePNGColorProfile of the image.*/
+unsigned lodepng_get_color_profile(LodePNGColorProfile* profile,
+                                   const unsigned char* image, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in);
+/*The function LodePNG uses internally to decide the PNG color with auto_convert.
+Chooses an optimal color model, e.g. grey if only grey pixels, palette if < 256 colors, ...*/
+unsigned lodepng_auto_choose_color(LodePNGColorMode* mode_out,
+                                   const unsigned char* image, unsigned w, unsigned h,
+                                   const LodePNGColorMode* mode_in);
+
+/*Settings for the encoder.*/
+typedef struct LodePNGEncoderSettings
+{
+  LodePNGCompressSettings zlibsettings; /*settings for the zlib encoder, such as window size, ...*/
+
+  unsigned auto_convert; /*automatically choose output PNG color type. Default: true*/
+
+  /*If true, follows the official PNG heuristic: if the PNG uses a palette or lower than
+  8 bit depth, set all filters to zero. Otherwise use the filter_strategy. Note that to
+  completely follow the official PNG heuristic, filter_palette_zero must be true and
+  filter_strategy must be LFS_MINSUM*/
+  unsigned filter_palette_zero;
+  /*Which filter strategy to use when not using zeroes due to filter_palette_zero.
+  Set filter_palette_zero to 0 to ensure always using your chosen strategy. Default: LFS_MINSUM*/
+  LodePNGFilterStrategy filter_strategy;
+  /*used if filter_strategy is LFS_PREDEFINED. In that case, this must point to a buffer with
+  the same length as the amount of scanlines in the image, and each value must <= 5. You
+  have to cleanup this buffer, LodePNG will never free it. Don't forget that filter_palette_zero
+  must be set to 0 to ensure this is also used on palette or low bitdepth images.*/
+  const unsigned char* predefined_filters;
+
+  /*force creating a PLTE chunk if colortype is 2 or 6 (= a suggested palette).
+  If colortype is 3, PLTE is _always_ created.*/
+  unsigned force_palette;
+#ifdef LODEPNG_COMPILE_ANCILLARY_CHUNKS
+  /*add LodePNG identifier and version as a text chunk, for debugging*/
+  unsigned add_id;
+  /*encode text chunks as zTXt chunks instead of tEXt chunks, and use compression in iTXt chunks*/
+  unsigned text_compression;
+#endif /*LODEPNG_COMPILE_ANCILLARY_CHUNKS*/
+} LodePNGEncoderSettings;
+
+void lodepng_encoder_settings_init(LodePNGEncoderSettings* settings);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+
+#if defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER)
+/*The settings, state and information for extended encoding and decoding.*/
+typedef struct LodePNGState
+{
+#ifdef LODEPNG_COMPILE_DECODER
+  LodePNGDecoderSettings decoder; /*the decoding settings*/
+#endif /*LODEPNG_COMPILE_DECODER*/
+#ifdef LODEPNG_COMPILE_ENCODER
+  LodePNGEncoderSettings encoder; /*the encoding settings*/
+#endif /*LODEPNG_COMPILE_ENCODER*/
+  LodePNGColorMode info_raw; /*specifies the format in which you would like to get the raw pixel buffer*/
+  LodePNGInfo info_png; /*info of the PNG image obtained after decoding*/
+  unsigned error;
+#ifdef LODEPNG_COMPILE_CPP
+  /* For the lodepng::State subclass. */
+  virtual ~LodePNGState(){}
+#endif
+} LodePNGState;
+
+/*init, cleanup and copy functions to use with this struct*/
+void lodepng_state_init(LodePNGState* state);
+void lodepng_state_cleanup(LodePNGState* state);
+void lodepng_state_copy(LodePNGState* dest, const LodePNGState* source);
+#endif /* defined(LODEPNG_COMPILE_DECODER) || defined(LODEPNG_COMPILE_ENCODER) */
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*
+Same as lodepng_decode_memory, but uses a LodePNGState to allow custom settings and
+getting much more information about the PNG image and color mode.
+*/
+unsigned lodepng_decode(unsigned char** out, unsigned* w, unsigned* h,
+                        LodePNGState* state,
+                        const unsigned char* in, size_t insize);
+
+/*
+Read the PNG header, but not the actual data. This returns only the information
+that is in the header chunk of the PNG, such as width, height and color type. The
+information is placed in the info_png field of the LodePNGState.
+*/
+unsigned lodepng_inspect(unsigned* w, unsigned* h,
+                         LodePNGState* state,
+                         const unsigned char* in, size_t insize);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*This function allocates the out buffer with standard malloc and stores the size in *outsize.*/
+unsigned lodepng_encode(unsigned char** out, size_t* outsize,
+                        const unsigned char* image, unsigned w, unsigned h,
+                        LodePNGState* state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+/*
+The lodepng_chunk functions are normally not needed, except to traverse the
+unknown chunks stored in the LodePNGInfo struct, or add new ones to it.
+It also allows traversing the chunks of an encoded PNG file yourself.
+
+PNG standard chunk naming conventions:
+First byte: uppercase = critical, lowercase = ancillary
+Second byte: uppercase = public, lowercase = private
+Third byte: must be uppercase
+Fourth byte: uppercase = unsafe to copy, lowercase = safe to copy
+*/
+
+/*
+Gets the length of the data of the chunk. Total chunk length has 12 bytes more.
+There must be at least 4 bytes to read from. If the result value is too large,
+it may be corrupt data.
+*/
+unsigned lodepng_chunk_length(const unsigned char* chunk);
+
+/*puts the 4-byte type in null terminated string*/
+void lodepng_chunk_type(char type[5], const unsigned char* chunk);
+
+/*check if the type is the given type*/
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type);
+
+/*0: it's one of the critical chunk types, 1: it's an ancillary chunk (see PNG standard)*/
+unsigned char lodepng_chunk_ancillary(const unsigned char* chunk);
+
+/*0: public, 1: private (see PNG standard)*/
+unsigned char lodepng_chunk_private(const unsigned char* chunk);
+
+/*0: the chunk is unsafe to copy, 1: the chunk is safe to copy (see PNG standard)*/
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk);
+
+/*get pointer to the data of the chunk, where the input points to the header of the chunk*/
+unsigned char* lodepng_chunk_data(unsigned char* chunk);
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk);
+
+/*returns 0 if the crc is correct, 1 if it's incorrect (0 for OK as usual!)*/
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk);
+
+/*generates the correct CRC from the data and puts it in the last 4 bytes of the chunk*/
+void lodepng_chunk_generate_crc(unsigned char* chunk);
+
+/*iterate to next chunks. don't use on IEND chunk, as there is no next chunk then*/
+unsigned char* lodepng_chunk_next(unsigned char* chunk);
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk);
+
+/*
+Appends chunk to the data in out. The given chunk should already have its chunk header.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returns error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk);
+
+/*
+Appends new chunk to out. The chunk to append is given by giving its length, type
+and data separately. The type is a 4-letter string.
+The out variable and outlength are updated to reflect the new reallocated buffer.
+Returne error code (0 if it went ok)
+*/
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+                              const char* type, const unsigned char* data);
+
+
+/*Calculate CRC32 of buffer*/
+unsigned lodepng_crc32(const unsigned char* buf, size_t len);
+#endif /*LODEPNG_COMPILE_PNG*/
+
+
+#ifdef LODEPNG_COMPILE_ZLIB
+/*
+This zlib part can be used independently to zlib compress and decompress a
+buffer. It cannot be used to create gzip files however, and it only supports the
+part of zlib that is required for PNG, it does not support dictionaries.
+*/
+
+#ifdef LODEPNG_COMPILE_DECODER
+/*Inflate a buffer. Inflate is the decompression step of deflate. Out buffer must be freed after use.*/
+unsigned lodepng_inflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGDecompressSettings* settings);
+
+/*
+Decompresses Zlib data. Reallocates the out buffer and appends the data. The
+data must be according to the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_decompress(unsigned char** out, size_t* outsize,
+                                 const unsigned char* in, size_t insize,
+                                 const LodePNGDecompressSettings* settings);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/*
+Compresses data with Zlib. Reallocates the out buffer and appends the data.
+Zlib adds a small header and trailer around the deflate data.
+The data is output in the format of the zlib specification.
+Either, *out must be NULL and *outsize must be 0, or, *out must be a valid
+buffer and *outsize its size in bytes. out must be freed by user after usage.
+*/
+unsigned lodepng_zlib_compress(unsigned char** out, size_t* outsize,
+                               const unsigned char* in, size_t insize,
+                               const LodePNGCompressSettings* settings);
+
+/*
+Find length-limited Huffman code for given frequencies. This function is in the
+public interface only for tests, it's used internally by lodepng_deflate.
+*/
+unsigned lodepng_huffman_code_lengths(unsigned* lengths, const unsigned* frequencies,
+                                      size_t numcodes, unsigned maxbitlen);
+
+/*Compress a buffer with deflate. See RFC 1951. Out buffer must be freed after use.*/
+unsigned lodepng_deflate(unsigned char** out, size_t* outsize,
+                         const unsigned char* in, size_t insize,
+                         const LodePNGCompressSettings* settings);
+
+#endif /*LODEPNG_COMPILE_ENCODER*/
+#endif /*LODEPNG_COMPILE_ZLIB*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into buffer. The function allocates the out buffer, and
+after usage you should free it.
+out: output parameter, contains pointer to loaded buffer.
+outsize: output parameter, size of the allocated out buffer
+filename: the path to the file to load
+return value: error code (0 means ok)
+*/
+unsigned lodepng_load_file(unsigned char** out, size_t* outsize, const char* filename);
+
+/*
+Save a file from buffer to disk. Warning, if it exists, this function overwrites
+the file without warning!
+buffer: the buffer to write
+buffersize: size of the buffer to write
+filename: the path to the file to save to
+return value: error code (0 means ok)
+*/
+unsigned lodepng_save_file(const unsigned char* buffer, size_t buffersize, const char* filename);
+#endif /*LODEPNG_COMPILE_DISK*/
+
+#ifdef LODEPNG_COMPILE_CPP
+/* The LodePNG C++ wrapper uses std::vectors instead of manually allocated memory buffers. */
+namespace lodepng
+{
+#ifdef LODEPNG_COMPILE_PNG
+class State : public LodePNGState
+{
+  public:
+    State();
+    State(const State& other);
+    virtual ~State();
+    State& operator=(const State& other);
+};
+
+#ifdef LODEPNG_COMPILE_DECODER
+/* Same as other lodepng::decode, but using a State for more settings and information. */
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const unsigned char* in, size_t insize);
+unsigned decode(std::vector<unsigned char>& out, unsigned& w, unsigned& h,
+                State& state,
+                const std::vector<unsigned char>& in);
+#endif /*LODEPNG_COMPILE_DECODER*/
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/* Same as other lodepng::encode, but using a State for more settings and information. */
+unsigned encode(std::vector<unsigned char>& out,
+                const unsigned char* in, unsigned w, unsigned h,
+                State& state);
+unsigned encode(std::vector<unsigned char>& out,
+                const std::vector<unsigned char>& in, unsigned w, unsigned h,
+                State& state);
+#endif /*LODEPNG_COMPILE_ENCODER*/
+
+#ifdef LODEPNG_COMPILE_DISK
+/*
+Load a file from disk into an std::vector.
+return value: error code (0 means ok)
+*/
+unsigned load_file(std::vector<unsigned char>& buffer, const std::string& filename);
+
+/*
+Save the binary data in an std::vector to a file on disk. The file is overwritten
+without warning.
+*/
+unsigned save_file(const std::vector<unsigned char>& buffer, const std::string& filename);
+#endif /* LODEPNG_COMPILE_DISK */
+#endif /* LODEPNG_COMPILE_PNG */
+
+#ifdef LODEPNG_COMPILE_ZLIB
+#ifdef LODEPNG_COMPILE_DECODER
+/* Zlib-decompress an unsigned char buffer */
+unsigned decompress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                    const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+
+/* Zlib-decompress an std::vector */
+unsigned decompress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                    const LodePNGDecompressSettings& settings = lodepng_default_decompress_settings);
+#endif /* LODEPNG_COMPILE_DECODER */
+
+#ifdef LODEPNG_COMPILE_ENCODER
+/* Zlib-compress an unsigned char buffer */
+unsigned compress(std::vector<unsigned char>& out, const unsigned char* in, size_t insize,
+                  const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+
+/* Zlib-compress an std::vector */
+unsigned compress(std::vector<unsigned char>& out, const std::vector<unsigned char>& in,
+                  const LodePNGCompressSettings& settings = lodepng_default_compress_settings);
+#endif /* LODEPNG_COMPILE_ENCODER */
+#endif /* LODEPNG_COMPILE_ZLIB */
+} /* namespace lodepng */
+#endif /*LODEPNG_COMPILE_CPP*/
+
+/*
+TODO:
+[.] test if there are no memory leaks or security exploits - done a lot but needs to be checked often
+[.] check compatibility with various compilers  - done but needs to be redone for every newer version
+[X] converting color to 16-bit per channel types
+[ ] read all public PNG chunk types (but never let the color profile and gamma ones touch RGB values)
+[ ] make sure encoder generates no chunks with size > (2^31)-1
+[ ] partial decoding (stream processing)
+[X] let the "isFullyOpaque" function check color keys and transparent palettes too
+[X] better name for the variables "codes", "codesD", "codelengthcodes", "clcl" and "lldl"
+[ ] don't stop decoding on errors like 69, 57, 58 (make warnings)
+[ ] let the C++ wrapper catch exceptions coming from the standard library and return LodePNG error codes
+[ ] allow user to provide custom color conversion functions, e.g. for premultiplied alpha, padding bits or not, ...
+[ ] allow user to give data (void*) to custom allocator
+*/
+
+#endif /*LODEPNG_H inclusion guard*/
+
+/*
+LodePNG Documentation
+---------------------
+
+0. table of contents
+--------------------
+
+  1. about
+   1.1. supported features
+   1.2. features not supported
+  2. C and C++ version
+  3. security
+  4. decoding
+  5. encoding
+  6. color conversions
+    6.1. PNG color types
+    6.2. color conversions
+    6.3. padding bits
+    6.4. A note about 16-bits per channel and endianness
+  7. error values
+  8. chunks and PNG editing
+  9. compiler support
+  10. examples
+   10.1. decoder C++ example
+   10.2. decoder C example
+  11. state settings reference
+  12. changes
+  13. contact information
+
+
+1. about
+--------
+
+PNG is a file format to store raster images losslessly with good compression,
+supporting different color types and alpha channel.
+
+LodePNG is a PNG codec according to the Portable Network Graphics (PNG)
+Specification (Second Edition) - W3C Recommendation 10 November 2003.
+
+The specifications used are:
+
+*) Portable Network Graphics (PNG) Specification (Second Edition):
+     http://www.w3.org/TR/2003/REC-PNG-20031110
+*) RFC 1950 ZLIB Compressed Data Format version 3.3:
+     http://www.gzip.org/zlib/rfc-zlib.html
+*) RFC 1951 DEFLATE Compressed Data Format Specification ver 1.3:
+     http://www.gzip.org/zlib/rfc-deflate.html
+
+The most recent version of LodePNG can currently be found at
+http://lodev.org/lodepng/
+
+LodePNG works both in C (ISO C90) and C++, with a C++ wrapper that adds
+extra functionality.
+
+LodePNG exists out of two files:
+-lodepng.h: the header file for both C and C++
+-lodepng.c(pp): give it the name lodepng.c or lodepng.cpp (or .cc) depending on your usage
+
+If you want to start using LodePNG right away without reading this doc, get the
+examples from the LodePNG website to see how to use it in code, or check the
+smaller examples in chapter 13 here.
+
+LodePNG is simple but only supports the basic requirements. To achieve
+simplicity, the following design choices were made: There are no dependencies
+on any external library. There are functions to decode and encode a PNG with
+a single function call, and extended versions of these functions taking a
+LodePNGState struct allowing to specify or get more information. By default
+the colors of the raw image are always RGB or RGBA, no matter what color type
+the PNG file uses. To read and write files, there are simple functions to
+convert the files to/from buffers in memory.
+
+This all makes LodePNG suitable for loading textures in games, demos and small
+programs, ... It's less suitable for full fledged image editors, loading PNGs
+over network (it requires all the image data to be available before decoding can
+begin), life-critical systems, ...
+
+1.1. supported features
+-----------------------
+
+The following features are supported by the decoder:
+
+*) decoding of PNGs with any color type, bit depth and interlace mode, to a 24- or 32-bit color raw image,
+   or the same color type as the PNG
+*) encoding of PNGs, from any raw image to 24- or 32-bit color, or the same color type as the raw image
+*) Adam7 interlace and deinterlace for any color type
+*) loading the image from harddisk or decoding it from a buffer from other sources than harddisk
+*) support for alpha channels, including RGBA color model, translucent palettes and color keying
+*) zlib decompression (inflate)
+*) zlib compression (deflate)
+*) CRC32 and ADLER32 checksums
+*) handling of unknown chunks, allowing making a PNG editor that stores custom and unknown chunks.
+*) the following chunks are supported (generated/interpreted) by both encoder and decoder:
+    IHDR: header information
+    PLTE: color palette
+    IDAT: pixel data
+    IEND: the final chunk
+    tRNS: transparency for palettized images
+    tEXt: textual information
+    zTXt: compressed textual information
+    iTXt: international textual information
+    bKGD: suggested background color
+    pHYs: physical dimensions
+    tIME: modification time
+
+1.2. features not supported
+---------------------------
+
+The following features are _not_ supported:
+
+*) some features needed to make a conformant PNG-Editor might be still missing.
+*) partial loading/stream processing. All data must be available and is processed in one call.
+*) The following public chunks are not supported but treated as unknown chunks by LodePNG
+    cHRM, gAMA, iCCP, sRGB, sBIT, hIST, sPLT
+   Some of these are not supported on purpose: LodePNG wants to provide the RGB values
+   stored in the pixels, not values modified by system dependent gamma or color models.
+
+
+2. C and C++ version
+--------------------
+
+The C version uses buffers allocated with alloc that you need to free()
+yourself. You need to use init and cleanup functions for each struct whenever
+using a struct from the C version to avoid exploits and memory leaks.
+
+The C++ version has extra functions with std::vectors in the interface and the
+lodepng::State class which is a LodePNGState with constructor and destructor.
+
+These files work without modification for both C and C++ compilers because all
+the additional C++ code is in "#ifdef __cplusplus" blocks that make C-compilers
+ignore it, and the C code is made to compile both with strict ISO C90 and C++.
+
+To use the C++ version, you need to rename the source file to lodepng.cpp
+(instead of lodepng.c), and compile it with a C++ compiler.
+
+To use the C version, you need to rename the source file to lodepng.c (instead
+of lodepng.cpp), and compile it with a C compiler.
+
+
+3. Security
+-----------
+
+Even if carefully designed, it's always possible that LodePNG contains possible
+exploits. If you discover one, please let me know, and it will be fixed.
+
+When using LodePNG, care has to be taken with the C version of LodePNG, as well
+as the C-style structs when working with C++. The following conventions are used
+for all C-style structs:
+
+-if a struct has a corresponding init function, always call the init function when making a new one
+-if a struct has a corresponding cleanup function, call it before the struct disappears to avoid memory leaks
+-if a struct has a corresponding copy function, use the copy function instead of "=".
+ The destination must also be inited already.
+
+
+4. Decoding
+-----------
+
+Decoding converts a PNG compressed image to a raw pixel buffer.
+
+Most documentation on using the decoder is at its declarations in the header
+above. For C, simple decoding can be done with functions such as
+lodepng_decode32, and more advanced decoding can be done with the struct
+LodePNGState and lodepng_decode. For C++, all decoding can be done with the
+various lodepng::decode functions, and lodepng::State can be used for advanced
+features.
+
+When using the LodePNGState, it uses the following fields for decoding:
+*) LodePNGInfo info_png: it stores extra information about the PNG (the input) in here
+*) LodePNGColorMode info_raw: here you can say what color mode of the raw image (the output) you want to get
+*) LodePNGDecoderSettings decoder: you can specify a few extra settings for the decoder to use
+
+LodePNGInfo info_png
+--------------------
+
+After decoding, this contains extra information of the PNG image, except the actual
+pixels, width and height because these are already gotten directly from the decoder
+functions.
+
+It contains for example the original color type of the PNG image, text comments,
+suggested background color, etc... More details about the LodePNGInfo struct are
+at its declaration documentation.
+
+LodePNGColorMode info_raw
+-------------------------
+
+When decoding, here you can specify which color type you want
+the resulting raw image to be. If this is different from the colortype of the
+PNG, then the decoder will automatically convert the result. This conversion
+always works, except if you want it to convert a color PNG to greyscale or to
+a palette with missing colors.
+
+By default, 32-bit color is used for the result.
+
+LodePNGDecoderSettings decoder
+------------------------------
+
+The settings can be used to ignore the errors created by invalid CRC and Adler32
+chunks, and to disable the decoding of tEXt chunks.
+
+There's also a setting color_convert, true by default. If false, no conversion
+is done, the resulting data will be as it was in the PNG (after decompression)
+and you'll have to puzzle the colors of the pixels together yourself using the
+color type information in the LodePNGInfo.
+
+
+5. Encoding
+-----------
+
+Encoding converts a raw pixel buffer to a PNG compressed image.
+
+Most documentation on using the encoder is at its declarations in the header
+above. For C, simple encoding can be done with functions such as
+lodepng_encode32, and more advanced decoding can be done with the struct
+LodePNGState and lodepng_encode. For C++, all encoding can be done with the
+various lodepng::encode functions, and lodepng::State can be used for advanced
+features.
+
+Like the decoder, the encoder can also give errors. However it gives less errors
+since the encoder input is trusted, the decoder input (a PNG image that could
+be forged by anyone) is not trusted.
+
+When using the LodePNGState, it uses the following fields for encoding:
+*) LodePNGInfo info_png: here you specify how you want the PNG (the output) to be.
+*) LodePNGColorMode info_raw: here you say what color type of the raw image (the input) has
+*) LodePNGEncoderSettings encoder: you can specify a few settings for the encoder to use
+
+LodePNGInfo info_png
+--------------------
+
+When encoding, you use this the opposite way as when decoding: for encoding,
+you fill in the values you want the PNG to have before encoding. By default it's
+not needed to specify a color type for the PNG since it's automatically chosen,
+but it's possible to choose it yourself given the right settings.
+
+The encoder will not always exactly match the LodePNGInfo struct you give,
+it tries as close as possible. Some things are ignored by the encoder. The
+encoder uses, for example, the following settings from it when applicable:
+colortype and bitdepth, text chunks, time chunk, the color key, the palette, the
+background color, the interlace method, unknown chunks, ...
+
+When encoding to a PNG with colortype 3, the encoder will generate a PLTE chunk.
+If the palette contains any colors for which the alpha channel is not 255 (so
+there are translucent colors in the palette), it'll add a tRNS chunk.
+
+LodePNGColorMode info_raw
+-------------------------
+
+You specify the color type of the raw image that you give to the input here,
+including a possible transparent color key and palette you happen to be using in
+your raw image data.
+
+By default, 32-bit color is assumed, meaning your input has to be in RGBA
+format with 4 bytes (unsigned chars) per pixel.
+
+LodePNGEncoderSettings encoder
+------------------------------
+
+The following settings are supported (some are in sub-structs):
+*) auto_convert: when this option is enabled, the encoder will
+automatically choose the smallest possible color mode (including color key) that
+can encode the colors of all pixels without information loss.
+*) btype: the block type for LZ77. 0 = uncompressed, 1 = fixed huffman tree,
+   2 = dynamic huffman tree (best compression). Should be 2 for proper
+   compression.
+*) use_lz77: whether or not to use LZ77 for compressed block types. Should be
+   true for proper compression.
+*) windowsize: the window size used by the LZ77 encoder (1 - 32768). Has value
+   2048 by default, but can be set to 32768 for better, but slow, compression.
+*) force_palette: if colortype is 2 or 6, you can make the encoder write a PLTE
+   chunk if force_palette is true. This can used as suggested palette to convert
+   to by viewers that don't support more than 256 colors (if those still exist)
+*) add_id: add text chunk "Encoder: LodePNG <version>" to the image.
+*) text_compression: default 1. If 1, it'll store texts as zTXt instead of tEXt chunks.
+  zTXt chunks use zlib compression on the text. This gives a smaller result on
+  large texts but a larger result on small texts (such as a single program name).
+  It's all tEXt or all zTXt though, there's no separate setting per text yet.
+
+
+6. color conversions
+--------------------
+
+An important thing to note about LodePNG, is that the color type of the PNG, and
+the color type of the raw image, are completely independent. By default, when
+you decode a PNG, you get the result as a raw image in the color type you want,
+no matter whether the PNG was encoded with a palette, greyscale or RGBA color.
+And if you encode an image, by default LodePNG will automatically choose the PNG
+color type that gives good compression based on the values of colors and amount
+of colors in the image. It can be configured to let you control it instead as
+well, though.
+
+To be able to do this, LodePNG does conversions from one color mode to another.
+It can convert from almost any color type to any other color type, except the
+following conversions: RGB to greyscale is not supported, and converting to a
+palette when the palette doesn't have a required color is not supported. This is
+not supported on purpose: this is information loss which requires a color
+reduction algorithm that is beyong the scope of a PNG encoder (yes, RGB to grey
+is easy, but there are multiple ways if you want to give some channels more
+weight).
+
+By default, when decoding, you get the raw image in 32-bit RGBA or 24-bit RGB
+color, no matter what color type the PNG has. And by default when encoding,
+LodePNG automatically picks the best color model for the output PNG, and expects
+the input image to be 32-bit RGBA or 24-bit RGB. So, unless you want to control
+the color format of the images yourself, you can skip this chapter.
+
+6.1. PNG color types
+--------------------
+
+A PNG image can have many color types, ranging from 1-bit color to 64-bit color,
+as well as palettized color modes. After the zlib decompression and unfiltering
+in the PNG image is done, the raw pixel data will have that color type and thus
+a certain amount of bits per pixel. If you want the output raw image after
+decoding to have another color type, a conversion is done by LodePNG.
+
+The PNG specification gives the following color types:
+
+0: greyscale, bit depths 1, 2, 4, 8, 16
+2: RGB, bit depths 8 and 16
+3: palette, bit depths 1, 2, 4 and 8
+4: greyscale with alpha, bit depths 8 and 16
+6: RGBA, bit depths 8 and 16
+
+Bit depth is the amount of bits per pixel per color channel. So the total amount
+of bits per pixel is: amount of channels * bitdepth.
+
+6.2. color conversions
+----------------------
+
+As explained in the sections about the encoder and decoder, you can specify
+color types and bit depths in info_png and info_raw to change the default
+behaviour.
+
+If, when decoding, you want the raw image to be something else than the default,
+you need to set the color type and bit depth you want in the LodePNGColorMode,
+or the parameters colortype and bitdepth of the simple decoding function.
+
+If, when encoding, you use another color type than the default in the raw input
+image, you need to specify its color type and bit depth in the LodePNGColorMode
+of the raw image, or use the parameters colortype and bitdepth of the simple
+encoding function.
+
+If, when encoding, you don't want LodePNG to choose the output PNG color type
+but control it yourself, you need to set auto_convert in the encoder settings
+to false, and specify the color type you want in the LodePNGInfo of the
+encoder (including palette: it can generate a palette if auto_convert is true,
+otherwise not).
+
+If the input and output color type differ (whether user chosen or auto chosen),
+LodePNG will do a color conversion, which follows the rules below, and may
+sometimes result in an error.
+
+To avoid some confusion:
+-the decoder converts from PNG to raw image
+-the encoder converts from raw image to PNG
+-the colortype and bitdepth in LodePNGColorMode info_raw, are those of the raw image
+-the colortype and bitdepth in the color field of LodePNGInfo info_png, are those of the PNG
+-when encoding, the color type in LodePNGInfo is ignored if auto_convert
+ is enabled, it is automatically generated instead
+-when decoding, the color type in LodePNGInfo is set by the decoder to that of the original
+ PNG image, but it can be ignored since the raw image has the color type you requested instead
+-if the color type of the LodePNGColorMode and PNG image aren't the same, a conversion
+ between the color types is done if the color types are supported. If it is not
+ supported, an error is returned. If the types are the same, no conversion is done.
+-even though some conversions aren't supported, LodePNG supports loading PNGs from any
+ colortype and saving PNGs to any colortype, sometimes it just requires preparing
+ the raw image correctly before encoding.
+-both encoder and decoder use the same color converter.
+
+Non supported color conversions:
+-color to greyscale: no error is thrown, but the result will look ugly because
+only the red channel is taken
+-anything to palette when that palette does not have that color in it: in this
+case an error is thrown
+
+Supported color conversions:
+-anything to 8-bit RGB, 8-bit RGBA, 16-bit RGB, 16-bit RGBA
+-any grey or grey+alpha, to grey or grey+alpha
+-anything to a palette, as long as the palette has the requested colors in it
+-removing alpha channel
+-higher to smaller bitdepth, and vice versa
+
+If you want no color conversion to be done (e.g. for speed or control):
+-In the encoder, you can make it save a PNG with any color type by giving the
+raw color mode and LodePNGInfo the same color mode, and setting auto_convert to
+false.
+-In the decoder, you can make it store the pixel data in the same color type
+as the PNG has, by setting the color_convert setting to false. Settings in
+info_raw are then ignored.
+
+The function lodepng_convert does the color conversion. It is available in the
+interface but normally isn't needed since the encoder and decoder already call
+it.
+
+6.3. padding bits
+-----------------
+
+In the PNG file format, if a less than 8-bit per pixel color type is used and the scanlines
+have a bit amount that isn't a multiple of 8, then padding bits are used so that each
+scanline starts at a fresh byte. But that is NOT true for the LodePNG raw input and output.
+The raw input image you give to the encoder, and the raw output image you get from the decoder
+will NOT have these padding bits, e.g. in the case of a 1-bit image with a width
+of 7 pixels, the first pixel of the second scanline will the the 8th bit of the first byte,
+not the first bit of a new byte.
+
+6.4. A note about 16-bits per channel and endianness
+----------------------------------------------------
+
+LodePNG uses unsigned char arrays for 16-bit per channel colors too, just like
+for any other color format. The 16-bit values are stored in big endian (most
+significant byte first) in these arrays. This is the opposite order of the
+little endian used by x86 CPU's.
+
+LodePNG always uses big endian because the PNG file format does so internally.
+Conversions to other formats than PNG uses internally are not supported by
+LodePNG on purpose, there are myriads of formats, including endianness of 16-bit
+colors, the order in which you store R, G, B and A, and so on. Supporting and
+converting to/from all that is outside the scope of LodePNG.
+
+This may mean that, depending on your use case, you may want to convert the big
+endian output of LodePNG to little endian with a for loop. This is certainly not
+always needed, many applications and libraries support big endian 16-bit colors
+anyway, but it means you cannot simply cast the unsigned char* buffer to an
+unsigned short* buffer on x86 CPUs.
+
+
+7. error values
+---------------
+
+All functions in LodePNG that return an error code, return 0 if everything went
+OK, or a non-zero code if there was an error.
+
+The meaning of the LodePNG error values can be retrieved with the function
+lodepng_error_text: given the numerical error code, it returns a description
+of the error in English as a string.
+
+Check the implementation of lodepng_error_text to see the meaning of each code.
+
+
+8. chunks and PNG editing
+-------------------------
+
+If you want to add extra chunks to a PNG you encode, or use LodePNG for a PNG
+editor that should follow the rules about handling of unknown chunks, or if your
+program is able to read other types of chunks than the ones handled by LodePNG,
+then that's possible with the chunk functions of LodePNG.
+
+A PNG chunk has the following layout:
+
+4 bytes length
+4 bytes type name
+length bytes data
+4 bytes CRC
+
+8.1. iterating through chunks
+-----------------------------
+
+If you have a buffer containing the PNG image data, then the first chunk (the
+IHDR chunk) starts at byte number 8 of that buffer. The first 8 bytes are the
+signature of the PNG and are not part of a chunk. But if you start at byte 8
+then you have a chunk, and can check the following things of it.
+
+NOTE: none of these functions check for memory buffer boundaries. To avoid
+exploits, always make sure the buffer contains all the data of the chunks.
+When using lodepng_chunk_next, make sure the returned value is within the
+allocated memory.
+
+unsigned lodepng_chunk_length(const unsigned char* chunk):
+
+Get the length of the chunk's data. The total chunk length is this length + 12.
+
+void lodepng_chunk_type(char type[5], const unsigned char* chunk):
+unsigned char lodepng_chunk_type_equals(const unsigned char* chunk, const char* type):
+
+Get the type of the chunk or compare if it's a certain type
+
+unsigned char lodepng_chunk_critical(const unsigned char* chunk):
+unsigned char lodepng_chunk_private(const unsigned char* chunk):
+unsigned char lodepng_chunk_safetocopy(const unsigned char* chunk):
+
+Check if the chunk is critical in the PNG standard (only IHDR, PLTE, IDAT and IEND are).
+Check if the chunk is private (public chunks are part of the standard, private ones not).
+Check if the chunk is safe to copy. If it's not, then, when modifying data in a critical
+chunk, unsafe to copy chunks of the old image may NOT be saved in the new one if your
+program doesn't handle that type of unknown chunk.
+
+unsigned char* lodepng_chunk_data(unsigned char* chunk):
+const unsigned char* lodepng_chunk_data_const(const unsigned char* chunk):
+
+Get a pointer to the start of the data of the chunk.
+
+unsigned lodepng_chunk_check_crc(const unsigned char* chunk):
+void lodepng_chunk_generate_crc(unsigned char* chunk):
+
+Check if the crc is correct or generate a correct one.
+
+unsigned char* lodepng_chunk_next(unsigned char* chunk):
+const unsigned char* lodepng_chunk_next_const(const unsigned char* chunk):
+
+Iterate to the next chunk. This works if you have a buffer with consecutive chunks. Note that these
+functions do no boundary checking of the allocated data whatsoever, so make sure there is enough
+data available in the buffer to be able to go to the next chunk.
+
+unsigned lodepng_chunk_append(unsigned char** out, size_t* outlength, const unsigned char* chunk):
+unsigned lodepng_chunk_create(unsigned char** out, size_t* outlength, unsigned length,
+                              const char* type, const unsigned char* data):
+
+These functions are used to create new chunks that are appended to the data in *out that has
+length *outlength. The append function appends an existing chunk to the new data. The create
+function creates a new chunk with the given parameters and appends it. Type is the 4-letter
+name of the chunk.
+
+8.2. chunks in info_png
+-----------------------
+
+The LodePNGInfo struct contains fields with the unknown chunk in it. It has 3
+buffers (each with size) to contain 3 types of unknown chunks:
+the ones that come before the PLTE chunk, the ones that come between the PLTE
+and the IDAT chunks, and the ones that come after the IDAT chunks.
+It's necessary to make the distionction between these 3 cases because the PNG
+standard forces to keep the ordering of unknown chunks compared to the critical
+chunks, but does not force any other ordering rules.
+
+info_png.unknown_chunks_data[0] is the chunks before PLTE
+info_png.unknown_chunks_data[1] is the chunks after PLTE, before IDAT
+info_png.unknown_chunks_data[2] is the chunks after IDAT
+
+The chunks in these 3 buffers can be iterated through and read by using the same
+way described in the previous subchapter.
+
+When using the decoder to decode a PNG, you can make it store all unknown chunks
+if you set the option settings.remember_unknown_chunks to 1. By default, this
+option is off (0).
+
+The encoder will always encode unknown chunks that are stored in the info_png.
+If you need it to add a particular chunk that isn't known by LodePNG, you can
+use lodepng_chunk_append or lodepng_chunk_create to the chunk data in
+info_png.unknown_chunks_data[x].
+
+Chunks that are known by LodePNG should not be added in that way. E.g. to make
+LodePNG add a bKGD chunk, set background_defined to true and add the correct
+parameters there instead.
+
+
+9. compiler support
+-------------------
+
+No libraries other than the current standard C library are needed to compile
+LodePNG. For the C++ version, only the standard C++ library is needed on top.
+Add the files lodepng.c(pp) and lodepng.h to your project, include
+lodepng.h where needed, and your program can read/write PNG files.
+
+It is compatible with C90 and up, and C++03 and up.
+
+If performance is important, use optimization when compiling! For both the
+encoder and decoder, this makes a large difference.
+
+Make sure that LodePNG is compiled with the same compiler of the same version
+and with the same settings as the rest of the program, or the interfaces with
+std::vectors and std::strings in C++ can be incompatible.
+
+CHAR_BITS must be 8 or higher, because LodePNG uses unsigned chars for octets.
+
+*) gcc and g++
+
+LodePNG is developed in gcc so this compiler is natively supported. It gives no
+warnings with compiler options "-Wall -Wextra -pedantic -ansi", with gcc and g++
+version 4.7.1 on Linux, 32-bit and 64-bit.
+
+*) Clang
+
+Fully supported and warning-free.
+
+*) Mingw
+
+The Mingw compiler (a port of gcc for Windows) should be fully supported by
+LodePNG.
+
+*) Visual Studio and Visual C++ Express Edition
+
+LodePNG should be warning-free with warning level W4. Two warnings were disabled
+with pragmas though: warning 4244 about implicit conversions, and warning 4996
+where it wants to use a non-standard function fopen_s instead of the standard C
+fopen.
+
+Visual Studio may want "stdafx.h" files to be included in each source file and
+give an error "unexpected end of file while looking for precompiled header".
+This is not standard C++ and will not be added to the stock LodePNG. You can
+disable it for lodepng.cpp only by right clicking it, Properties, C/C++,
+Precompiled Headers, and set it to Not Using Precompiled Headers there.
+
+NOTE: Modern versions of VS should be fully supported, but old versions, e.g.
+VS6, are not guaranteed to work.
+
+*) Compilers on Macintosh
+
+LodePNG has been reported to work both with gcc and LLVM for Macintosh, both for
+C and C++.
+
+*) Other Compilers
+
+If you encounter problems on any compilers, feel free to let me know and I may
+try to fix it if the compiler is modern and standards complient.
+
+
+10. examples
+------------
+
+This decoder example shows the most basic usage of LodePNG. More complex
+examples can be found on the LodePNG website.
+
+10.1. decoder C++ example
+-------------------------
+
+#include "lodepng.h"
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  //load and decode
+  std::vector<unsigned char> image;
+  unsigned width, height;
+  unsigned error = lodepng::decode(image, width, height, filename);
+
+  //if there's an error, display it
+  if(error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;
+
+  //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
+}
+
+10.2. decoder C example
+-----------------------
+
+#include "lodepng.h"
+
+int main(int argc, char *argv[])
+{
+  unsigned error;
+  unsigned char* image;
+  size_t width, height;
+  const char* filename = argc > 1 ? argv[1] : "test.png";
+
+  error = lodepng_decode32_file(&image, &width, &height, filename);
+
+  if(error) printf("decoder error %u: %s\n", error, lodepng_error_text(error));
+
+  / * use image here * /
+
+  free(image);
+  return 0;
+}
+
+11. state settings reference
+----------------------------
+
+A quick reference of some settings to set on the LodePNGState
+
+For decoding:
+
+state.decoder.zlibsettings.ignore_adler32: ignore ADLER32 checksums
+state.decoder.zlibsettings.custom_...: use custom inflate function
+state.decoder.ignore_crc: ignore CRC checksums
+state.decoder.color_convert: convert internal PNG color to chosen one
+state.decoder.read_text_chunks: whether to read in text metadata chunks
+state.decoder.remember_unknown_chunks: whether to read in unknown chunks
+state.info_raw.colortype: desired color type for decoded image
+state.info_raw.bitdepth: desired bit depth for decoded image
+state.info_raw....: more color settings, see struct LodePNGColorMode
+state.info_png....: no settings for decoder but ouput, see struct LodePNGInfo
+
+For encoding:
+
+state.encoder.zlibsettings.btype: disable compression by setting it to 0
+state.encoder.zlibsettings.use_lz77: use LZ77 in compression
+state.encoder.zlibsettings.windowsize: tweak LZ77 windowsize
+state.encoder.zlibsettings.minmatch: tweak min LZ77 length to match
+state.encoder.zlibsettings.nicematch: tweak LZ77 match where to stop searching
+state.encoder.zlibsettings.lazymatching: try one more LZ77 matching
+state.encoder.zlibsettings.custom_...: use custom deflate function
+state.encoder.auto_convert: choose optimal PNG color type, if 0 uses info_png
+state.encoder.filter_palette_zero: PNG filter strategy for palette
+state.encoder.filter_strategy: PNG filter strategy to encode with
+state.encoder.force_palette: add palette even if not encoding to one
+state.encoder.add_id: add LodePNG identifier and version as a text chunk
+state.encoder.text_compression: use compressed text chunks for metadata
+state.info_raw.colortype: color type of raw input image you provide
+state.info_raw.bitdepth: bit depth of raw input image you provide
+state.info_raw: more color settings, see struct LodePNGColorMode
+state.info_png.color.colortype: desired color type if auto_convert is false
+state.info_png.color.bitdepth: desired bit depth if auto_convert is false
+state.info_png.color....: more color settings, see struct LodePNGColorMode
+state.info_png....: more PNG related settings, see struct LodePNGInfo
+
+
+12. changes
+-----------
+
+The version number of LodePNG is the date of the change given in the format
+yyyymmdd.
+
+Some changes aren't backwards compatible. Those are indicated with a (!)
+symbol.
+
+*) 18 apr 2016: Changed qsort to custom stable sort (for platforms w/o qsort).
+*) 09 apr 2016: Fixed colorkey usage detection, and better file loading (within
+   the limits of pure C90).
+*) 08 dec 2015: Made load_file function return error if file can't be opened.
+*) 24 okt 2015: Bugfix with decoding to palette output.
+*) 18 apr 2015: Boundary PM instead of just package-merge for faster encoding.
+*) 23 aug 2014: Reduced needless memory usage of decoder.
+*) 28 jun 2014: Removed fix_png setting, always support palette OOB for
+    simplicity. Made ColorProfile public.
+*) 09 jun 2014: Faster encoder by fixing hash bug and more zeros optimization.
+*) 22 dec 2013: Power of two windowsize required for optimization.
+*) 15 apr 2013: Fixed bug with LAC_ALPHA and color key.
+*) 25 mar 2013: Added an optional feature to ignore some PNG errors (fix_png).
+*) 11 mar 2013 (!): Bugfix with custom free. Changed from "my" to "lodepng_"
+    prefix for the custom allocators and made it possible with a new #define to
+    use custom ones in your project without needing to change lodepng's code.
+*) 28 jan 2013: Bugfix with color key.
+*) 27 okt 2012: Tweaks in text chunk keyword length error handling.
+*) 8 okt 2012 (!): Added new filter strategy (entropy) and new auto color mode.
+    (no palette). Better deflate tree encoding. New compression tweak settings.
+    Faster color conversions while decoding. Some internal cleanups.
+*) 23 sep 2012: Reduced warnings in Visual Studio a little bit.
+*) 1 sep 2012 (!): Removed #define's for giving custom (de)compression functions
+    and made it work with function pointers instead.
+*) 23 jun 2012: Added more filter strategies. Made it easier to use custom alloc
+    and free functions and toggle #defines from compiler flags. Small fixes.
+*) 6 may 2012 (!): Made plugging in custom zlib/deflate functions more flexible.
+*) 22 apr 2012 (!): Made interface more consistent, renaming a lot. Removed
+    redundant C++ codec classes. Reduced amount of structs. Everything changed,
+    but it is cleaner now imho and functionality remains the same. Also fixed
+    several bugs and shrunk the implementation code. Made new samples.
+*) 6 nov 2011 (!): By default, the encoder now automatically chooses the best
+    PNG color model and bit depth, based on the amount and type of colors of the
+    raw image. For this, autoLeaveOutAlphaChannel replaced by auto_choose_color.
+*) 9 okt 2011: simpler hash chain implementation for the encoder.
+*) 8 sep 2011: lz77 encoder lazy matching instead of greedy matching.
+*) 23 aug 2011: tweaked the zlib compression parameters after benchmarking.
+    A bug with the PNG filtertype heuristic was fixed, so that it chooses much
+    better ones (it's quite significant). A setting to do an experimental, slow,
+    brute force search for PNG filter types is added.
+*) 17 aug 2011 (!): changed some C zlib related function names.
+*) 16 aug 2011: made the code less wide (max 120 characters per line).
+*) 17 apr 2011: code cleanup. Bugfixes. Convert low to 16-bit per sample colors.
+*) 21 feb 2011: fixed compiling for C90. Fixed compiling with sections disabled.
+*) 11 dec 2010: encoding is made faster, based on suggestion by Peter Eastman
+    to optimize long sequences of zeros.
+*) 13 nov 2010: added LodePNG_InfoColor_hasPaletteAlpha and
+    LodePNG_InfoColor_canHaveAlpha functions for convenience.
+*) 7 nov 2010: added LodePNG_error_text function to get error code description.
+*) 30 okt 2010: made decoding slightly faster
+*) 26 okt 2010: (!) changed some C function and struct names (more consistent).
+     Reorganized the documentation and the declaration order in the header.
+*) 08 aug 2010: only changed some comments and external samples.
+*) 05 jul 2010: fixed bug thanks to warnings in the new gcc version.
+*) 14 mar 2010: fixed bug where too much memory was allocated for char buffers.
+*) 02 sep 2008: fixed bug where it could create empty tree that linux apps could
+    read by ignoring the problem but windows apps couldn't.
+*) 06 jun 2008: added more error checks for out of memory cases.
+*) 26 apr 2008: added a few more checks here and there to ensure more safety.
+*) 06 mar 2008: crash with encoding of strings fixed
+*) 02 feb 2008: support for international text chunks added (iTXt)
+*) 23 jan 2008: small cleanups, and #defines to divide code in sections
+*) 20 jan 2008: support for unknown chunks allowing using LodePNG for an editor.
+*) 18 jan 2008: support for tIME and pHYs chunks added to encoder and decoder.
+*) 17 jan 2008: ability to encode and decode compressed zTXt chunks added
+    Also various fixes, such as in the deflate and the padding bits code.
+*) 13 jan 2008: Added ability to encode Adam7-interlaced images. Improved
+    filtering code of encoder.
+*) 07 jan 2008: (!) changed LodePNG to use ISO C90 instead of C++. A
+    C++ wrapper around this provides an interface almost identical to before.
+    Having LodePNG be pure ISO C90 makes it more portable. The C and C++ code
+    are together in these files but it works both for C and C++ compilers.
+*) 29 dec 2007: (!) changed most integer types to unsigned int + other tweaks
+*) 30 aug 2007: bug fixed which makes this Borland C++ compatible
+*) 09 aug 2007: some VS2005 warnings removed again
+*) 21 jul 2007: deflate code placed in new namespace separate from zlib code
+*) 08 jun 2007: fixed bug with 2- and 4-bit color, and small interlaced images
+*) 04 jun 2007: improved support for Visual Studio 2005: crash with accessing
+    invalid std::vector element [0] fixed, and level 3 and 4 warnings removed
+*) 02 jun 2007: made the encoder add a tag with version by default
+*) 27 may 2007: zlib and png code separated (but still in the same file),
+    simple encoder/decoder functions added for more simple usage cases
+*) 19 may 2007: minor fixes, some code cleaning, new error added (error 69),
+    moved some examples from here to lodepng_examples.cpp
+*) 12 may 2007: palette decoding bug fixed
+*) 24 apr 2007: changed the license from BSD to the zlib license
+*) 11 mar 2007: very simple addition: ability to encode bKGD chunks.
+*) 04 mar 2007: (!) tEXt chunk related fixes, and support for encoding
+    palettized PNG images. Plus little interface change with palette and texts.
+*) 03 mar 2007: Made it encode dynamic Huffman shorter with repeat codes.
+    Fixed a bug where the end code of a block had length 0 in the Huffman tree.
+*) 26 feb 2007: Huffman compression with dynamic trees (BTYPE 2) now implemented
+    and supported by the encoder, resulting in smaller PNGs at the output.
+*) 27 jan 2007: Made the Adler-32 test faster so that a timewaste is gone.
+*) 24 jan 2007: gave encoder an error interface. Added color conversion from any
+    greyscale type to 8-bit greyscale with or without alpha.
+*) 21 jan 2007: (!) Totally changed the interface. It allows more color types
+    to convert to and is more uniform. See the manual for how it works now.
+*) 07 jan 2007: Some cleanup & fixes, and a few changes over the last days:
+    encode/decode custom tEXt chunks, separate classes for zlib & deflate, and
+    at last made the decoder give errors for incorrect Adler32 or Crc.
+*) 01 jan 2007: Fixed bug with encoding PNGs with less than 8 bits per channel.
+*) 29 dec 2006: Added support for encoding images without alpha channel, and
+    cleaned out code as well as making certain parts faster.
+*) 28 dec 2006: Added "Settings" to the encoder.
+*) 26 dec 2006: The encoder now does LZ77 encoding and produces much smaller files now.
+    Removed some code duplication in the decoder. Fixed little bug in an example.
+*) 09 dec 2006: (!) Placed output parameters of public functions as first parameter.
+    Fixed a bug of the decoder with 16-bit per color.
+*) 15 okt 2006: Changed documentation structure
+*) 09 okt 2006: Encoder class added. It encodes a valid PNG image from the
+    given image buffer, however for now it's not compressed.
+*) 08 sep 2006: (!) Changed to interface with a Decoder class
+*) 30 jul 2006: (!) LodePNG_InfoPng , width and height are now retrieved in different
+    way. Renamed decodePNG to decodePNGGeneric.
+*) 29 jul 2006: (!) Changed the interface: image info is now returned as a
+    struct of type LodePNG::LodePNG_Info, instead of a vector, which was a bit clumsy.
+*) 28 jul 2006: Cleaned the code and added new error checks.
+    Corrected terminology "deflate" into "inflate".
+*) 23 jun 2006: Added SDL example in the documentation in the header, this
+    example allows easy debugging by displaying the PNG and its transparency.
+*) 22 jun 2006: (!) Changed way to obtain error value. Added
+    loadFile function for convenience. Made decodePNG32 faster.
+*) 21 jun 2006: (!) Changed type of info vector to unsigned.
+    Changed position of palette in info vector. Fixed an important bug that
+    happened on PNGs with an uncompressed block.
+*) 16 jun 2006: Internally changed unsigned into unsigned where
+    needed, and performed some optimizations.
+*) 07 jun 2006: (!) Renamed functions to decodePNG and placed them
+    in LodePNG namespace. Changed the order of the parameters. Rewrote the
+    documentation in the header. Renamed files to lodepng.cpp and lodepng.h
+*) 22 apr 2006: Optimized and improved some code
+*) 07 sep 2005: (!) Changed to std::vector interface
+*) 12 aug 2005: Initial release (C++, decoder only)
+
+
+13. contact information
+-----------------------
+
+Feel free to contact me with suggestions, problems, comments, ... concerning
+LodePNG. If you encounter a PNG image that doesn't work properly with this
+decoder, feel free to send it and I'll use it to find and fix the problem.
+
+My email address is (puzzle the account and domain together with an @ symbol):
+Domain: gmail dot com.
+Account: lode dot vandevenne.
+
+
+Copyright (c) 2005-2016 Lode Vandevenne
+*/
diff --git a/3rdparty/bimg/3rdparty/maratis-tcl/LICENSE b/3rdparty/bimg/3rdparty/maratis-tcl/LICENSE
new file mode 100644
index 0000000..3126b7e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/maratis-tcl/LICENSE
@@ -0,0 +1,22 @@
+Maratis Tiny C library
+
+Copyright (c) 2015 Anael Seghezzi <www.maratis3d.com>
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would
+    be appreciated but is not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not
+    be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+    distribution.
diff --git a/3rdparty/bimg/3rdparty/maratis-tcl/m_image.h b/3rdparty/bimg/3rdparty/maratis-tcl/m_image.h
new file mode 100644
index 0000000..9923482
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/maratis-tcl/m_image.h
@@ -0,0 +1,2340 @@
+/*======================================================================
+ Maratis Tiny C Library
+ version 1.0
+------------------------------------------------------------------------
+ Copyright (c) 2015 Anael Seghezzi <www.maratis3d.org>
+ Copyright (c) 2015 Marti Maria Saguer
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the authors be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+    claim that you wrote the original software. If you use this software
+    in a product, an acknowledgment in the product documentation would
+    be appreciated but is not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not
+    be misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source
+    distribution.
+
+========================================================================*/
+/*
+   Image manipulation :
+   - transformation (re-frame, mirror, rotation)
+   - conversions (float, half, ubyte, linear, greyscale...)
+   - filtering (convolution, Gaussian blur, Harris)
+   - scaling (pyramid, generic, bilinear)
+   - morphology (flood-fill, dilate, erode, thinning)
+   - edge and corner detection (Sobel, Harris)
+
+   to create the implementation,
+   #define M_IMAGE_IMPLEMENTATION
+   in *one* C/CPP file that includes this file.
+
+   optional:
+   include after *m_math.h*
+   
+   //////////////////////////////////////////////////////
+   Example: create a 256x256 float image with 1 component:
+ 
+   struct m_image foo1 = M_IMAGE_IDENTITY();
+   struct m_image foo2 = M_IMAGE_IDENTITY();   
+   int x, y;
+   
+   m_image_create(&foo1, M_FLOAT, 256, 256, 1);
+   memset(foo1.data, 0, foo1.size * sizeof(float)); // clear to zero
+   
+   y = 128; x = 128;
+   ((float *)foo1.data)[y * foo1.width + x] = 1.0f; // set (x, y) pixel to one
+   
+   m_image_gaussian_blur(&foo2, &foo1, 3, 3); // apply Gaussian blur
+   
+   m_image_destroy(&foo2);
+   m_image_destroy(&foo1);
+*/
+
+#ifndef M_IMAGE_H
+#define M_IMAGE_H
+
+#include <stdint.h>
+
+#define M_IMAGE_VERSION 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef MIAPI
+#define MIAPI extern
+#endif
+
+#define M_VOID   0
+#define M_BOOL   1
+#define M_BYTE   2
+#define M_UBYTE  3
+#define M_SHORT  4
+#define M_USHORT 5
+#define M_INT    6
+#define M_UINT   7
+#define M_HALF   8
+#define M_FLOAT  9
+#define M_DOUBLE 10
+
+struct m_image
+{
+   void *data;
+   int size;
+   int width;
+   int height;
+   int comp;
+   char type;
+};
+
+/* identity, must be used before calling m_image_create */
+#define M_IMAGE_IDENTITY() {0, 0, 0, 0, 0, 0}
+
+/* fully supported types are: M_UBYTE, M_USHORT, M_HALF, M_FLOAT
+   partially supported types: M_BYTE, M_SHORT, M_INT, M_UINT (no support for conversion) */
+MIAPI void m_image_create(struct m_image *image, char type, int width, int height, int comp);
+MIAPI void m_image_destroy(struct m_image *image);
+
+MIAPI void m_image_ubyte_to_float(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_ushort_to_float(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_half_to_float(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_float_to_ubyte(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_float_to_ushort(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_float_to_half(struct m_image *dest, const struct m_image *src);
+
+MIAPI void m_image_copy(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_copy_sub_image(struct m_image *dest, const struct m_image *src, int x, int y, int w, int h);
+MIAPI void m_image_reframe(struct m_image *dest, const struct m_image *src, int left, int top, int right, int bottom);
+MIAPI void m_image_extract_component(struct m_image *dest, const struct m_image *src, int c);
+MIAPI void m_image_rotate_left(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_rotate_right(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_rotate_180(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_mirror_x(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_mirror_y(struct m_image *dest, const struct m_image *src);
+
+MIAPI void m_image_premultiply(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_unpremultiply(struct m_image *dest, const struct m_image *src);
+
+/* float/half conversion */
+MIAPI float    m_half2float(uint16_t h);
+MIAPI uint16_t m_float2half(float flt);
+
+/* raw processing */
+MIAPI void  m_gaussian_kernel(float *dest, int size);
+MIAPI void  m_sst(float *dest, const float *src, int count);
+MIAPI void  m_harris_response(float *dest, const float *src, int count);
+MIAPI void  m_tfm(float *dest, const float *src, int count);
+MIAPI void  m_normalize(float *dest, const float *src, int size); /* dest = src / norm(src) */
+MIAPI void  m_normalize_sum(float *dest, const float *src, int size); /* dest = src / sum(src) */
+MIAPI float m_mean(const float *src, int size);
+MIAPI float m_squared_distance(const float *src1, const float *src2, int size);
+MIAPI float m_convolution(const float *src1, const float *src2, int size); /* a dot product really */
+MIAPI float m_chi_squared_distance(const float *src1, const float *src2, int size); /* good at estimating signed hystograms difference */
+
+/* conversion to 1 component (float image only) */
+MIAPI void m_image_grey(struct m_image *dest, const struct m_image *src); /* from RGB src */
+MIAPI void m_image_max(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_max_abs(struct m_image *dest, const struct m_image *src);
+
+/* summed area table (also called "integral image") */
+MIAPI void m_image_summed_area(struct m_image *dest, const struct m_image *src);
+
+/* convolutions (float image only) */
+/* if alpha channel, src image must be pre-multiplied */
+MIAPI void m_image_convolution_h(struct m_image *dest, const struct m_image *src, float *kernel, int size); /* horizontal */
+MIAPI void m_image_convolution_v(struct m_image *dest, const struct m_image *src, float *kernel, int size); /* vertical */
+MIAPI void m_image_gaussian_blur(struct m_image *dest, const struct m_image *src, int dx, int dy);
+
+/* edge and corner (float 1 component image only) */
+MIAPI void m_image_sobel(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_harris(struct m_image *dest, const struct m_image *src, int radius);
+
+/* morphology (ubyte 1 component image only) */
+MIAPI int  m_image_floodfill_4x(struct m_image *dest, int x, int y, unsigned char ref, unsigned char value, unsigned short *stack, int stack_size);
+MIAPI int  m_image_floodfill_8x(struct m_image *dest, int x, int y, unsigned char ref, unsigned char value, unsigned short *stack, int stack_size);
+MIAPI void m_image_dilate(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_erode(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_edge_4x(struct m_image *dest, const struct m_image *src, unsigned char ref);
+MIAPI void m_image_thin(struct m_image *dest);
+
+/* non maxima suppression (float image only) */
+MIAPI void m_image_non_max_supp(struct m_image *dest, const struct m_image *src, int radius, float threshold);
+
+/* detect Harris corners
+   margin: margin around the image to exclude corners
+   radius: maxima radius
+   threshold: Harris response threshold
+   corners: corners coordinates of size max_count * 2
+   max_count: maximum number of corners
+   return corner count */
+MIAPI int m_image_corner_harris(const struct m_image *src, int margin, int radius, float threshold, int *corners, int max_count);
+
+/* resizing (float image only) */
+MIAPI void m_image_sub_pixel(const struct m_image *src, float x, float y, float *result);
+MIAPI void m_image_pyrdown(struct m_image *dest, const struct m_image *src);
+MIAPI void m_image_resize(struct m_image *dest, const struct m_image *src, int new_width, int new_height);
+
+#ifdef __cplusplus
+}
+#endif
+/*
+----------------------------------------------------------------------*/
+#endif /* M_IMAGE_H */
+
+#ifdef M_IMAGE_IMPLEMENTATION
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#ifndef M_SAFE_FREE
+#define M_SAFE_FREE(p) {if (p) {free(p); (p) = NULL;}}
+#endif
+
+#ifndef M_MIN
+#define M_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef M_MAX
+#define M_MAX(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+#ifndef M_ABS
+#define M_ABS(a) (((a) < 0) ? -(a) : (a))
+#endif
+#ifndef M_CLAMP
+#define M_CLAMP(x, low, high) (((x) > (high)) ? (high) : (((x) < (low)) ? (low) : (x)))
+#endif
+
+MIAPI void m_gaussian_kernel(float *dest, int size)
+{
+   if(size == 3) {
+      dest[0] = 0.25f;
+      dest[1] = 0.50f;
+      dest[2] = 0.25f;
+   }
+   else {
+
+      float *k = dest;
+      float sigma = 1.6f;
+      float rs, s2;
+      float sum = 0.0f;
+      int radius = (size - 1) / 2;
+      int r;
+      
+      s2 = 1.0f / expf(sigma * sigma * 2.25f);
+      rs = sigma / (float)radius;
+
+      /* compute gaussian kernel */
+      for(r = -radius; r <= radius; r++) {
+         float x = fabsf(r * rs);
+         float v = (1.0f / expf(x * x)) - s2;
+         *k = v;
+         sum += v;
+         k++;
+      }
+
+      /* normalize */
+      if (sum > 0.0f) {
+         float isum = 1.0f / sum;
+         for (r = 0; r < size; r++)
+            dest[r] *= isum;
+      }
+   }
+}
+
+MIAPI void m_sst(float *dest, const float *src, int count)
+{
+   int i;
+   for (i = 0; i < count; i++) {
+      float dx = src[0];
+      float dy = src[1];
+      dest[0] = dx*dx;
+      dest[1] = dy*dy;
+      dest[2] = dx*dy;
+      src += 2;
+      dest += 3;
+   }
+}
+
+MIAPI void m_harris_response(float *dest, const float *src, int count)
+{
+   int i;
+   for (i = 0; i < count; i++) {
+      float dx2 = src[0];
+      float dy2 = src[1];
+      float dxy = src[2];
+      *dest = (dx2 * dy2 - dxy * dxy) / (dx2 + dy2 + 1e-8f);
+      src += 3;
+      dest++;
+   }
+}
+
+MIAPI void m_tfm(float *dest, const float *src, int count)
+{
+   int i;
+   for (i = 0; i < count; i++) {
+
+      if (src[0] < src[1]) {
+         float dx2 = src[0];
+         float dy2 = src[1];
+         float dxy = src[2];
+         float lambda = 0.5f * (dy2 + dx2 + sqrtf((dy2 * dy2) - (2.0f * dx2 * dy2) + (dx2 * dx2) + (4.0f * dxy * dxy)));
+         dest[0] = dx2 - lambda;
+         dest[1] = dxy;
+      }
+      else {
+         float dy2 = src[0];
+         float dx2 = src[1];
+         float dxy = src[2];
+         float lambda = 0.5f * (dy2 + dx2 + sqrtf((dy2 * dy2) - (2.0f * dx2 * dy2) + (dx2 * dx2) + (4.0f * dxy * dxy)));
+         dest[0] = dxy;
+         dest[1] = dx2 - lambda;
+      }
+
+      src += 3;
+      dest += 2;
+   }
+}
+
+MIAPI float m_chi_squared_distance(const float *src1, const float *src2, int size)
+{
+   int i;
+   float score = 0;
+   for (i = 0; i < size; i++) {
+
+      float val1 = src1[i];
+      float val2 = src2[i];
+
+      /* chi squared distance */
+      if ((val1 + val2) > 0) {
+         float x = val2 - val1;
+         score += (x * x) / (val1 + val2);
+      }
+   }
+
+   return score * 0.5f;
+}
+
+MIAPI float m_convolution(const float *src1, const float *src2, int size)
+{
+   float c = 0; int i;
+   for (i = 0; i < size; i++)
+      c += src1[i] * src2[i];
+   return c;
+}
+
+MIAPI void m_normalize(float *dest, const float *src, int size)
+{
+   float sum = 0.0f; int i;
+   for(i = 0; i < size; i++)
+      sum += src[i] * src[i];
+
+   if (sum > 0.0f) {
+      sum = 1.0f / sqrtf(sum);
+      for(i = 0; i < size; i++)
+         dest[i] = src[i] * sum;
+   }
+   else {
+      memset(dest, 0, size * sizeof(float));
+   }
+}
+
+MIAPI void m_normalize_sum(float *dest, const float *src, int size)
+{
+   float sum = 0.0f; int i;
+   for(i = 0; i < size; i++)
+      sum += src[i];
+
+   if (sum > 0.0f) {
+      sum = 1.0f / sum;
+      for(i = 0; i < size; i++)
+         dest[i] = src[i] * sum;
+   }
+   else {
+      memset(dest, 0, size * sizeof(float));
+   }
+}
+
+MIAPI float m_mean(const float *src, int size)
+{
+   float mean = 0; int i;
+   for (i = 0; i < size; i++)
+      mean += (*src++);
+   return size > 0 ? mean / (float)size : 0;
+}
+
+MIAPI float m_squared_distance(const float *src1, const float *src2, int size)
+{
+   float score = 0; int i;
+   for (i = 0; i < size; i++) {
+      float x = src2[i] - src1[i];
+      score += x * x;
+   }
+   return score;
+}
+
+/* m_half2float / m_float2half :
+   a big thanks to Marti Maria Saguer for allowing the use of this code
+   under the zlib license from "Little Color Management System" (cmshalf.c) */
+
+/* This code is inspired in the paper "Fast Half Float Conversions"
+   by Jeroen van der Zijp */
+
+static uint32_t m__mantissa[2048] = {
+0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34a00000,
+0x34c00000, 0x34e00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000,
+0x35400000, 0x35500000, 0x35600000, 0x35700000, 0x35800000, 0x35880000,
+0x35900000, 0x35980000, 0x35a00000, 0x35a80000, 0x35b00000, 0x35b80000,
+0x35c00000, 0x35c80000, 0x35d00000, 0x35d80000, 0x35e00000, 0x35e80000,
+0x35f00000, 0x35f80000, 0x36000000, 0x36040000, 0x36080000, 0x360c0000,
+0x36100000, 0x36140000, 0x36180000, 0x361c0000, 0x36200000, 0x36240000,
+0x36280000, 0x362c0000, 0x36300000, 0x36340000, 0x36380000, 0x363c0000,
+0x36400000, 0x36440000, 0x36480000, 0x364c0000, 0x36500000, 0x36540000,
+0x36580000, 0x365c0000, 0x36600000, 0x36640000, 0x36680000, 0x366c0000,
+0x36700000, 0x36740000, 0x36780000, 0x367c0000, 0x36800000, 0x36820000,
+0x36840000, 0x36860000, 0x36880000, 0x368a0000, 0x368c0000, 0x368e0000,
+0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369a0000,
+0x369c0000, 0x369e0000, 0x36a00000, 0x36a20000, 0x36a40000, 0x36a60000,
+0x36a80000, 0x36aa0000, 0x36ac0000, 0x36ae0000, 0x36b00000, 0x36b20000,
+0x36b40000, 0x36b60000, 0x36b80000, 0x36ba0000, 0x36bc0000, 0x36be0000,
+0x36c00000, 0x36c20000, 0x36c40000, 0x36c60000, 0x36c80000, 0x36ca0000,
+0x36cc0000, 0x36ce0000, 0x36d00000, 0x36d20000, 0x36d40000, 0x36d60000,
+0x36d80000, 0x36da0000, 0x36dc0000, 0x36de0000, 0x36e00000, 0x36e20000,
+0x36e40000, 0x36e60000, 0x36e80000, 0x36ea0000, 0x36ec0000, 0x36ee0000,
+0x36f00000, 0x36f20000, 0x36f40000, 0x36f60000, 0x36f80000, 0x36fa0000,
+0x36fc0000, 0x36fe0000, 0x37000000, 0x37010000, 0x37020000, 0x37030000,
+0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000,
+0x370a0000, 0x370b0000, 0x370c0000, 0x370d0000, 0x370e0000, 0x370f0000,
+0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000,
+0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371a0000, 0x371b0000,
+0x371c0000, 0x371d0000, 0x371e0000, 0x371f0000, 0x37200000, 0x37210000,
+0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000,
+0x37280000, 0x37290000, 0x372a0000, 0x372b0000, 0x372c0000, 0x372d0000,
+0x372e0000, 0x372f0000, 0x37300000, 0x37310000, 0x37320000, 0x37330000,
+0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000,
+0x373a0000, 0x373b0000, 0x373c0000, 0x373d0000, 0x373e0000, 0x373f0000,
+0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000,
+0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374a0000, 0x374b0000,
+0x374c0000, 0x374d0000, 0x374e0000, 0x374f0000, 0x37500000, 0x37510000,
+0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000,
+0x37580000, 0x37590000, 0x375a0000, 0x375b0000, 0x375c0000, 0x375d0000,
+0x375e0000, 0x375f0000, 0x37600000, 0x37610000, 0x37620000, 0x37630000,
+0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000,
+0x376a0000, 0x376b0000, 0x376c0000, 0x376d0000, 0x376e0000, 0x376f0000,
+0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000,
+0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377a0000, 0x377b0000,
+0x377c0000, 0x377d0000, 0x377e0000, 0x377f0000, 0x37800000, 0x37808000,
+0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000,
+0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000,
+0x37870000, 0x37878000, 0x37880000, 0x37888000, 0x37890000, 0x37898000,
+0x378a0000, 0x378a8000, 0x378b0000, 0x378b8000, 0x378c0000, 0x378c8000,
+0x378d0000, 0x378d8000, 0x378e0000, 0x378e8000, 0x378f0000, 0x378f8000,
+0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000,
+0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000,
+0x37960000, 0x37968000, 0x37970000, 0x37978000, 0x37980000, 0x37988000,
+0x37990000, 0x37998000, 0x379a0000, 0x379a8000, 0x379b0000, 0x379b8000,
+0x379c0000, 0x379c8000, 0x379d0000, 0x379d8000, 0x379e0000, 0x379e8000,
+0x379f0000, 0x379f8000, 0x37a00000, 0x37a08000, 0x37a10000, 0x37a18000,
+0x37a20000, 0x37a28000, 0x37a30000, 0x37a38000, 0x37a40000, 0x37a48000,
+0x37a50000, 0x37a58000, 0x37a60000, 0x37a68000, 0x37a70000, 0x37a78000,
+0x37a80000, 0x37a88000, 0x37a90000, 0x37a98000, 0x37aa0000, 0x37aa8000,
+0x37ab0000, 0x37ab8000, 0x37ac0000, 0x37ac8000, 0x37ad0000, 0x37ad8000,
+0x37ae0000, 0x37ae8000, 0x37af0000, 0x37af8000, 0x37b00000, 0x37b08000,
+0x37b10000, 0x37b18000, 0x37b20000, 0x37b28000, 0x37b30000, 0x37b38000,
+0x37b40000, 0x37b48000, 0x37b50000, 0x37b58000, 0x37b60000, 0x37b68000,
+0x37b70000, 0x37b78000, 0x37b80000, 0x37b88000, 0x37b90000, 0x37b98000,
+0x37ba0000, 0x37ba8000, 0x37bb0000, 0x37bb8000, 0x37bc0000, 0x37bc8000,
+0x37bd0000, 0x37bd8000, 0x37be0000, 0x37be8000, 0x37bf0000, 0x37bf8000,
+0x37c00000, 0x37c08000, 0x37c10000, 0x37c18000, 0x37c20000, 0x37c28000,
+0x37c30000, 0x37c38000, 0x37c40000, 0x37c48000, 0x37c50000, 0x37c58000,
+0x37c60000, 0x37c68000, 0x37c70000, 0x37c78000, 0x37c80000, 0x37c88000,
+0x37c90000, 0x37c98000, 0x37ca0000, 0x37ca8000, 0x37cb0000, 0x37cb8000,
+0x37cc0000, 0x37cc8000, 0x37cd0000, 0x37cd8000, 0x37ce0000, 0x37ce8000,
+0x37cf0000, 0x37cf8000, 0x37d00000, 0x37d08000, 0x37d10000, 0x37d18000,
+0x37d20000, 0x37d28000, 0x37d30000, 0x37d38000, 0x37d40000, 0x37d48000,
+0x37d50000, 0x37d58000, 0x37d60000, 0x37d68000, 0x37d70000, 0x37d78000,
+0x37d80000, 0x37d88000, 0x37d90000, 0x37d98000, 0x37da0000, 0x37da8000,
+0x37db0000, 0x37db8000, 0x37dc0000, 0x37dc8000, 0x37dd0000, 0x37dd8000,
+0x37de0000, 0x37de8000, 0x37df0000, 0x37df8000, 0x37e00000, 0x37e08000,
+0x37e10000, 0x37e18000, 0x37e20000, 0x37e28000, 0x37e30000, 0x37e38000,
+0x37e40000, 0x37e48000, 0x37e50000, 0x37e58000, 0x37e60000, 0x37e68000,
+0x37e70000, 0x37e78000, 0x37e80000, 0x37e88000, 0x37e90000, 0x37e98000,
+0x37ea0000, 0x37ea8000, 0x37eb0000, 0x37eb8000, 0x37ec0000, 0x37ec8000,
+0x37ed0000, 0x37ed8000, 0x37ee0000, 0x37ee8000, 0x37ef0000, 0x37ef8000,
+0x37f00000, 0x37f08000, 0x37f10000, 0x37f18000, 0x37f20000, 0x37f28000,
+0x37f30000, 0x37f38000, 0x37f40000, 0x37f48000, 0x37f50000, 0x37f58000,
+0x37f60000, 0x37f68000, 0x37f70000, 0x37f78000, 0x37f80000, 0x37f88000,
+0x37f90000, 0x37f98000, 0x37fa0000, 0x37fa8000, 0x37fb0000, 0x37fb8000,
+0x37fc0000, 0x37fc8000, 0x37fd0000, 0x37fd8000, 0x37fe0000, 0x37fe8000,
+0x37ff0000, 0x37ff8000, 0x38000000, 0x38004000, 0x38008000, 0x3800c000,
+0x38010000, 0x38014000, 0x38018000, 0x3801c000, 0x38020000, 0x38024000,
+0x38028000, 0x3802c000, 0x38030000, 0x38034000, 0x38038000, 0x3803c000,
+0x38040000, 0x38044000, 0x38048000, 0x3804c000, 0x38050000, 0x38054000,
+0x38058000, 0x3805c000, 0x38060000, 0x38064000, 0x38068000, 0x3806c000,
+0x38070000, 0x38074000, 0x38078000, 0x3807c000, 0x38080000, 0x38084000,
+0x38088000, 0x3808c000, 0x38090000, 0x38094000, 0x38098000, 0x3809c000,
+0x380a0000, 0x380a4000, 0x380a8000, 0x380ac000, 0x380b0000, 0x380b4000,
+0x380b8000, 0x380bc000, 0x380c0000, 0x380c4000, 0x380c8000, 0x380cc000,
+0x380d0000, 0x380d4000, 0x380d8000, 0x380dc000, 0x380e0000, 0x380e4000,
+0x380e8000, 0x380ec000, 0x380f0000, 0x380f4000, 0x380f8000, 0x380fc000,
+0x38100000, 0x38104000, 0x38108000, 0x3810c000, 0x38110000, 0x38114000,
+0x38118000, 0x3811c000, 0x38120000, 0x38124000, 0x38128000, 0x3812c000,
+0x38130000, 0x38134000, 0x38138000, 0x3813c000, 0x38140000, 0x38144000,
+0x38148000, 0x3814c000, 0x38150000, 0x38154000, 0x38158000, 0x3815c000,
+0x38160000, 0x38164000, 0x38168000, 0x3816c000, 0x38170000, 0x38174000,
+0x38178000, 0x3817c000, 0x38180000, 0x38184000, 0x38188000, 0x3818c000,
+0x38190000, 0x38194000, 0x38198000, 0x3819c000, 0x381a0000, 0x381a4000,
+0x381a8000, 0x381ac000, 0x381b0000, 0x381b4000, 0x381b8000, 0x381bc000,
+0x381c0000, 0x381c4000, 0x381c8000, 0x381cc000, 0x381d0000, 0x381d4000,
+0x381d8000, 0x381dc000, 0x381e0000, 0x381e4000, 0x381e8000, 0x381ec000,
+0x381f0000, 0x381f4000, 0x381f8000, 0x381fc000, 0x38200000, 0x38204000,
+0x38208000, 0x3820c000, 0x38210000, 0x38214000, 0x38218000, 0x3821c000,
+0x38220000, 0x38224000, 0x38228000, 0x3822c000, 0x38230000, 0x38234000,
+0x38238000, 0x3823c000, 0x38240000, 0x38244000, 0x38248000, 0x3824c000,
+0x38250000, 0x38254000, 0x38258000, 0x3825c000, 0x38260000, 0x38264000,
+0x38268000, 0x3826c000, 0x38270000, 0x38274000, 0x38278000, 0x3827c000,
+0x38280000, 0x38284000, 0x38288000, 0x3828c000, 0x38290000, 0x38294000,
+0x38298000, 0x3829c000, 0x382a0000, 0x382a4000, 0x382a8000, 0x382ac000,
+0x382b0000, 0x382b4000, 0x382b8000, 0x382bc000, 0x382c0000, 0x382c4000,
+0x382c8000, 0x382cc000, 0x382d0000, 0x382d4000, 0x382d8000, 0x382dc000,
+0x382e0000, 0x382e4000, 0x382e8000, 0x382ec000, 0x382f0000, 0x382f4000,
+0x382f8000, 0x382fc000, 0x38300000, 0x38304000, 0x38308000, 0x3830c000,
+0x38310000, 0x38314000, 0x38318000, 0x3831c000, 0x38320000, 0x38324000,
+0x38328000, 0x3832c000, 0x38330000, 0x38334000, 0x38338000, 0x3833c000,
+0x38340000, 0x38344000, 0x38348000, 0x3834c000, 0x38350000, 0x38354000,
+0x38358000, 0x3835c000, 0x38360000, 0x38364000, 0x38368000, 0x3836c000,
+0x38370000, 0x38374000, 0x38378000, 0x3837c000, 0x38380000, 0x38384000,
+0x38388000, 0x3838c000, 0x38390000, 0x38394000, 0x38398000, 0x3839c000,
+0x383a0000, 0x383a4000, 0x383a8000, 0x383ac000, 0x383b0000, 0x383b4000,
+0x383b8000, 0x383bc000, 0x383c0000, 0x383c4000, 0x383c8000, 0x383cc000,
+0x383d0000, 0x383d4000, 0x383d8000, 0x383dc000, 0x383e0000, 0x383e4000,
+0x383e8000, 0x383ec000, 0x383f0000, 0x383f4000, 0x383f8000, 0x383fc000,
+0x38400000, 0x38404000, 0x38408000, 0x3840c000, 0x38410000, 0x38414000,
+0x38418000, 0x3841c000, 0x38420000, 0x38424000, 0x38428000, 0x3842c000,
+0x38430000, 0x38434000, 0x38438000, 0x3843c000, 0x38440000, 0x38444000,
+0x38448000, 0x3844c000, 0x38450000, 0x38454000, 0x38458000, 0x3845c000,
+0x38460000, 0x38464000, 0x38468000, 0x3846c000, 0x38470000, 0x38474000,
+0x38478000, 0x3847c000, 0x38480000, 0x38484000, 0x38488000, 0x3848c000,
+0x38490000, 0x38494000, 0x38498000, 0x3849c000, 0x384a0000, 0x384a4000,
+0x384a8000, 0x384ac000, 0x384b0000, 0x384b4000, 0x384b8000, 0x384bc000,
+0x384c0000, 0x384c4000, 0x384c8000, 0x384cc000, 0x384d0000, 0x384d4000,
+0x384d8000, 0x384dc000, 0x384e0000, 0x384e4000, 0x384e8000, 0x384ec000,
+0x384f0000, 0x384f4000, 0x384f8000, 0x384fc000, 0x38500000, 0x38504000,
+0x38508000, 0x3850c000, 0x38510000, 0x38514000, 0x38518000, 0x3851c000,
+0x38520000, 0x38524000, 0x38528000, 0x3852c000, 0x38530000, 0x38534000,
+0x38538000, 0x3853c000, 0x38540000, 0x38544000, 0x38548000, 0x3854c000,
+0x38550000, 0x38554000, 0x38558000, 0x3855c000, 0x38560000, 0x38564000,
+0x38568000, 0x3856c000, 0x38570000, 0x38574000, 0x38578000, 0x3857c000,
+0x38580000, 0x38584000, 0x38588000, 0x3858c000, 0x38590000, 0x38594000,
+0x38598000, 0x3859c000, 0x385a0000, 0x385a4000, 0x385a8000, 0x385ac000,
+0x385b0000, 0x385b4000, 0x385b8000, 0x385bc000, 0x385c0000, 0x385c4000,
+0x385c8000, 0x385cc000, 0x385d0000, 0x385d4000, 0x385d8000, 0x385dc000,
+0x385e0000, 0x385e4000, 0x385e8000, 0x385ec000, 0x385f0000, 0x385f4000,
+0x385f8000, 0x385fc000, 0x38600000, 0x38604000, 0x38608000, 0x3860c000,
+0x38610000, 0x38614000, 0x38618000, 0x3861c000, 0x38620000, 0x38624000,
+0x38628000, 0x3862c000, 0x38630000, 0x38634000, 0x38638000, 0x3863c000,
+0x38640000, 0x38644000, 0x38648000, 0x3864c000, 0x38650000, 0x38654000,
+0x38658000, 0x3865c000, 0x38660000, 0x38664000, 0x38668000, 0x3866c000,
+0x38670000, 0x38674000, 0x38678000, 0x3867c000, 0x38680000, 0x38684000,
+0x38688000, 0x3868c000, 0x38690000, 0x38694000, 0x38698000, 0x3869c000,
+0x386a0000, 0x386a4000, 0x386a8000, 0x386ac000, 0x386b0000, 0x386b4000,
+0x386b8000, 0x386bc000, 0x386c0000, 0x386c4000, 0x386c8000, 0x386cc000,
+0x386d0000, 0x386d4000, 0x386d8000, 0x386dc000, 0x386e0000, 0x386e4000,
+0x386e8000, 0x386ec000, 0x386f0000, 0x386f4000, 0x386f8000, 0x386fc000,
+0x38700000, 0x38704000, 0x38708000, 0x3870c000, 0x38710000, 0x38714000,
+0x38718000, 0x3871c000, 0x38720000, 0x38724000, 0x38728000, 0x3872c000,
+0x38730000, 0x38734000, 0x38738000, 0x3873c000, 0x38740000, 0x38744000,
+0x38748000, 0x3874c000, 0x38750000, 0x38754000, 0x38758000, 0x3875c000,
+0x38760000, 0x38764000, 0x38768000, 0x3876c000, 0x38770000, 0x38774000,
+0x38778000, 0x3877c000, 0x38780000, 0x38784000, 0x38788000, 0x3878c000,
+0x38790000, 0x38794000, 0x38798000, 0x3879c000, 0x387a0000, 0x387a4000,
+0x387a8000, 0x387ac000, 0x387b0000, 0x387b4000, 0x387b8000, 0x387bc000,
+0x387c0000, 0x387c4000, 0x387c8000, 0x387cc000, 0x387d0000, 0x387d4000,
+0x387d8000, 0x387dc000, 0x387e0000, 0x387e4000, 0x387e8000, 0x387ec000,
+0x387f0000, 0x387f4000, 0x387f8000, 0x387fc000, 0x38000000, 0x38002000,
+0x38004000, 0x38006000, 0x38008000, 0x3800a000, 0x3800c000, 0x3800e000,
+0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801a000,
+0x3801c000, 0x3801e000, 0x38020000, 0x38022000, 0x38024000, 0x38026000,
+0x38028000, 0x3802a000, 0x3802c000, 0x3802e000, 0x38030000, 0x38032000,
+0x38034000, 0x38036000, 0x38038000, 0x3803a000, 0x3803c000, 0x3803e000,
+0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804a000,
+0x3804c000, 0x3804e000, 0x38050000, 0x38052000, 0x38054000, 0x38056000,
+0x38058000, 0x3805a000, 0x3805c000, 0x3805e000, 0x38060000, 0x38062000,
+0x38064000, 0x38066000, 0x38068000, 0x3806a000, 0x3806c000, 0x3806e000,
+0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807a000,
+0x3807c000, 0x3807e000, 0x38080000, 0x38082000, 0x38084000, 0x38086000,
+0x38088000, 0x3808a000, 0x3808c000, 0x3808e000, 0x38090000, 0x38092000,
+0x38094000, 0x38096000, 0x38098000, 0x3809a000, 0x3809c000, 0x3809e000,
+0x380a0000, 0x380a2000, 0x380a4000, 0x380a6000, 0x380a8000, 0x380aa000,
+0x380ac000, 0x380ae000, 0x380b0000, 0x380b2000, 0x380b4000, 0x380b6000,
+0x380b8000, 0x380ba000, 0x380bc000, 0x380be000, 0x380c0000, 0x380c2000,
+0x380c4000, 0x380c6000, 0x380c8000, 0x380ca000, 0x380cc000, 0x380ce000,
+0x380d0000, 0x380d2000, 0x380d4000, 0x380d6000, 0x380d8000, 0x380da000,
+0x380dc000, 0x380de000, 0x380e0000, 0x380e2000, 0x380e4000, 0x380e6000,
+0x380e8000, 0x380ea000, 0x380ec000, 0x380ee000, 0x380f0000, 0x380f2000,
+0x380f4000, 0x380f6000, 0x380f8000, 0x380fa000, 0x380fc000, 0x380fe000,
+0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810a000,
+0x3810c000, 0x3810e000, 0x38110000, 0x38112000, 0x38114000, 0x38116000,
+0x38118000, 0x3811a000, 0x3811c000, 0x3811e000, 0x38120000, 0x38122000,
+0x38124000, 0x38126000, 0x38128000, 0x3812a000, 0x3812c000, 0x3812e000,
+0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813a000,
+0x3813c000, 0x3813e000, 0x38140000, 0x38142000, 0x38144000, 0x38146000,
+0x38148000, 0x3814a000, 0x3814c000, 0x3814e000, 0x38150000, 0x38152000,
+0x38154000, 0x38156000, 0x38158000, 0x3815a000, 0x3815c000, 0x3815e000,
+0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816a000,
+0x3816c000, 0x3816e000, 0x38170000, 0x38172000, 0x38174000, 0x38176000,
+0x38178000, 0x3817a000, 0x3817c000, 0x3817e000, 0x38180000, 0x38182000,
+0x38184000, 0x38186000, 0x38188000, 0x3818a000, 0x3818c000, 0x3818e000,
+0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819a000,
+0x3819c000, 0x3819e000, 0x381a0000, 0x381a2000, 0x381a4000, 0x381a6000,
+0x381a8000, 0x381aa000, 0x381ac000, 0x381ae000, 0x381b0000, 0x381b2000,
+0x381b4000, 0x381b6000, 0x381b8000, 0x381ba000, 0x381bc000, 0x381be000,
+0x381c0000, 0x381c2000, 0x381c4000, 0x381c6000, 0x381c8000, 0x381ca000,
+0x381cc000, 0x381ce000, 0x381d0000, 0x381d2000, 0x381d4000, 0x381d6000,
+0x381d8000, 0x381da000, 0x381dc000, 0x381de000, 0x381e0000, 0x381e2000,
+0x381e4000, 0x381e6000, 0x381e8000, 0x381ea000, 0x381ec000, 0x381ee000,
+0x381f0000, 0x381f2000, 0x381f4000, 0x381f6000, 0x381f8000, 0x381fa000,
+0x381fc000, 0x381fe000, 0x38200000, 0x38202000, 0x38204000, 0x38206000,
+0x38208000, 0x3820a000, 0x3820c000, 0x3820e000, 0x38210000, 0x38212000,
+0x38214000, 0x38216000, 0x38218000, 0x3821a000, 0x3821c000, 0x3821e000,
+0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822a000,
+0x3822c000, 0x3822e000, 0x38230000, 0x38232000, 0x38234000, 0x38236000,
+0x38238000, 0x3823a000, 0x3823c000, 0x3823e000, 0x38240000, 0x38242000,
+0x38244000, 0x38246000, 0x38248000, 0x3824a000, 0x3824c000, 0x3824e000,
+0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825a000,
+0x3825c000, 0x3825e000, 0x38260000, 0x38262000, 0x38264000, 0x38266000,
+0x38268000, 0x3826a000, 0x3826c000, 0x3826e000, 0x38270000, 0x38272000,
+0x38274000, 0x38276000, 0x38278000, 0x3827a000, 0x3827c000, 0x3827e000,
+0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828a000,
+0x3828c000, 0x3828e000, 0x38290000, 0x38292000, 0x38294000, 0x38296000,
+0x38298000, 0x3829a000, 0x3829c000, 0x3829e000, 0x382a0000, 0x382a2000,
+0x382a4000, 0x382a6000, 0x382a8000, 0x382aa000, 0x382ac000, 0x382ae000,
+0x382b0000, 0x382b2000, 0x382b4000, 0x382b6000, 0x382b8000, 0x382ba000,
+0x382bc000, 0x382be000, 0x382c0000, 0x382c2000, 0x382c4000, 0x382c6000,
+0x382c8000, 0x382ca000, 0x382cc000, 0x382ce000, 0x382d0000, 0x382d2000,
+0x382d4000, 0x382d6000, 0x382d8000, 0x382da000, 0x382dc000, 0x382de000,
+0x382e0000, 0x382e2000, 0x382e4000, 0x382e6000, 0x382e8000, 0x382ea000,
+0x382ec000, 0x382ee000, 0x382f0000, 0x382f2000, 0x382f4000, 0x382f6000,
+0x382f8000, 0x382fa000, 0x382fc000, 0x382fe000, 0x38300000, 0x38302000,
+0x38304000, 0x38306000, 0x38308000, 0x3830a000, 0x3830c000, 0x3830e000,
+0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831a000,
+0x3831c000, 0x3831e000, 0x38320000, 0x38322000, 0x38324000, 0x38326000,
+0x38328000, 0x3832a000, 0x3832c000, 0x3832e000, 0x38330000, 0x38332000,
+0x38334000, 0x38336000, 0x38338000, 0x3833a000, 0x3833c000, 0x3833e000,
+0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834a000,
+0x3834c000, 0x3834e000, 0x38350000, 0x38352000, 0x38354000, 0x38356000,
+0x38358000, 0x3835a000, 0x3835c000, 0x3835e000, 0x38360000, 0x38362000,
+0x38364000, 0x38366000, 0x38368000, 0x3836a000, 0x3836c000, 0x3836e000,
+0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837a000,
+0x3837c000, 0x3837e000, 0x38380000, 0x38382000, 0x38384000, 0x38386000,
+0x38388000, 0x3838a000, 0x3838c000, 0x3838e000, 0x38390000, 0x38392000,
+0x38394000, 0x38396000, 0x38398000, 0x3839a000, 0x3839c000, 0x3839e000,
+0x383a0000, 0x383a2000, 0x383a4000, 0x383a6000, 0x383a8000, 0x383aa000,
+0x383ac000, 0x383ae000, 0x383b0000, 0x383b2000, 0x383b4000, 0x383b6000,
+0x383b8000, 0x383ba000, 0x383bc000, 0x383be000, 0x383c0000, 0x383c2000,
+0x383c4000, 0x383c6000, 0x383c8000, 0x383ca000, 0x383cc000, 0x383ce000,
+0x383d0000, 0x383d2000, 0x383d4000, 0x383d6000, 0x383d8000, 0x383da000,
+0x383dc000, 0x383de000, 0x383e0000, 0x383e2000, 0x383e4000, 0x383e6000,
+0x383e8000, 0x383ea000, 0x383ec000, 0x383ee000, 0x383f0000, 0x383f2000,
+0x383f4000, 0x383f6000, 0x383f8000, 0x383fa000, 0x383fc000, 0x383fe000,
+0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840a000,
+0x3840c000, 0x3840e000, 0x38410000, 0x38412000, 0x38414000, 0x38416000,
+0x38418000, 0x3841a000, 0x3841c000, 0x3841e000, 0x38420000, 0x38422000,
+0x38424000, 0x38426000, 0x38428000, 0x3842a000, 0x3842c000, 0x3842e000,
+0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843a000,
+0x3843c000, 0x3843e000, 0x38440000, 0x38442000, 0x38444000, 0x38446000,
+0x38448000, 0x3844a000, 0x3844c000, 0x3844e000, 0x38450000, 0x38452000,
+0x38454000, 0x38456000, 0x38458000, 0x3845a000, 0x3845c000, 0x3845e000,
+0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846a000,
+0x3846c000, 0x3846e000, 0x38470000, 0x38472000, 0x38474000, 0x38476000,
+0x38478000, 0x3847a000, 0x3847c000, 0x3847e000, 0x38480000, 0x38482000,
+0x38484000, 0x38486000, 0x38488000, 0x3848a000, 0x3848c000, 0x3848e000,
+0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849a000,
+0x3849c000, 0x3849e000, 0x384a0000, 0x384a2000, 0x384a4000, 0x384a6000,
+0x384a8000, 0x384aa000, 0x384ac000, 0x384ae000, 0x384b0000, 0x384b2000,
+0x384b4000, 0x384b6000, 0x384b8000, 0x384ba000, 0x384bc000, 0x384be000,
+0x384c0000, 0x384c2000, 0x384c4000, 0x384c6000, 0x384c8000, 0x384ca000,
+0x384cc000, 0x384ce000, 0x384d0000, 0x384d2000, 0x384d4000, 0x384d6000,
+0x384d8000, 0x384da000, 0x384dc000, 0x384de000, 0x384e0000, 0x384e2000,
+0x384e4000, 0x384e6000, 0x384e8000, 0x384ea000, 0x384ec000, 0x384ee000,
+0x384f0000, 0x384f2000, 0x384f4000, 0x384f6000, 0x384f8000, 0x384fa000,
+0x384fc000, 0x384fe000, 0x38500000, 0x38502000, 0x38504000, 0x38506000,
+0x38508000, 0x3850a000, 0x3850c000, 0x3850e000, 0x38510000, 0x38512000,
+0x38514000, 0x38516000, 0x38518000, 0x3851a000, 0x3851c000, 0x3851e000,
+0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852a000,
+0x3852c000, 0x3852e000, 0x38530000, 0x38532000, 0x38534000, 0x38536000,
+0x38538000, 0x3853a000, 0x3853c000, 0x3853e000, 0x38540000, 0x38542000,
+0x38544000, 0x38546000, 0x38548000, 0x3854a000, 0x3854c000, 0x3854e000,
+0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855a000,
+0x3855c000, 0x3855e000, 0x38560000, 0x38562000, 0x38564000, 0x38566000,
+0x38568000, 0x3856a000, 0x3856c000, 0x3856e000, 0x38570000, 0x38572000,
+0x38574000, 0x38576000, 0x38578000, 0x3857a000, 0x3857c000, 0x3857e000,
+0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858a000,
+0x3858c000, 0x3858e000, 0x38590000, 0x38592000, 0x38594000, 0x38596000,
+0x38598000, 0x3859a000, 0x3859c000, 0x3859e000, 0x385a0000, 0x385a2000,
+0x385a4000, 0x385a6000, 0x385a8000, 0x385aa000, 0x385ac000, 0x385ae000,
+0x385b0000, 0x385b2000, 0x385b4000, 0x385b6000, 0x385b8000, 0x385ba000,
+0x385bc000, 0x385be000, 0x385c0000, 0x385c2000, 0x385c4000, 0x385c6000,
+0x385c8000, 0x385ca000, 0x385cc000, 0x385ce000, 0x385d0000, 0x385d2000,
+0x385d4000, 0x385d6000, 0x385d8000, 0x385da000, 0x385dc000, 0x385de000,
+0x385e0000, 0x385e2000, 0x385e4000, 0x385e6000, 0x385e8000, 0x385ea000,
+0x385ec000, 0x385ee000, 0x385f0000, 0x385f2000, 0x385f4000, 0x385f6000,
+0x385f8000, 0x385fa000, 0x385fc000, 0x385fe000, 0x38600000, 0x38602000,
+0x38604000, 0x38606000, 0x38608000, 0x3860a000, 0x3860c000, 0x3860e000,
+0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861a000,
+0x3861c000, 0x3861e000, 0x38620000, 0x38622000, 0x38624000, 0x38626000,
+0x38628000, 0x3862a000, 0x3862c000, 0x3862e000, 0x38630000, 0x38632000,
+0x38634000, 0x38636000, 0x38638000, 0x3863a000, 0x3863c000, 0x3863e000,
+0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864a000,
+0x3864c000, 0x3864e000, 0x38650000, 0x38652000, 0x38654000, 0x38656000,
+0x38658000, 0x3865a000, 0x3865c000, 0x3865e000, 0x38660000, 0x38662000,
+0x38664000, 0x38666000, 0x38668000, 0x3866a000, 0x3866c000, 0x3866e000,
+0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867a000,
+0x3867c000, 0x3867e000, 0x38680000, 0x38682000, 0x38684000, 0x38686000,
+0x38688000, 0x3868a000, 0x3868c000, 0x3868e000, 0x38690000, 0x38692000,
+0x38694000, 0x38696000, 0x38698000, 0x3869a000, 0x3869c000, 0x3869e000,
+0x386a0000, 0x386a2000, 0x386a4000, 0x386a6000, 0x386a8000, 0x386aa000,
+0x386ac000, 0x386ae000, 0x386b0000, 0x386b2000, 0x386b4000, 0x386b6000,
+0x386b8000, 0x386ba000, 0x386bc000, 0x386be000, 0x386c0000, 0x386c2000,
+0x386c4000, 0x386c6000, 0x386c8000, 0x386ca000, 0x386cc000, 0x386ce000,
+0x386d0000, 0x386d2000, 0x386d4000, 0x386d6000, 0x386d8000, 0x386da000,
+0x386dc000, 0x386de000, 0x386e0000, 0x386e2000, 0x386e4000, 0x386e6000,
+0x386e8000, 0x386ea000, 0x386ec000, 0x386ee000, 0x386f0000, 0x386f2000,
+0x386f4000, 0x386f6000, 0x386f8000, 0x386fa000, 0x386fc000, 0x386fe000,
+0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870a000,
+0x3870c000, 0x3870e000, 0x38710000, 0x38712000, 0x38714000, 0x38716000,
+0x38718000, 0x3871a000, 0x3871c000, 0x3871e000, 0x38720000, 0x38722000,
+0x38724000, 0x38726000, 0x38728000, 0x3872a000, 0x3872c000, 0x3872e000,
+0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873a000,
+0x3873c000, 0x3873e000, 0x38740000, 0x38742000, 0x38744000, 0x38746000,
+0x38748000, 0x3874a000, 0x3874c000, 0x3874e000, 0x38750000, 0x38752000,
+0x38754000, 0x38756000, 0x38758000, 0x3875a000, 0x3875c000, 0x3875e000,
+0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876a000,
+0x3876c000, 0x3876e000, 0x38770000, 0x38772000, 0x38774000, 0x38776000,
+0x38778000, 0x3877a000, 0x3877c000, 0x3877e000, 0x38780000, 0x38782000,
+0x38784000, 0x38786000, 0x38788000, 0x3878a000, 0x3878c000, 0x3878e000,
+0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879a000,
+0x3879c000, 0x3879e000, 0x387a0000, 0x387a2000, 0x387a4000, 0x387a6000,
+0x387a8000, 0x387aa000, 0x387ac000, 0x387ae000, 0x387b0000, 0x387b2000,
+0x387b4000, 0x387b6000, 0x387b8000, 0x387ba000, 0x387bc000, 0x387be000,
+0x387c0000, 0x387c2000, 0x387c4000, 0x387c6000, 0x387c8000, 0x387ca000,
+0x387cc000, 0x387ce000, 0x387d0000, 0x387d2000, 0x387d4000, 0x387d6000,
+0x387d8000, 0x387da000, 0x387dc000, 0x387de000, 0x387e0000, 0x387e2000,
+0x387e4000, 0x387e6000, 0x387e8000, 0x387ea000, 0x387ec000, 0x387ee000,
+0x387f0000, 0x387f2000, 0x387f4000, 0x387f6000, 0x387f8000, 0x387fa000,
+0x387fc000, 0x387fe000
+};
+
+static uint16_t m__offset[64] = {
+0x0000, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0000, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400, 0x0400, 0x0400,
+0x0400, 0x0400, 0x0400, 0x0400
+};
+
+static uint32_t m__exponent[64] = {
+0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000,
+0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000,
+0x06000000, 0x06800000, 0x07000000, 0x07800000, 0x08000000, 0x08800000,
+0x09000000, 0x09800000, 0x0a000000, 0x0a800000, 0x0b000000, 0x0b800000,
+0x0c000000, 0x0c800000, 0x0d000000, 0x0d800000, 0x0e000000, 0x0e800000,
+0x0f000000, 0x47800000, 0x80000000, 0x80800000, 0x81000000, 0x81800000,
+0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000,
+0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8a000000, 0x8a800000,
+0x8b000000, 0x8b800000, 0x8c000000, 0x8c800000, 0x8d000000, 0x8d800000,
+0x8e000000, 0x8e800000, 0x8f000000, 0xc7800000
+};
+
+static uint16_t m__base[512] = {
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040,
+0x0080, 0x0100, 0x0200, 0x0400, 0x0800, 0x0c00, 0x1000, 0x1400, 0x1800, 0x1c00,
+0x2000, 0x2400, 0x2800, 0x2c00, 0x3000, 0x3400, 0x3800, 0x3c00, 0x4000, 0x4400,
+0x4800, 0x4c00, 0x5000, 0x5400, 0x5800, 0x5c00, 0x6000, 0x6400, 0x6800, 0x6c00,
+0x7000, 0x7400, 0x7800, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00,
+0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x7c00, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001,
+0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100, 0x8200, 0x8400,
+0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00, 0xa000, 0xa400, 0xa800, 0xac00,
+0xb000, 0xb400, 0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00, 0xd000, 0xd400,
+0xd800, 0xdc00, 0xe000, 0xe400, 0xe800, 0xec00, 0xf000, 0xf400, 0xf800, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00, 0xfc00,
+0xfc00, 0xfc00
+};
+
+static uint8_t  m__shift[512] = {
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17,
+0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d,
+0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x0d, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13,
+0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d,
+0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x0d, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18, 0x18,
+0x18, 0x18, 0x18, 0x18, 0x0d
+};
+
+MIAPI float m_half2float(uint16_t h)
+{
+   union {
+      float flt;
+      uint32_t  num;
+   } out;
+
+   int n = h >> 10;
+
+   out.num = m__mantissa[ (h & 0x3ff) + m__offset[n]] + m__exponent[n];
+   return out.flt;
+}
+
+MIAPI uint16_t m_float2half(float flt)
+{
+   union {
+      float flt;
+      uint32_t  num;
+   } in;
+
+   uint32_t n, j;
+
+   in.flt = flt;
+   n = in.num;
+   j = (n >> 23) & 0x1ff;
+
+   return (uint16_t) ((uint32_t) m__base[j] + ((n & 0x007fffff) >> m__shift[j]));
+}
+
+MIAPI void m_image_create(struct m_image *image, char type, int width, int height, int comp)
+{
+   int size = width * height * comp;
+   assert(size > 0);
+
+   /* already allocated */
+   if (image->data != 0 && type == image->type && width == image->width && height == image->height && comp == image->comp)
+      return;
+
+   M_SAFE_FREE(image->data);
+
+   switch (type) {
+   case M_BYTE:
+   case M_UBYTE:
+      image->data = malloc(size * sizeof(uint8_t));
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      image->data = malloc(size * sizeof(uint16_t));
+      break;
+   case M_INT:
+   case M_UINT:
+      image->data = malloc(size * sizeof(uint32_t));
+      break;
+   case M_FLOAT:
+      image->data = malloc(size * sizeof(float));
+      break;
+   default:
+      assert(0);
+      return;
+   }
+
+   image->type = type;
+   image->width = width;
+   image->height = height;
+   image->comp = comp;
+   image->size = size;
+}
+
+MIAPI void m_image_destroy(struct m_image *image)
+{
+   M_SAFE_FREE(image->data);
+   memset(image, 0, sizeof(struct m_image));
+}
+
+MIAPI void m_image_copy(struct m_image *dest, const struct m_image *src)
+{
+   m_image_create(dest, src->type, src->width, src->height, src->comp);
+   
+   switch (dest->type) {
+   case M_BYTE:
+   case M_UBYTE:
+      memcpy(dest->data, src->data, dest->size*sizeof(char));
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      memcpy(dest->data, src->data, dest->size*sizeof(short));
+      break;
+   case M_INT:
+   case M_UINT:
+      memcpy(dest->data, src->data, dest->size*sizeof(int));
+      break;
+   case M_FLOAT:
+      memcpy(dest->data, src->data, dest->size*sizeof(float));
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+MIAPI void m_image_copy_sub_image(struct m_image *dest, const struct m_image *src, int x, int y, int w, int h)
+{
+   #define M_COPY_SUBI(T)\
+   {\
+      T *sData = (T *)src->data + (miny * src->width + minx) * comp;\
+      T *dData = (T *)dest->data;\
+      int y;\
+      for (y = miny; y <= maxy; y++) {\
+         memcpy(dData, sData, dstep * sizeof(T));\
+         dData += dstep;\
+         sData += sstep;\
+      }\
+   }
+
+   int comp = src->comp;
+   int minx = M_MAX(0, x);
+   int miny = M_MAX(0, y);
+   int maxx = M_CLAMP(x + w - 1, 0, src->width - 1);
+   int maxy = M_CLAMP(y + h - 1, 0, src->height - 1);
+   int dwidth = 1 + maxx - minx;
+   int dheight = 1 + maxy - miny;
+   int sstep = src->width * comp;
+   int dstep = dwidth * comp;
+
+   m_image_create(dest, src->type, dwidth, dheight, src->comp);
+   
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_COPY_SUBI(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_COPY_SUBI(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_COPY_SUBI(int);
+      break;
+   case M_FLOAT:
+      M_COPY_SUBI(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_COPY_SUBI
+}
+
+MIAPI void m_image_ubyte_to_float(struct m_image *dest, const struct m_image *src)
+{
+   uint8_t *src_data;
+   float *dest_data;
+   float ubyte_div = 1.0f / 255.0f;
+   int i;
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, src->comp);
+
+   src_data = (uint8_t *)src->data;
+   dest_data = (float *)dest->data;
+   for (i = 0; i < src->size; i++)
+      dest_data[i] = (float)src_data[i] * ubyte_div;
+}
+
+MIAPI void m_image_ushort_to_float(struct m_image *dest, const struct m_image *src)
+{
+   uint16_t *src_data;
+   float *dest_data;
+   float ushort_div = 1.0f / (float)65535;
+   int i;
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, src->comp);
+
+   src_data = (uint16_t *)src->data;
+   dest_data = (float *)dest->data;
+   for (i = 0; i < src->size; i++)
+      dest_data[i] = (float)src_data[i] * ushort_div;
+}
+
+MIAPI void m_image_half_to_float(struct m_image *dest, const struct m_image *src)
+{
+   uint16_t *src_data;
+   float *dest_data;
+   int i;
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, src->comp);
+
+   src_data = (uint16_t *)src->data;
+   dest_data = (float *)dest->data;
+   for (i = 0; i < src->size; i++)
+      dest_data[i] = m_half2float(src_data[i]);
+}
+
+MIAPI void m_image_float_to_ubyte(struct m_image *dest, const struct m_image *src)
+{
+   float *src_data;
+   uint8_t *dest_data;
+   int i;
+
+   m_image_create(dest, M_UBYTE, src->width, src->height, src->comp);
+
+   src_data = (float *)src->data;
+   dest_data = (uint8_t *)dest->data;
+   for (i = 0; i < src->size; i++) {
+      int x = (int)(src_data[i] * 255);
+      dest_data[i] = (uint8_t)M_CLAMP(x, 0, 255);
+   }
+}
+
+MIAPI void m_image_float_to_ushort(struct m_image *dest, const struct m_image *src)
+{
+   float *src_data;
+   uint16_t *dest_data;
+   int i;
+
+   m_image_create(dest, M_USHORT, src->width, src->height, src->comp);
+
+   src_data = (float *)src->data;
+   dest_data = (uint16_t *)dest->data;
+   for (i = 0; i < src->size; i++) {
+      int x = (int)(src_data[i] * 65535);
+      dest_data[i] = (uint16_t)M_CLAMP(x, 0, 65535);
+   }
+}
+
+MIAPI void m_image_float_to_half(struct m_image *dest, const struct m_image *src)
+{
+   float *src_data;
+   uint16_t *dest_data;
+   int i;
+
+   m_image_create(dest, M_USHORT, src->width, src->height, src->comp);
+
+   src_data = (float *)src->data;
+   dest_data = (uint16_t *)dest->data;
+   for (i = 0; i < src->size; i++)
+      dest_data[i] = m_float2half(src_data[i]);
+}
+
+MIAPI void m_image_extract_component(struct m_image *dest, const struct m_image *src, int c)
+{
+   #define M_EXTRACT(T)\
+   {\
+      T *dest_pixel = (T *)dest->data;\
+      T *src_pixel = (T *)src->data;\
+      for (i = 0; i < size; i += comp) {\
+         (*dest_pixel) = src_pixel[c];\
+         dest_pixel++;\
+         src_pixel += comp;\
+      }\
+   }
+
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int size = src->size;
+   int i;
+
+   if(c >= src->comp) {
+      assert(0);
+      return;
+   }
+
+   m_image_create(dest, src->type, width, height, 1);
+
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_EXTRACT(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_EXTRACT(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_EXTRACT(int);
+      break;
+   case M_FLOAT:
+      M_EXTRACT(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_EXTRACT
+}
+
+MIAPI void m_image_reframe(struct m_image *dest, const struct m_image *src, int left, int top, int right, int bottom)
+{
+   #define M_REFRAME(T)\
+   {\
+      T *src_data;\
+      T *src_pixel;\
+      T *dest_pixel;\
+      int c;\
+      int x, y;\
+      m_image_create(dest, src->type, width2, height2, comp);\
+      src_data = (T *)src->data;\
+      dest_pixel = (T *)dest->data;\
+      for (y = 0; y < height2; y++) {\
+         T *src_y;\
+         int ys = y - top;\
+         src_y = src_data + M_CLAMP(ys, 0, hm1) * width * comp;\
+         for (x = 0; x < width2; x++) {\
+            int xs = x - left;\
+            src_pixel = src_y + M_CLAMP(xs, 0, wm1) * comp;\
+            for (c = 0; c < comp; c++)\
+               dest_pixel[c] = src_pixel[c];\
+            dest_pixel += comp;\
+         }\
+      }\
+   }
+
+   if(left != 0 || top != 0 || right != 0 || bottom != 0) {
+
+      int comp = src->comp;
+      int width = src->width;
+      int height = src->height;
+      int width2 = width + left + right;
+      int height2 = height + top + bottom;
+      int wm1 = width - 1;
+      int hm1 = height - 1;
+
+      if(width2 > 0 && height2 > 0) {
+
+         switch(src->type) {
+         case M_BYTE:
+         case M_UBYTE:
+            M_REFRAME(char);
+            break;
+         case M_SHORT:
+         case M_USHORT:
+         case M_HALF:
+            M_REFRAME(short);
+            break;
+         case M_INT:
+         case M_UINT:
+            M_REFRAME(int);
+            break;
+         case M_FLOAT:
+            M_REFRAME(float);
+            break;
+         default:
+            assert(0);
+            break;
+         }
+      }
+      else {
+         assert(0);
+      }
+   }
+   else {
+      m_image_copy(dest, src);
+   }
+
+   #undef M_REFRAME
+}
+
+MIAPI void m_image_rotate_left(struct m_image *dest, const struct m_image *src)
+{
+   #define M_ROTATE_L(T)\
+   {\
+      T *src_data = (T *)src->data;\
+      T *dest_pixel = (T *)dest->data;\
+      for (y = 0; y < width;  y++)\
+      for (x = 0; x < height; x++) {\
+         T *src_pixel = src_data + (x * width + (width - 1 - y)) * comp;\
+         for (c = 0; c < comp; c++)\
+            dest_pixel[c] = src_pixel[c];\
+         dest_pixel += comp;\
+      }\
+   }
+
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int x, y, c;
+
+   m_image_create(dest, src->type, height, width, comp);
+
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_ROTATE_L(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_ROTATE_L(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_ROTATE_L(int);
+      break;
+   case M_FLOAT:
+      M_ROTATE_L(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_ROTATE_L
+}
+
+MIAPI void m_image_rotate_right(struct m_image *dest, const struct m_image *src)
+{
+   #define M_ROTATE_R(T)\
+   {\
+      T *src_data = (T *)src->data;\
+      T *dest_pixel = (T *)dest->data;\
+      for (y = 0; y < width;  y++)\
+      for (x = 0; x < height; x++) {\
+         T *src_pixel = src_data + ((height - 1 - x) * width + y) * comp;\
+         for (c = 0; c < comp; c++)\
+            dest_pixel[c] = src_pixel[c];\
+         dest_pixel += comp;\
+      }\
+   }
+
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int x, y, c;
+
+   m_image_create(dest, src->type, height, width, comp);
+
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_ROTATE_R(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_ROTATE_R(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_ROTATE_R(int);
+      break;
+   case M_FLOAT:
+      M_ROTATE_R(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_ROTATE_R
+}
+
+MIAPI void m_image_rotate_180(struct m_image *dest, const struct m_image *src)
+{
+   #define M_ROTATE_180(T)\
+   {\
+      T *src_data = (T *)src->data;\
+      T *dest_pixel = (T *)dest->data;\
+      for (y = 0; y < height;  y++)\
+      for (x = 0; x < width; x++) {\
+         T *src_pixel = src_data + ((height - 1 - y) * width + (width - 1 - x)) * comp;\
+         for (c = 0; c < comp; c++)\
+            dest_pixel[c] = src_pixel[c];\
+         dest_pixel += comp;\
+      }\
+   }
+
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int x, y, c;
+
+   m_image_create(dest, src->type, width, height, comp);
+
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_ROTATE_180(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_ROTATE_180(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_ROTATE_180(int);
+      break;
+   case M_FLOAT:
+      M_ROTATE_180(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_ROTATE_180
+}
+
+MIAPI void m_image_mirror_x(struct m_image *dest, const struct m_image *src)
+{
+   #define M_MIRROR_X(T)\
+   {\
+      T *src_data = (T *)src->data;\
+      T *dest_pixel = (T *)dest->data;\
+      for (y = 0; y < height;  y++)\
+      for (x = 0; x < width; x++) {\
+         T *src_pixel = src_data + (y * width + (width - 1 - x)) * comp;\
+         for (c = 0; c < comp; c++)\
+            dest_pixel[c] = src_pixel[c];\
+         dest_pixel += comp;\
+      }\
+   }
+
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int x, y, c;
+
+   m_image_create(dest, src->type, width, height, comp);
+
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_MIRROR_X(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_MIRROR_X(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_MIRROR_X(int);
+      break;
+   case M_FLOAT:
+      M_MIRROR_X(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_MIRROR_X
+}
+
+MIAPI void m_image_mirror_y(struct m_image *dest, const struct m_image *src)
+{
+   #define M_MIRROR_Y(T)\
+   {\
+      T *src_data = (T *)src->data;\
+      T *dest_pixel = (T *)dest->data;\
+      for (y = 0; y < height;  y++)\
+      for (x = 0; x < width; x++) {\
+         T *src_pixel = src_data + ((height - 1 - y) * width + x) * comp;\
+         for (c = 0; c < comp; c++)\
+            dest_pixel[c] = src_pixel[c];\
+         dest_pixel += comp;\
+      }\
+   }
+
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int x, y, c;
+
+   m_image_create(dest, src->type, width, height, comp);
+
+   switch(src->type)
+   {
+   case M_BYTE:
+   case M_UBYTE:
+      M_MIRROR_Y(char);
+      break;
+   case M_SHORT:
+   case M_USHORT:
+   case M_HALF:
+      M_MIRROR_Y(short);
+      break;
+   case M_INT:
+   case M_UINT:
+      M_MIRROR_Y(int);
+      break;
+   case M_FLOAT:
+      M_MIRROR_Y(float);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   #undef M_MIRROR_Y
+}
+
+MIAPI void m_image_premultiply(struct m_image *dest, const struct m_image *src)
+{
+   float *dest_p, *src_p;
+   int i;
+   
+   assert(src->size > 0 && src->type == M_FLOAT && src->comp == 4);
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, 4);
+   dest_p = (float *)dest->data;
+   src_p = (float *)src->data;
+
+   for (i = 0; i < src->size; i+=4) {
+      dest_p[0] = src_p[0] * src_p[3];
+      dest_p[1] = src_p[1] * src_p[3];
+      dest_p[2] = src_p[2] * src_p[3];
+      dest_p[3] = src_p[3];
+      dest_p += 4;
+      src_p += 4;
+   }
+}
+
+MIAPI void m_image_unpremultiply(struct m_image *dest, const struct m_image *src)
+{
+   float *dest_p, *src_p;
+   int i;
+   
+   assert(src->size > 0 && src->type == M_FLOAT && src->comp == 4);
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, 4);
+   dest_p = (float *)dest->data;
+   src_p = (float *)src->data;
+
+   for (i = 0; i < src->size; i+=4) {
+      float x = 1.0 / src_p[3];
+      dest_p[0] = src_p[0] * x;
+      dest_p[1] = src_p[1] * x;
+      dest_p[2] = src_p[2] * x;
+      dest_p[3] = src_p[3];
+      dest_p += 4;
+      src_p += 4;
+   }
+}
+
+MIAPI void m_image_summed_area(struct m_image *dest, const struct m_image *src)
+{
+   float *src_pixel;
+   float *dest_pixel;
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int x, y, c;
+
+   assert(src->size > 0 && src->type == M_FLOAT);
+   if (dest != src)
+      m_image_copy(dest, src);
+
+   /* horiz sum */
+   dest_pixel = (float *)dest->data;
+   for (y = 0; y < height; y++) {
+      float *prev_pixel = dest_pixel;
+      dest_pixel += comp;
+
+      for (x = 1; x < width; x++) {
+         for (c = 0; c < comp; c++)
+            dest_pixel[c] += prev_pixel[c];
+
+         prev_pixel = dest_pixel;
+         dest_pixel += comp;
+      }
+   }
+
+   /* vertical sum */
+   src_pixel = (float *)dest->data;
+   dest_pixel = (float *)dest->data + width * comp;
+   for (y = 1; y < height; y++)
+   for (x = 0; x < width; x++) {
+
+      for (c = 0; c < comp; c++)
+         dest_pixel[c] += src_pixel[c];
+
+      src_pixel += comp;
+      dest_pixel += comp;
+   }
+}
+
+MIAPI void m_image_convolution_h(struct m_image *dest, const struct m_image *src, float *kernel, int size)
+{
+   struct m_image copy = M_IMAGE_IDENTITY();
+   float *src_data;
+   float *dest_data;
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int radius = (size - 1) / 2;
+   int y, ystep, ystepc;
+
+   assert(src->size > 0 && src->type == M_FLOAT);
+
+   /* create source and destination images */
+   m_image_reframe(&copy, src, radius, 0, radius, 0); /* apply clamped margin */
+   m_image_create(dest, M_FLOAT, width, height, comp);
+   
+   /* clear */
+   memset(dest->data, 0, dest->size * sizeof(float));
+
+   src_data = (float *)copy.data;
+   dest_data = (float *)dest->data;
+   ystep = width * comp;
+   ystepc = copy.width * comp;
+
+   #pragma omp parallel for schedule(dynamic, 8)
+   for (y=0; y<height; y++) {
+
+      float *dest_pixel = dest_data + y * ystep;
+      float *src_pixel_y = src_data + y * ystepc;
+      int x;
+
+      for (x = 0; x < width; x++) {
+
+         float *src_pixel;
+         int i, k;
+
+         src_pixel = src_pixel_y + (x * comp);
+
+         /* apply kernel */
+         for (k = 0; k < size; k++) {
+         float v = kernel[k];
+         for (i = 0; i < comp; i++)
+            dest_pixel[i] += (*src_pixel++) * v;
+         }
+
+         dest_pixel += comp;
+      }
+   }
+
+   m_image_destroy(&copy);
+}
+
+MIAPI void m_image_convolution_v(struct m_image *dest, const struct m_image *src, float *kernel, int size)
+{
+   struct m_image copy = M_IMAGE_IDENTITY();
+   float *src_data;
+   float *dest_data;
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int radius = (size - 1) / 2;
+   int y, ystep;
+
+   assert(src->size > 0 && src->type == M_FLOAT);
+
+   /* create source and destination images */
+   m_image_reframe(&copy, src, 0, radius, 0, radius); /* apply clamped margin */
+   m_image_create(dest, M_FLOAT, width, height, comp);
+   
+   /* clear */
+   memset(dest->data, 0, dest->size * sizeof(float));
+
+   src_data = (float *)copy.data;
+   dest_data = (float *)dest->data;
+   ystep = width * comp;
+
+   #pragma omp parallel for schedule(dynamic, 8)
+   for (y=0; y<height; y++) {
+
+      float *dest_pixel = dest_data + y * ystep;
+      int x;
+
+      for (x = 0; x < width; x++) {
+
+         float *src_pixel;
+         int i, k;
+
+         src_pixel = src_data + (y * width + x) * comp;
+
+         /* apply kernel */
+         for (k = 0; k < size; k++) {
+            float v = kernel[k];
+            for (i = 0; i < comp; i++)
+               dest_pixel[i] += src_pixel[i] * v;
+            src_pixel += ystep;
+         }
+         
+         dest_pixel += comp;
+      }
+   }
+
+   m_image_destroy(&copy);
+}
+
+MIAPI void m_image_gaussian_blur(struct m_image *dest, const struct m_image *src, int dx, int dy)
+{
+   struct m_image tmp = M_IMAGE_IDENTITY();
+   float *kernelx = NULL, *kernely = NULL;
+   int kernelx_size = dx * 2 + 1;
+   int kernely_size = dy * 2 + 1;
+   
+   assert(src->size > 0 && src->type == M_FLOAT);
+
+   /* exit */
+   if (dx == 0 && dy == 0) {
+      m_image_copy(dest, src);
+      return;
+   }
+
+   /* x blur */
+   if (dx > 0) {
+      kernelx = (float *)malloc(kernelx_size * sizeof(float));
+      m_gaussian_kernel(kernelx, kernelx_size);
+      if (dy > 0)
+         m_image_convolution_h(&tmp, src, kernelx, kernelx_size);
+      else
+         m_image_convolution_h(dest, src, kernelx, kernelx_size);
+   }
+
+   /* y blur */
+   if (dy > 0) {
+      kernely = (float *)malloc(kernely_size * sizeof(float));
+      m_gaussian_kernel(kernely, kernely_size);
+      if (dx > 0)
+         m_image_convolution_v(dest, &tmp, kernely, kernely_size);
+      else
+         m_image_convolution_v(dest, src, kernely, kernely_size);
+   }
+
+   m_image_destroy(&tmp);
+   if(kernely) free(kernely);
+   if(kernelx) free(kernelx);
+}
+
+MIAPI void m_image_grey(struct m_image *dest, const struct m_image *src)
+{
+   float *src_pixel;
+   float *dest_pixel;
+   int size = src->size;
+   int i, c = src->comp;
+
+   assert(src->size > 0 && src->type == M_FLOAT && src->comp > 2);
+   
+   m_image_create(dest, M_FLOAT, src->width, src->height, 1);
+
+   src_pixel = (float *)src->data;
+   dest_pixel = (float *)dest->data;
+
+   for (i = 0; i < size; i+=c) {
+      float v = src_pixel[0] * 0.3f + src_pixel[1] * 0.5f + src_pixel[2] * 0.2f;
+      *dest_pixel = v;
+      dest_pixel++;
+      src_pixel+=c;
+   }
+}
+
+MIAPI void m_image_max(struct m_image *dest, const struct m_image *src)
+{
+   float *src_pixel;
+   float *dest_pixel;
+   int size = src->size;
+   int i, j, c = src->comp;
+
+   assert(src->size > 0 && src->type == M_FLOAT);
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, 1);
+
+   src_pixel = (float *)src->data;
+   dest_pixel = (float *)dest->data;
+
+   for (i = 0; i < size; i+=c) {
+      float v = src_pixel[0];
+      for (j = 1; j < c; j++)
+         v = M_MAX(v, src_pixel[j]);
+      *dest_pixel = v;
+      dest_pixel++;
+      src_pixel+=c;
+   }
+}
+
+MIAPI void m_image_max_abs(struct m_image *dest, const struct m_image *src)
+{
+   float *src_pixel;
+   float *dest_pixel;
+   int size = src->size;
+   int i, j, c = src->comp;
+
+   assert(src->size > 0 && src->type == M_FLOAT);
+
+   m_image_create(dest, M_FLOAT, src->width, src->height, 1);
+
+   src_pixel = (float *)src->data;
+   dest_pixel = (float *)dest->data;
+
+   for (i = 0; i < size; i+=c) {
+      float v = fabsf(src_pixel[0]);
+      for (j = 1; j < c; j++)
+         v = M_MAX(v, fabsf(src_pixel[j]));
+      *dest_pixel = v;
+      dest_pixel++;
+      src_pixel+=c;
+   }
+}
+
+static float m__convolve_pixel(float *data, int width, float *kernel)
+{
+   float sum = 0; int i, j;
+   for (i = 0; i < 3; i++) {
+      float *pixel = data + width * i;
+      for (j = 0; j < 3; j++) {
+         sum += (*pixel) * (*kernel);
+         pixel++;
+         kernel++;
+      }
+   }
+   return sum;
+}
+
+MIAPI void m_image_sobel(struct m_image *dest, const struct m_image *src)
+{
+   struct m_image copy = M_IMAGE_IDENTITY();
+   float ky[9] = {-1, -2, -1, 0, 0, 0, 1, 2, 1};
+   float kx[9] = {-1, 0, 1, -2, 0, 2, -1, 0, 1};
+   float *src_pixel;
+   float *dest_pixel;
+   int width = src->width;
+   int height = src->height;
+   int w2 = width + 2;
+   int x, y;
+
+   assert(src->size > 0 && src->type == M_FLOAT && src->comp == 1);
+
+   /* create source and destination images */
+   m_image_reframe(&copy, src, 1, 1, 1, 1); /* apply clamped margin */
+   m_image_create(dest, M_FLOAT, width, height, 2);
+
+   src_pixel = (float *)copy.data;
+   dest_pixel = (float *)dest->data;
+
+   for (y = 0; y < height; y++) {
+      for (x = 0; x < width; x++) {
+         dest_pixel[0] = m__convolve_pixel(src_pixel, w2, kx);
+         dest_pixel[1] = m__convolve_pixel(src_pixel, w2, ky);
+         src_pixel++;
+         dest_pixel += 2;
+      }
+      src_pixel += 2;
+   }
+
+   m_image_destroy(&copy);
+}
+
+MIAPI void m_image_harris(struct m_image *dest, const struct m_image *src, int radius)
+{
+   struct m_image tmp1 = M_IMAGE_IDENTITY();
+   struct m_image tmp2 = M_IMAGE_IDENTITY();
+
+   /* sobel */
+   m_image_sobel(&tmp1, src);
+   
+   /* sst */
+   m_image_create(&tmp2, M_FLOAT, src->width, src->height, 3);
+   m_sst((float *)tmp2.data, (float *)tmp1.data, src->width * src->height);
+   
+   /* blur */
+   m_image_copy(&tmp1, &tmp2);
+   m_image_gaussian_blur(&tmp2, &tmp1, radius, radius);
+   
+   /* harris response */
+   m_image_create(dest, M_FLOAT, src->width, src->height, 1);
+   m_harris_response((float *)dest->data, (float *)tmp2.data, src->width * src->height);
+
+   m_image_destroy(&tmp1);
+   m_image_destroy(&tmp2);
+}
+
+#define M_WRITE_PIXEL(dest, x0, y0, v) {*(dest + w * (y0) + (x0)) = v;}
+#define M_PUSH_PIXEL(x2, y2) if((stack_i+3) < stack_size && m__test_pixel(data, w, h, x2, y2, ref)) {\
+   stack_i+=2;\
+   stack[stack_i] = (unsigned short)(x2);\
+   stack[stack_i+1] = (unsigned short)(y2);\
+   M_WRITE_PIXEL(data, x2, y2, value);\
+}
+
+static int m__test_pixel(unsigned char *src, int w, int h, int x, int y, unsigned char ref)
+{
+   if (! (x >= 0 && x < w && y >= 0 && y < h))
+      return 0;
+   return (*(src + w * y + x) == ref);
+}
+
+MIAPI int m_image_floodfill_4x(struct m_image *dest, int x, int y, unsigned char ref, unsigned char value, unsigned short *stack, int stack_size)
+{
+   unsigned char *data = (unsigned char *)dest->data;
+   int w = dest->width;
+   int h = dest->height;
+   int stack_i = 0;
+
+   assert(dest->size > 0 && dest->type == M_UBYTE);
+
+   if(! m__test_pixel(data, w, h, x, y, ref))
+      return 0;
+
+   stack[0] = (unsigned short)x;
+   stack[1] = (unsigned short)y;
+   M_WRITE_PIXEL(data, x, y, value);
+
+   while (stack_i >= 0) {
+
+      x = stack[stack_i];
+      y = stack[stack_i+1];
+      stack_i-=2;
+
+      M_PUSH_PIXEL(x + 1, y)
+      M_PUSH_PIXEL(x - 1, y)
+      M_PUSH_PIXEL(x, y + 1)
+      M_PUSH_PIXEL(x, y - 1)
+   }
+
+   return 1;
+}
+
+MIAPI int m_image_floodfill_8x(struct m_image *dest, int x, int y, unsigned char ref, unsigned char value, unsigned short *stack, int stack_size)
+{
+   unsigned char *data = (unsigned char *)dest->data;
+   int w = dest->width;
+   int h = dest->height;
+   int stack_i = 0;
+
+   assert(dest->size > 0 && dest->type == M_UBYTE);
+
+   if(! m__test_pixel(data, w, h, x, y, ref))
+      return 0;
+
+   stack[0] = (unsigned short)x;
+   stack[1] = (unsigned short)y;
+   M_WRITE_PIXEL(data, x, y, value);
+
+   while (stack_i >= 0) {
+
+      x = stack[stack_i];
+      y = stack[stack_i+1];
+      stack_i-=2;
+
+      M_PUSH_PIXEL(x + 1, y)
+      M_PUSH_PIXEL(x - 1, y)
+      M_PUSH_PIXEL(x, y + 1)
+      M_PUSH_PIXEL(x, y - 1)
+      M_PUSH_PIXEL(x + 1, y + 1)
+      M_PUSH_PIXEL(x + 1, y - 1)
+      M_PUSH_PIXEL(x - 1, y + 1)
+      M_PUSH_PIXEL(x - 1, y - 1)
+   }
+
+   return 1;
+}
+
+#undef M_WRITE_PIXEL
+#undef M_PUSH_PIXEL
+
+static void m__dilate_erode(struct m_image *dest, const struct m_image *src, unsigned char ref, unsigned char value, int copy)
+{
+   unsigned char *src_data = (unsigned char *)src->data;
+   unsigned char *src_pixel = src_data;
+   unsigned char *dest_pixel;
+   int w = src->width;
+   int h = src->height;
+   int y;
+
+   assert(src->size > 0 && src->type == M_UBYTE);
+
+   m_image_create(dest, M_UBYTE, w, h, 1);
+   dest_pixel = (unsigned char *)dest->data;
+   if (copy)
+      memcpy(dest_pixel, src_data, dest->size * sizeof(char));
+   else
+      memset(dest_pixel, 0, dest->size * sizeof(char));
+
+   for (y=0; y<h; y++) {
+
+      int x;
+      for (x=0; x<w; x++) {
+
+         unsigned char c1, c2, c3, c4, c5;
+         c1 = *src_pixel;
+
+         if (c1 == ref) {
+            c2 = x > 0 ? *(src_data + y * w + (x - 1)) : c1;
+            c3 = y > 0 ? *(src_data + (y - 1) * w + x) : c1;
+            c4 = (x + 1) < w ? *(src_data + y * w + x + 1) : c1;
+            c5 = (y + 1) < h ? *(src_data + (y + 1) * w + x) : c1;
+            if (c2 != c1 || c3 != c1 || c4 != c1 || c5 != c1)
+               *dest_pixel = value;
+         }
+
+         src_pixel++;
+         dest_pixel++;
+      }
+   }
+}
+
+MIAPI void m_image_dilate(struct m_image *dest, const struct m_image *src)
+{
+   m__dilate_erode(dest, src, 0, 255, 1);
+}
+
+MIAPI void m_image_erode(struct m_image *dest, const struct m_image *src)
+{
+   m__dilate_erode(dest, src, 255, 0, 1);
+}
+
+MIAPI void m_image_edge_4x(struct m_image *dest, const struct m_image *src, unsigned char ref)
+{
+   m__dilate_erode(dest, src, ref, 255, 0);
+}
+
+/* Following C code from the article
+   "Efficient Binary Image Thinning using Neighborhood Maps"
+   by Joseph M. Cychosz, in "Graphics Gems IV", Academic Press, 1994
+   Thins the image using Rosenfeld's parallel thinning algorithm.
+*/
+
+/* Direction m__masks:
+   N    S    W    E
+*/
+static int m__masks[] = {0200, 0002, 0040, 0010};
+
+/* True if pixel neighbor map indicates the pixel is 8-simple and
+   not an end point and thus can be deleted.  The neighborhood
+   map is defined as an integer of bits abcdefghi with a non-zero
+   bit representing a non-zero pixel.  The bit assignment for the
+   neighborhood is:
+
+            a b c
+            d e f
+            g h i
+*/
+static unsigned char m__delete_map[512] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+
+MIAPI void m_image_thin(struct m_image *dest)
+{
+   unsigned char *data; /* image data */
+   unsigned char ** ip; /* scanline pointers, ip[y][x] */
+   unsigned char * qb;  /* Neighborhood maps of previous scanline */
+   int xsize, ysize;    /* Image resolution */
+   int x, y;            /* Pixel location */
+   int i;               /* Pass index */
+   int pc = 0;          /* Pass count */
+   int count = 1;       /* Deleted pixel count */
+   int p, q;            /* Neighborhood maps of adjacent cells */
+   int m;               /* Deletion direction mask */
+
+   assert(dest->size > 0 && dest->type == M_UBYTE);
+
+   data = (unsigned char *)dest->data;
+   xsize = dest->width;
+   ysize = dest->height;
+
+   qb = (unsigned char *)malloc(xsize * sizeof(char));
+   qb[xsize-1] = 0; /* Used for lower-right pixel */
+
+   /* alloc scanline pointers */
+   ip = (unsigned char **)malloc(sizeof(void *) * ysize);
+   
+   /* set scanline pointers */
+   for (y=0; y<ysize; y++) {
+      ip[y] = data + y*xsize;
+   }
+
+   while (count) { /* Scan image while deletions */
+   
+      pc++;
+      count = 0;
+
+      for (i=0; i<4; i++) {
+
+         m = m__masks[i];
+         
+         /* Build initial previous scan buffer */
+         p = ip[0][0] != 0;
+         for (x=0; x<xsize-1; x++) {
+            p = ((p<<1)&0006) | (ip[0][x+1] != 0);
+            qb[x] = (unsigned char)p;
+         }
+         
+         /* Scan image for pixel deletion candidates */
+         for (y=0; y<ysize-1; y++) {
+            q = qb[0];
+            p = ((q<<3)&0110) | (ip[y+1][0] != 0);
+            
+            for (x=0; x<xsize-1; x++) {
+               q = qb[x];
+               p = ((p<<1)&0666) | ((q<<3)&0110) | (ip[y+1][x+1] != 0);
+               qb[x] = (unsigned char)p;
+
+               if (((p&m) == 0) && m__delete_map[p]) {
+                  if (ip[y][x] != 0) {
+                     count++;
+                     ip[y][x] = 0;
+                  }
+               }
+            }
+            
+            /* Process right edge pixel */
+            p = (p<<1)&0666;
+            if ((p&m) == 0 && m__delete_map[p]) {
+               if (ip[y][xsize-1] != 0) {
+                  count++;
+                  ip[y][xsize-1] = 0;
+               }
+            }
+         }
+         
+         /* Process bottom scan line */
+         for (x=0; x<xsize; x++) {
+            q = qb[x];
+            p = ((p<<1)&0666) | ((q<<3)&0110);
+            if ((p&m) == 0 && m__delete_map[p]) {
+               if (ip[ysize-1][x] != 0) {
+                  count++;
+                  ip[ysize-1][x] = 0;
+               }
+            }
+         }
+      }
+   }
+
+   free(qb);
+   free(ip);
+}
+
+MIAPI void m_image_non_max_supp(struct m_image *dest, const struct m_image *src, int radius, float threshold)
+{
+   float *src_data, *dest_data;
+   float *src_pixel, *dest_pixel;
+   int width = src->width;
+   int height = src->height;
+   int x, y;
+
+   assert(src->size > 0 && src->type == M_FLOAT && src->comp == 1);
+
+   m_image_copy(dest, src);
+
+   src_data = (float *)src->data;
+   dest_data = (float *)dest->data;
+   src_pixel = src_data;
+   dest_pixel = dest_data;
+
+   for (y = 0; y < height; y++)
+   for (x = 0; x < width; x++) {
+
+      int minx, miny, maxx, maxy, xx, yy;
+
+      if (*src_pixel < threshold) {
+         *dest_pixel = 0;
+         goto end;
+      }
+
+      minx = M_MAX(0, x - radius);
+      miny = M_MAX(0, y - radius);
+      maxx = M_MIN(width - 1, x + radius);
+      maxy = M_MIN(height - 1, y + radius);
+
+      for (yy = miny; yy <= maxy; yy++)
+      for (xx = minx; xx <= maxx; xx++) {
+
+         float *src_pixel2 = src_data + yy*width + xx;
+         if (*src_pixel2 > *src_pixel) {
+            *dest_pixel = 0;
+            goto end;
+         }
+      }
+
+      end:
+      src_pixel++;
+      dest_pixel++;
+   }
+}
+
+MIAPI int m_image_corner_harris(const struct m_image *src, int margin, int radius, float threshold, int *corners, int max_count)
+{
+   struct m_image harris = M_IMAGE_IDENTITY();
+   struct m_image nms = M_IMAGE_IDENTITY();
+   float *pixel;
+   int width = src->width;
+   int height = src->height;
+   int wm = width - margin;
+   int hm = height - margin;
+   int x, y, count;
+
+   if (width <= (margin * 2) || height <= (margin * 2))
+      return 0;
+
+   m_image_harris(&harris, src, radius);
+   m_image_non_max_supp(&nms, &harris, radius, threshold);
+
+   count = 0;
+   pixel = (float *)nms.data;
+
+   for (y = 0; y < height; y++)
+   for (x = 0; x < width; x++) {
+
+      if (count == max_count)
+         goto end;
+
+      if ((*pixel) > 0 && x >= margin && y >= margin && x < wm && y < hm) {
+         corners[count*2]   = x;
+         corners[count*2+1] = y;
+         count++;
+      }
+      pixel++;
+   }
+
+   end:
+   m_image_destroy(&nms);
+   m_image_destroy(&harris);
+   return count;
+}
+
+MIAPI void m_image_sub_pixel(const struct m_image *src, float x, float y, float *result)
+{
+   float *colors0, *colors1, *colors2, *colors3;
+   float *src_data = (float *)src->data;
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int c;
+   float fx, fy;
+   int wm = width - 1;
+   int hm = height - 1;
+   int ix, iy, ix2, iy2;
+
+   ix = (int)x;
+   iy = (int)y;
+   fx = x - (float)ix;
+   fy = y - (float)iy;
+   fx = M_MAX(fx, 0);
+   fy = M_MAX(fy, 0);
+   
+   ix = M_CLAMP(ix, 0, wm);
+   iy = M_CLAMP(iy, 0, hm);
+   ix2 = ix + 1;
+   iy2 = iy + 1;
+   ix2 = M_MIN(ix2, wm);
+   iy2 = M_MIN(iy2, hm);
+   
+   colors0 = src_data + (width * iy  + ix)  * comp;
+   colors1 = src_data + (width * iy  + ix2) * comp;
+   colors2 = src_data + (width * iy2 + ix)  * comp;
+   colors3 = src_data + (width * iy2 + ix2) * comp;
+   
+   for(c = 0; c < comp; c++) {
+      float A = colors0[c] + (colors2[c] - colors0[c]) * fy;
+      float B = colors1[c] + (colors3[c] - colors1[c]) * fy;
+      result[c] = A + (B - A) * fx;
+   }
+}
+
+/* slow TODO better */
+static void m__bilinear(struct m_image *dest, const struct m_image *src, float dx, float dy, float offset)
+{
+   float *dest_data = (float *)dest->data;
+   int width = dest->width;
+   int height = dest->height;
+   int comp = src->comp;
+   int y, ystep = width * comp;
+
+   #pragma omp parallel for schedule(dynamic, 8)
+   for (y = 0; y < height; y++) {
+      float *dest_pixel = dest_data + y * ystep; int x;
+      for (x = 0; x < width; x++) {
+         m_image_sub_pixel(src, ((float)x + 0.5f) * dx + offset, ((float)y + 0.5f) * dy + offset, dest_pixel);
+         dest_pixel += comp;
+      }
+   }
+}
+
+MIAPI void m_image_pyrdown(struct m_image *dest, const struct m_image *src)
+{
+   struct m_image tmp = M_IMAGE_IDENTITY();
+   float *src_data;
+   float *dest_pixel;
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   int comp2 = comp * 2;
+   int ystep = width * comp * 2;
+   int w2 = width / 2;
+   int h2 = height / 2;
+   int x, y, i;
+
+   m_image_gaussian_blur(&tmp, src, 1, 1);
+   m_image_create(dest, M_FLOAT, w2, h2, comp);
+
+   src_data = (float *)tmp.data;
+   dest_pixel = (float *)dest->data;
+
+   for (y = 0; y < h2; y++) {
+      float *src_pixel = src_data + y * ystep;
+      for (x = 0; x < w2; x++) {
+         for (i = 0; i < comp; i++)
+            dest_pixel[i] = src_pixel[i];
+         dest_pixel += comp;
+         src_pixel += comp2;
+      }
+   }
+
+   m_image_destroy(&tmp);
+}
+
+MIAPI void m_image_resize(struct m_image *dest, const struct m_image *src, int new_width, int new_height)
+{
+   struct m_image tmp = M_IMAGE_IDENTITY();
+   int width = src->width;
+   int height = src->height;
+   int comp = src->comp;
+   float rx = (float)width / (float)new_width;
+   float ry = (float)height / (float)new_height;
+
+   assert(src->size > 0 && src->type == M_FLOAT);
+   m_image_create(dest, M_FLOAT, new_width, new_height, comp);
+
+   if (new_width < width || new_height < height) {
+      float r = M_MAX(rx, ry);
+      int ir = (int)r - 1;
+      if (ir > 0) {
+         m_image_gaussian_blur(&tmp, src, ir, ir);
+         m__bilinear(dest, &tmp, rx, ry, -0.5f);
+      }
+      else {
+         m__bilinear(dest, src, rx, ry, -0.5f);
+      }
+   }
+   else {
+      m__bilinear(dest, src, rx, ry, -0.5f);
+   }
+
+   m_image_destroy(&tmp);
+}
+
+#endif /* M_IMAGE_IMPLEMENTATION */
diff --git a/3rdparty/bimg/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt b/3rdparty/bimg/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
new file mode 100644
index 0000000..c422f71
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
@@ -0,0 +1,24 @@
+NVIDIA Texture Tools 2.0 is licensed under the MIT license.
+
+Copyright (c) 2007 NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/bits.h b/3rdparty/bimg/3rdparty/nvtt/bc6h/bits.h
new file mode 100644
index 0000000..c47a7c6
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/bits.h
@@ -0,0 +1,75 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#ifndef _ZOH_BITS_H
+#define _ZOH_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/debug.h"
+
+namespace ZOH {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/shapes_two.h b/3rdparty/bimg/3rdparty/nvtt/bc6h/shapes_two.h
new file mode 100644
index 0000000..2fc5559
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_SHAPES_TWO_H
+#define _ZOH_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static const int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/tile.h b/3rdparty/bimg/3rdparty/nvtt/bc6h/tile.h
new file mode 100644
index 0000000..6e642a2
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/tile.h
@@ -0,0 +1,82 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#ifndef _ZOH_TILE_H
+#define _ZOH_TILE_H
+
+#include "zoh_utils.h"
+#include "nvmath/vector.h"
+#include <math.h>
+
+namespace ZOH {
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+public:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(uint16 h)
+	{
+		return (float) Utils::ushort_to_format(h);
+	}
+	// NOTE: this is the inverse of the above operation
+	static uint16 float2half(float f)
+	{
+		return Utils::format_to_ushort((int)f);
+	}
+
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].x == data[yn][xn].x) &&
+				(data[y][x].y == data[yn][xn].y) &&
+				(data[y][x].z == data[yn][xn].z) );
+	}
+
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int, int)
+	{
+		return false;
+	}
+#endif
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+    nv::Vector3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+};
+
+}
+
+#endif // _ZOH_TILE_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.cpp b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.cpp
new file mode 100644
index 0000000..3053ea1
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.cpp
@@ -0,0 +1,197 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include "tile.h"
+#include "zoh.h"
+
+#include <string.h> // memcpy
+
+using namespace ZOH;
+
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	float mseone = ZOH::compressone(t, oneblock);
+	float msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+/*
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s)
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	nvAssert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
+*/
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.h b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.h
new file mode 100644
index 0000000..d3003cb
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh.h
@@ -0,0 +1,65 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include "tile.h"
+
+namespace ZOH {
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_ONE	= 1;
+static const int NCHANNELS		= 3;
+
+struct FltEndpts
+{
+    nv::Vector3 A;
+    nv::Vector3 B;
+};
+
+struct IntEndpts
+{
+	int A[NCHANNELS];
+	int B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	uint A[NCHANNELS];
+	uint B[NCHANNELS];
+};
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compressone(const Tile &t, char *block);
+float compresstwo(const Tile &t, char *block);
+void decompressone(const char *block, Tile &t);
+void decompresstwo(const char *block, Tile &t);
+
+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+bool isone(const char *block);
+
+}
+
+#endif // _ZOH_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.cpp b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.cpp
new file mode 100644
index 0000000..fde3200
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.cpp
@@ -0,0 +1,324 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "zoh_utils.h"
+#include "nvmath/vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace ZOH;
+
+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+/*static*/ Format Utils::FORMAT;
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvDebugCheck(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvUnreachable();
+	}
+
+	// no need to round these as this is an exact division
+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vector3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v.component[i] < 0.0) v.component[i] = 0;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		default:
+			nvUnreachable();
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		nvDebugCheck (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	nvDebugCheck (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+float Utils::norm(const Vector3 &a, const Vector3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	return lengthSquared(a - b);
+#endif
+#ifdef	NORM_ABS
+	Vector3 err = a - b;
+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
+#endif
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	nvDebugCheck (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else nvDebugCheck(0);
+		field = (Field) foo;
+	}
+}
+
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.h b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.h
new file mode 100644
index 0000000..3ce33ce
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/zoh_utils.h
@@ -0,0 +1,72 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _ZOH_UTILS_H
+#define _ZOH_UTILS_H
+
+#include "nvmath/vector.h"
+
+namespace ZOH {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
+
+enum Field {
+    FIELD_M = 1,	// mode
+    FIELD_D = 2,	// distribution/shape
+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+static const int F16S_MASK	=  0x8000;		// f16 sign mask
+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
+static const int U16MAX		=  0xffff;
+static const int S16MIN		= -0x8000;
+static const int S16MAX		=  0x7fff;
+static const int INT16_MASK	=  0xffff;
+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
+
+    // error metrics
+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
+
+    // conversion & clamp
+    static int ushort_to_format(unsigned short input);
+    static unsigned short format_to_ushort(int input);
+
+    // clamp to format
+    static void clamp(nv::Vector3 &v);
+
+    // quantization and unquantization
+    static int finish_unquantize(int q, int prec);
+    static int unquantize(int q, int prec);
+    static int quantize(float value, int prec);
+
+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
+
+    // lerping
+    static int lerp(int a, int b, int i, int denom);
+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
+};
+
+}
+
+#endif // _ZOH_UTILS_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/zohone.cpp b/3rdparty/bimg/3rdparty/nvtt/bc6h/zohone.cpp
new file mode 100644
index 0000000..3652676
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/zohone.cpp
@@ -0,0 +1,799 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/vector.inl"
+#include "nvmath/fitting.h"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static const int shapes[NSHAPES] =
+{
+    0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;				// associated mode value
+    int modebits;			// number of mode bits
+    const char *encoding;	// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static const Pattern patterns[NPATTERNS] =
+{
+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static const int mode_to_pat[MAXMODES] = {
+    -1,-1,-1,
+    3,	// 0x03
+    -1,-1,-1,
+    2,	// 0x07
+    -1,-1,-1,
+    1,	// 0x0b
+    -1,-1,-1,
+    0,	// 0x0f
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    int index_positions[NREGIONS_ONE];
+
+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        int x = index_positions[region] & 3;
+        int y = (index_positions[region] >> 2) & 3;
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_ONE];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx;
+    int gw, gx;
+    int bw, bx;
+
+    d = 0;
+    rw = rx = 0;
+    gw = gx = 0;
+    bw = bx = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 63);
+
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(endpts, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_ONE];
+    ComprEndpts compr_endpts[NREGIONS_ONE];
+
+    read_header(in, compr_endpts, p);
+    int shapeindex = 0;		// only one shape
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+    for (int r = 0; r < NREGIONS_ONE; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    // read indices
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+	*/
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+
+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x,y,shapeindex) == region)
+                {
+                    colors[np] = tile.data[y][x];
+                    mean += tile.data[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compressone(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+    float msebest = FLT_MAX;
+
+    /*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughone(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refineone(t, shapeindex_best, endptsbest, block);
+}
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc6h/zohtwo.cpp b/3rdparty/bimg/3rdparty/nvtt/bc6h/zohtwo.cpp
new file mode 100644
index 0000000..5a14294
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc6h/zohtwo.cpp
@@ -0,0 +1,883 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to nvDebugCheck we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/fitting.h"
+#include "nvmath/vector.inl"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;                   // associated mode value
+    int modebits;               // number of mode bits
+    const char *encoding;       // verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static const Pattern patterns[NPATTERNS] =
+{
+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static const int mode_to_pat[MAXMODES] = {	
+    3,	// 0x00
+    8,	// 0x01
+    0,	// 0x02
+    -1,-1,-1,
+    1,	// 0x06
+    -1,-1,-1,
+    2,	// 0x0a
+    -1,-1,-1,
+    4,	// 0x0e
+    -1,-1,-1,
+    5,	// 0x12
+    -2,-1,-1,
+    6,	// 0x16
+    -2,-1,-1,
+    7,	// 0x1a
+    -2,-1,-1,
+    9,	// 0x1e
+    -2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+        int x = POS_TO_X(position);
+        int y = POS_TO_Y(position);
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i)
+            {
+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+            }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_TWO];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_TWO; ++j)
+    {
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+        }
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int d = shapeindex;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_D:	out.write( d >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_RY:	out.write(ry >> endbit, len); break;
+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_GY:	out.write(gy >> endbit, len); break;
+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+        case FIELD_BY:	out.write(by >> endbit, len); break;
+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
+        default: nvUnreachable();
+        }
+    }
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    if (pat_index == -2)
+        return false;		// reserved mode found
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx, ry, rz;
+    int gw, gx, gy, gz;
+    int bw, bx, by, bz;
+
+    d = 0;
+    rw = rx = ry = rz = 0;
+    gw = gx = gy = gz = 0;
+    bw = bx = by = bz = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_D:	 d |= in.read(len) << endbit; break;
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+        case FIELD_BY:	by |= in.read(len) << endbit; break;
+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 46);
+
+    shapeindex = d;
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+    return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(compr_endpts, shapeindex, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_TWO];
+    ComprEndpts compr_endpts[NREGIONS_TWO];
+    int shapeindex;
+
+    if (!read_header(in, compr_endpts, shapeindex, p))
+    {
+        // reserved mode, return all zeroes
+        for (int y = 0; y < Tile::TILE_H; y++)
+            for (int x = 0; x < Tile::TILE_W; x++)
+                t.data[y][x] = Vector3(0.0f);
+
+        return;
+    }
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+    */
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            pixels[np] = tile.data[y][x];
+            importance[np] = tile.importance_map[y][x];
+            ++np;
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            colors[np] = tile.data[y][x];
+            mean += tile.data[y][x];
+            ++np;
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compresstwo(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+    float msebest = FLT_MAX;
+
+    /*
+    collect the mse values that are within 5% of the best values
+    optimize each one and choose the best
+    */
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughtwo(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.cpp
new file mode 100644
index 0000000..8e0b169
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.cpp
@@ -0,0 +1,264 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the avpcl compressor and decompressor
+
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include <string.h>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// global flags
+bool AVPCL::flag_premult = false;
+bool AVPCL::flag_nonuniform = false;
+bool AVPCL::flag_nonuniform_ati = false;
+
+// global mode
+bool AVPCL::mode_rgb = false;		// true if image had constant alpha = 255
+
+void AVPCL::compress(const Tile &t, char *block)
+{
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	float mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+		
+	/*if (errfile)
+	{
+		float errs[21];
+		int nerrs = 8;
+		errs[0] = mse_mode0; 
+		errs[1] = mse_mode1; 
+		errs[2] = mse_mode2; 
+		errs[3] = mse_mode3; 
+		errs[4] = mse_mode4; 
+		errs[5] = mse_mode5; 
+		errs[6] = mse_mode6; 
+		errs[7] = mse_mode7;
+		if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs)
+			throw "Write error on error file";
+	}*/
+}
+
+/*
+static int getbit(char *b, int start)
+{
+	if (start < 0 || start >= 128) return 0; // out of range
+
+	int ix = start >> 3;
+	return (b[ix] & (1 << (start & 7))) != 0;
+}
+
+static int getbits(char *b, int start, int len)
+{
+	int out = 0;
+	for (int i=0; i<len; ++i)
+		out |= getbit(b, start+i) << i;
+	return out;
+}
+
+static void setbit(char *b, int start, int bit)
+{
+	if (start < 0 || start >= 128) return; // out of range
+
+	int ix = start >> 3;
+
+	if (bit & 1)
+		b[ix] |= (1 << (start & 7));
+	else
+		b[ix] &= ~(1 << (start & 7));
+}
+
+static void setbits(char *b, int start, int len, int bits)
+{
+	for (int i=0; i<len; ++i)
+		setbit(b, start+i, bits >> i);
+}
+*/
+
+void AVPCL::decompress(const char *cblock, Tile &t)
+{
+	char block[AVPCL::BLOCKSIZE];
+	memcpy(block, cblock, AVPCL::BLOCKSIZE);
+
+	switch(getmode(block))
+	{
+	case 0:	AVPCL::decompress_mode0(block, t);	break;
+	case 1:	AVPCL::decompress_mode1(block, t);	break;
+	case 2:	AVPCL::decompress_mode2(block, t);	break;
+	case 3:	AVPCL::decompress_mode3(block, t);	break;
+	case 4:	AVPCL::decompress_mode4(block, t);	break;
+	case 5:	AVPCL::decompress_mode5(block, t);	break;
+	case 6:	AVPCL::decompress_mode6(block, t);	break;
+	case 7:	AVPCL::decompress_mode7(block, t);	break;
+	case 8: // return a black tile if you get a reserved mode
+		for (int y=0; y<Tile::TILE_H; ++y)
+			for (int x=0; x<Tile::TILE_W; ++x)
+				t.data[y][x].set(0, 0, 0, 0);
+		break;
+	default: nvUnreachable();
+	}
+}
+
+/*
+void AVPCL::compress(string inf, string avpclf, string errf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	Targa::read(inf, pixels, w, h);
+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
+	FILE *errfile = NULL;
+	if (errf != "")
+	{
+		errfile = fopen(errf.c_str(), "wb");
+		if (errfile == NULL) throw "Unable to open error file for write";
+	}
+
+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
+	if (AVPCL::flag_premult)
+	{
+		if (AVPCL::mode_rgb)
+		{
+			AVPCL::flag_premult = false;
+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
+		}
+	}
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	clock_t start, prev, cur;
+
+	start = prev = clock();
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, float(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
+
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			AVPCL::compress(t, block, errfile);
+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+		}
+	}
+
+	cur = clock();
+	printf("\nTotal time to compress: %.2f seconds\n\n", float(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
+
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+	if (errfile && fclose(errfile)) throw "Close failed on error file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
+{
+	size_t n = avpclf.rfind('.', avpclf.length()-1);
+	size_t n1 = avpclf.rfind('-', n-1);
+	size_t n2 = avpclf.rfind('-', n1-1);
+	size_t n3 = avpclf.rfind('-', n2-1);
+	//	...-wwww-hhhh-RGB[A].avpcl
+	//     ^    ^    ^      ^
+	//     n3   n2   n1     n n3<n2<n1<n
+	string width = avpclf.substr(n3+1, n2-n3-1);
+	w = str2int(width);
+	string height = avpclf.substr(n2+1, n1-n2-1);
+	h = str2int(height);
+	string mode = avpclf.substr(n1+1, n-n1-1);
+	mode_rgb = mode == "RGB";
+}
+
+static int modehist[8];
+
+static void stats(char block[AVPCL::BLOCKSIZE])
+{
+	int m = AVPCL::getmode(block);
+	modehist[m]++;
+}
+
+static void printstats()
+{
+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
+	printf("\n");
+}
+
+void AVPCL::decompress(string avpclf, string outf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	extract(avpclf, w, h, AVPCL::mode_rgb);
+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+		
+			AVPCL::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+
+	Targa::write(outf, pixels, w, h);
+
+	printstats();	// print statistics
+}
+*/
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.h b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.h
new file mode 100644
index 0000000..44ea504
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl.h
@@ -0,0 +1,99 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_H
+#define _AVPCL_H
+
+#include "tile.h"
+#include "bits.h"
+
+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
+
+namespace AVPCL {
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_THREE	= 3;
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+// global flags
+extern bool flag_premult;
+extern bool flag_nonuniform;
+extern bool flag_nonuniform_ati;
+
+// global mode
+extern bool mode_rgb;		// true if image had constant alpha = 255
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compress_mode0(const Tile &t, char *block);
+void decompress_mode0(const char *block, Tile &t);
+
+float compress_mode1(const Tile &t, char *block);
+void decompress_mode1(const char *block, Tile &t);
+
+float compress_mode2(const Tile &t, char *block);
+void decompress_mode2(const char *block, Tile &t);
+
+float compress_mode3(const Tile &t, char *block);
+void decompress_mode3(const char *block, Tile &t);
+
+float compress_mode4(const Tile &t, char *block);
+void decompress_mode4(const char *block, Tile &t);
+
+float compress_mode5(const Tile &t, char *block);
+void decompress_mode5(const char *block, Tile &t);
+
+float compress_mode6(const Tile &t, char *block);
+void decompress_mode6(const char *block, Tile &t);
+
+float compress_mode7(const Tile &t, char *block);
+void decompress_mode7(const char *block, Tile &t);
+
+inline int getmode(Bits &in)
+{
+	int mode = 0;
+
+	if (in.read(1))			mode = 0;
+	else if (in.read(1))	mode = 1;
+	else if (in.read(1))	mode = 2;
+	else if (in.read(1))	mode = 3;
+	else if (in.read(1))	mode = 4;
+	else if (in.read(1))	mode = 5;
+	else if (in.read(1))	mode = 6;
+	else if (in.read(1))	mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+inline int getmode(const char *block)
+{
+	int bits = block[0], mode = 0;
+
+	if (bits & 1) mode = 0;
+	else if ((bits&3) == 2) mode = 1;
+	else if ((bits&7) == 4) mode = 2;
+	else if ((bits & 0xF) == 8) mode = 3;
+	else if ((bits & 0x1F) == 16) mode = 4;
+	else if ((bits & 0x3F) == 32) mode = 5;
+	else if ((bits & 0x7F) == 64) mode = 6;
+	else if ((bits & 0xFF) == 128) mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode0.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode0.cpp
new file mode 100644
index 0000000..82dd607
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode0.cpp
@@ -0,0 +1,1066 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+//  x1		444.1x6 16p 45b (3bi)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+#include "shapes_three.h"
+
+// use only the first 16 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 16
+#define SHAPEBITS 4
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+    const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	4,4,4,4,4,4,	4,4,4,4,4,4,	4,4,4,4,4,4,	0,	0x1, 1, "",	// really 444.1 x 6
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 555,555; reduce to 444,444 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 16);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 16);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 83);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 83);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode0(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+    nvAssert(false); // throw "No candidate found, should never happen (mode avpcl 0).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+// for this mode, we assume alpha = 255 constant and compress only the RGB portion.
+// however, we do the error check against the actual alpha values supplied for the tile.
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode0(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode1.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode1.cpp
new file mode 100644
index 0000000..fb1bfea
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode1.cpp
@@ -0,0 +1,1047 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10	(666x2).1 (666x2).1 64p 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	2		// number of different lsb modes per region. since we have one .1 per region, that can have 2 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x2, 2, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	6,6,6, 6,6,6, 6,6,6, 6,6,6,	
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+
+static void transform_forward(IntEndptsRGB_1 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_1 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 777,777; reduce to 666,666 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_1& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 64);
+		nvAssert (compr_endpts.B[j] < 64);
+	}
+	compr_endpts.lsb = onescnt >= 3;
+}
+
+static void uncompress_one(const IntEndptsRGB_1& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_1 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_1 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_1 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_1 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_1 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		out.write(endpts[i].lsb, 1);
+
+	nvAssert (out.getptr() == 82);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_1 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		endpts[i].lsb  = in.read(1);
+	
+	nvAssert (in.getptr() == 82);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_1 &endpts_1, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_1, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// note: don't simplify to a + ((b-a)*i + BIAS)/DENOM as that doesn't work due to the way C handles integer division of negatives
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB_1 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode1(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_1 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_1 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_1 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_1 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_1 new_a, new_b;
+	IntEndptsRGB_1 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGB_1 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_1 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_1 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.lsb = lsbmode;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_1 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			//nvAssert(opt_toterr <= orig_toterr);
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 1).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			float err = Utils::metric4(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode1(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode2.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode2.cpp
new file mode 100644
index 0000000..380ffce
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode2.cpp
@@ -0,0 +1,1004 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100 555x6 64p 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+#include "shapes_three.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	6
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	5,5,5,5,5,5,	5,5,5,5,5,5,	5,5,5,5,5,5,	0,	0x4, 3, "",
+};
+
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS_THREE];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+#define	R_2 ep[1].A[i]
+#define	R_3	ep[1].B[i]
+
+static void transform_forward(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_1 -= R_3; R_2 -= R_3; R_0 -= R_3;
+	}
+}
+
+static void transform_inverse(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_0 += R_3; R_2 += R_3; R_1 += R_3;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, IntEndptsRGB q_endpts[NREGIONS_THREE])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB endpts[NREGIONS_THREE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB endpts[NREGIONS_THREE], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[i*2+0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[i*2+1]);
+		}
+	nvAssert (out.getptr() == 99);
+}
+
+static void read_header(Bits &in, IntEndptsRGB endpts[NREGIONS_THREE], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[i*2+0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[i*2+1]);
+		}
+	nvAssert (in.getptr() == 99);
+}
+
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB &endpts, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB endpts[NREGIONS_THREE])
+{
+	nvAssert (p.transformed != 0);
+
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+		endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
+		endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
+		endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
+		endpts[2].A[i] = SIGN_EXTEND(endpts[2].A[i], p.chan[i].nbitsizes[4]);
+		endpts[2].B[i] = SIGN_EXTEND(endpts[2].B[i], p.chan[i].nbitsizes[5]);
+	}
+}
+
+void AVPCL::decompress_mode2(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB endpts[NREGIONS_THREE];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB new_a, new_b;
+	IntEndptsRGB new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_THREE], 
+							const IntEndptsRGB orig_endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB opt_endpts[NREGIONS_THREE])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+		float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_THREE], char *block)
+{
+	float orig_err[NREGIONS_THREE], opt_err[NREGIONS_THREE], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB orig_endpts[NREGIONS_THREE], opt_endpts[NREGIONS_THREE];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS_THREE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 2).";
+	return FLT_MAX;
+
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_THREE], Vector4 palette[NREGIONS_THREE][NINDICES])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_THREE])
+{
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode2(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS_THREE];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode3.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode3.cpp
new file mode 100644
index 0000000..0020d8a
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode3.cpp
@@ -0,0 +1,1059 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000 777.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	7,7,7,7,	7,7,7,7,	7,7,7,7,	0,	0x8, 4, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7, 7,7,7, 7,7,7, 7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 888,888; reduce to 777,777 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 98);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode3(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+            float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 3).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode3(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode4.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode4.cpp
new file mode 100644
index 0000000..5115d7c
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode4.cpp
@@ -0,0 +1,1214 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000 2r 1i 555x2 6x2 2bi 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	8
+#define	INDEXBITS3	3
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	5,5,		5,5,		5,5,		6,6,	0x0, 0x10, 5, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,6,	5,5,5,6,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (out.getptr() == 50);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+	indexmode = in.read(INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (in.getptr() == 50);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
+
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		default: nvUnreachable();
+		}
+	}
+}
+
+void AVPCL::decompress_mode4(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	float toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vector3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
+
+		rgb.x = (colors[i]).x;
+		rgb.y = (colors[i]).y;
+		rgb.z = (colors[i]).z;
+		a = (colors[i]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
+			toterr += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.x = (tile.data[y][x]).x;
+		rgb.y = (tile.data[y][x]).y;
+		rgb.z = (tile.data[y][x]).z;
+		a = (tile.data[y][x]).w;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, 
+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					nvAssert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 4).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		float mina = FLT_MAX, maxa = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alphas[i] - mean.w;
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + Vector4(minp*direction, mina);
+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+float AVPCL::compress_mode4(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		{
+			float mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode5.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode5.cpp
new file mode 100644
index 0000000..f1f1636
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode5.cpp
@@ -0,0 +1,1216 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100000 2r 777x2 8x2 2bi 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	4
+#define	INDEXBITS3	2
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	7,7,		7,7,		7,7,		8,8,	0x0, 0x20, 6, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,8,	7,7,7,8,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+//	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (out.getptr() == 66);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+
+	indexmode = 0;		// we don't have any
+
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (in.getptr() == 66);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		default: nvUnreachable();
+		}
+	}
+}
+
+void AVPCL::decompress_mode5(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	float toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vector3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
+
+		rgb.x = (colors[i]).x;
+		rgb.y = (colors[i]).y;
+		rgb.z = (colors[i]).z;
+		a = (colors[i]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
+			toterr += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.x = (tile.data[y][x]).x;
+		rgb.y = (tile.data[y][x]).y;
+		rgb.z = (tile.data[y][x]).z;
+		a = (tile.data[y][x]).w;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts,
+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					nvAssert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 5).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		float mina = FLT_MAX, maxa = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alphas[i] - mean.w;
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + Vector4(minp*direction, mina);
+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+float AVPCL::compress_mode5(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+//		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		for (int i = 0; i < 1 && msebest > 0; ++i)
+		{
+			float mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode6.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode6.cpp
new file mode 100644
index 0000000..38e3a25
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode6.cpp
@@ -0,0 +1,1055 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000000 7777.1x2 4bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	NREGIONS	1
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red	green	blue	alpha	mode  mb verilog
+	7,7,	7,7,	7,7,	7,7,	0x40, 7, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,7,	7,7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 8888 ->7777.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 65);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 65);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	nvAssert ((indices[0][0] & HIGH_INDEXBIT) == 0);
+
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			out.write(indices[i>>2][i&3], INDEXBITS-1);	// write i..[2:0]
+		else
+			out.write(indices[i>>2][i&3], INDEXBITS);	// write i..[3:0]
+	}
+
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			indices[i>>2][i&3] = in.read(INDEXBITS-1);	// read i..[1:0]
+		else
+			indices[i>>2][i&3] = in.read(INDEXBITS);	// read i..[2:0]
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
+}
+
+void AVPCL::decompress_mode6(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+
+     simplify the above given that there is no transform now and that endpoints will always fit
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+
+		optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+		assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+		// (nreed) Commented out asserts because they go off all the time...not sure why
+		//for (int i=0; i<NREGIONS; ++i)
+		//	nvAssert(expected_opt_err[i] == opt_err[i]);
+		swap_indices(opt_endpts, opt_indices, shapeindex_best);
+
+		orig_toterr = opt_toterr = 0;
+		for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+		//nvAssert(opt_toterr <= orig_toterr);
+
+		if (opt_toterr < orig_toterr)
+		{
+			emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+			return opt_toterr;
+		}
+		else
+		{
+			emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+			return orig_toterr;
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 6).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+
+		besterr = Utils::metric4(tile.data[y][x], palette[region][0]);
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector4 colors[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean, direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode6(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=1;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode7.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode7.cpp
new file mode 100644
index 0000000..441c4ac
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_mode7.cpp
@@ -0,0 +1,1094 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg (jacob@cs.lth.se) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000000 5555.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/debug.h"
+#include "nvmath/vector.inl"
+#include "nvmath/matrix.inl"
+#include "nvmath/fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <string.h>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha		xfm	mode  mb
+	5,5,5,5,	5,5,5,5,	5,5,5,5,	5,5,5,5,	0,	0x80, 8, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,5,  5,5,5,5,  5,5,5,5,  5,5,5,5,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 6666 ->5555.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 32);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 32);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 98);
+}
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGBA_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode7(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 7).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector4 colors[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean, direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode7(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.cpp b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.cpp
new file mode 100644
index 0000000..af99711
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.cpp
@@ -0,0 +1,389 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "avpcl_utils.h"
+#include "avpcl.h"
+#include "nvmath/vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+	nvAssert (a >= 0 && b >= 0);
+
+	int round = 0;
+#ifdef	USE_ZOH_INTERP_ROUNDED
+	round = 32;
+#endif
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
+	default: nvUnreachable(); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+//	nvAssert (a >= 0 && b >= 0);
+
+	// no need to bias these as this is an exact division
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f;
+	case 7:	return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f;
+	default: nvUnreachable(); return Vector4(0);
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+
+int Utils::unquantize(int q, int prec)
+{
+	int unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+#ifdef USE_ZOH_QUANT
+	if (prec >= 8)
+		unq = q;
+	else if (q == 0) 
+		unq = 0;
+	else if (q == ((1<<prec)-1)) 
+		unq = 255;
+	else
+		unq = (q * 256 + 128) >> prec;
+#else
+	// avpcl unquantizer -- bit replicate
+	unq = (q << (8-prec)) | (q >> (2*prec-8));
+#endif
+
+	return unq;
+}
+
+// quantize to the best value -- i.e., minimize unquantize error
+int Utils::quantize(float value, int prec)
+{
+	int q, unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+	unq = (int)floor(value + 0.5f);
+	nvAssert (unq <= 255);
+
+#ifdef USE_ZOH_QUANT
+	q = (prec >= 8) ? unq : (unq << prec) / 256;
+#else
+	// avpcl quantizer -- scale properly for best possible bit-replicated result
+	q = (unq * ((1<<prec)-1) + 127)/255;
+#endif
+
+	nvAssert (q >= 0 && q < (1 << prec));
+
+	return q;
+}
+
+float Utils::metric4(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
+float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode)
+{
+	Vector3 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1(const float a, const float b, int rotatemode)
+{
+	float err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
+
+float Utils::premult(float r, float a)
+{
+	// note that the args are really integers stored in floats
+	int R = int(r), A = int(a);
+
+	nvAssert ((R==r) && (A==a));
+
+	return float((R*A + 127)/255);
+}
+
+static void premult4(Vector4& rgba)
+{
+	rgba.x = Utils::premult(rgba.x, rgba.w);
+	rgba.y = Utils::premult(rgba.y, rgba.w);
+	rgba.z = Utils::premult(rgba.z, rgba.w);
+}
+
+static void premult3(Vector3& rgb, float a)
+{
+	rgb.x = Utils::premult(rgb.x, a);
+	rgb.y = Utils::premult(rgb.y, a);
+	rgb.z = Utils::premult(rgb.z, a);
+}
+
+float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 pma = a, pmb = b;
+
+	premult4(pma);
+	premult4(pmb);
+
+	Vector4 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	premult3(pma, a0);
+	premult3(pmb, a1);
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	switch(rotatemode)
+	{
+	case ROTATEMODE_RGBA_RGBA:
+		// this function isn't supposed to be called for this rotatemode
+		nvUnreachable();
+		break;
+	case ROTATEMODE_RGBA_AGBR:
+		pma.y = premult(pma.y, pma.x);
+		pma.z = premult(pma.z, pma.x);
+		pmb.y = premult(pmb.y, pmb.x);
+		pmb.z = premult(pmb.z, pmb.x);
+		break;
+	case ROTATEMODE_RGBA_RABG:
+		pma.x = premult(pma.x, pma.y);
+		pma.z = premult(pma.z, pma.y);
+		pmb.x = premult(pmb.x, pmb.y);
+		pmb.z = premult(pmb.z, pmb.y);
+		break;
+	case ROTATEMODE_RGBA_RGAB:
+		pma.x = premult(pma.x, pma.z);
+		pma.y = premult(pma.y, pma.z);
+		pmb.x = premult(pmb.x, pmb.z);
+		pmb.y = premult(pmb.y, pmb.z);
+		break;
+	default: nvUnreachable();
+	}
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
+{
+	float err = premult(rgb0, a0) - premult(rgb1, a1);
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.h b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.h
new file mode 100644
index 0000000..cb546d5
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/avpcl_utils.h
@@ -0,0 +1,61 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _AVPCL_UTILS_H
+#define _AVPCL_UTILS_H
+
+#include "nvmath/vector.h"
+
+namespace AVPCL {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); }
+
+static const int INDEXMODE_BITS				= 1;		// 2 different index modes
+static const int NINDEXMODES				= (1<<(INDEXMODE_BITS));
+static const int INDEXMODE_ALPHA_IS_3BITS	= 0;
+static const int INDEXMODE_ALPHA_IS_2BITS	= 1;
+
+static const int ROTATEMODE_BITS		= 2;		// 4 different rotate modes
+static const int NROTATEMODES			= (1<<(ROTATEMODE_BITS));
+static const int ROTATEMODE_RGBA_RGBA	= 0;
+static const int ROTATEMODE_RGBA_AGBR	= 1;
+static const int ROTATEMODE_RGBA_RABG	= 2;
+static const int ROTATEMODE_RGBA_RGAB	= 3;
+
+class Utils
+{
+public:
+	// error metrics
+	static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b);
+	static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode);
+	static float metric1(float a, float b, int rotatemode);
+
+	static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1);
+	static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1);
+	static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode);
+	static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
+
+	static float premult(float r, float a);
+
+	// quantization and unquantization
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	// lerping
+	static int lerp(int a, int b, int i, int bias, int denom);
+	static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom);
+};
+
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/bits.h b/3rdparty/bimg/3rdparty/nvtt/bc7/bits.h
new file mode 100644
index 0000000..782f655
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/bits.h
@@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_BITS_H
+#define _AVPCL_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/debug.h"
+
+namespace AVPCL {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/endpts.h b/3rdparty/bimg/3rdparty/nvtt/bc7/endpts.h
new file mode 100644
index 0000000..4f42a16
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/endpts.h
@@ -0,0 +1,81 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_ENDPTS_H
+#define _AVPCL_ENDPTS_H
+
+// endpoint definitions and routines to search through endpoint space
+
+#include "nvmath/vector.h"
+
+namespace AVPCL {
+
+static const int NCHANNELS_RGB	= 3;
+static const int NCHANNELS_RGBA	= 4;
+static const int CHANNEL_R		= 0;
+static const int CHANNEL_G		= 1;
+static const int CHANNEL_B		= 2;
+static const int CHANNEL_A		= 3;
+
+struct FltEndpts
+{
+	nv::Vector4	A;
+	nv::Vector4	B;
+};
+
+struct IntEndptsRGB
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+};
+
+struct IntEndptsRGB_1
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		lsb;				// shared lsb for A and B
+};
+
+struct IntEndptsRGB_2
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+
+struct IntEndptsRGBA
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+};
+
+struct IntEndptsRGBA_2
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+struct IntEndptsRGBA_2a
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for RGB channels of A
+	int		b_lsb;				// lsb for RGB channels of A
+};
+
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/shapes_three.h b/3rdparty/bimg/3rdparty/nvtt/bc7/shapes_three.h
new file mode 100644
index 0000000..dc95ba5
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/shapes_three.h
@@ -0,0 +1,132 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef	_AVPCL_SHAPES_THREE_H
+#define _AVPCL_SHAPES_THREE_H
+
+// shapes for 3 regions
+
+#define NREGIONS 3
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
+
+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
+
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+
+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
+
+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
+
+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
+
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
+
+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
+
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+
+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
+
+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
+
+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
+{
+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
+
+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
+
+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
+
+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/shapes_two.h b/3rdparty/bimg/3rdparty/nvtt/bc7/shapes_two.h
new file mode 100644
index 0000000..853d557
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/shapes_two.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_SHAPES_TWO_H
+#define _AVPCL_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/bc7/tile.h b/3rdparty/bimg/3rdparty/nvtt/bc7/tile.h
new file mode 100644
index 0000000..730d9ba
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/bc7/tile.h
@@ -0,0 +1,41 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_TILE_H
+#define _AVPCL_TILE_H
+
+#include "nvmath/vector.h"
+#include <math.h>
+#include "avpcl_utils.h"
+
+namespace AVPCL {
+
+// extract a tile of pixels from an array
+
+class Tile
+{
+public:
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	nv::Vector4 data[TILE_H][TILE_W];
+    float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+};
+
+}
+
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/array.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/array.h
new file mode 100644
index 0000000..f4460f3
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/array.h
@@ -0,0 +1,181 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_ARRAY_H
+#define NV_CORE_ARRAY_H
+
+/*
+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
+are not supported.
+
+Note also that push_back and resize does not support inserting arguments elements that are in the same 
+container. This is forbidden to prevent an extra copy.
+*/
+
+
+#include "memory.h"
+#include "debug.h"
+#include "foreach.h" // pseudoindex
+
+
+namespace nv 
+{
+    class Stream;
+
+    /**
+    * Replacement for std::vector that is easier to debug and provides
+    * some nice foreach enumerators. 
+    */
+    template<typename T>
+    class NVCORE_CLASS Array {
+    public:
+        typedef uint size_type;
+
+        // Default constructor.
+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
+
+        // Copy constructor.
+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(a.m_buffer, a.m_size);
+        }
+
+        // Constructor that initializes the vector with the given elements.
+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(ptr, num);
+        }
+
+        // Allocate array.
+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            setArrayCapacity(capacity);
+        }
+
+        // Destructor.
+        NV_FORCEINLINE ~Array() {
+            clear();
+            free<T>(m_buffer);
+        }
+
+
+        /// Const element access.
+        NV_FORCEINLINE const T & operator[]( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE const T & at( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Element access.
+        NV_FORCEINLINE T & operator[] ( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE T & at( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint size() const { return m_size; }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint count() const { return m_size; }
+
+        /// Get vector capacity.
+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
+
+        /// Get const vector pointer.
+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
+
+        /// Get vector pointer.
+        NV_FORCEINLINE T * buffer() { return m_buffer; }
+
+        /// Provide begin/end pointers for C++11 range-based for loops.
+        NV_FORCEINLINE T * begin() { return m_buffer; }
+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
+
+        /// Is vector empty.
+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
+
+        /// Is a null vector.
+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
+
+
+        T & append();
+        void push_back( const T & val );
+        void pushBack( const T & val );
+        Array<T> & append( const T & val );
+        Array<T> & operator<< ( T & t );
+        void pop_back();
+        void popBack(uint count = 1);
+        void popFront(uint count = 1);
+        const T & back() const;
+        T & back();
+        const T & front() const;
+        T & front();
+        bool contains(const T & e) const;
+        bool find(const T & element, uint * indexPtr) const;
+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
+        void removeAt(uint index);
+        bool remove(const T & element);
+        void insertAt(uint index, const T & val = T());
+        void append(const Array<T> & other);
+        void append(const T other[], uint count);
+        void replaceWithLast(uint index);
+        void resize(uint new_size);
+        void resize(uint new_size, const T & elem);
+        void fill(const T & elem);
+        void clear();
+        void shrink();
+        void reserve(uint desired_size);
+        void copy(const T * data, uint count);
+        Array<T> & operator=( const Array<T> & a );
+        T * release();
+
+
+        // Array enumerator.
+        typedef uint PseudoIndex;
+
+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
+
+#if NV_CC_MSVC
+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
+            return m_buffer[i(this)];
+        }
+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
+            return m_buffer[i(this)];
+        }
+#endif
+
+        // Friends.
+        template <typename Typ> 
+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
+
+        template <typename Typ>
+        friend void swap(Array<Typ> & a, Array<Typ> & b);
+
+
+    protected:
+
+        void setArraySize(uint new_size);
+        void setArrayCapacity(uint new_capacity);
+
+        T * m_buffer;
+        uint m_capacity;
+        uint m_size;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/array.inl b/3rdparty/bimg/3rdparty/nvtt/nvcore/array.inl
new file mode 100644
index 0000000..2138b3a
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/array.inl
@@ -0,0 +1,437 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_ARRAY_INL
+#define NV_CORE_ARRAY_INL
+
+#include "array.h"
+
+#include "stream.h"
+#include "utils.h" // swap
+
+#include <string.h>	// memmove
+#include <new> // for placement new
+
+
+
+namespace nv 
+{
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::append()
+    {
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size);
+
+        return m_buffer[old_size]; // Return reference to last element.
+    }
+
+    // Push an element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::push_back( const T & val )
+    {
+#if 1
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
+
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size, val);
+#else
+        uint new_size = m_size + 1;
+
+        if (new_size > m_capacity)
+        {
+            // @@ Is there any way to avoid this copy?
+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?
+            // @@ Assert instead of copy?
+            const T copy(val);	// create a copy in case value is inside of this array.
+
+            setArraySize(new_size);
+
+            new (m_buffer+new_size-1) T(copy);
+        }
+        else
+        {
+            m_size = new_size;
+            new(m_buffer+new_size-1) T(val);
+        }
+#endif // 0/1
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )
+    {
+        push_back(val);
+    }
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
+    {
+        push_back(val);
+        return *this;
+    }
+
+    // Qt like push operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )
+    {
+        push_back(t);
+        return *this;
+    }
+
+    // Pop the element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pop_back()
+    {
+        nvDebugCheck( m_size > 0 );
+        resize( m_size - 1 );
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popBack(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        resize(m_size - count);
+    }
+
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popFront(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        //resize(m_size - count);
+
+        if (m_size == count) {
+            clear();
+        }
+        else {
+            destroy_range(m_buffer, 0, count);
+
+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));
+
+            m_size -= count;
+        }
+
+    }
+
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::back() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::back()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::front() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::front()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Check if the given element is contained in the array.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const
+    {
+        return find(e, NULL);
+    }
+
+    // Return true if element found.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const
+    {
+        return find(element, 0, m_size, indexPtr);
+    }
+
+    // Return true if element found within the given range.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const
+    {
+        return ::nv::find(element, m_buffer, begin, end, indexPtr);
+    }
+
+
+    // Remove the element at the given index. This is an expensive operation!
+    template <typename T>
+    void Array<T>::removeAt(uint index)
+    {
+        nvDebugCheck(index >= 0 && index < m_size);
+
+        if (m_size == 1) {
+            clear();
+        }
+        else {
+            m_buffer[index].~T();
+
+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));
+            m_size--;
+        }
+    }
+
+    // Remove the first instance of the given element.
+    template <typename T>
+    bool Array<T>::remove(const T & element)
+    {
+        uint index;
+        if (find(element, &index)) {
+            removeAt(index);
+            return true;
+        }
+        return false;
+    }
+
+    // Insert the given element at the given index shifting all the elements up.
+    template <typename T>
+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)
+    {
+        nvDebugCheck( index >= 0 && index <= m_size );
+
+        setArraySize(m_size + 1);
+
+        if (index < m_size - 1) {
+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));
+        }
+
+        // Copy-construct into the newly opened slot.
+        new(m_buffer+index) T(val);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)
+    {
+        append(other.m_buffer, other.m_size);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    void Array<T>::append(const T other[], uint count)
+    {
+        if (count > 0) {
+            const uint old_size = m_size;
+
+            setArraySize(m_size + count);
+
+            for (uint i = 0; i < count; i++ ) {
+                new(m_buffer + old_size + i) T(other[i]);
+            }
+        }
+    }
+
+
+    // Remove the given element by replacing it with the last one.
+    template <typename T> 
+    void Array<T>::replaceWithLast(uint index)
+    {
+        nvDebugCheck( index < m_size );
+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?
+        (m_buffer+m_size-1)->~T();
+        m_size--;
+    }
+
+    // Resize the vector preserving existing elements.
+    template <typename T> 
+    void Array<T>::resize(uint new_size)
+    {
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call default constructors
+        construct_range(m_buffer, new_size, old_size);
+    }
+
+
+    // Resize the vector preserving existing elements and initializing the
+    // new ones with the given value.
+    template <typename T> 
+    void Array<T>::resize(uint new_size, const T & elem)
+    {
+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);
+
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call copy constructors
+        construct_range(m_buffer, new_size, old_size, elem);
+    }
+
+    // Fill array with the given value.
+    template <typename T>
+    void Array<T>::fill(const T & elem)
+    {
+        fill(m_buffer, m_size, elem);
+    }
+
+    // Clear the buffer.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::clear()
+    {
+        nvDebugCheck(isValidPtr(m_buffer));
+
+        // Destruct old elements
+        destroy_range(m_buffer, 0, m_size);
+
+        m_size = 0;
+    }
+
+    // Shrink the allocated vector.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::shrink()
+    {
+        if (m_size < m_capacity) {
+            setArrayCapacity(m_size);
+        }
+    }
+
+    // Preallocate space.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)
+    {
+        if (desired_size > m_capacity) {
+            setArrayCapacity(desired_size);
+        }
+    }
+
+    // Copy elements to this array. Resizes it if needed.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
+    {
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
+
+        setArraySize(count);
+
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
+    }
+
+    // Assignment operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )
+    {
+        copy(a.m_buffer, a.m_size);
+        return *this;
+    }
+
+    // Release ownership of allocated memory and returns pointer to it.
+    template <typename T>
+    T * Array<T>::release() {
+        T * tmp = m_buffer;
+        m_buffer = NULL;
+        m_capacity = 0;
+        m_size = 0;
+        return tmp;
+    }
+
+
+
+    // Change array size.
+    template <typename T> 
+    inline void Array<T>::setArraySize(uint new_size) {
+        m_size = new_size;
+
+        if (new_size > m_capacity) {
+            uint new_buffer_size;
+            if (m_capacity == 0) {
+                // first allocation is exact
+                new_buffer_size = new_size;
+            }
+            else {
+                // following allocations grow array by 25%
+                new_buffer_size = new_size + (new_size >> 2);
+            }
+
+            setArrayCapacity( new_buffer_size );
+        }
+    }
+
+    // Change array capacity.
+    template <typename T> 
+    inline void Array<T>::setArrayCapacity(uint new_capacity) {
+        nvDebugCheck(new_capacity >= m_size);
+
+        if (new_capacity == 0) {
+            // free the buffer.
+            if (m_buffer != NULL) {
+                free<T>(m_buffer);
+                m_buffer = NULL;
+            }
+        }
+        else {
+            // realloc the buffer
+            m_buffer = realloc<T>(m_buffer, new_capacity);
+        }
+
+        m_capacity = new_capacity;
+    }
+
+    // Array serialization.
+    template <typename Typ> 
+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )
+    {
+        if (s.isLoading()) {
+            uint size;
+            s << size;
+            p.resize( size );
+        }
+        else {
+            s << p.m_size;
+        }
+
+        for (uint i = 0; i < p.m_size; i++) {
+            s << p.m_buffer[i];
+        }
+
+        return s;
+    }
+
+    // Swap the members of the two given vectors.
+    template <typename Typ>
+    inline void swap(Array<Typ> & a, Array<Typ> & b)
+    {
+        nv::swap(a.m_buffer, b.m_buffer);
+        nv::swap(a.m_capacity, b.m_capacity);
+        nv::swap(a.m_size, b.m_size);
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_INL
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/debug.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/debug.h
new file mode 100644
index 0000000..61fbd2f
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/debug.h
@@ -0,0 +1,216 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_DEBUG_H
+#define NV_CORE_DEBUG_H
+
+#include "nvcore.h"
+
+#include <stdarg.h> // va_list
+
+
+// Make sure we are using our assert.
+#undef assert
+
+#define NV_ABORT_DEBUG      1
+#define NV_ABORT_IGNORE     2
+#define NV_ABORT_EXIT       3
+
+#define nvNoAssert(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    (void)sizeof(exp); \
+    NV_MULTI_LINE_MACRO_END
+
+#if NV_NO_ASSERT
+
+#   define nvAssert(exp) nvNoAssert(exp)
+#   define nvCheck(exp) nvNoAssert(exp)
+#   define nvDebugAssert(exp) nvNoAssert(exp)
+#   define nvDebugCheck(exp) nvNoAssert(exp)
+#   define nvDebugBreak() nvNoAssert(0)
+
+#else // NV_NO_ASSERT
+
+#   if NV_CC_MSVC
+        // @@ Does this work in msvc-6 and earlier?
+#       define nvDebugBreak()       __debugbreak()
+//#       define nvDebugBreak()        __asm { int 3 }
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __debugbreak()
+#   elif NV_CC_GNUC
+#       define nvDebugBreak()       __builtin_trap()
+#   else
+#       error "No nvDebugBreak()!"
+#   endif
+
+/*
+#   elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN
+        // @@ Use __builtin_trap() on GCC
+#       define nvDebugBreak()       __asm__ volatile ("trap")
+#   elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN
+#       define nvDebugBreak()       __asm__ volatile ("int3")
+#   elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64
+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
+#   else
+#       include <signal.h>
+#       define nvDebugBreak()       raise(SIGTRAP)
+#   endif
+*/
+
+#define nvDebugBreakOnce() \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    static bool firstTime = true; \
+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
+    NV_MULTI_LINE_MACRO_END
+
+#define nvAssertMacro(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    if (!(exp)) { \
+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
+            nvDebugBreak(); \
+        } \
+    } \
+    NV_MULTI_LINE_MACRO_END
+
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+        static bool ignoreAll = false; \
+        if (!ignoreAll && !(exp)) { \
+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
+            if (result == NV_ABORT_DEBUG) { \
+                nvDebugBreak(); \
+            } else if (result == NV_ABORT_IGNORE) { \
+                ignoreAll = true; \
+            } \
+        } \
+    NV_MULTI_LINE_MACRO_END
+
+// Interesting assert macro from Insomniac:
+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
+// Used as follows:
+// if (nvCheck(i < count)) {
+//     normal path
+// } else {
+//     fixup code.
+// }
+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
+#define nvCheckMacro(exp) \
+    (\
+        (exp) ? true : ( \
+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
+        ) \
+    )
+
+
+#define nvAssert(exp)    nvAssertMacro(exp)
+#define nvCheck(exp)     nvAssertMacro(exp)
+
+#if defined(_DEBUG)
+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
+#else // _DEBUG
+#   define nvDebugAssert(exp)   nvNoAssert(exp)
+#   define nvDebugCheck(exp)    nvNoAssert(exp)
+#endif // _DEBUG
+
+#endif // NV_NO_ASSERT
+
+// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
+/*#if !defined(_DEBUG)
+#   if NV_CC_MSVC
+#       define nvAssume(exp)    __assume(exp)
+#   else
+#       define nvAssume(exp)    nvCheck(exp)
+#   endif
+#else
+#   define nvAssume(exp)    nvCheck(exp)
+#endif*/
+
+#if defined(_DEBUG)
+#  if NV_CC_MSVC
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
+#  else
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
+#  endif
+#else
+#  if NV_CC_MSVC
+#   define nvUnreachable() __assume(0)
+#  else
+#   define nvUnreachable() __builtin_unreachable()
+#  endif
+#endif
+
+
+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+
+#ifndef NV_DEBUG_PRINT
+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
+#endif
+
+#if NV_DEBUG_PRINT
+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
+#else
+#if NV_CC_MSVC
+#define nvDebug(...)    __noop(__VA_ARGS__)
+#else
+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
+#endif
+#endif
+
+
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6)));
+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+
+namespace nv
+{
+    inline bool isValidPtr(const void * ptr) {
+    #if NV_CPU_X86_64
+        if (ptr == NULL) return true;
+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
+    #else
+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
+    #endif
+        return true;
+    }
+
+    // Message handler interface.
+    struct MessageHandler {
+        virtual void log(const char * str, va_list arg) = 0;
+        virtual ~MessageHandler() {}
+    };
+
+    // Assert handler interface.
+    struct AssertHandler {
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
+        virtual ~AssertHandler() {}
+    };
+
+
+    namespace debug
+    {
+        NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
+
+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+        NVCORE_API void resetMessageHandler();
+
+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+        NVCORE_API void resetAssertHandler();
+
+        NVCORE_API void enableSigHandler(bool interactive);
+        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
+        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_DEBUG_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucdarwin.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucdarwin.h
new file mode 100644
index 0000000..968f4bc
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucdarwin.h
@@ -0,0 +1,57 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+#ifndef __STDC_VERSION__
+#	define __STDC_VERSION__ 0
+#endif // __STDC_VERSION__
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT __attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnuclinux.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnuclinux.h
new file mode 100644
index 0000000..117d342
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnuclinux.h
@@ -0,0 +1,63 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+#ifndef __STDC_VERSION__
+#	define __STDC_VERSION__ 0
+#endif
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#   define DLL_EXPORT   __attribute__((visibility("default")))
+#   define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#   define DLL_EXPORT
+#   define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#   define NV_CDECL     __attribute__((cdecl))
+#   define NV_STDCALL   __attribute__((stdcall))
+#else
+#   define NV_CDECL 
+#   define NV_STDCALL
+#endif
+
+#define NV_FASTCALL     __attribute__((fastcall))
+//#if __GNUC__ > 3
+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
+#define NV_FORCEINLINE  inline
+//#else
+// Some compilers complain that inline and always_inline are redundant.
+//#define NV_FORCEINLINE  __attribute__((always_inline))
+//#endif
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL __thread 
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 199901L
+#   if __GNUC__ >= 2
+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
+#   else
+#       define __FUNC__ "<unknown>"
+#   endif
+#else
+#   define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucwin32.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucwin32.h
new file mode 100644
index 0000000..68465c8
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsgnucwin32.h
@@ -0,0 +1,65 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+//#include <cstddef> // size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+/*
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
+*/
+
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/defsvcwin32.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsvcwin32.h
new file mode 100644
index 0000000..a6c6bf9
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/defsvcwin32.h
@@ -0,0 +1,94 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT __declspec(dllimport)
+#define DLL_EXPORT __declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#define NV_CDECL        __cdecl
+#define NV_STDCALL      __stdcall
+#define NV_FASTCALL     __fastcall
+#define NV_DEPRECATED
+
+#define NV_PURE
+#define NV_CONST
+
+// Set standard function names.
+#if _MSC_VER < 1900
+#   define snprintf _snprintf
+#endif
+#if _MSC_VER < 1500
+#   define vsnprintf _vsnprintf
+#endif
+#if _MSC_VER < 1700
+#   define strtoll _strtoi64
+#   define strtoull _strtoui64
+#endif
+#define chdir _chdir
+#define getcwd _getcwd 
+
+#if _MSC_VER < 1800 // Not sure what version introduced this.
+#define va_copy(a, b) (a) = (b)
+#endif
+
+#if !defined restrict
+#define restrict
+#endif
+
+// Ignore gcc attributes.
+#define __attribute__(X)
+
+#if !defined __FUNC__
+#define __FUNC__ __FUNCTION__ 
+#endif
+
+#define NV_NOINLINE __declspec(noinline)
+#define NV_FORCEINLINE inline
+
+#define NV_THREAD_LOCAL __declspec(thread)
+
+/*
+// Type definitions
+typedef unsigned char       uint8;
+typedef signed char         int8;
+
+typedef unsigned short      uint16;
+typedef signed short        int16;
+
+typedef unsigned int        uint32;
+typedef signed int          int32;
+
+typedef unsigned __int64    uint64;
+typedef signed __int64      int64;
+
+// Aliases
+typedef uint32              uint;
+*/
+
+// Unwanted VC++ warnings to disable.
+/*
+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)     // unreferenced formal parameter
+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
+#pragma warning(disable : 4710)     // inline function not expanded
+#pragma warning(disable : 4127)     // Conditional expression is constant
+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)     // unreferenced local function has been removed
+
+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
+#pragma warning(disable : 4711)     // function selected for automatic inlining
+#pragma warning(disable : 4725)     // Pentium fdiv bug
+
+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
+
+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
+*/
+
+#pragma warning(1 : 4705)     // Report unused local variables.
+#pragma warning(1 : 4555)     // Expression has no effect.
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/foreach.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/foreach.h
new file mode 100644
index 0000000..71b19f7
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/foreach.h
@@ -0,0 +1,68 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_FOREACH_H
+#define NV_CORE_FOREACH_H
+
+/*
+These foreach macros are very non-standard and somewhat confusing, but I like them.
+*/
+
+#include "nvcore.h"
+
+#if NV_CC_GNUC // If typeof or decltype is available:
+#if !NV_CC_CPP11
+#   define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype
+#else
+#   define NV_DECLTYPE decltype
+#endif
+
+/*
+Ideally we would like to write this:
+
+#define NV_FOREACH(i, container) \
+    for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+But gcc versions prior to 4.7 required an intermediate type. See:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
+*/
+
+#define NV_FOREACH(i, container) \
+    typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \
+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+#else // If typeof not available:
+
+#include <new> // placement new
+
+struct PseudoIndexWrapper {
+    template <typename T>
+    PseudoIndexWrapper(const T & container) {
+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
+        new (memory) typename T::PseudoIndex(container.start());
+    }
+    // PseudoIndex cannot have a dtor!
+
+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
+    }
+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
+    }
+
+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
+};
+
+#define NV_FOREACH(i, container) \
+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
+
+#endif
+
+// Declare foreach keyword.
+#if !defined NV_NO_USE_KEYWORDS
+#   define foreach NV_FOREACH
+#   define foreach_index NV_FOREACH
+#endif
+
+
+#endif // NV_CORE_FOREACH_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/hash.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/hash.h
new file mode 100644
index 0000000..a8b0b2c
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/hash.h
@@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_CORE_HASH_H
+#define NV_CORE_HASH_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
+    {
+        const uint8 * data = (const uint8 *) data_in;
+        uint i = 0;
+        while (i < size) {
+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
+        }
+        return h;
+    }
+
+    // Note that this hash does not handle NaN properly.
+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
+    {
+        for (uint i = 0; i < count; i++) {
+            //nvDebugCheck(nv::isFinite(*f));
+            union { float f; uint32 i; } x = { f[i] };
+            if (x.i == 0x80000000) x.i = 0;
+            h = sdbmHash(&x, 4, h);
+        }
+        return h;
+    }
+
+
+    template <typename T>
+    inline uint hash(const T & t, uint h = 5381)
+    {
+        return sdbmHash(&t, sizeof(T), h);
+    }
+
+    template <>
+    inline uint hash(const float & f, uint h)
+    {
+        return sdbmFloatHash(&f, 1, h);
+    }
+
+
+    // Functors for hash table:
+    template <typename Key> struct Hash 
+    {
+        uint operator()(const Key & k) const {
+            return hash(k);
+        }
+    };
+
+    template <typename Key> struct Equal
+    {
+        bool operator()(const Key & k0, const Key & k1) const {
+            return k0 == k1;
+        }
+    };
+
+
+    // @@ Move to Utils.h?
+    template <typename T1, typename T2>
+    struct Pair {
+        T1 first;
+        T2 second;
+    };
+
+    template <typename T1, typename T2>
+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
+        return p0.first == p1.first && p0.second == p1.second;
+    }
+
+    template <typename T1, typename T2>
+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
+        return hash(p.second, hash(p.first));
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_HASH_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/memory.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/memory.h
new file mode 100644
index 0000000..b332fab
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/memory.h
@@ -0,0 +1,30 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_CORE_MEMORY_H
+#define NV_CORE_MEMORY_H
+
+#include "nvcore.h"
+#include <stdlib.h>
+
+namespace nv {
+
+    // C++ helpers.
+    template <typename T> inline T * malloc(size_t count) {
+        return (T *)::malloc(sizeof(T) * count);
+    }
+
+    template <typename T> inline T * realloc(T * ptr, size_t count) {
+        return (T *)::realloc(ptr, sizeof(T) * count);
+    }
+
+    template <typename T> inline void free(const T * ptr) {
+        ::free((void *)ptr);
+    }
+
+    template <typename T> inline void zero(T & data) {
+        memset(&data, 0, sizeof(T));
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_MEMORY_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/nvcore.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/nvcore.h
new file mode 100644
index 0000000..689feff
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/nvcore.h
@@ -0,0 +1,363 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_H
+#define NV_CORE_H
+
+#define NVCORE_SHARED 0
+#define NV_NO_ASSERT 0
+
+// Function linkage
+#if NVCORE_SHARED
+#ifdef NVCORE_EXPORTS
+#define NVCORE_API DLL_EXPORT
+#define NVCORE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVCORE_API DLL_IMPORT
+#define NVCORE_CLASS DLL_IMPORT
+#endif
+#else // NVCORE_SHARED
+#define NVCORE_API
+#define NVCORE_CLASS
+#endif // NVCORE_SHARED
+
+// Platform definitions
+#include "posh.h"
+
+#define NV_OS_STRING POSH_OS_STRING
+
+#if defined POSH_OS_LINUX
+#   define NV_OS_LINUX 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
+#elif defined POSH_OS_FREEBSD
+#   define NV_OS_FREEBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_OPENBSD
+#   define NV_OS_OPENBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_CYGWIN32
+#   define NV_OS_CYGWIN 1
+#elif defined POSH_OS_MINGW
+#   define NV_OS_MINGW 1
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_OSX
+#   define NV_OS_DARWIN 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
+#elif defined POSH_OS_UNIX
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_WIN64
+#   define NV_OS_WIN32 1
+#   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_XBOX
+#   define NV_OS_XBOX 1
+#else
+#   error "Unsupported OS"
+#endif
+
+#ifndef NV_OS_WIN32
+#	define NV_OS_WIN32  0
+#endif // NV_OS_WIN32
+
+#ifndef NV_OS_WIN64
+#	define NV_OS_WIN64  0
+#endif // NV_OS_WIN64
+
+#ifndef NV_OS_MINGW
+#	define NV_OS_MINGW  0
+#endif // NV_OS_MINGW
+
+#ifndef NV_OS_CYGWIN
+#	define NV_OS_CYGWIN 0
+#endif // NV_OS_CYGWIN
+
+#ifndef NV_OS_LINUX
+#	define NV_OS_LINUX  0
+#endif // NV_OS_LINUX
+
+#ifndef NV_OS_FREEBSD
+#	define NV_OS_FREEBSD 0
+#endif // NV_OS_FREEBSD
+
+#ifndef NV_OS_OPENBSD
+#	define NV_OS_OPENBSD 0
+#endif // NV_OS_OPENBSD
+
+#ifndef NV_OS_UNIX
+#	define NV_OS_UNIX   0
+#endif // NV_OS_UNIX
+
+#ifndef NV_OS_DARWIN
+#	define NV_OS_DARWIN 0
+#endif // NV_OS_DARWIN
+
+#ifndef NV_OS_XBOX
+#	define NV_OS_XBOX   0
+#endif // NV_OS_XBOX
+
+#ifndef NV_OS_ORBIS
+#	define NV_OS_ORBIS  0
+#endif // NV_OS_ORBIS
+
+#ifndef NV_OS_IOS
+#	define NV_OS_IOS    0
+#endif // NV_OS_IOS
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_DARWIN || NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
+#endif
+
+
+// CPUs:
+
+#define NV_CPU_STRING   POSH_CPU_STRING
+
+#if defined POSH_CPU_X86_64
+//#   define NV_CPU_X86 1
+#   define NV_CPU_X86_64 1
+#elif defined POSH_CPU_X86
+#   define NV_CPU_X86 1
+#elif defined POSH_CPU_PPC
+#   define NV_CPU_PPC 1
+#elif defined POSH_CPU_STRONGARM
+#   define NV_CPU_ARM 1
+#elif defined POSH_CPU_AARCH64
+#   define NV_CPU_AARCH64 1
+#else
+#   error "Unsupported CPU"
+#endif
+
+#ifndef NV_CPU_X86
+#	define NV_CPU_X86     0
+#endif // NV_CPU_X86
+
+#ifndef NV_CPU_X86_64
+#	define NV_CPU_X86_64  0
+#endif // NV_CPU_X86_64
+
+#ifndef NV_CPU_PPC
+#	define NV_CPU_PPC     0
+#endif // NV_CPU_PPC
+
+#ifndef NV_CPU_ARM
+#	define NV_CPU_ARM     0
+#endif // NV_CPU_ARM
+
+#ifndef NV_CPU_AARCH64
+#	define NV_CPU_AARCH64 0
+#endif // NV_CPU_AARCH64
+
+// Compiler:
+
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#	pragma clang diagnostic ignored "-Wmissing-braces"
+#	pragma clang diagnostic ignored "-Wshadow"
+#	pragma clang diagnostic ignored "-Wunused-local-typedef"
+#	pragma clang diagnostic ignored "-Wunused-function"
+#	pragma clang diagnostic ignored "-Wunused-variable"
+#	pragma clang diagnostic ignored "-Wunused-parameter"
+#	pragma clang diagnostic ignored "-Wsometimes-uninitialized"
+#elif defined POSH_COMPILER_GCC
+#   define NV_CC_GNUC   1
+#   define NV_CC_STRING "gcc"
+#	pragma GCC diagnostic ignored "-Wshadow"
+#	pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#	pragma GCC diagnostic ignored "-Wunused-function"
+#	pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+#	pragma GCC diagnostic ignored "-Wunused-variable"
+#	pragma GCC diagnostic ignored "-Wunused-parameter"
+#	pragma GCC diagnostic ignored "-Warray-bounds"
+#elif defined POSH_COMPILER_MSVC
+#   define NV_CC_MSVC   1
+#   define NV_CC_STRING "msvc"
+#else
+#   error "Unsupported compiler"
+#endif
+
+#ifndef NV_CC_GNUC
+#	define NV_CC_GNUC  0
+#endif // NV_CC_GNUC
+
+#ifndef NV_CC_MSVC
+#	define NV_CC_MSVC  0
+#endif // NV_CC_MSVC
+
+#ifndef NV_CC_CLANG
+#	define NV_CC_CLANG 0
+#endif // NV_CC_CLANG
+
+#if NV_CC_MSVC
+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
+#else
+// @@ IC: This works in CLANG, about GCC?
+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
+#ifdef __clang__
+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
+#elif defined __GNUC__ 
+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+#endif
+#endif
+
+// Endiannes:
+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+
+
+// Type definitions:
+typedef posh_u8_t   uint8;
+typedef posh_i8_t   int8;
+
+typedef posh_u16_t  uint16;
+typedef posh_i16_t  int16;
+
+typedef posh_u32_t  uint32;
+typedef posh_i32_t  int32;
+
+typedef posh_u64_t  uint64;
+typedef posh_i64_t  int64;
+
+// Aliases
+typedef uint32      uint;
+
+
+// Version string:
+#define NV_VERSION_STRING \
+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+
+
+// Disable copy constructor and assignment operator. 
+#if NV_CC_CPP11
+#define NV_FORBID_COPY(C) \
+    C( const C & ) = delete; \
+    C &operator=( const C & ) = delete
+#else
+#define NV_FORBID_COPY(C) \
+    private: \
+    C( const C & ); \
+    C &operator=( const C & )
+#endif
+
+// Disable dynamic allocation on the heap. 
+// See Prohibiting Heap-Based Objects in More Effective C++.
+#define NV_FORBID_HEAPALLOC() \
+    private: \
+    void *operator new(size_t size); \
+    void *operator new[](size_t size)
+
+// String concatenation macros.
+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+#define NV_STRING2(x) #x
+#define NV_STRING(x) NV_STRING2(x)
+
+#if NV_CC_MSVC
+#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_END \
+    __pragma(warning(push)) \
+    __pragma(warning(disable:4127)) \
+    } while(false) \
+    __pragma(warning(pop))  
+#else
+#define NV_MULTI_LINE_MACRO_BEGIN do {
+#define NV_MULTI_LINE_MACRO_END } while(false)
+#endif
+
+#if NV_CC_CPP11
+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+
+#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+#if 0 // Disabled in The Witness.
+#if NV_CC_MSVC
+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
+#else
+#define NV_MESSAGE(x) message(x)
+#endif
+#else
+#define NV_MESSAGE(x) 
+#endif
+
+
+// Startup initialization macro.
+#define NV_AT_STARTUP(some_code) \
+    namespace { \
+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+        } \
+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+    }
+
+// Indicate the compiler that the parameter is not used to suppress compier warnings.
+#define NV_UNUSED(a) ((a)=(a))
+
+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
+//const unsigned int NIL = unsigned int(~0);
+//#define NIL uint(~0)
+
+// Null pointer.
+#ifndef NULL
+#define NULL 0
+#endif
+
+// Platform includes
+#if NV_CC_MSVC
+#   if NV_OS_WIN32
+#       include "defsvcwin32.h"
+#   elif NV_OS_XBOX
+#       include "defsvcxbox.h"
+#   else
+#       error "MSVC: Platform not supported"
+#   endif
+#elif NV_CC_GNUC
+#   if NV_OS_LINUX
+#       include "defsgnuclinux.h"
+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#       include "defsgnucdarwin.h"
+#   elif NV_OS_MINGW
+#       include "defsgnucwin32.h"
+#   elif NV_OS_CYGWIN
+#       error "GCC: Cygwin not supported"
+#   else
+#       error "GCC: Platform not supported"
+#   endif
+#endif
+
+#endif // NV_CORE_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/posh.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/posh.h
new file mode 100644
index 0000000..45d2d9e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/posh.h
@@ -0,0 +1,1030 @@
+/**
+@file posh.h
+@author Brian Hook
+@version 1.3.001
+
+Header file for POSH, the Portable Open Source Harness project.
+
+NOTE: Unlike most header files, this one is designed to be included
+multiple times, which is why it does not have the @#ifndef/@#define
+preamble.
+
+POSH relies on environment specified preprocessor symbols in order
+to infer as much as possible about the target OS/architecture and
+the host compiler capabilities.
+
+NOTE: POSH is simple and focused. It attempts to provide basic
+functionality and information, but it does NOT attempt to emulate
+missing functionality.  I am also not willing to make POSH dirty
+and hackish to support truly ancient and/or outmoded and/or bizarre
+technologies such as non-ANSI compilers, systems with non-IEEE
+floating point formats, segmented 16-bit operating systems, etc.
+
+Please refer to the accompanying HTML documentation or visit
+http://www.poshlib.org for more information on how to use POSH.
+
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REVISION:
+
+I've been lax about revision histories, so this starts at, um, 1.3.001.
+Sorry for any inconveniences.
+
+1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary,
+                      where I was not detecting Visual Studio
+                      compilation on x86-64 systems.  Added check for
+                      _M_X64 which should fix that.
+
+*/
+/*
+I have yet to find an authoritative reference on preprocessor
+symbols, but so far this is what I've gleaned:
+
+GNU GCC/G++:
+   - __GNUC__: GNU C version
+   - __GNUG__: GNU C++ compiler
+   - __sun__ : on Sun platforms
+   - __svr4__: on Solaris and other SysV R4 platforms
+   - __mips__: on MIPS processor platforms
+   - __sparc_v9__: on Sparc 64-bit CPUs
+   - __sparcv9: 64-bit Solaris
+   - __MIPSEL__: mips processor, compiled for little endian
+   - __MIPSEB__: mips processor, compiled for big endian
+   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
+   - mc68000: 68K
+   - m68000: 68K
+   - m68k: 68K
+   - __palmos__: PalmOS
+
+Intel C/C++ Compiler:
+   - __ECC      : compiler version, IA64 only
+   - __EDG__
+   - __ELF__
+   - __GXX_ABI_VERSION
+   - __i386     : IA-32 only
+   - __i386__   : IA-32 only
+   - i386       : IA-32 only
+   - __ia64     : IA-64 only
+   - __ia64__   : IA-64 only
+   - ia64       : IA-64 only
+   - __ICC      : IA-32 only
+   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
+
+Apple's C/C++ Compiler for OS X:
+   - __APPLE_CC__
+   - __APPLE__
+   - __BIG_ENDIAN__
+   - __APPLE__
+   - __ppc__
+   - __MACH__
+
+DJGPP:
+   - __MSDOS__
+   - __unix__
+   - __unix
+   - __GNUC__
+   - __GO32
+   - DJGPP
+   - __i386, __i386, i386
+
+Cray's C compiler:
+   - _ADDR64: if 64-bit pointers
+   - _UNICOS: 
+   - __unix:
+
+SGI's CC compiler predefines the following (and more) with -ansi:
+   - __sgi
+   - __unix
+   - __host_mips
+   - _SYSTYPE_SVR4
+   - __mips
+   - _MIPSEB
+   - anyone know if there is a predefined symbol for the compiler?!
+
+MinGW:
+   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
+   - __MINGW32__
+
+Cygwin:
+   - as Gnu C, but also
+   - __unix__
+   - __CYGWIN32__
+
+Microsoft Visual Studio predefines the following:
+   - _MSC_VER
+   - _WIN32: on Win32
+   - _M_IX6 (on x86 systems)
+   - _M_X64: on x86-64 systems
+   - _M_ALPHA (on DEC AXP systems)
+   - _SH3: WinCE, Hitachi SH-3
+   - _MIPS: WinCE, MIPS
+   - _ARM: WinCE, ARM
+
+Sun's C Compiler:
+   - sun and _sun
+   - unix and _unix
+   - sparc and _sparc (SPARC systems only)
+   - i386 and _i386 (x86 systems only)
+   - __SVR4 (Solaris only)
+   - __sparcv9: 64-bit solaris
+   - __SUNPRO_C
+   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
+
+Borland C/C++ predefines the following:
+   - __BORLANDC__:
+
+DEC/Compaq C/C++ on Alpha:
+   - __alpha
+   - __arch64__
+   - __unix__ (on Tru64 Unix)
+   - __osf__
+   - __DECC
+   - __DECCXX (C++ compilation)
+   - __DECC_VER
+   - __DECCXX_VER
+
+IBM's AIX compiler:
+   - __64BIT__ if 64-bit mode
+   - _AIX
+   - __IBMC__: C compiler version
+   - __IBMCPP__: C++ compiler version
+   - _LONG_LONG: compiler allows long long
+
+Watcom:
+   - __WATCOMC__
+   - __DOS__ : if targeting DOS
+   - __386__ : if 32-bit support
+   - __WIN32__ : if targetin 32-bit Windows
+
+HP-UX C/C++ Compiler:
+   - __hpux
+   - __unix
+   - __hppa (on PA-RISC)
+   - __LP64__: if compiled in 64-bit mode
+
+Metrowerks:
+   - __MWERKS__
+   - __powerpc__
+   - _powerc
+   - __MC68K__
+   - macintosh when compiling for MacOS
+   - __INTEL__ for x86 targets
+   - __POWERPC__
+
+LLVM:
+   - __llvm__
+   - __clang__
+*/
+
+/*
+** ----------------------------------------------------------------------------
+** Include <limits.h> optionally
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_USE_LIMITS_H
+#  include <limits.h>
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine compilation environment
+** ----------------------------------------------------------------------------
+*/
+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
+#  define POSH_COMPILER_STRING "Intel C/C++"
+#  define POSH_COMPILER_INTEL 1
+#endif
+
+#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
+#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
+#  define POSH_COMPILER_MIPSPRO 1 
+#endif
+
+#if defined __hpux && !defined __GNUC__
+#  define POSH_COMPILER_STRING "HP-UX CC"
+#  define POSH_COMPILER_HPCC 1 
+#endif
+
+#if defined __clang__
+#  define POSH_COMPILER_STRING "Clang"
+#  define POSH_COMPILER_CLANG 1
+#endif
+
+#if defined __GNUC__ && !defined __clang__
+#  define POSH_COMPILER_STRING "Gnu GCC"
+#  define POSH_COMPILER_GCC 1
+#endif
+
+#if defined __APPLE_CC__
+   /* we don't define the compiler string here, let it be GNU */
+#  define POSH_COMPILER_APPLECC 1
+#endif
+
+#if defined __IBMC__ || defined __IBMCPP__
+#  define POSH_COMPILER_STRING "IBM C/C++"
+#  define POSH_COMPILER_IBM 1
+#endif
+
+#if defined _MSC_VER
+#  define POSH_COMPILER_STRING "Microsoft Visual C++"
+#  define POSH_COMPILER_MSVC 1
+#endif
+
+#if defined __SUNPRO_C
+#  define POSH_COMPILER_STRING "Sun Pro" 
+#  define POSH_COMPILER_SUN 1
+#endif
+
+#if defined __BORLANDC__
+#  define POSH_COMPILER_STRING "Borland C/C++"
+#  define POSH_COMPILER_BORLAND 1
+#endif
+
+#if defined __MWERKS__
+#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
+#  define POSH_COMPILER_METROWERKS 1
+#endif
+
+#if defined __DECC || defined __DECCXX
+#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
+#  define POSH_COMPILER_DEC 1
+#endif
+
+#if defined __WATCOMC__
+#  define POSH_COMPILER_STRING "Watcom C/C++"
+#  define POSH_COMPILER_WATCOM 1
+#endif
+
+#if !defined POSH_COMPILER_STRING
+#  define POSH_COMPILER_STRING "Unknown compiler"
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine target operating system
+** ----------------------------------------------------------------------------
+*/
+#if defined linux || defined __linux__
+#  define POSH_OS_LINUX 1 
+#  define POSH_OS_STRING "Linux"
+#endif
+
+#if defined __FreeBSD__
+#  define POSH_OS_FREEBSD 1 
+#  define POSH_OS_STRING "FreeBSD"
+#endif
+
+#if defined __OpenBSD__
+#  define POSH_OS_OPENBSD 1
+#  define POSH_OS_STRING "OpenBSD"
+#endif
+
+#if defined __CYGWIN32__
+#  define POSH_OS_CYGWIN32 1
+#  define POSH_OS_STRING "Cygwin"
+#endif
+
+#if defined GEKKO
+#  define POSH_OS_GAMECUBE
+#  define __powerpc__
+#  define POSH_OS_STRING "GameCube"
+#endif
+
+#if defined __MINGW32__
+#  define POSH_OS_MINGW 1
+#  define POSH_OS_STRING "MinGW"
+#endif
+
+#if defined GO32 && defined DJGPP && defined __MSDOS__ 
+#  define POSH_OS_GO32 1
+#  define POSH_OS_STRING "GO32/MS-DOS"
+#endif
+
+/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
+   otherwise Watcom assumes host=target */
+#if defined __WATCOMC__  && defined __386__ && defined __DOS__
+#  define POSH_OS_DOS32 1
+#  define POSH_OS_STRING "DOS/32-bit"
+#endif
+
+#if defined _UNICOS
+#  define POSH_OS_UNICOS 1
+#  define POSH_OS_STRING "UNICOS"
+#endif
+
+#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
+#  define POSH_OS_OSX 1
+#  define POSH_OS_STRING "MacOS X"
+#endif
+
+#if defined __sun__ || defined sun || defined __sun || defined __solaris__
+#  if defined __SVR4 || defined __svr4__ || defined __solaris__
+#     define POSH_OS_STRING "Solaris"
+#     define POSH_OS_SOLARIS 1
+#  endif
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "SunOS"
+#     define POSH_OS_SUNOS 1
+#  endif
+#endif
+
+#if defined __sgi__ || defined sgi || defined __sgi
+#  define POSH_OS_IRIX 1
+#  define POSH_OS_STRING "Irix"
+#endif
+
+#if defined __hpux__ || defined __hpux
+#  define POSH_OS_HPUX 1
+#  define POSH_OS_STRING "HP-UX"
+#endif
+
+#if defined _AIX
+#  define POSH_OS_AIX 1
+#  define POSH_OS_STRING "AIX"
+#endif
+
+#if ( defined __alpha && defined __osf__ )
+#  define POSH_OS_TRU64 1
+#  define POSH_OS_STRING "Tru64"
+#endif
+
+#if defined __BEOS__ || defined __beos__
+#  define POSH_OS_BEOS 1
+#  define POSH_OS_STRING "BeOS"
+#endif
+
+#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
+#  define POSH_OS_AMIGA 1
+#  define POSH_OS_STRING "Amiga"
+#endif
+
+#if defined __unix__
+#  define POSH_OS_UNIX 1 
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "Unix-like(generic)"
+#  endif
+#endif
+
+#if defined _WIN32_WCE
+#  define POSH_OS_WINCE 1
+#  define POSH_OS_STRING "Windows CE"
+#endif
+
+#if defined _XBOX || defined _XBOX_VER
+#  define POSH_OS_XBOX 1
+#  define POSH_OS_STRING "XBOX"
+#endif
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
+#  define POSH_OS_WIN32 1
+#  if !defined POSH_OS_XBOX
+#     if defined _WIN64
+#        define POSH_OS_WIN64 1
+#        if !defined POSH_OS_STRING
+#           define POSH_OS_STRING "Win64"
+#        endif // !defined POSH_OS_STRING
+#     else
+#        if !defined POSH_OS_STRING
+#           define POSH_OS_STRING "Win32"
+#        endif
+#     endif
+#  endif
+#endif
+
+#if defined __palmos__
+#  define POSH_OS_PALM 1
+#  define POSH_OS_STRING "PalmOS"
+#endif
+
+#if defined THINK_C || defined macintosh
+#  define POSH_OS_MACOS 1
+#  define POSH_OS_STRING "MacOS"
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Determine target CPU
+** -----------------------------------------------------------------------------
+*/
+
+#if defined GEKKO
+#  define POSH_CPU_PPC750 1
+#  define POSH_CPU_STRING "IBM PowerPC 750 (NGC)"
+#endif
+
+#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
+#  define POSH_CPU_68K 1
+#  define POSH_CPU_STRING "MC68000"
+#endif
+
+#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__ || defined _M_PPC
+#  define POSH_CPU_PPC 1
+#  if !defined POSH_CPU_STRING
+#    if defined __powerpc64__
+#       define POSH_CPU_STRING "PowerPC64"
+#    else
+#       define POSH_CPU_STRING "PowerPC"
+#    endif
+#  endif
+#endif
+
+#if defined _CRAYT3E || defined _CRAYMPP
+#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
+#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
+#endif
+
+#if defined CRAY || defined _CRAY && !defined _CRAYT3E
+#  error Non-AXP Cray systems not supported
+#endif
+
+#if defined _SH3
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_STRING "Hitachi SH-3"
+#endif
+
+#if defined __sh4__ || defined __SH4__
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_SH4 1
+#  define POSH_CPU_STRING "Hitachi SH-4"
+#endif
+
+#if defined __sparc__ || defined __sparc
+#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
+#     define POSH_CPU_SPARC64 1 
+#     define POSH_CPU_STRING "Sparc/64"
+#  else
+#     define POSH_CPU_STRING "Sparc/32"
+#  endif
+#  define POSH_CPU_SPARC 1
+#endif
+
+#if defined ARM || defined __arm__ || defined _ARM
+#  define POSH_CPU_STRONGARM 1
+#  define POSH_CPU_STRING "ARM"
+#endif
+
+#if defined __aarch64__
+#  define POSH_CPU_AARCH64 1
+#  define POSH_CPU_STRING "ARM64"
+#endif
+
+#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
+#  define POSH_CPU_MIPS 1 
+#  if defined _R5900
+#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
+#  else
+#    define POSH_CPU_STRING "MIPS"
+#  endif
+#endif
+
+#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
+#  define POSH_CPU_IA64 1
+#  define POSH_CPU_STRING "IA64"
+#endif
+
+#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
+#  define POSH_CPU_X86 1
+#  if defined __x86_64__ || defined _M_X64
+#     define POSH_CPU_X86_64 1 
+#  endif
+#  if defined POSH_CPU_X86_64
+#     define POSH_CPU_STRING "AMD x86-64"
+#  else
+#     define POSH_CPU_STRING "Intel 386+"
+#  endif
+#endif
+
+#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
+#  define POSH_CPU_AXP 1
+#  define POSH_CPU_STRING "AXP"
+#endif
+
+#if defined __hppa || defined hppa
+#  define POSH_CPU_HPPA 1
+#  define POSH_CPU_STRING "PA-RISC"
+#endif
+
+#if !defined POSH_CPU_STRING
+#  error POSH cannot determine target CPU
+#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Attempt to autodetect building for embedded on Sony PS2
+** -----------------------------------------------------------------------------
+*/
+#if !defined POSH_OS_STRING
+#  if !defined FORCE_DOXYGEN
+#    define POSH_OS_EMBEDDED 1 
+#  endif
+#  if defined _R5900
+#     define POSH_OS_STRING "Sony PS2(embedded)"
+#  else
+#     define POSH_OS_STRING "Embedded/Unknown"
+#  endif
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Handle cdecl, stdcall, fastcall, etc.
+** ---------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
+#  if defined __GNUC__
+#     define POSH_CDECL __attribute__((cdecl))
+#     define POSH_STDCALL __attribute__((stdcall))
+#     define POSH_FASTCALL __attribute__((fastcall))
+#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
+#     define POSH_CDECL    __cdecl
+#     define POSH_STDCALL  __stdcall
+#     define POSH_FASTCALL __fastcall
+#  endif
+#else
+#  define POSH_CDECL    
+#  define POSH_STDCALL  
+#  define POSH_FASTCALL 
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
+** ---------------------------------------------------------------------------
+*/
+
+/*
+** We undefine this so that multiple inclusions will work
+*/
+#if defined POSH_IMPORTEXPORT
+#  undef POSH_IMPORTEXPORT
+#endif
+
+#if defined POSH_DLL
+#   if defined POSH_OS_WIN32
+#      if defined _MSC_VER 
+#         if ( _MSC_VER >= 800 )
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif  /* defined _MSC_VER */
+#      if defined __BORLANDC__
+#         if ( __BORLANDC__ >= 0x500 )
+#            if defined POSH_BUILDING_LIB 
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif /* defined __BORLANDC__ */
+       /* for all other compilers, we're just making a blanket assumption */
+#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
+#         if defined POSH_BUILDING_LIB
+#            define POSH_IMPORTEXPORT __declspec( dllexport )
+#         else
+#            define POSH_IMPORTEXPORT __declspec( dllimport )
+#         endif
+#      endif /* all other compilers */
+#      if !defined POSH_IMPORTEXPORT
+#         error Building DLLs not supported on this compiler (poshlib@poshlib.org if you know how)
+#      endif
+#   endif /* defined POSH_OS_WIN32 */
+#endif
+
+/* On pretty much everything else, we can thankfully just ignore this */
+#if !defined POSH_IMPORTEXPORT
+#  define POSH_IMPORTEXPORT
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_DLL    
+#  define POSH_BUILDING_LIB
+#  undef POSH_DLL
+#  undef POSH_BUILDING_LIB
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** (Re)define POSH_PUBLIC_API export signature 
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_PUBLIC_API
+#  undef POSH_PUBLIC_API
+#endif
+
+#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
+#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
+#else
+#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Try to infer endianess.  Basically we just go through the CPUs we know are
+** little endian, and assume anything that isn't one of those is big endian.
+** As a sanity check, we also do this with operating systems we know are
+** little endian, such as Windows.  Some processors are bi-endian, such as 
+** the MIPS series, so we have to be careful about those.
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
+#  define POSH_ENDIAN_STRING "little"
+#  define POSH_LITTLE_ENDIAN 1
+#else
+#  define POSH_ENDIAN_STRING "big"
+#  define POSH_BIG_ENDIAN 1
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_LITTLE_ENDIAN
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Cross-platform compile time assertion macro
+** ----------------------------------------------------------------------------
+*/
+#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit Integer
+**
+** We don't require 64-bit support, nor do we emulate its functionality, we
+** simply export it if it's available.  Since we can't count on <limits.h>
+** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
+** ----------------------------------------------------------------------------
+*/
+#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
+#  define POSH_64BIT_INTEGER 1
+typedef long posh_i64_t; 
+typedef unsigned long posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "l"
+#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
+#  define POSH_64BIT_INTEGER 1
+typedef __int64 posh_i64_t;
+typedef unsigned __int64 posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)(x##i64))
+#  define POSH_U64( x ) ((posh_u64_t)(x##ui64))
+#  define POSH_I64_PRINTF_PREFIX "I64"
+#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
+#  define POSH_64BIT_INTEGER 1
+typedef long long posh_i64_t;
+typedef unsigned long long posh_u64_t;
+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
+#  define POSH_I64_PRINTF_PREFIX "ll"
+#endif
+
+/* hack */
+/*#ifdef __MINGW32__
+#undef POSH_I64
+#undef POSH_U64
+#undef POSH_I64_PRINTF_PREFIX
+#define POSH_I64( x ) ((posh_i64_t)x)
+#define POSH_U64( x ) ((posh_u64_t)x)
+#define POSH_I64_PRINTF_PREFIX "I64"
+#endif*/
+
+#ifdef FORCE_DOXYGEN
+typedef long long posh_i64_t;
+typedef unsigned long posh_u64_t;
+#  define POSH_64BIT_INTEGER
+#  define POSH_I64_PRINTF_PREFIX
+#  define POSH_I64(x)
+#  define POSH_U64(x)
+#endif
+
+/** Minimum value for a 64-bit signed integer */
+#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
+/** Maximum value for a 64-bit signed integer */
+#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
+/** Minimum value for a 64-bit unsigned integer */
+#define POSH_U64_MIN  POSH_U64(0)
+/** Maximum value for a 64-bit unsigned integer */
+#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
+
+/* ----------------------------------------------------------------------------
+** Basic Sized Types
+**
+** These types are expected to be EXACTLY sized so you can use them for
+** serialization.
+** ----------------------------------------------------------------------------
+*/
+#define POSH_FALSE 0 
+#define POSH_TRUE  1 
+
+typedef int            posh_bool_t;
+typedef unsigned char  posh_byte_t;
+
+/* NOTE: These assume that CHAR_BIT is 8!! */
+typedef unsigned char  posh_u8_t;
+typedef signed char    posh_i8_t;
+
+#if defined POSH_USE_LIMITS_H
+#  if CHAR_BITS > 8
+#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
+#  endif /* CHAR_BITS > 8 */
+
+/* 16-bit */
+#  if ( USHRT_MAX == 65535 ) 
+   typedef unsigned short posh_u16_t;
+   typedef short          posh_i16_t;
+#  else
+   /* Yes, in theory there could still be a 16-bit character type and shorts are
+      32-bits in size...if you find such an architecture, let me know =P */
+#    error No 16-bit type found
+#  endif
+
+/* 32-bit */
+#  if ( INT_MAX == 2147483647 )
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  elif ( LONG_MAX == 2147483647 )
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  else
+      error No 32-bit type found
+#  endif
+
+#else /* POSH_USE_LIMITS_H */
+
+  typedef unsigned short posh_u16_t;
+  typedef short          posh_i16_t;
+
+#  if !defined POSH_OS_PALM
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  else
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  endif
+#endif
+
+/** Minimum value for a byte */
+#define POSH_BYTE_MIN    0
+/** Maximum value for an 8-bit unsigned value */
+#define POSH_BYTE_MAX    255
+/** Minimum value for a byte */
+#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
+/** Maximum value for a 16-bit signed value */
+#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
+/** Minimum value for a 16-bit unsigned value */
+#define POSH_U16_MIN     0
+/** Maximum value for a 16-bit unsigned value */
+#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
+/** Minimum value for a 32-bit signed value */
+#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
+/** Maximum value for a 32-bit signed value */
+#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
+/** Minimum value for a 32-bit unsigned value */
+#define POSH_U32_MIN     0
+/** Maximum value for a 32-bit unsigned value */
+#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
+
+/*
+** ----------------------------------------------------------------------------
+** Sanity checks on expected sizes
+** ----------------------------------------------------------------------------
+*/
+#if !defined FORCE_DOXYGEN
+
+POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
+POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
+
+#if !defined POSH_NO_FLOAT
+   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
+   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
+#endif
+
+#if defined POSH_64BIT_INTEGER
+   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
+   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
+#endif
+
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit pointer support
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
+#   define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_64BIT_POINTER
+   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
+#elif !defined FORCE_DOXYGEN
+/* if this assertion is hit then you're on a system that either has 64-bit
+   addressing and we didn't catch it, or you're on a system with 16-bit
+   pointers.  In the latter case, POSH doesn't actually care, we're just
+   triggering this assertion to make sure you're aware of the situation,
+   so feel free to delete it.
+
+   If this assertion is triggered on a known 32 or 64-bit platform, 
+   please let us know (poshlib@poshlib.org) */
+   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_64BIT_POINTER
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** POSH Utility Functions
+**
+** These are optional POSH utility functions that are not required if you don't
+** need anything except static checking of your host and target environment.
+** 
+** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
+** to enforce their export if your own library is only using them internally.
+** ----------------------------------------------------------------------------
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char *POSH_GetArchString( void );
+
+#if !defined POSH_NO_FLOAT
+
+posh_u32_t  POSH_LittleFloatBits( float f );
+posh_u32_t  POSH_BigFloatBits( float f );
+float       POSH_FloatFromLittleBits( posh_u32_t bits );
+float       POSH_FloatFromBigBits( posh_u32_t bits );
+
+void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
+double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
+
+/* unimplemented
+float      *POSH_WriteFloatToLittle( void *dst, float f );
+float      *POSH_WriteFloatToBig( void *dst, float f );
+float       POSH_ReadFloatFromLittle( const void *src );
+float       POSH_ReadFloatFromBig( const void *src );
+
+double     *POSH_WriteDoubleToLittle( void *dst, double d );
+double     *POSH_WriteDoubleToBig( void *dst, double d );
+double      POSH_ReadDoubleFromLittle( const void *src );
+double      POSH_ReadDoubleFromBig( const void *src );
+*/
+#endif /* !defined POSH_NO_FLOAT */
+
+#if defined FORCE_DOXYGEN
+#  define POSH_NO_FLOAT
+#  undef  POSH_NO_FLOAT
+#endif
+
+extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
+extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
+extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
+extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
+
+#if defined POSH_64BIT_INTEGER
+
+extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
+extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
+
+#endif /*POSH_64BIT_INTEGER */
+
+extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
+
+extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
+
+extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
+extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
+extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
+extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
+
+extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
+extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
+extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
+extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
+
+#if defined POSH_64BIT_INTEGER
+extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
+extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
+
+extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
+extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
+extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
+extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
+#endif /* POSH_64BIT_INTEGER */
+
+#if defined POSH_LITTLE_ENDIAN
+
+#  define POSH_LittleU16(x) (x)
+#  define POSH_LittleU32(x) (x)
+#  define POSH_LittleI16(x) (x)
+#  define POSH_LittleI32(x) (x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) (x)
+#    define POSH_LittleI64(x) (x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#  define POSH_BigU16(x) POSH_SwapU16(x)
+#  define POSH_BigU32(x) POSH_SwapU32(x)
+#  define POSH_BigI16(x) POSH_SwapI16(x)
+#  define POSH_BigI32(x) POSH_SwapI32(x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) POSH_SwapU64(x)
+#    define POSH_BigI64(x) POSH_SwapI64(x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#else
+
+#  define POSH_BigU16(x) (x)
+#  define POSH_BigU32(x) (x)
+#  define POSH_BigI16(x) (x)
+#  define POSH_BigI32(x) (x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) (x)
+#    define POSH_BigI64(x) (x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#  define POSH_LittleU16(x) POSH_SwapU16(x)
+#  define POSH_LittleU32(x) POSH_SwapU32(x)
+#  define POSH_LittleI16(x) POSH_SwapI16(x)
+#  define POSH_LittleI32(x) POSH_SwapI32(x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) POSH_SwapU64(x)
+#    define POSH_LittleI64(x) POSH_SwapI64(x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/stdstream.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/stdstream.h
new file mode 100644
index 0000000..4f0a10a
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/stdstream.h
@@ -0,0 +1,459 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#include "nvcore.h"
+#include "stream.h"
+#include "array.h"
+
+#include <stdio.h> // fopen
+#include <string.h> // memcpy
+
+namespace nv
+{
+
+    // Portable version of fopen.
+    inline FILE * fileOpen(const char * fileName, const char * mode)
+    {
+        nvCheck(fileName != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+        FILE * fp;
+        if (fopen_s(&fp, fileName, mode) == 0) {
+            return fp;
+        }
+        return NULL;
+#else
+        return fopen(fileName, mode);
+#endif
+    }
+
+
+    /// Base stdio stream.
+    class NVCORE_CLASS StdStream : public Stream
+    {
+        NV_FORBID_COPY(StdStream);
+    public:
+
+        /// Ctor.
+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
+
+        /// Dtor. 
+        virtual ~StdStream()
+        {
+            if( m_fp != NULL && m_autoclose ) {
+#if NV_OS_WIN32
+                _fclose_nolock( m_fp );
+#else
+                fclose( m_fp );
+#endif
+            }
+        }
+
+
+        /** @name Stream implementation. */
+        //@{
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(m_fp != NULL);
+            nvDebugCheck(pos <= size());
+#if NV_OS_WIN32
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return _ftell_nolock(m_fp);
+#else
+            return (uint)ftell(m_fp);
+#endif
+        }
+
+        virtual uint size() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return end;
+        }
+
+        virtual bool isError() const
+        {
+            return m_fp == NULL || ferror( m_fp ) != 0;
+        }
+
+        virtual void clearError()
+        {
+            nvDebugCheck(m_fp != NULL);
+            clearerr(m_fp);
+        }
+
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
+        virtual bool isAtEnd() const
+        {
+            if (m_fp == NULL) return true;
+            //nvDebugCheck(m_fp != NULL);
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const { return true; }
+        //@}
+
+    protected:
+
+        FILE * m_fp;
+        bool m_autoclose;
+
+    };
+
+
+    /// Standard output stream.
+    class NVCORE_CLASS StdOutputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdOutputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Write data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                putc_unlocked(((char *)data)[i], m_fp);
+            }
+            return len;
+#else
+            return (uint)fwrite(data, 1, len, m_fp);
+#endif
+        }
+
+        virtual bool isLoading() const
+        {
+            return false;
+        }
+
+        virtual bool isSaving() const
+        {
+            return true;
+        }
+        //@}
+
+    };
+
+
+    /// Standard input stream.
+    class NVCORE_CLASS StdInputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdInputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fread_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fread_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                ((char *)data)[i] = getc_unlocked(m_fp);
+            }
+            return len;
+#else
+            return (uint)fread(data, 1, len, m_fp);
+#endif
+            
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+    };
+
+
+
+    /// Memory input stream.
+    class NVCORE_CLASS MemoryInputStream : public Stream
+    {
+        NV_FORBID_COPY(MemoryInputStream);
+    public:
+
+        /// Ctor.
+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(!isError());
+
+            uint left = m_size - tell();
+            if (len > left) len = left;
+
+            memcpy( data, m_ptr, len );
+            m_ptr += len;
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(!isError());
+            m_ptr = m_mem + pos;
+            nvDebugCheck(!isError());
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_ptr >= m_mem);
+            return uint(m_ptr - m_mem);
+        }
+
+        virtual uint size() const
+        {
+            return m_size;
+        }
+
+        virtual bool isError() const
+        {
+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+        }
+
+        virtual void clearError()
+        {
+            // Nothing to do.
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_ptr == m_mem + m_size;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const
+        {
+            return true;
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+
+        const uint8 * ptr() const { return m_ptr; }
+
+
+    private:
+
+        const uint8 * m_mem;
+        const uint8 * m_ptr;
+        uint m_size;
+
+    };
+
+
+    /// Buffer output stream.
+    class NVCORE_CLASS BufferOutputStream : public Stream
+    {
+        NV_FORBID_COPY(BufferOutputStream);
+    public:
+
+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
+
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            m_buffer.append((uint8 *)data, len);
+            return len;
+        }
+
+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
+        virtual uint tell() const { return m_buffer.size(); }
+        virtual uint size() const { return m_buffer.size(); }
+
+        virtual bool isError() const { return false; }
+        virtual void clearError() {}
+
+        virtual bool isAtEnd() const { return true; }
+        virtual bool isSeekable() const { return false; }
+        virtual bool isLoading() const { return false; }
+        virtual bool isSaving() const { return true; }
+
+    private:
+        Array<uint8> & m_buffer;
+    };
+
+
+    /// Protected input stream.
+    class NVCORE_CLASS ProtectedStream : public Stream
+    {
+        NV_FORBID_COPY(ProtectedStream);
+    public:
+
+        /// Ctor.
+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+        { 
+        }
+
+        /// Ctor.
+        ProtectedStream( Stream * s, bool autodelete = true ) : 
+        m_s(s), m_autodelete(autodelete) 
+        {
+            nvDebugCheck(m_s != NULL);
+        }
+
+        /// Dtor.
+        virtual ~ProtectedStream()
+        {
+            if( m_autodelete ) {
+                delete m_s;
+            }
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            len = m_s->serialize( data, len );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            m_s->seek( pos );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+        }
+
+        virtual uint tell() const
+        {
+            return m_s->tell();
+        }
+
+        virtual uint size() const
+        {
+            return m_s->size();
+        }
+
+        virtual bool isError() const
+        {
+            return m_s->isError();
+        }
+
+        virtual void clearError()
+        {
+            m_s->clearError();
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_s->isAtEnd();
+        }
+
+        virtual bool isSeekable() const
+        {
+            return m_s->isSeekable();
+        }
+
+        virtual bool isLoading() const
+        {
+            return m_s->isLoading();
+        }
+
+        virtual bool isSaving() const
+        {
+            return m_s->isSaving();
+        }
+        //@}
+
+
+    private:
+
+        Stream * const m_s;
+        bool const m_autodelete;
+
+    };
+
+} // nv namespace
+
+
+//#endif // NV_CORE_STDSTREAM_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/stream.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/stream.h
new file mode 100644
index 0000000..9252d9e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/stream.h
@@ -0,0 +1,163 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_STREAM_H
+#define NV_CORE_STREAM_H
+
+#include "nvcore.h"
+#include "debug.h"
+
+namespace nv
+{
+
+    /// Base stream class.
+    class NVCORE_CLASS Stream {
+    public:
+
+        enum ByteOrder {
+            LittleEndian = false,
+            BigEndian = true,
+        };
+
+        /// Get the byte order of the system.
+        static ByteOrder getSystemByteOrder() { 
+#if NV_LITTLE_ENDIAN
+            return LittleEndian;
+#else
+            return BigEndian;
+#endif
+        }
+
+
+        /// Ctor.
+        Stream() : m_byteOrder(LittleEndian) { }
+
+        /// Virtual destructor.
+        virtual ~Stream() {}
+
+        /// Set byte order.
+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+
+        /// Get byte order.
+        ByteOrder byteOrder() const { return m_byteOrder; }
+
+
+        /// Serialize the given data.
+        virtual uint serialize( void * data, uint len ) = 0;
+
+        /// Move to the given position in the archive.
+        virtual void seek( uint pos ) = 0;
+
+        /// Return the current position in the archive.
+        virtual uint tell() const = 0;
+
+        /// Return the current size of the archive.
+        virtual uint size() const = 0;
+
+        /// Determine if there has been any error.
+        virtual bool isError() const = 0;
+
+        /// Clear errors.
+        virtual void clearError() = 0;
+
+        /// Return true if the stream is at the end.
+        virtual bool isAtEnd() const = 0;
+
+        /// Return true if the stream is seekable.
+        virtual bool isSeekable() const = 0;
+
+        /// Return true if this is an input stream.
+        virtual bool isLoading() const = 0;
+
+        /// Return true if this is an output stream.
+        virtual bool isSaving() const = 0;
+
+
+        void advance(uint offset) { seek(tell() + offset); }
+
+
+        // friends	
+        friend Stream & operator<<( Stream & s, bool & c ) {
+#if NV_OS_DARWIN && !NV_CC_CPP11
+            nvStaticCheck(sizeof(bool) == 4);
+            uint8 b = c ? 1 : 0;
+            s.serialize( &b, 1 );
+            c = (b == 1);
+#else
+            nvStaticCheck(sizeof(bool) == 1);
+            s.serialize( &c, 1 );
+#endif
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, char & c ) {
+            nvStaticCheck(sizeof(char) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint8 & c ) {
+            nvStaticCheck(sizeof(uint8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, int8 & c ) {
+            nvStaticCheck(sizeof(int8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint16 & c ) {
+            nvStaticCheck(sizeof(uint16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, int16 & c ) {
+            nvStaticCheck(sizeof(int16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, uint32 & c ) {
+            nvStaticCheck(sizeof(uint32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, int32 & c ) {
+            nvStaticCheck(sizeof(int32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, uint64 & c ) {
+            nvStaticCheck(sizeof(uint64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, int64 & c ) {
+            nvStaticCheck(sizeof(int64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, float & c ) {
+            nvStaticCheck(sizeof(float) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, double & c ) {
+            nvStaticCheck(sizeof(double) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+
+    protected:
+
+        /// Serialize in the stream byte order.
+        Stream & byteOrderSerialize( void * v, uint len ) {
+            if( m_byteOrder == getSystemByteOrder() ) {
+                serialize( v, len );
+            }
+            else {
+                for( uint i = len; i > 0; i-- ) {
+                    serialize( (uint8 *)v + i - 1, 1 );
+                }
+            }
+            return *this;
+        }
+
+
+    private:
+
+        ByteOrder m_byteOrder;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STREAM_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/strlib.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/strlib.h
new file mode 100644
index 0000000..80a957c
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/strlib.h
@@ -0,0 +1,429 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_STRING_H
+#define NV_CORE_STRING_H
+
+#include "debug.h"
+#include "hash.h" // hash
+
+//#include <string.h> // strlen, etc.
+
+#if NV_OS_WIN32
+#define NV_PATH_SEPARATOR '\\'
+#else
+#define NV_PATH_SEPARATOR '/'
+#endif
+
+namespace nv
+{
+
+    NVCORE_API uint strHash(const char * str, uint h) NV_PURE;
+
+    /// String hash based on Bernstein's hash.
+    inline uint strHash(const char * data, uint h = 5381)
+    {
+        uint i = 0;
+        while(data[i] != 0) {
+            h = (33 * h) ^ uint(data[i]);
+            i++;
+        }
+        return h;
+    }
+
+    template <> struct Hash<const char *> {
+        uint operator()(const char * str) const { return strHash(str); }
+    };
+
+    NVCORE_API uint strLen(const char * str) NV_PURE;                       // Asserts on NULL strings.
+
+    NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE;       // Asserts on NULL strings.
+    NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE;   // Asserts on NULL strings.
+    NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE;     // Accepts NULL strings.
+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
+
+    template <> struct Equal<const char *> {
+        bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
+    };
+
+    NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE;
+    NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE;
+
+
+    NVCORE_API void strCpy(char * dst, uint size, const char * src);
+    NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len);
+    NVCORE_API void strCat(char * dst, uint size, const char * src);
+
+    NVCORE_API const char * strSkipWhiteSpace(const char * str);
+    NVCORE_API char * strSkipWhiteSpace(char * str);
+
+    NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
+
+    NVCORE_API bool isNumber(const char * str) NV_PURE;
+
+    /* @@ Implement these two functions and modify StringBuilder to use them?
+    NVCORE_API void strFormat(const char * dst, const char * fmt, ...);
+    NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg);
+
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3)));
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        strFormatList(buffer, count, fmt, args);
+        va_end(args);
+    }
+    template <size_t count> void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) {
+        va_list tmp;
+        va_copy(tmp, args);
+        strFormatList(buffer, count, fmt, tmp);
+        va_end(tmp);
+    }*/
+
+    template <int count> void strCpySafe(char (&buffer)[count], const char *src) {
+        strCpy(buffer, count, src);
+    }
+
+    template <int count> void strCatSafe(char (&buffer)[count], const char * src) {
+        strCat(buffer, count, src);
+    }
+
+
+
+    /// String builder.
+    class NVCORE_CLASS StringBuilder
+    {
+    public:
+
+        StringBuilder();
+        explicit StringBuilder( uint size_hint );
+        StringBuilder(const char * str);
+        StringBuilder(const char * str, uint len);
+        StringBuilder(const StringBuilder & other);
+
+        ~StringBuilder();
+
+        StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+        StringBuilder & formatList( const char * format, va_list arg );
+
+        StringBuilder & append(const char * str);
+		StringBuilder & append(const char * str, uint len);
+        StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
+        StringBuilder & appendFormatList(const char * format, va_list arg);
+
+        StringBuilder & appendSpace(uint n);
+
+        StringBuilder & number( int i, int base = 10 );
+        StringBuilder & number( uint i, int base = 10 );
+
+        StringBuilder & reserve(uint size_hint);
+        StringBuilder & copy(const char * str);
+        StringBuilder & copy(const char * str, uint len);
+        StringBuilder & copy(const StringBuilder & str);
+
+        StringBuilder & toLower();
+        StringBuilder & toUpper();
+
+        bool endsWith(const char * str) const;
+        bool beginsWith(const char * str) const;
+
+        char * reverseFind(char c);
+
+        void reset();
+        bool isNull() const { return m_size == 0; }
+
+        // const char * accessors
+        //operator const char * () const { return m_str; }
+        //operator char * () { return m_str; }
+        const char * str() const { return m_str; }
+        char * str() { return m_str; }
+
+        char * release();
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const StringBuilder & s ) {
+            return copy(s);
+        }
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const char * s ) {
+            return copy(s);
+        }
+
+        /// Equal operator.
+        bool operator==( const StringBuilder & s ) const {
+            return strMatch(s.m_str, m_str);
+        }
+
+        /// Return the exact length.
+        uint length() const { return isNull() ? 0 : strLen(m_str); }
+
+        /// Return the size of the string container.
+        uint capacity() const { return m_size; }
+
+        /// Return the hash of the string.
+        uint hash() const { return isNull() ? 0 : strHash(m_str); }
+
+        // Swap strings.
+        friend void swap(StringBuilder & a, StringBuilder & b);
+
+    protected:
+
+        /// Size of the string container.
+        uint m_size;
+
+        /// String.
+        char * m_str;
+
+    };
+
+
+    /// Path string. @@ This should be called PathBuilder.
+    class NVCORE_CLASS Path : public StringBuilder
+    {
+    public:
+        Path() : StringBuilder() {}
+        explicit Path(int size_hint) : StringBuilder(size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
+        Path(const Path & path) : StringBuilder(path) {}
+
+        const char * fileName() const;
+        const char * extension() const;
+
+        void translatePath(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void stripFileName();
+        void stripExtension();
+
+        // statics
+        NVCORE_API static char separator();
+        NVCORE_API static const char * fileName(const char *);
+        NVCORE_API static const char * extension(const char *);
+
+        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+    };
+
+
+    /// String class.
+    class NVCORE_CLASS String
+    {
+    public:
+
+        /// Constructs a null string. @sa isNull()
+        String()
+        {
+            data = NULL;
+        }
+
+        /// Constructs a shared copy of str.
+        String(const String & str)
+        {
+            data = str.data;
+            if (data != NULL) addRef();
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str)
+        {
+            setString(str);
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str, int length)
+        {
+            setString(str, length);
+        }
+
+        /// Constructs a shared string from a StringBuilder.
+        String(const StringBuilder & str)
+        {
+            setString(str);
+        }
+
+        /// Dtor.
+        ~String()
+        {
+            release();
+        }
+
+        String clone() const;
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const char * str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const StringBuilder & str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Implement value semantics.
+        String & operator=( const String & str )
+        {
+            if (str.data != data)
+            {
+                release();
+                data = str.data;
+                addRef();
+            }
+            return *this;
+        }
+
+        /// Equal operator.
+        bool operator==( const String & str ) const
+        {
+            return strMatch(str.data, data);
+        }
+
+        /// Equal operator.
+        bool operator==( const char * str ) const
+        {
+            return strMatch(str, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const String & str ) const
+        {
+            return !strMatch(str.data, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const char * str ) const
+        {
+            return !strMatch(str, data);
+        }
+
+        /// Returns true if this string is the null string.
+        bool isNull() const { return data == NULL; }
+
+        /// Return the exact length.
+        uint length() const { nvDebugCheck(data != NULL); return strLen(data); }
+
+        /// Return the hash of the string.
+        uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
+
+        /// const char * cast operator.
+        operator const char * () const { return data; }
+
+        /// Get string pointer.
+        const char * str() const { return data; }
+
+
+    private:
+
+        // Add reference count.
+        void addRef();
+
+        // Decrease reference count.
+        void release();
+
+        uint16 getRefCount() const
+        {
+            nvDebugCheck(data != NULL);
+            return *reinterpret_cast<const uint16 *>(data - 2);
+        }
+
+        void setRefCount(uint16 count) {
+            nvDebugCheck(data != NULL);
+            nvCheck(count < 0xFFFF);
+            *reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
+        }
+
+        void setData(const char * str) {
+            data = str + 2;
+        }
+
+        void allocString(const char * str)
+        {
+            allocString(str, strLen(str));
+        }
+
+        void allocString(const char * str, uint length);
+
+        void setString(const char * str);
+        void setString(const char * str, uint length);
+        void setString(const StringBuilder & str);
+
+        // Swap strings.
+        friend void swap(String & a, String & b);
+
+    private:
+
+        const char * data;
+
+    };
+
+    template <> struct Hash<String> {
+        uint operator()(const String & str) const { return str.hash(); }
+    };
+
+
+    // Like AutoPtr, but for const char strings.
+    class AutoString
+    {
+        NV_FORBID_COPY(AutoString);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        // Ctor.
+        AutoString(const char * p = NULL) : m_ptr(p) { }
+
+#if NV_CC_CPP11
+        // Move ctor.
+        AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; }
+#endif
+        
+        // Dtor. Deletes owned pointer.
+        ~AutoString() {
+            delete [] m_ptr;
+            m_ptr = NULL;
+        }
+
+        // Delete owned pointer and assign new one.
+        void operator=(const char * p) {
+            if (p != m_ptr) 
+            {
+                delete [] m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        // Get pointer.
+        const char * ptr() const { return m_ptr; }
+        operator const char *() const { return m_ptr; }
+
+        // Relinquish ownership of the underlying pointer and returns that pointer.
+        const char * release() {
+            const char * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        // comparison operators.
+        friend bool operator == (const AutoString & ap, const char * const p) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const AutoString & ap, const char * const p) {
+            return (ap.ptr() != p);
+        }
+        friend bool operator == (const char * const p, const AutoString & ap) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const char * const p, const AutoString & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        const char * m_ptr;
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STRING_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvcore/utils.h b/3rdparty/bimg/3rdparty/nvtt/nvcore/utils.h
new file mode 100644
index 0000000..364b629
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvcore/utils.h
@@ -0,0 +1,281 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_CORE_UTILS_H
+#define NV_CORE_UTILS_H
+
+#include "debug.h" // nvdebugcheck
+
+#include <new> // for placement new
+
+
+// Just in case. Grrr.
+#undef min
+#undef max
+
+#define NV_INT8_MIN    (-128)
+#define NV_INT8_MAX    127
+#define NV_UINT8_MAX    255
+#define NV_INT16_MIN    (-32767-1)
+#define NV_INT16_MAX    32767
+#define NV_UINT16_MAX   0xffff
+#define NV_INT32_MIN    (-2147483647-1)
+#define NV_INT32_MAX    2147483647
+#define NV_UINT32_MAX   0xffffffff
+#define NV_INT64_MAX    POSH_I64(9223372036854775807)
+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
+#define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
+
+#define NV_HALF_MAX     65504.0F
+#define NV_FLOAT_MAX    3.402823466e+38F
+
+#define NV_INTEGER_TO_FLOAT_MAX  16777217     // Largest integer such that it and all smaller integers can be stored in a 32bit float.
+
+
+namespace nv
+{
+    // Less error prone than casting. From CB:
+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
+
+    // These intentionally look like casts.
+
+    // uint32 casts:
+    template <typename T> inline uint32 U32(T x) { return x; }
+    template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
+    template <> inline uint32 U32<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
+    //template <> inline uint32 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint32 U32<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint16>(uint16 x) { return x; }
+    template <> inline uint32 U32<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
+    template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+
+    // int32 casts:
+    template <typename T> inline int32 I32(T x) { return x; }
+    template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    template <> inline int32 I32<int64>(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; }
+    template <> inline int32 I32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    //template <> inline int32 I32<int32>(int32 x) { return x; }
+    //template <> inline int32 I32<uint16>(uint16 x) { return x; }
+    //template <> inline int32 I32<int16>(int16 x) { return x; }
+    //template <> inline int32 I32<uint8>(uint8 x) { return x; }
+    //template <> inline int32 I32<int8>(int8 x) { return x; }
+
+    // uint16 casts:
+    template <typename T> inline uint16 U16(T x) { return x; }
+    template <> inline uint16 U16<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    //template <> inline uint16 U16<uint16>(uint16 x) { return x; }
+    template <> inline uint16 U16<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+    //template <> inline uint16 U16<uint8>(uint8 x) { return x; }
+    template <> inline uint16 U16<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+
+    // int16 casts:
+    template <typename T> inline int16 I16(T x) { return x; }
+    template <> inline int16 I16<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int64>(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int32>(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    //template <> inline int16 I16<int16>(int16 x) { return x; }
+    //template <> inline int16 I16<uint8>(uint8 x) { return x; }
+    //template <> inline int16 I16<int8>(int8 x) { return x; }
+
+    // uint8 casts:
+    template <typename T> inline uint8 U8(T x) { return x; }
+    template <> inline uint8 U8<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint16>(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int16>(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    //template <> inline uint8 U8<uint8>(uint8 x) { return x; }
+    template <> inline uint8 U8<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; }
+    //template <> inline uint8 U8<float>(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; }
+
+    // int8 casts:
+    template <typename T> inline int8 I8(T x) { return x; }
+    template <> inline int8 I8<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int64>(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int32>(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int16>(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint8>(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    //template <> inline int8 I8<int8>(int8 x) { return x; }
+
+    // float casts:
+    template <typename T> inline float F32(T x) { return x; }
+    template <> inline float F32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int64>(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int32>(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    // The compiler should not complain about these conversions:
+    //template <> inline float F32<uint16>(uint16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int16>(int16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<uint8>(uint8 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int8>(int8 x) { nvDebugCheck(return (float)x; }
+
+
+    /// Swap two values.
+    template <typename T> 
+    inline void swap(T & a, T & b)
+    {
+        T temp(a);
+        a = b; 
+        b = temp;
+    }
+
+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
+    template <typename T> 
+    //inline const T & max(const T & a, const T & b)
+    inline T max(const T & a, const T & b)
+    {
+        return (b < a) ? a : b;
+    }
+
+	/// Return the maximum of the four arguments.
+	template <typename T> 
+	//inline const T & max4(const T & a, const T & b, const T & c)
+	inline T max4(const T & a, const T & b, const T & c, const T & d)
+	{
+		return max(max(a, b), max(c, d));
+	}
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & max3(const T & a, const T & b, const T & c)
+    inline T max3(const T & a, const T & b, const T & c)
+    {
+        return max(a, max(b, c));
+    }
+
+    /// Return the minimum of two values.
+    template <typename T> 
+    //inline const T & min(const T & a, const T & b)
+    inline T min(const T & a, const T & b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & min3(const T & a, const T & b, const T & c)
+    inline T min3(const T & a, const T & b, const T & c)
+    {
+        return min(a, min(b, c));
+    }
+
+    /// Clamp between two values.
+    template <typename T> 
+    //inline const T & clamp(const T & x, const T & a, const T & b)
+    inline T clamp(const T & x, const T & a, const T & b)
+    {
+        return min(max(x, a), b);
+    }
+
+    /** Return the next power of two. 
+    * @see http://graphics.stanford.edu/~seander/bithacks.html
+    * @warning Behaviour for 0 is undefined.
+    * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
+    * @note nextPowerOfTwo(x) = 2 << log2(x-1)
+    */
+    inline uint nextPowerOfTwo( uint x )
+    {
+        nvDebugCheck( x != 0 );
+#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
+        x--;
+        x |= x >> 1;
+        x |= x >> 2;
+        x |= x >> 4;
+        x |= x >> 8;
+        x |= x >> 16;
+        return x+1;	
+#else
+        uint p = 1;
+        while( x > p ) {
+            p += p;
+        }
+        return p;
+#endif
+    }
+
+    /// Return true if @a n is a power of two.
+    inline bool isPowerOfTwo( uint n )
+    {
+        return (n & (n-1)) == 0;
+    }
+
+
+    // @@ Move this to utils?
+    /// Delete all the elements of a container.
+    template <typename T>
+    void deleteAll(T & container)
+    {
+        for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
+        {
+            delete container[i];
+        }
+    }
+
+
+
+    // @@ Specialize these methods for numeric, pointer, and pod types.
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T; // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(elem); // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
+    template <typename T>
+    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = new_size; i < old_size; i++) {
+            (ptr+i)->~T(); // Explicit call to the destructor
+        }
+    }
+
+    template <typename T>
+    void fill(T * restrict dst, uint count, const T & value) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = value;
+        }
+    }
+
+    template <typename T>
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = src[i];
+        }
+    }
+
+    template <typename T>
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
+        for (uint i = begin; i < end; i++) {
+            if (ptr[i] == element) {
+                if (index != NULL) *index = i;
+                return true;
+            }
+        }
+        return false;
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_UTILS_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.cpp b/3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.cpp
new file mode 100644
index 0000000..ba01b1f
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.cpp
@@ -0,0 +1,1200 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#include "fitting.h"
+#include "vector.inl"
+#include "plane.inl"
+#include "matrix.inl"
+
+#include "nvcore/array.inl"
+#include "nvcore/utils.h" // max, swap
+
+using namespace nv;
+
+// @@ Move to EigenSolver.h
+
+// @@ We should be able to do something cheaper...
+static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
+{
+	const Vector3 row0(matrix[0], matrix[1], matrix[2]);
+	const Vector3 row1(matrix[1], matrix[3], matrix[4]);
+	const Vector3 row2(matrix[2], matrix[4], matrix[5]);
+
+	float r0 = lengthSquared(row0);
+	float r1 = lengthSquared(row1);
+	float r2 = lengthSquared(row2);
+
+	if (r0 > r1 && r0 > r2) return row0;
+	if (r1 > r2) return row1;
+	return row2;
+}
+
+
+static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    Vector3 v = estimatePrincipalComponent(matrix);
+
+    const int NUM = 8;
+    for (int i = 0; i < NUM; i++)
+    {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+
+        float norm = max(max(x, y), z);
+
+        v = Vector3(x, y, z) / norm;
+    }
+
+    return v;
+}
+
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
+{
+    Vector3 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    Vector3 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points)
+{
+    Vector4 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    Vector4 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.y * v.y;
+        covariance[4] += v.y * v.z;
+        covariance[5] += v.z * v.z;
+    }
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 a = (points[i] - centroid) * metric;
+        Vector3 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.y * b.y;
+        covariance[4] += a.y * b.z;
+        covariance[5] += a.z * b.z;
+    }
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.x * v.w;
+
+		covariance[4] += v.y * v.y;
+        covariance[5] += v.y * v.z;
+        covariance[6] += v.y * v.w;
+
+		covariance[7] += v.z * v.z;
+		covariance[8] += v.z * v.w;
+
+		covariance[9] += v.w * v.w;
+	}
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 a = (points[i] - centroid) * metric;
+        Vector4 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.x * b.w;
+
+		covariance[4] += a.y * b.y;
+        covariance[5] += a.y * b.z;
+        covariance[6] += a.y * b.w;
+
+		covariance[7] += a.z * b.z;
+		covariance[8] += a.z * b.w;
+
+		covariance[9] += a.w * b.w;
+    }
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+
+
+static inline Vector3 firstEigenVector_EigenSolver3(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+	if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors))
+	{
+		return Vector3(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+
+
+static inline Vector4 firstEigenVector_EigenSolver4(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[4] == 0 && matrix[7] == 0&& matrix[9] == 0)
+    {
+        return Vector4(0.0f);
+    }
+
+    float eigenValues[4];
+    Vector4 eigenVectors[4];
+	if (!nv::Fit::eigenSolveSymmetric4(matrix, eigenValues, eigenVectors))
+	{
+		return Vector4(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points)
+{
+    float matrix[10];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    float matrix[10];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
+
+Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector3(R[0], R[1], R[2]);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+		Q[i*n+3] = points[i].w;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector4(R[0], R[1], R[2], R[3]);
+}
+
+
+
+Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, matrix);
+
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    return Plane(eigenVectors[2], centroid);
+}
+
+bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        return false;
+    }
+
+    return eigenValues[2] < epsilon;
+}
+
+
+
+// Tridiagonal solver from Charles Bloom. 
+// Householder transforms followed by QL decomposition. 
+// Seems to be based on the code from Numerical Recipes in C.
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd);
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[3];
+    float diag[3];
+    float work[3][3];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[1][1] = matrix[3];
+    work[1][2] = work[2][1] = matrix[4];
+    work[2][2] = matrix[5];
+
+    EigenSolver3_Tridiagonal(work, diag, subd);
+    if (!EigenSolver3_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 3; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector3(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows :
+
+    for (int i=0; i < 3; i++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // shuffle to sort by singular value :
+    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[0], eigenValues[2]);
+        swap(eigenVectors[0], eigenVectors[2]);
+    }
+    if (eigenValues[1] > eigenValues[0])
+    {
+        swap(eigenValues[0], eigenValues[1]);
+        swap(eigenVectors[0], eigenVectors[1]);
+    }
+    if (eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[1], eigenValues[2]);
+        swap(eigenVectors[1], eigenVectors[2]);
+    }
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
+
+    return true;
+}
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+    const float epsilon = 1e-08f;
+
+    float a = mat[0][0];
+    float b = mat[0][1];
+    float c = mat[0][2];
+    float d = mat[1][1];
+    float e = mat[1][2];
+    float f = mat[2][2];
+
+    diag[0] = a;
+    subd[2] = 0.f;
+    if (fabsf(c) >= epsilon)
+    {
+        const float ell = sqrtf(b*b+c*c);
+        b /= ell;
+        c /= ell;
+        const float q = 2*b*e+c*(f-d);
+        diag[1] = d+c*q;
+        diag[2] = f-c*q;
+        subd[0] = ell;
+        subd[1] = e-b*q;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
+        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
+    }
+    else
+    {
+        diag[1] = d;
+        diag[2] = f;
+        subd[0] = b;
+        subd[1] = e;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
+        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
+    }
+}
+
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 3; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m <= 1; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 3; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+// Tridiagonal solver for 4x4 symmetric matrices.
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd);
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[4];
+    float diag[4];
+    float work[4][4];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[0][3] = work[3][0] = matrix[3];
+    work[1][1] = matrix[4];
+    work[1][2] = work[2][1] = matrix[5];
+    work[1][3] = work[3][1] = matrix[6];
+    work[2][2] = matrix[7];
+    work[2][3] = work[3][2] = matrix[8];
+    work[3][3] = matrix[9];
+
+    EigenSolver4_Tridiagonal(work, diag, subd);
+    if (!EigenSolver4_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 4; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector4(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 4; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows
+
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // sort by singular value
+
+	for (int i = 0; i < 3; ++i)
+	{
+		for (int j = i+1; j < 4; ++j)
+		{
+			if (eigenValues[j] > eigenValues[i])
+			{
+				swap(eigenValues[i], eigenValues[j]);
+				swap(eigenVectors[i], eigenVectors[j]);
+			}
+		}
+	}
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2] && eigenValues[0] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2] && eigenValues[1] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[2] >= eigenValues[2]);
+
+    return true;
+}
+
+inline float signNonzero(float x)
+{
+	return (x >= 0.0f) ? 1.0f : -1.0f;
+}
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+
+	static const int n = 4;
+
+	// Set epsilon relative to size of elements in matrix
+	static const float relEpsilon = 1e-6f;
+	float maxElement = FLT_MAX;
+	for (int i = 0; i < n; ++i)
+		for (int j = 0; j < n; ++j)
+			maxElement = max(maxElement, fabsf(mat[i][j]));
+	float epsilon = relEpsilon * maxElement;
+
+	// Iterative algorithm, works for any size of matrix but might be slower than
+	// a closed-form solution for symmetric 4x4 matrices.  Based on this article:
+	// http://en.wikipedia.org/wiki/Householder_transformation#Tridiagonalization
+
+	Matrix A, Q(identity);
+	memcpy(&A, mat, sizeof(float)*n*n);
+
+	// We proceed from left to right, making the off-tridiagonal entries zero in
+	// one column of the matrix at a time.
+	for (int k = 0; k < n - 2; ++k)
+	{
+		float sum = 0.0f;
+		for (int j = k+1; j < n; ++j)
+			sum += A(j,k)*A(j,k);
+		float alpha = -signNonzero(A(k+1,k)) * sqrtf(sum);
+		float r = sqrtf(0.5f * (alpha*alpha - A(k+1,k)*alpha));
+
+		// If r is zero, skip this column - already in tridiagonal form
+		if (fabsf(r) < epsilon)
+			continue;
+
+		float v[n] = {};
+		v[k+1] = 0.5f * (A(k+1,k) - alpha) / r;
+		for (int j = k+2; j < n; ++j)
+			v[j] = 0.5f * A(j,k) / r;
+
+		Matrix P(identity);
+		for (int i = 0; i < n; ++i)
+			for (int j = 0; j < n; ++j)
+				P(i,j) -= 2.0f * v[i] * v[j];
+
+		A = mul(mul(P, A), P);
+		Q = mul(Q, P);
+	}
+
+	nvDebugCheck(fabsf(A(2,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,2)) < epsilon);
+	nvDebugCheck(fabsf(A(3,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,3)) < epsilon);
+	nvDebugCheck(fabsf(A(3,1)) < epsilon);
+	nvDebugCheck(fabsf(A(1,3)) < epsilon);
+
+	for (int i = 0; i < n; ++i)
+		diag[i] = A(i,i);
+	for (int i = 0; i < n - 1; ++i)
+		subd[i] = A(i+1,i);
+	subd[n-1] = 0.0f;
+
+	memcpy(mat, &Q, sizeof(float)*n*n);
+}
+
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 4; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m < 3; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 4; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
+{
+    // Compute principal component.
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
+    Vector3 principal = firstEigenVector_PowerMethod(matrix);
+
+    // Pick initial solution.
+    int mini, maxi;
+    mini = maxi = 0;
+
+    float mindps, maxdps;
+    mindps = maxdps = dot(points[0] - centroid, principal);
+
+    for (int i = 1; i < n; ++i)
+    {
+        float dps = dot(points[i] - centroid, principal);
+
+        if (dps < mindps) {
+            mindps = dps;
+            mini = i;
+        }
+        else {
+            maxdps = dps;
+            maxi = i;
+        }
+    }
+
+    cluster[0] = centroid + mindps * principal;
+    cluster[1] = centroid + maxdps * principal;
+    cluster[2] = (2.0f * cluster[0] + cluster[1]) / 3.0f;
+    cluster[3] = (2.0f * cluster[1] + cluster[0]) / 3.0f;
+
+    // Now we have to iteratively refine the clusters.
+    while (true)
+    {
+        Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) };
+        float total[4] = {0, 0, 0, 0};
+
+        for (int i = 0; i < n; ++i)
+        {
+            // Find nearest cluster.
+            int nearest = 0;
+            float mindist = FLT_MAX;
+            for (int j = 0; j < 4; j++)
+            {
+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
+                if (dist < mindist)
+                {
+                    mindist = dist;
+                    nearest = j;
+                }
+            }
+
+            newCluster[nearest] += weights[i] * points[i];
+            total[nearest] += weights[i];
+        }
+
+        for (int j = 0; j < 4; j++)
+        {
+            if (total[j] != 0)
+                newCluster[j] /= total[j];
+        }
+
+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
+        {
+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
+        }
+
+        cluster[0] = newCluster[0];
+        cluster[1] = newCluster[1];
+        cluster[2] = newCluster[2];
+        cluster[3] = newCluster[3];
+
+        // Sort clusters by weight.
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
+            {
+                swap( total[j], total[j - 1] );
+                swap( cluster[j], cluster[j - 1] );
+            }
+        }
+    }
+}
+
+
+
+// Adaptation of James Arvo's SVD code, as found in ZOH.
+
+inline float Sqr(float x) { return x*x; }
+
+inline float svd_pythag( float a, float b )
+{
+	float at = fabsf(a);
+	float bt = fabsf(b);
+	if( at > bt )
+		return at * sqrtf( 1.0f + Sqr( bt / at ) );
+	else if( bt > 0.0f )
+		return bt * sqrtf( 1.0f + Sqr( at / bt ) );
+	else return 0.0f;
+}
+
+inline float SameSign( float a, float b ) 
+{
+	float t;
+	if( b >= 0.0f ) t = fabsf( a );
+	else t = -fabsf( a );
+	return t;
+}
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
+{
+	static const int MaxIterations = 30;
+
+	int    i, j, k, l, p, q, iter;
+	float  c, f, h, s, x, y, z;
+	float  norm  = 0.0f;
+	float  g     = 0.0f;
+	float  scale = 0.0f;
+
+    Array<float> temp; temp.resize(cols, 0.0f);
+
+	for( i = 0; i < cols; i++ ) 
+	{
+		temp[i] = scale * g;
+		scale   = 0.0f;
+		g       = 0.0f;
+		s       = 0.0f;
+		l       = i + 1;
+
+		if( i < rows )
+		{
+			for( k = i; k < rows; k++ ) scale += fabsf( Q[k*cols+i] );
+			if( scale != 0.0f ) 
+			{
+				for( k = i; k < rows; k++ ) 
+				{
+					Q[k*cols+i] /= scale;
+					s += Sqr( Q[k*cols+i] );
+				}
+				f = Q[i*cols+i];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+i] = f - g;
+				if( i != cols - 1 )
+				{
+					for( j = l; j < cols; j++ ) 
+					{
+						s = 0.0f;
+						for( k = i; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+						f = s / h;
+						for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+					}
+				}
+				for( k = i; k < rows; k++ ) Q[k*cols+i] *= scale;
+			}
+		}
+
+		diag[i] = scale * g;
+		g       = 0.0f;
+		s       = 0.0f;
+		scale   = 0.0f;
+
+		if( i < rows && i != cols - 1 ) 
+		{
+			for( k = l; k < cols; k++ ) scale += fabsf( Q[i*cols+k] );
+			if( scale != 0.0f ) 
+			{
+				for( k = l; k < cols; k++ ) 
+				{
+					Q[i*cols+k] /= scale;
+					s += Sqr( Q[i*cols+k] );
+				}
+				f = Q[i*cols+l];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+l] = f - g;
+				for( k = l; k < cols; k++ ) temp[k] = Q[i*cols+k] / h;
+				if( i != rows - 1 ) 
+				{
+					for( j = l; j < rows; j++ ) 
+					{
+						s = 0.0f;
+						for( k = l; k < cols; k++ ) s += Q[j*cols+k] * Q[i*cols+k];
+						for( k = l; k < cols; k++ ) Q[j*cols+k] += s * temp[k];
+					}
+				}
+				for( k = l; k < cols; k++ ) Q[i*cols+k] *= scale;
+			}
+		}
+		norm = max( norm, fabsf( diag[i] ) + fabsf( temp[i] ) );
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		if( i < cols - 1 ) 
+		{
+			if( g != 0.0f ) 
+			{
+				for( j = l; j < cols; j++ ) R[i*cols+j] = ( Q[i*cols+j] / Q[i*cols+l] ) / g;
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < cols; k++ ) s += Q[i*cols+k] * R[j*cols+k];
+					for( k = l; k < cols; k++ ) R[j*cols+k] += s * R[i*cols+k];
+				}
+			}
+			for( j = l; j < cols; j++ ) 
+			{
+				R[i*cols+j] = 0.0f;
+				R[j*cols+i] = 0.0f;
+			}
+		}
+		R[i*cols+i] = 1.0f;
+		g = temp[i];
+		l = i;
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		l = i + 1;
+		g = diag[i];
+		if( i < cols - 1 ) for( j = l; j < cols; j++ ) Q[i*cols+j] = 0.0f;
+		if( g != 0.0f ) 
+		{
+			g = 1.0f / g;
+			if( i != cols - 1 ) 
+			{
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+					f = ( s / Q[i*cols+i] ) * g;
+					for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+				}
+			}
+			for( j = i; j < rows; j++ ) Q[j*cols+i] *= g;
+		} 
+		else 
+		{
+			for( j = i; j < rows; j++ ) Q[j*cols+i] = 0.0f;
+		}
+		Q[i*cols+i] += 1.0f;
+	}
+
+
+	for( k = cols - 1; k >= 0; k-- ) 
+	{
+		for( iter = 1; iter <= MaxIterations; iter++ ) 
+		{
+			int jump;
+
+			for( l = k; l >= 0; l-- )
+			{
+				q = l - 1;
+				if( fabsf( temp[l] ) + norm == norm ) { jump = 1; break; }
+				if( fabsf( diag[q] ) + norm == norm ) { jump = 0; break; }
+			}
+
+			if( !jump )
+			{
+				c = 0.0f;
+				s = 1.0f;
+				for( i = l; i <= k; i++ )
+				{
+					f = s * temp[i];
+					temp[i] *= c;
+					if( fabsf( f ) + norm == norm ) break;
+					g = diag[i];
+					h = svd_pythag( f, g );
+					diag[i] = h;
+					h = 1.0f / h;
+					c = g * h;
+					s = -f * h;
+					for( j = 0; j < rows; j++ ) 
+					{
+						y = Q[j*cols+q];
+						z = Q[j*cols+i];
+						Q[j*cols+q] = y * c + z * s;
+						Q[j*cols+i] = z * c - y * s;
+					}
+				}
+			}
+
+			z = diag[k];
+			if( l == k ) 
+			{
+				if( z < 0.0f ) 
+				{
+					diag[k] = -z;
+					for( j = 0; j < cols; j++ ) R[k*cols+j] *= -1.0f; 
+				}
+				break;
+			}
+			if( iter >= MaxIterations ) return;
+			x = diag[l];
+			q = k - 1;
+			y = diag[q];
+			g = temp[q];
+			h = temp[k];
+			f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0f * h * y );
+			g = svd_pythag( f, 1.0f );
+			f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
+			c = 1.0f;
+			s = 1.0f;
+			for( j = l; j <= q; j++ ) 
+			{
+				i = j + 1;
+				g = temp[i];
+				y = diag[i];
+				h = s * g;
+				g = c * g;
+				z = svd_pythag( f, h );
+				temp[j] = z;
+				c = f / z;
+				s = h / z;
+				f = x * c + g * s;
+				g = g * c - x * s;
+				h = y * s;
+				y = y * c;
+				for( p = 0; p < cols; p++ ) 
+				{
+					x = R[j*cols+p];
+					z = R[i*cols+p];
+					R[j*cols+p] = x * c + z * s;
+					R[i*cols+p] = z * c - x * s;
+				}
+				z = svd_pythag( f, h );
+				diag[j] = z;
+				if( z != 0.0f ) 
+				{
+					z = 1.0f / z;
+					c = f * z;
+					s = h * z;
+				}
+				f = c * g + s * y;
+				x = c * y - s * g;
+				for( p = 0; p < rows; p++ ) 
+				{
+					y = Q[p*cols+j];
+					z = Q[p*cols+i];
+					Q[p*cols+j] = y * c + z * s;
+					Q[p*cols+i] = z * c - y * s;
+				}
+			}
+			temp[l] = 0.0f;
+			temp[k] = f;
+			diag[k] = x;
+		}
+	}
+
+	// Sort the singular values into descending order.
+
+	for( i = 0; i < cols - 1; i++ )
+	{
+		float biggest = diag[i];  // Biggest singular value so far.
+		int   bindex  = i;        // The row/col it occurred in.
+		for( j = i + 1; j < cols; j++ )
+		{
+			if( diag[j] > biggest ) 
+			{
+				biggest = diag[j];
+				bindex  = j;
+			}            
+		}
+		if( bindex != i )  // Need to swap rows and columns.
+		{
+			// Swap columns in Q.
+			for (int j = 0; j < rows; ++j)
+				swap(Q[j*cols+i], Q[j*cols+bindex]);
+
+			// Swap rows in R.
+			for (int j = 0; j < rows; ++j)
+				swap(R[i*cols+j], R[bindex*cols+j]);
+
+			// Swap elements in diag.
+			swap(diag[i], diag[bindex]);
+		}
+	}
+}
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.h b/3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.h
new file mode 100644
index 0000000..e835045
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/fitting.h
@@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Castaño <castano@gmail.com>
+
+#ifndef NV_MATH_FITTING_H
+#define NV_MATH_FITTING_H
+
+#include "vector.h"
+#include "plane.h"
+
+namespace nv
+{
+    namespace Fit
+    {
+        Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector4 computeCentroid(int n, const Vector4 * points);
+        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
+
+        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+
+        Plane bestPlane(int n, const Vector3 * points);
+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+
+        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
+
+        // Returns number of clusters [1-4].
+        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_FITTING_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.h b/3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.h
new file mode 100644
index 0000000..901a982
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.h
@@ -0,0 +1,112 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_MATRIX_H
+#define NV_MATH_MATRIX_H
+
+#include "vector.h"
+
+// - Matrices are stored in memory in *column major* order.
+// - Points are to be though of as column vectors.
+// - Transformation of a point p by a matrix M is: p' = M * p
+
+namespace nv
+{
+    enum identity_t { identity };
+
+    // 3x3 matrix.
+    class NVMATH_CLASS Matrix3
+    {
+    public:
+        Matrix3();
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
+        Matrix3(const Matrix3 & m);
+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+
+        Vector3 row(uint i) const;
+        Vector3 column(uint i) const;
+
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix3 & m);
+        void operator-=(const Matrix3 & m);
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        float determinant() const;
+
+    private:
+        float m_data[9];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
+
+
+    // 4x4 matrix.
+    class NVMATH_CLASS Matrix
+    {
+    public:
+        typedef Matrix const & Arg;
+
+        Matrix();
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
+        Matrix(const Matrix & m);
+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
+        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+        const float * ptr() const;
+
+        Vector4 row(uint i) const;
+        Vector4 column(uint i) const;
+
+        void zero();
+        void identity();
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        void translate(Vector3::Arg t);
+        void rotate(float theta, float v0, float v1, float v2);
+        float determinant() const;
+
+        void operator+=(const Matrix & m);
+        void operator-=(const Matrix & m);
+
+        void apply(Matrix::Arg m);
+
+    private:
+        float m_data[16];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Compute inverse using LU decomposition.
+    extern Matrix inverseLU(const Matrix & m);
+
+    // Compute inverse using Gaussian elimination and partial pivoting.
+    extern Matrix inverse(const Matrix & m);
+    extern Matrix3 inverse(const Matrix3 & m);
+
+} // nv namespace
+
+#endif // NV_MATH_MATRIX_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.inl b/3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.inl
new file mode 100644
index 0000000..0996a4f
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/matrix.inl
@@ -0,0 +1,1274 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#pragma once
+#ifndef NV_MATH_MATRIX_INL
+#define NV_MATH_MATRIX_INL
+
+#include "matrix.h"
+
+namespace nv
+{
+    inline Matrix3::Matrix3() {}
+    
+    inline Matrix3::Matrix3(float f)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = f;
+        }
+    }
+
+    inline Matrix3::Matrix3(identity_t)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix3::Matrix3(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+    
+    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
+    {
+        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;
+        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;
+        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;
+    }
+
+    inline float Matrix3::data(uint idx) const
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float & Matrix3::data(uint idx)
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float Matrix3::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float Matrix3::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float & Matrix3::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+
+    inline Vector3 Matrix3::row(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(i, 0), get(i, 1), get(i, 2));
+    }
+    inline Vector3 Matrix3::column(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(0, i), get(1, i), get(2, i));
+    }
+
+    inline void Matrix3::operator*=(float s)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::operator/=(float s)
+    {
+        float is = 1.0f /s;
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= is;
+        }
+    }
+
+    inline void Matrix3::operator+=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix3::operator-=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m -= b;
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator*(float s, const Matrix3 & a)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator/(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m /= s;
+        return m;
+    }
+
+    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m;
+
+        for(int i = 0; i < 3; i++) {
+            const float ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);
+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);
+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);
+            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);
+        }
+
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)
+    {
+        return mul(a, b);
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transform(const Matrix3 & m, const Vector3 & p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    inline void Matrix3::scale(float s)
+    {
+        for (int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x;
+        m_data[3] *= s.y; m_data[4] *= s.y; m_data[5] *= s.y;
+        m_data[6] *= s.z; m_data[7] *= s.z; m_data[8] *= s.z;
+    }
+
+    inline float Matrix3::determinant() const
+    {
+        return 
+            get(0,0) * get(1,1) * get(2,2) + 
+            get(0,1) * get(1,2) * get(2,0) + 
+            get(0,2) * get(1,0) * get(2,1) -
+            get(0,2) * get(1,1) * get(2,0) - 
+            get(0,1) * get(1,0) * get(2,2) -
+            get(0,0) * get(1,2) * get(2,1);
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix3 inverseCramer(const Matrix3 & m)
+    {
+        const float det = m.determinant();
+        if (equal(det, 0.0f, 0.0f)) {
+            return Matrix3(0);
+        }
+
+        Matrix3 r;
+
+        r.data(0) =  - m.data(5) * m.data(7) + m.data(4) * m.data(8);
+        r.data(1) =  + m.data(5) * m.data(6) - m.data(3) * m.data(8);
+        r.data(2) =  - m.data(4) * m.data(6) + m.data(3) * m.data(7);
+
+        r.data(3) =  + m.data(2) * m.data(7) - m.data(1) * m.data(8);
+        r.data(4) =  - m.data(2) * m.data(6) + m.data(0) * m.data(8);
+        r.data(5) =  + m.data(1) * m.data(6) - m.data(0) * m.data(7);
+
+        r.data(6) =  - m.data(2) * m.data(4) + m.data(1) * m.data(5);
+        r.data(7) =  + m.data(2) * m.data(3) - m.data(0) * m.data(5);
+        r.data(8) =  - m.data(1) * m.data(3) + m.data(0) * m.data(4);
+
+        r.scale(1.0f / det);
+
+        return r;
+    }
+
+
+
+    inline Matrix::Matrix()
+    {
+    }
+
+    inline Matrix::Matrix(float f)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = 0.0f;
+        }
+    }
+
+    inline Matrix::Matrix(identity_t)
+    {
+        for(int i = 0; i < 4; i++) {
+            for(int j = 0; j < 4; j++) {
+                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix3 & m)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                operator()(i, j) = m.get(i, j);
+            }
+        }
+        for(int i = 0; i < 4; i++) {
+            operator()(3, i) = 0;
+            operator()(i, 3) = 0;
+        }
+    }
+
+    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
+    {
+        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
+        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;
+        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;
+        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
+    }
+
+    /*inline Matrix::Matrix(const float m[])
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m[i];
+        }
+    }*/
+
+
+    // Accessors
+    inline float Matrix::data(uint idx) const
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float & Matrix::data(uint idx)
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float Matrix::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float Matrix::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float & Matrix::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+
+    inline const float * Matrix::ptr() const
+    {
+        return m_data;
+    }
+
+    inline Vector4 Matrix::row(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
+    }
+
+    inline Vector4 Matrix::column(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
+    }
+
+    inline void Matrix::zero()
+    {
+        m_data[0] = 0; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 0; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 0; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 0;
+    }
+
+    inline void Matrix::identity()
+    {
+        m_data[0] = 1; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 1; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 1; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 1;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(float s)
+    {
+        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
+        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
+        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
+        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;
+        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;
+        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;
+    }
+
+    // Apply translation.
+    inline void Matrix::translate(Vector3::Arg t)
+    {
+        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];
+        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];
+        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];
+        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];
+    }
+
+    Matrix rotation(float theta, float v0, float v1, float v2);
+
+    // Apply rotation.
+    inline void Matrix::rotate(float theta, float v0, float v1, float v2)
+    {
+        Matrix R(rotation(theta, v0, v1, v2));
+        apply(R);
+    }
+
+    // Apply transform.
+    inline void Matrix::apply(Matrix::Arg m)
+    {
+        nvDebugCheck(this != &m);
+
+        for(int i = 0; i < 4; i++) {
+            const float ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
+            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
+            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
+            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
+            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
+        }
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(Vector3::Arg s)
+    {
+        Matrix m(identity);
+        m(0,0) = s.x;
+        m(1,1) = s.y;
+        m(2,2) = s.z;
+        return m;
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(float s)
+    {
+        Matrix m(identity);
+        m(0,0) = m(1,1) = m(2,2) = s;
+        return m;
+    }
+
+    // Get translation matrix.
+    inline Matrix translation(Vector3::Arg t)
+    {
+        Matrix m(identity);
+        m(0,3) = t.x;
+        m(1,3) = t.y;
+        m(2,3) = t.z;
+        return m;
+    }
+
+    // Get rotation matrix.
+    inline Matrix rotation(float theta, float v0, float v1, float v2)
+    {
+        float cost = cosf(theta);
+        float sint = sinf(theta);
+
+        Matrix m(identity);
+
+        if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+            m(1,1) = cost; m(2,1) = -sint;
+            m(1,2) = sint; m(2,2) = cost;
+        }
+        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+            m(0,0) = cost; m(2,0) = sint;
+            m(1,2) = -sint; m(2,2) = cost;
+        }
+        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+            m(0,0) = cost; m(1,0) = -sint;
+            m(0,1) = sint; m(1,1) = cost;
+        } 
+        else {
+            float a2, b2, c2;
+            a2 = v0 * v0;
+            b2 = v1 * v1;
+            c2 = v2 * v2;
+
+            float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+            v0 *= iscale;
+            v1 *= iscale;
+            v2 *= iscale;
+
+            float abm, acm, bcm;
+            float mcos, asin, bsin, csin;
+            mcos = 1.0f - cost;
+            abm = v0 * v1 * mcos;
+            acm = v0 * v2 * mcos;
+            bcm = v1 * v2 * mcos;
+            asin = v0 * sint;
+            bsin = v1 * sint;
+            csin = v2 * sint;
+            m(0,0) = a2 * mcos + cost;
+            m(1,0) = abm - csin;
+            m(2,0) = acm + bsin;
+            m(3,0) = abm + csin;
+            m(1,1) = b2 * mcos + cost;
+            m(2,1) = bcm - asin;
+            m(3,1) = acm - bsin;
+            m(1,2) = bcm + asin;
+            m(2,2) = c2 * mcos + cost;
+        }
+        return m;
+    }
+
+    //Matrix rotation(float yaw, float pitch, float roll);
+    //Matrix skew(float angle, Vector3::Arg v1, Vector3::Arg v2);
+
+    // Get frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float one_deltaz = 1.0f / (zFar - zNear);
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -(zFar + zNear) * one_deltaz;
+        m(3,2) = -1.0f;
+        m(2,3) = -(zFar * doubleznear) * one_deltaz;
+
+        return m;
+    }
+
+    // Get inverse frustum matrix.
+    inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float one_doubleznear = 1.0f / (2.0f * zNear);
+        float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);
+
+        m(0,0) = (xmax - xmin) * one_doubleznear;
+        m(0,3) = (xmax + xmin) * one_doubleznear;
+        m(1,1) = (ymax - ymin) * one_doubleznear;
+        m(1,3) = (ymax + ymin) * one_doubleznear;
+        m(2,3) = -1;
+        m(3,2) = -(zFar - zNear) * one_doubleznearzfar;
+        m(3,3) = (zFar + zNear) * one_doubleznearzfar;
+
+        return m;
+    }
+
+    // Get infinite frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float nudge = 1.0; // 0.999;
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -1.0f * nudge;
+        m(3,2) = -1.0f;
+        m(2,3) = -doubleznear * nudge;
+
+        return m;
+    }
+
+    // Get perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tanf(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get inverse perspective matrix.
+    inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tanf(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get infinite perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear)
+    {
+        float x = zNear * tanf(fovy / 2);
+        float y = x / aspect;
+        return frustum( -x, x, -y, y, zNear );	
+    }
+
+    // Get matrix determinant.
+    inline float Matrix::determinant() const
+    {
+        return 
+            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
+            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
+            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
+            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
+            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
+            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
+    }
+
+    inline Matrix transpose(Matrix::Arg m)
+    {
+        Matrix r;
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+        return r;
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix inverseCramer(Matrix::Arg m)
+    {
+        Matrix r;
+        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
+        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
+        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
+        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
+        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
+        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
+        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
+        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
+        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
+        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
+        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
+        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
+        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
+        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
+        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
+        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
+        r.scale(1.0f / m.determinant());
+        return r;
+    }
+
+    inline Matrix isometryInverse(Matrix::Arg m)
+    {
+        Matrix r(identity);
+
+        // transposed 3x3 upper left matrix
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+
+        // translate by the negative offsets
+        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
+
+        return r;
+    }
+
+    // Transform the given 3d point with the given matrix.
+    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    // Transform the given 4d vector with the given matrix.
+    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
+    {
+        return Vector4(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),
+            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));
+    }
+
+    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
+    {
+        // @@ Is this the right order? mul(a, b) = b * a
+        Matrix m = a;
+        m.apply(b);
+        return m;
+    }
+
+    inline void Matrix::operator+=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix::operator-=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix operator+(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix operator-(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m -= b;
+        return m;
+    }
+
+
+} // nv namespace
+
+
+#if 0 // old code.
+/** @name Special matrices. */
+//@{
+/** Generate a translation matrix. */
+void TranslationMatrix(const Vec3 & v) {
+    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
+    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
+    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
+    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
+}
+
+/** Rotate theta degrees around v. */
+void RotationMatrix( float theta, float v0, float v1, float v2 ) {
+    float cost = cos(theta);
+    float sint = sin(theta);
+
+    if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    } 
+    else {
+        //we need scale a,b,c to unit length.
+        float a2, b2, c2;
+        a2 = v0 * v0;
+        b2 = v1 * v1;
+        c2 = v2 * v2;
+
+        float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+        v0 *= iscale;
+        v1 *= iscale;
+        v2 *= iscale;
+
+        float abm, acm, bcm;
+        float mcos, asin, bsin, csin;
+        mcos = 1.0f - cost;
+        abm = v0 * v1 * mcos;
+        acm = v0 * v2 * mcos;
+        bcm = v1 * v2 * mcos;
+        asin = v0 * sint;
+        bsin = v1 * sint;
+        csin = v2 * sint;
+        data[0] = a2 * mcos + cost;
+        data[1] = abm - csin;
+        data[2] = acm + bsin;
+        data[3] = abm + csin;
+        data[4] = 0.0f;
+        data[5] = b2 * mcos + cost;
+        data[6] = bcm - asin;
+        data[7] = acm - bsin;
+        data[8] = 0.0f;
+        data[9] = bcm + asin;
+        data[10] = c2 * mcos + cost;
+        data[11] = 0.0f;
+        data[12] = 0.0f;
+        data[13] = 0.0f;
+        data[14] = 0.0f;
+        data[15] = 1.0f;
+    }
+}
+
+/*
+void SkewMatrix(float angle, const Vec3 & v1, const Vec3 & v2) {
+v1.Normalize();
+v2.Normalize();
+
+Vec3 v3;
+v3.Cross(v1, v2);
+v3.Normalize();
+
+// Get skew factor.
+float costheta = Vec3DotProduct(v1, v2);
+float sintheta = Real.Sqrt(1 - costheta * costheta);
+float skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
+
+// Build orthonormal matrix.
+v1 = FXVector3.Cross(v3, v2);
+v1.Normalize();
+
+Matrix R = Matrix::Identity;
+R[0, 0] = v3.X;�// Not sure this is in the correct order...
+R[1, 0] = v3.Y;
+R[2, 0] = v3.Z;
+R[0, 1] = v1.X;
+R[1, 1] = v1.Y;
+R[2, 1] = v1.Z;
+R[0, 2] = v2.X;
+R[1, 2] = v2.Y;
+R[2, 2] = v2.Z;
+
+// Build skew matrix.
+Matrix S = Matrix::Identity;
+S[2, 1] = -skew;
+
+// Return skew transform.
+return R * S * R.Transpose;	// Not sure this is in the correct order...
+}
+*/
+
+/**
+* Generate rotation matrix for the euler angles. This is the same as computing
+* 3 rotation matrices and multiplying them together in our custom order.
+*
+* @todo Have to recompute this code for our new convention.
+**/
+void RotationMatrix( float yaw, float pitch, float roll ) {
+    float sy = sin(yaw+ToRadian(90));
+    float cy = cos(yaw+ToRadian(90));
+    float sp = sin(pitch-ToRadian(90));
+    float cp = cos(pitch-ToRadian(90));
+    float sr = sin(roll);
+    float cr = cos(roll);
+
+    data[0] = cr*cy + sr*sp*sy;
+    data[1] = cp*sy;
+    data[2] = -sr*cy + cr*sp*sy;
+    data[3] = 0;
+
+    data[4] = -cr*sy + sr*sp*cy;
+    data[5] = cp*cy;
+    data[6] = sr*sy + cr*sp*cy;
+    data[7] = 0;
+
+    data[8] = sr*cp;
+    data[9] = -sp;
+    data[10] = cr*cp;
+    data[11] = 0;
+
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = 0;
+    data[15] = 1;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void Frustum( float xmin, float xmax, float ymin, float ymax, float zNear, float zFar ) {
+    float one_deltax, one_deltay, one_deltaz, doubleznear;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    one_deltaz = 1.0f / (zFar - zNear);
+
+    data[0] = (float)(doubleznear * one_deltax);
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+    data[4] = 0.0f;
+    data[5] = (float)(doubleznear * one_deltay);
+    data[6] = 0.f;
+    data[7] = 0.f;
+    data[8] = (float)((xmax + xmin) * one_deltax);
+    data[9] = (float)((ymax + ymin) * one_deltay);
+    data[10] = (float)(-(zFar + zNear) * one_deltaz);
+    data[11] = -1.f;
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = (float)(-(zFar * doubleznear) * one_deltaz);
+    data[15] = 0.f;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void FrustumInf( float xmin, float xmax, float ymin, float ymax, float zNear ) {
+    float one_deltax, one_deltay, doubleznear, nudge;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    nudge = 1.0; // 0.999;
+
+    data[0] = doubleznear * one_deltax;
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+
+    data[4] = 0.0f;
+    data[5] = doubleznear * one_deltay;
+    data[6] = 0.f;
+    data[7] = 0.f;
+
+    data[8] = (xmax + xmin) * one_deltax;
+    data[9] = (ymax + ymin) * one_deltay;
+    data[10] = -1.0f * nudge;
+    data[11] = -1.0f;
+
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = -doubleznear * nudge;
+    data[15] = 0.f;
+}
+
+/** Create an inverse frustum matrix with the far plane at the infinity. */
+void FrustumInfInv( float left, float right, float bottom, float top, float zNear ) {
+    // this matrix is wrong (not tested floatly) I think it should be transposed.
+    data[0] = (right - left) / (2 * zNear);
+    data[1] = 0;
+    data[2] = 0;
+    data[3] = (right + left) / (2 * zNear);
+    data[4] = 0;
+    data[5] = (top - bottom) / (2 * zNear);
+    data[6] = 0;
+    data[7] = (top + bottom) / (2 * zNear);
+    data[8] = 0;
+    data[9] = 0;
+    data[10] = 0;
+    data[11] = -1;
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = -1 / (2 * zNear);
+    data[15] = 1 / (2 * zNear);
+}
+
+/** Create an homogeneous projection matrix. */
+void Perspective( float fov, float aspect, float zNear, float zFar ) {
+    float xmin, xmax, ymin, ymax;
+
+    xmax = zNear * tan( fov/2 );
+    xmin = -xmax;
+
+    ymax = xmax / aspect;
+    ymin = -ymax;
+
+    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+/** Create a projection matrix with the far plane at the infinity. */
+void PerspectiveInf( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInf( -x, x, -y, y, zNear );
+}
+
+/** Create an inverse projection matrix with far plane at the infinity. */
+void PerspectiveInfInv( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInfInv( -x, x, -y, y, zNear );
+}
+
+/** Build bone matrix from quatertion and offset. */
+void BoneMatrix(const Quat & q, const Vec3 & offset) {
+    float x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
+
+    // calculate coefficients
+    x2 = q.x + q.x;
+    y2 = q.y + q.y;
+    z2 = q.z + q.z;
+
+    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
+    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
+    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
+
+    data[0] = 1.0f - (yy + zz); 	
+    data[1] = xy - wz;
+    data[2] = xz + wy;		
+    data[3] = 0.0f;
+
+    data[4] = xy + wz;		
+    data[5] = 1.0f - (xx + zz);
+    data[6] = yz - wx;		
+    data[7] = 0.0f;
+
+    data[8] = xz - wy;		
+    data[9] = yz + wx;
+    data[10] = 1.0f - (xx + yy);		
+    data[11] = 0.0f;
+
+    data[12] = offset.x;
+    data[13] = offset.y;
+    data[14] = offset.z;			
+    data[15] = 1.0f;
+}
+
+//@}
+
+
+/** @name Transformations: */
+//@{
+
+/** Apply a general scale. */
+void Scale( float x, float y, float z ) {
+    data[0] *= x;	data[4] *= y;	data[8]  *= z;
+    data[1] *= x;	data[5] *= y;	data[9]  *= z;
+    data[2] *= x;	data[6] *= y;	data[10] *= z;
+    data[3] *= x;	data[7] *= y;	data[11] *= z;
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, const Vec3 & v ) {
+    Matrix b;
+    b.RotationMatrix( theta, v[0], v[1], v[2] );
+    Multiply4x3( b );
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, float v0, float v1, float v2 ) {
+    Matrix b;
+    b.RotationMatrix( theta, v0, v1, v2 );
+    Multiply4x3( b );
+}
+
+/**
+* Translate the matrix by t. This is the same as multiplying by a
+* translation matrix with the given offset.
+* this = T * this
+*/
+void Translate( const Vec3 &t ) {
+    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
+    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
+    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
+    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
+}
+
+/** 
+* Translate the matrix by x, y, z. This is the same as multiplying by a 
+* translation matrix with the given offsets.
+*/
+void Translate( float x, float y, float z ) {
+    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
+    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
+    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
+    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
+}
+
+/** Compute the transposed matrix. */
+void Transpose() {
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+    piSwap(data[3], data[12]);
+    piSwap(data[7], data[13]);
+    piSwap(data[11], data[14]);
+}
+
+/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
+void IsometryInverse() {
+    // transposed 3x3 upper left matrix
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+
+    // translate by the negative offsets
+    Vec3 v(-data[12], -data[13], -data[14]);
+    data[12] = data[13] = data[14] = 0;
+    Translate(v);
+}
+
+/** Compute the inverse of the affine portion of this matrix. */
+void AffineInverse() {
+    data[12] = data[13] = data[14] = 0;
+    Transpose();
+}
+//@}
+
+/** @name Matrix operations: */
+//@{
+
+/** Return the determinant of this matrix. */
+float Determinant() const {
+    return	data[0] * data[5] * data[10] * data[15] + 
+        data[1] * data[6] * data[11] * data[12] +
+        data[2] * data[7] * data[ 8] * data[13] +
+        data[3] * data[4] * data[ 9] * data[14] -
+        data[3] * data[6] * data[ 9] * data[12] -
+        data[2] * data[5] * data[ 8] * data[15] -
+        data[1] * data[4] * data[11] * data[14] -
+        data[0] * data[7] * data[10] * data[12];
+}
+
+
+/** Standard matrix product: this *= B. */
+void Multiply4x4( const Matrix & restrict B ) {
+    Multiply4x4(*this, B);
+}
+
+/** Standard matrix product: this = A * B. this != B*/
+void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 4; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+
+    /* Unrolled but does not allow this == A
+    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
+    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
+    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
+    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
+    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
+    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
+    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
+    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
+    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
+    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
+    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
+    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
+    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
+    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
+    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
+    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
+    */
+}
+
+/** Standard matrix product: this *= B. */
+void Multiply4x3( const Matrix & restrict B ) {
+    Multiply4x3(*this, B);
+}
+
+/** Standard product of matrices, where the last row is [0 0 0 1]. */
+void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 3; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
+
+    /* Unrolled but does not allow this == A
+    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
+    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
+    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
+    data[3] = 0.0f;
+    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
+    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
+    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
+    data[7] = 0.0f;
+    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
+    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
+    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
+    data[11]= 0.0f;
+    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
+    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
+    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
+    data[15]= 1.0f;
+    */
+}
+//@}
+
+
+/** @name Vector operations: */
+//@{
+
+/** Transform 3d vector (w=0). */
+void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
+}
+/** Transform 3d vector by the transpose (w=0). */
+void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
+    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
+    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
+}
+
+/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
+void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+}
+
+/** Transform a point, normalize it, and return w. */
+float TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    float w;
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
+    *dest *= w;
+    return w;
+}
+
+/** Transform a point and return w. */
+float TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+
+/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
+void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+//@}
+
+/** @name Matrix analysis. */
+//@{
+
+/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
+void GetEulerAnglesZYZ(float * s, float * t, float * r) const {
+    if( GetElem(2,2) < 1.0f ) {
+        if( GetElem(2,2) > -1.0f ) {
+            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
+            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
+            //	-cs*st				ss*st				ct
+            *s = atan2(GetElem(1,2), -GetElem(0,2));
+            *t = acos(GetElem(2,2));
+            *r = atan2(GetElem(2,1), GetElem(2,0));		
+        }
+        else {
+            // 	-c(s-r)	 	s(s-r)		0
+            //	s(s-r)		c(s-r)		0
+            //	0			0			-1
+            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
+            *t = PI;
+            *r = 0;
+        }
+    }
+    else {
+        // 	c(s+r)		-s(s+r)		0
+        //	s(s+r)		c(s+r)		0
+        //	0			0			1
+        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
+        *t = 0;
+        *r = 0;
+    }
+}
+
+//@}
+
+MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
+
+/** Print to debug output. */
+void Print() const {
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
+}
+
+
+public:
+
+    float data[16];
+
+};
+#endif
+
+
+#endif // NV_MATH_MATRIX_INL
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/nvmath.h b/3rdparty/bimg/3rdparty/nvtt/nvmath/nvmath.h
new file mode 100644
index 0000000..94f7ec7
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/nvmath.h
@@ -0,0 +1,61 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_H
+#define NV_MATH_H
+
+#include <cmath>
+#include <float.h>  // finite, isnan
+
+#include "nvcore/utils.h"   // max, clamp
+
+#define NVMATH_API
+#define NVMATH_CLASS
+
+#define PI                  float(3.1415926535897932384626433833)
+#define NV_EPSILON          (0.0001f)
+#define NV_NORMAL_EPSILON   (0.001f)
+
+namespace nv
+{
+    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
+    inline float toDegree(float radian) { return radian * (180.0f / PI); }
+
+    // Robust floating point comparisons:
+    // http://realtimecollisiondetection.net/blog/?p=89
+    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
+    {
+        //return fabs(f0-f1) <= epsilon;
+        return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
+    }
+
+    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+    {
+        return fabsf(f) <= epsilon;
+    }
+
+    inline bool isFinite(const float f)
+    {
+#if defined(_MSC_VER) && _MSC_VER <= 1800
+		(void)f;
+		return true;
+#else
+		return std::isfinite(f);
+#endif // defined(_MSC_VER) && _MSC_VER <= 1800
+    }
+
+    // Eliminates negative zeros from a float array.
+    inline void floatCleanup(float * fp, int n)
+    {
+        for (int i = 0; i < n; i++) {
+            //nvDebugCheck(isFinite(fp[i]));
+            union { float f; uint32 i; } x = { fp[i] };
+            if (x.i == 0x80000000) fp[i] = 0.0f;
+        }
+    }
+
+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+}
+
+#endif // NV_MATH_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/plane.h b/3rdparty/bimg/3rdparty/nvtt/nvmath/plane.h
new file mode 100644
index 0000000..eb544b1
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/plane.h
@@ -0,0 +1,40 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#ifndef NV_MATH_PLANE_H
+#define NV_MATH_PLANE_H
+
+#include "nvmath.h"
+#include "vector.h"
+
+namespace nv
+{
+    class Matrix;
+
+    class NVMATH_CLASS Plane
+    {
+    public:
+        Plane();
+        Plane(float x, float y, float z, float w);
+        Plane(const Vector4 & v);
+        Plane(const Vector3 & v, float d);
+        Plane(const Vector3 & normal, const Vector3 & point);
+        Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2);
+
+        const Plane & operator=(const Plane & v);
+
+        Vector3 vector() const;
+        float offset() const;
+
+        void operator*=(float s);
+
+        Vector4 v;
+    };
+
+    Plane transformPlane(const Matrix &, const Plane &);
+
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c);
+
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/plane.inl b/3rdparty/bimg/3rdparty/nvtt/nvmath/plane.inl
new file mode 100644
index 0000000..7baf804
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/plane.inl
@@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Casta�o <castano@gmail.com>
+
+#pragma once
+#ifndef NV_MATH_PLANE_INL
+#define NV_MATH_PLANE_INL
+
+#include "plane.h"
+#include "vector.inl"
+
+namespace nv
+{
+    inline Plane::Plane() {}
+    inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {}
+    inline Plane::Plane(const Vector4 & v) : v(v) {}
+    inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {}
+    inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {}
+    inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) {
+        Vector3 n = cross(v1-v0, v2-v0);
+        float d = -dot(n, v0);
+        v = Vector4(n, d);
+    }
+
+    inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; }
+
+    inline Vector3 Plane::vector() const { return v.xyz(); }
+    inline float Plane::offset() const { return v.w; }
+
+    // Normalize plane.
+    inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
+    {
+        const float len = length(plane.vector());
+        const float inv = isZero(len, epsilon) ? 0 : 1.0f / len;
+        return Plane(plane.v * inv);
+    }
+
+    // Get the signed distance from the given point to this plane.
+    inline float distance(const Plane & plane, const Vector3 & point)
+    {
+        return dot(plane.vector(), point) + plane.offset();
+    }
+
+    inline void Plane::operator*=(float s)
+    {
+        v *= s;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/vector.h b/3rdparty/bimg/3rdparty/nvtt/nvmath/vector.h
new file mode 100644
index 0000000..180cfab
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/vector.h
@@ -0,0 +1,148 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_VECTOR_H
+#define NV_MATH_VECTOR_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+    class NVMATH_CLASS Vector2
+    {
+    public:
+        typedef Vector2 const & Arg;
+
+        Vector2();
+        explicit Vector2(float f);
+        Vector2(float x, float y);
+        Vector2(Vector2::Arg v);
+
+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
+        //template <typename T> operator T() const { return T(x, y); }
+
+        const Vector2 & operator=(Vector2::Arg v);
+
+        const float * ptr() const;
+
+        void set(float x, float y);
+
+        Vector2 operator-() const;
+        void operator+=(Vector2::Arg v);
+        void operator-=(Vector2::Arg v);
+        void operator*=(float s);
+        void operator*=(Vector2::Arg v);
+
+        friend bool operator==(Vector2::Arg a, Vector2::Arg b);
+        friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
+
+        union {
+            struct {
+                float x, y;
+            };
+            float component[2];
+        };
+    };
+
+    class NVMATH_CLASS Vector3
+    {
+    public:
+        typedef Vector3 const & Arg;
+
+        Vector3();
+        explicit Vector3(float x);
+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
+        Vector3(float x, float y, float z);
+        Vector3(Vector2::Arg v, float z);
+        Vector3(Vector3::Arg v);
+
+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
+        //template <typename T> operator T() const { return T(x, y, z); }
+
+        const Vector3 & operator=(Vector3::Arg v);
+
+        Vector2 xy() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z);
+
+        Vector3 operator-() const;
+        void operator+=(Vector3::Arg v);
+        void operator-=(Vector3::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector3::Arg v);
+        void operator/=(Vector3::Arg v);
+
+        friend bool operator==(Vector3::Arg a, Vector3::Arg b);
+        friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
+
+        union {
+            struct {
+                float x, y, z;
+            };
+            float component[3];
+        };
+    };
+
+    class NVMATH_CLASS Vector4
+    {
+    public:
+        typedef Vector4 const & Arg;
+
+        Vector4();
+        explicit Vector4(float x);
+        Vector4(float x, float y, float z, float w);
+        Vector4(Vector2::Arg v, float z, float w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
+        Vector4(Vector3::Arg v, float w);
+        Vector4(Vector4::Arg v);
+        //	Vector4(const Quaternion & v);
+
+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+        //template <typename T> operator T() const { return T(x, y, z, w); }
+
+        const Vector4 & operator=(Vector4::Arg v);
+
+        Vector2 xy() const;
+        Vector2 zw() const;
+        Vector3 xyz() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z, float w);
+
+        Vector4 operator-() const;
+        void operator+=(Vector4::Arg v);
+        void operator-=(Vector4::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector4::Arg v);
+        void operator/=(Vector4::Arg v);
+
+        friend bool operator==(Vector4::Arg a, Vector4::Arg b);
+        friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
+
+        union {
+            struct {
+                float x, y, z, w;
+            };
+            float component[4];
+        };
+    };
+
+} // nv namespace
+
+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
+
+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
+
+// Instead we simply have explicit casts:
+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
+
+#endif // NV_MATH_VECTOR_H
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvmath/vector.inl b/3rdparty/bimg/3rdparty/nvtt/nvmath/vector.inl
new file mode 100644
index 0000000..8f1da1e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvmath/vector.inl
@@ -0,0 +1,921 @@
+// This code is in the public domain -- castanyo@yahoo.es
+
+#ifndef NV_MATH_VECTOR_INL
+#define NV_MATH_VECTOR_INL
+
+#include "vector.h"
+#include "nvcore/utils.h" // min, max
+#include "nvcore/hash.h" // hash
+
+namespace nv
+{
+
+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
+
+
+    // Vector2
+    inline Vector2::Vector2() {}
+    inline Vector2::Vector2(float f) : x(f), y(f) {}
+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
+
+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        return *this;
+    }
+
+    inline const float * Vector2::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector2::set(float x, float y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    inline Vector2 Vector2::operator-() const
+    {
+        return Vector2(-x, -y);
+    }
+
+    inline void Vector2::operator+=(Vector2::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+    }
+
+    inline void Vector2::operator-=(Vector2::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+    }
+
+    inline void Vector2::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+    }
+
+    inline void Vector2::operator*=(Vector2::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+    }
+
+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x == b.x && a.y == b.y; 
+    }
+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x != b.x || a.y != b.y; 
+    }
+
+
+    // Vector3
+    inline Vector3::Vector3() {}
+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
+
+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        return *this;
+    }
+
+
+    inline Vector2 Vector3::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline const float * Vector3::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector3::set(float x, float y, float z)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+
+    inline Vector3 Vector3::operator-() const
+    {
+        return Vector3(-x, -y, -z);
+    }
+
+    inline void Vector3::operator+=(Vector3::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+    }
+
+    inline void Vector3::operator-=(Vector3::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+    }
+
+    inline void Vector3::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+    }
+
+    inline void Vector3::operator/=(float s)
+    {
+        float is = 1.0f / s;
+        x *= is;
+        y *= is;
+        z *= is;
+    }
+
+    inline void Vector3::operator*=(Vector3::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+    }
+
+    inline void Vector3::operator/=(Vector3::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+    }
+
+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z; 
+    }
+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z; 
+    }
+
+
+    // Vector4
+    inline Vector4::Vector4() {}
+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector2 Vector4::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
+    inline Vector3 Vector4::xyz() const
+    {
+        return Vector3(x, y, z);
+    }
+
+    inline const float * Vector4::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector4::set(float x, float y, float z, float w)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+
+    inline Vector4 Vector4::operator-() const
+    {
+        return Vector4(-x, -y, -z, -w);
+    }
+
+    inline void Vector4::operator+=(Vector4::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+        w += v.w;
+    }
+
+    inline void Vector4::operator-=(Vector4::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+        w -= v.w;
+    }
+
+    inline void Vector4::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+        w *= s;
+    }
+
+    inline void Vector4::operator/=(float s)
+    {
+        x /= s;
+        y /= s;
+        z /= s;
+        w /= s;
+    }
+
+    inline void Vector4::operator*=(Vector4::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+        w *= v.w;
+    }
+
+    inline void Vector4::operator/=(Vector4::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+        w /= v.w;
+    }
+
+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
+    }
+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
+    }
+
+
+
+    // Functions
+
+
+    // Vector2
+
+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x + b.x, a.y + b.y);
+    }
+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x - b.x, a.y - b.y);
+    }
+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, float s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
+    {
+        return Vector2(v.x * s.x, v.y * s.y);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
+    {
+        return Vector2(v1.x*v2.x, v1.y*v2.y);
+    }
+
+    inline Vector2 operator*(float s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
+    inline float dot(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x * b.x + a.y * b.y;
+    }
+
+    inline float lengthSquared(Vector2::Arg v)
+    {
+        return v.x * v.x + v.y * v.y;
+    }
+
+    inline float length(Vector2::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float inverseLength(Vector2::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector2 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
+    }
+
+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(min(a.x, b.x), min(a.y, b.y));
+    }
+
+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(max(a.x, b.x), max(a.y, b.y));
+    }
+
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
+    inline bool isFinite(Vector2::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y);
+    }
+
+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector2 vf = v;
+        nv::floatCleanup(vf.component, 2);
+        return vf;
+    }
+
+    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+        // IC: While it may be appealing to use the following expression:
+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+
+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
+        // numbers and the results becomes very unstable and dependent on the order of the factors.
+
+        // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result
+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
+        // the triangle.
+
+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
+        return triangleArea(a-c, b-c);
+    }
+
+
+    template <>
+    inline uint hash(const Vector2 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 2, h);
+    }
+
+
+
+    // Vector3
+
+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    inline Vector3 add(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x + b, a.y + b, a.z + b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
+    {
+        return add(a, b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, float b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+    }
+    inline Vector3 sub(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x - b, a.y - b, a.z - b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
+    {
+        return sub(a, b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, float b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, float s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
+    {
+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(float s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
+    {
+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
+    }*/
+
+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
+    }
+
+    inline float dot(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+
+    inline float lengthSquared(Vector3::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z;
+    }
+
+    inline float length(Vector3::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
+    inline float inverseLength(Vector3::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector3 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
+    }
+
+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+    }
+
+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+    }
+
+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
+    {
+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
+    }
+
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
+
+    inline bool isFinite(Vector3::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
+    }
+
+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector3 vf = v;
+        nv::floatCleanup(vf.component, 3);
+        return vf;
+    }
+
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
+    template <>
+    inline uint hash(const Vector3 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 3, h);
+    }
+
+
+    // Vector4
+
+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+    }
+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, float s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
+    {
+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(float s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    {
+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
+    }
+
+    inline float dot(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+
+    inline float lengthSquared(Vector4::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+    }
+
+    inline float length(Vector4::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float inverseLength(Vector4::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector4 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
+    }
+
+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+    }
+
+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+    }
+
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
+    inline bool isFinite(Vector4::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
+    }
+
+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector4 vf = v;
+        nv::floatCleanup(vf.component, 4);
+        return vf;
+    }
+
+    template <>
+    inline uint hash(const Vector4 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 4, h);
+    }
+
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
+} // nv namespace
+
+#endif // NV_MATH_VECTOR_INL
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvtt.cpp b/3rdparty/bimg/3rdparty/nvtt/nvtt.cpp
new file mode 100644
index 0000000..51a2bce
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvtt.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "nvtt.h"
+
+#include <string.h>
+#include <bx/uint32_t.h>
+
+#include "bc6h/zoh.h"
+#include "bc7/avpcl.h"
+#include "nvmath/vector.inl"
+
+NVCORE_API int nvAbort(const char *, const char *, int , const char *, const char *, ...)
+{
+	abort();
+	return 0;
+}
+
+namespace nvtt
+{
+	using namespace nv;
+
+	void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
+	{
+		const uint8_t* src = (const uint8_t*)_input;
+		char* dst = (char*)_output;
+
+		for (uint32_t yy = 0; yy < _height; yy += 4)
+		{
+			for (uint32_t xx = 0; xx < _width; xx += 4)
+			{
+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
+
+				ZOH::Utils::FORMAT = ZOH::UNSIGNED_F16;
+				ZOH::Tile zohTile(4, 4);
+
+				memset(zohTile.data, 0, sizeof(zohTile.data) );
+				memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map) );
+
+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
+				{
+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
+					{
+						Vector4 color = rgba[blockY*4 + blockX];
+						uint16 rHalf = bx::halfFromFloat(color.x);
+						uint16 gHalf = bx::halfFromFloat(color.y);
+						uint16 bHalf = bx::halfFromFloat(color.z);
+						zohTile.data[blockY][blockX].x = ZOH::Tile::half2float(rHalf);
+						zohTile.data[blockY][blockX].y = ZOH::Tile::half2float(gHalf);
+						zohTile.data[blockY][blockX].z = ZOH::Tile::half2float(bHalf);
+						zohTile.importance_map[blockY][blockX] = 1.0f;
+					}
+				}
+
+				ZOH::compress(zohTile, &dst[( (yy*_width) + xx)/4 * 16]);
+			}
+		}
+	}
+
+	void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
+	{
+		const uint8_t* src = (const uint8_t*)_input;
+		char* dst = (char*)_output;
+
+		for (uint32_t yy = 0; yy < _height; yy += 4)
+		{
+			for (uint32_t xx = 0; xx < _width; xx += 4)
+			{
+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
+
+				AVPCL::mode_rgb     = false;
+				AVPCL::flag_premult = false;
+				AVPCL::flag_nonuniform     = false;
+				AVPCL::flag_nonuniform_ati = false;
+
+				AVPCL::Tile avpclTile(4, 4);
+				memset(avpclTile.data, 0, sizeof(avpclTile.data) );
+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
+				{
+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
+					{
+						Vector4 color = rgba[blockY*4 + blockX];
+						avpclTile.data[blockY][blockX] = color * 255.0f;
+						avpclTile.importance_map[blockY][blockX] = 1.0f;
+					}
+				}
+
+				AVPCL::compress(avpclTile, &dst[( (yy*_width) + xx)/4 * 16]);
+			}
+		}
+	}
+
+} //namespace nvtt
diff --git a/3rdparty/bimg/3rdparty/nvtt/nvtt.h b/3rdparty/bimg/3rdparty/nvtt/nvtt.h
new file mode 100644
index 0000000..a37c7cf
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/nvtt/nvtt.h
@@ -0,0 +1,13 @@
+#ifndef NVTT_H
+#define NVTT_H
+
+#include <stdint.h>
+
+namespace nvtt
+{
+void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
+void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
+
+} // namespace nvtt
+
+#endif // NVTT_H
diff --git a/3rdparty/bimg/3rdparty/pvrtc/AlphaBitmap.h b/3rdparty/bimg/3rdparty/pvrtc/AlphaBitmap.h
new file mode 100644
index 0000000..4197332
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/AlphaBitmap.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "Bitmap.h"
+
+namespace Javelin {
+
+class AlphaBitmap : public Bitmap {
+public:
+    AlphaBitmap() {}
+
+    AlphaBitmap(int w, int h)
+        : Bitmap(w, h, 1) {
+    }
+
+    const unsigned char *GetData() const { return data; }
+
+    unsigned char *GetData() { return data; }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/BitScale.cpp b/3rdparty/bimg/3rdparty/pvrtc/BitScale.cpp
new file mode 100644
index 0000000..3e74193
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/BitScale.cpp
@@ -0,0 +1,183 @@
+#include "BitScale.h"
+
+const uint8_t Javelin::Data::BITSCALE_5_TO_8[32] = {
+ 0, 8, 16, 24, 32, 41, 49, 57, 65, 74,
+ 82, 90, 98, 106, 115, 123, 131, 139, 148, 156,
+ 164, 172, 180, 189, 197, 205, 213, 222, 230, 238,
+ 246, 255};
+
+const uint8_t Javelin::Data::BITSCALE_4_TO_8[16] = {
+ 0, 17, 34, 51, 68, 85, 102, 119, 136, 153,
+ 170, 187, 204, 221, 238, 255};
+
+const uint8_t Javelin::Data::BITSCALE_3_TO_8[8] = {
+ 0, 36, 72, 109, 145, 182, 218, 255};
+
+const uint8_t Javelin::Data::BITSCALE_8_TO_5_FLOOR[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
+ 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+ 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
+ 3, 3, 3, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 7, 7,
+ 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 9, 9, 9, 9, 9,
+ 9, 9, 9, 10, 10, 10, 10, 10, 10, 10,
+ 10, 11, 11, 11, 11, 11, 11, 11, 11, 12,
+ 12, 12, 12, 12, 12, 12, 12, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 14, 14, 14, 14,
+ 14, 14, 14, 14, 15, 15, 15, 15, 15, 15,
+ 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
+ 17, 17, 17, 17, 17, 17, 17, 17, 17, 18,
+ 18, 18, 18, 18, 18, 18, 18, 19, 19, 19,
+ 19, 19, 19, 19, 19, 20, 20, 20, 20, 20,
+ 20, 20, 20, 21, 21, 21, 21, 21, 21, 21,
+ 21, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+ 23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
+ 24, 24, 24, 24, 24, 24, 25, 25, 25, 25,
+ 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+ 26, 26, 26, 27, 27, 27, 27, 27, 27, 27,
+ 27, 28, 28, 28, 28, 28, 28, 28, 28, 29,
+ 29, 29, 29, 29, 29, 29, 29, 30, 30, 30,
+ 30, 30, 30, 30, 30, 31};
+
+const uint8_t Javelin::Data::BITSCALE_8_TO_4_FLOOR[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 15};
+
+const uint8_t Javelin::Data::BITSCALE_8_TO_3_FLOOR[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 7};
+
+const uint8_t Javelin::Data::BITSCALE_8_TO_5_CEIL[256] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 2,
+ 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
+ 3, 3, 3, 3, 3, 4, 4, 4, 4, 4,
+ 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 6, 6, 6, 6, 6, 6, 6, 6,
+ 7, 7, 7, 7, 7, 7, 7, 7, 8, 8,
+ 8, 8, 8, 8, 8, 8, 9, 9, 9, 9,
+ 9, 9, 9, 9, 9, 10, 10, 10, 10, 10,
+ 10, 10, 10, 11, 11, 11, 11, 11, 11, 11,
+ 11, 12, 12, 12, 12, 12, 12, 12, 12, 13,
+ 13, 13, 13, 13, 13, 13, 13, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 15, 15, 15, 15,
+ 15, 15, 15, 15, 16, 16, 16, 16, 16, 16,
+ 16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
+ 18, 18, 18, 18, 18, 18, 18, 18, 18, 19,
+ 19, 19, 19, 19, 19, 19, 19, 20, 20, 20,
+ 20, 20, 20, 20, 20, 21, 21, 21, 21, 21,
+ 21, 21, 21, 22, 22, 22, 22, 22, 22, 22,
+ 22, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+ 24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
+ 25, 25, 25, 25, 25, 25, 26, 26, 26, 26,
+ 26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
+ 27, 27, 27, 28, 28, 28, 28, 28, 28, 28,
+ 28, 29, 29, 29, 29, 29, 29, 29, 29, 30,
+ 30, 30, 30, 30, 30, 30, 30, 31, 31, 31,
+ 31, 31, 31, 31, 31, 31};
+
+const uint8_t Javelin::Data::BITSCALE_8_TO_4_CEIL[256] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 9, 9, 9,
+ 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+ 9, 9, 9, 9, 10, 10, 10, 10, 10, 10,
+ 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+ 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+ 11, 11, 11, 11, 11, 11, 11, 11, 12, 12,
+ 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+ 12, 12, 12, 12, 12, 13, 13, 13, 13, 13,
+ 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+ 13, 13, 14, 14, 14, 14, 14, 14, 14, 14,
+ 14, 14, 14, 14, 14, 14, 14, 14, 14, 15,
+ 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+ 15, 15, 15, 15, 15, 15};
+
+const uint8_t Javelin::Data::BITSCALE_8_TO_3_CEIL[256] = {
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+ 5, 5, 5, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+ 7, 7, 7, 7, 7, 7};
+
diff --git a/3rdparty/bimg/3rdparty/pvrtc/BitScale.h b/3rdparty/bimg/3rdparty/pvrtc/BitScale.h
new file mode 100644
index 0000000..b600fe9
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/BitScale.h
@@ -0,0 +1,28 @@
+//============================================================================
+
+#pragma once
+
+#include <stdint.h>
+
+//============================================================================
+
+namespace Javelin
+{
+  namespace Data
+  {
+//============================================================================
+    
+    extern const uint8_t BITSCALE_5_TO_8[32];
+    extern const uint8_t BITSCALE_4_TO_8[16];
+    extern const uint8_t BITSCALE_3_TO_8[8];
+    extern const uint8_t BITSCALE_8_TO_5_FLOOR[256];
+    extern const uint8_t BITSCALE_8_TO_4_FLOOR[256];
+    extern const uint8_t BITSCALE_8_TO_3_FLOOR[256];
+    extern const uint8_t BITSCALE_8_TO_5_CEIL[256];
+    extern const uint8_t BITSCALE_8_TO_4_CEIL[256];
+    extern const uint8_t BITSCALE_8_TO_3_CEIL[256];
+    
+//============================================================================
+  } // namespace Data
+} // namespace Javelin
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/BitUtility.h b/3rdparty/bimg/3rdparty/pvrtc/BitUtility.h
new file mode 100644
index 0000000..588ff3e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/BitUtility.h
@@ -0,0 +1,19 @@
+#pragma once
+
+namespace Javelin {
+
+class BitUtility {
+public:
+    static bool IsPowerOf2(unsigned int x) {
+        return (x & (x - 1)) == 0;
+    }
+
+    static unsigned int RotateRight(unsigned int value, unsigned int shift) {
+        if ((shift &= sizeof(value) * 8 - 1) == 0) {
+            return value;
+        }
+        return (value >> shift) | (value << (sizeof(value) * 8 - shift));
+    }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/Bitmap.h b/3rdparty/bimg/3rdparty/pvrtc/Bitmap.h
new file mode 100644
index 0000000..409ef1e
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/Bitmap.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "Point2.h"
+
+namespace Javelin {
+
+class Bitmap {
+public:
+    int width;
+    int height;
+    unsigned char *data;
+
+    Bitmap() {}
+
+    Bitmap(int w, int h, int bytesPerPixel)
+        : width(w)
+        , height(h)
+        , data(new unsigned char[width * height * bytesPerPixel]) {
+    }
+
+    virtual ~Bitmap() {
+        delete [] data;
+    }
+
+    Point2<int> GetSize() const { return Point2<int>(width, height); }
+
+    int GetArea() const { return width * height; }
+
+    int GetBitmapWidth() const { return width; }
+
+    int GetBitmapHeight() const { return height; }
+
+    const unsigned char *GetRawData() const { return data; }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/ColorRgba.h b/3rdparty/bimg/3rdparty/pvrtc/ColorRgba.h
new file mode 100644
index 0000000..e3ec6aa
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/ColorRgba.h
@@ -0,0 +1,152 @@
+#pragma once
+
+namespace Javelin {
+
+template<typename T>
+class ColorRgb {
+public:
+    T r;
+    T g;
+    T b;
+    
+
+    ColorRgb()
+        : r(0)
+        , g(0)
+        , b(0) {
+    }
+
+    ColorRgb(T red, T green, T blue)
+        : r(red)
+        , g(green)
+        , b(blue) {
+    }
+
+    ColorRgb(const ColorRgb<T> &x)
+        : r(x.r)
+        , g(x.g)
+        , b(x.b) {
+    }
+
+    ColorRgb<int> operator *(int x) {
+        return ColorRgb<int>(r * x, g * x, b * x);
+    }
+
+    ColorRgb<int> operator +(const ColorRgb<T> &x) const {
+        return ColorRgb<int>(r + (int)x.r, g + (int)x.g, b + (int)x.b);
+    }
+
+    ColorRgb<int> operator -(const ColorRgb<T> &x) const {
+        return ColorRgb<int>(r - (int)x.r, g - (int)x.g, b - (int)x.b);
+    }
+
+    int operator %(const ColorRgb<T> &x) const {
+        return r * (int)x.r + g * (int)x.g + b * (int)x.b;
+    }
+
+    bool operator ==(const ColorRgb<T> &x) const {
+        return r == x.r && g == x.g && b == x.b;
+    }
+
+    bool operator !=(const ColorRgb<T> &x) const {
+        return r != x.r || g != x.g || b != x.b;
+    }
+
+    void SetMin(const ColorRgb<T> &x) {
+        if (x.r < r) {
+            r = x.r;
+        }
+        if (x.g < g) {
+            g = x.g;
+        }
+        if (x.b < b) {
+            b = x.b;
+        }
+    }
+
+    void SetMax(const ColorRgb<T> &x) {
+        if (x.r > r) {
+            r = x.r;
+        }
+        if (x.g > g) {
+            g = x.g;
+        }
+        if (x.b > b) {
+            b = x.b;
+        }
+    }
+};
+
+template<typename T>
+class ColorRgba : public ColorRgb<T> {
+public:
+    T a;
+
+    ColorRgba() :
+        a(0) {
+    }
+
+    ColorRgba(T red, T green, T blue, T alpha)
+        : ColorRgb<T>(red, green, blue)
+        , a(alpha) {
+    }
+
+    ColorRgba(const ColorRgba<T> &x)
+        : ColorRgb<T>(x.r, x.g, x.b)
+        , a(x.a) {
+    }
+
+    ColorRgba<int> operator *(int x) {
+        return ColorRgba<T>(ColorRgb<T>::r * x, 
+                            ColorRgb<T>::g * x, 
+                            ColorRgb<T>::b * x, 
+                            a * x);
+    }
+
+    ColorRgba<int> operator +(const ColorRgba<T> &x) {
+        return ColorRgba<T>(ColorRgb<T>::r + (int)x.r, 
+                            ColorRgb<T>::g + (int)x.g, 
+                            ColorRgb<T>::b + (int)x.b, 
+                            a + (int)x.a);
+    }
+
+    ColorRgba<int> operator -(const ColorRgba<T> &x) {
+        return ColorRgba<T>(ColorRgb<T>::r - (int)x.r, 
+                            ColorRgb<T>::g - (int)x.g, 
+                            ColorRgb<T>::b - (int)x.b, 
+                            a - (int)x.a);
+    }
+
+    int operator %(const ColorRgba<T> &x) {
+        return ColorRgb<T>::r * (int)x.r + 
+               ColorRgb<T>::g * (int)x.g + 
+               ColorRgb<T>::b * (int)x.b + 
+               a * (int)x.a;
+    }
+
+    bool operator ==(const ColorRgba<T> &x) {
+        return ColorRgb<T>::r == x.r && ColorRgb<T>::g == x.g && 
+               ColorRgb<T>::b == x.b && a == x.a;
+    }
+
+    bool operator !=(const ColorRgba<T> &x) {
+        return ColorRgb<T>::r != x.r || ColorRgb<T>::g != x.g || 
+               ColorRgb<T>::b != x.b || a != x.a;
+    }
+
+    void SetMin(const ColorRgba<T> &x) {
+        ColorRgb<T>::SetMin(x);
+        if (x.a < a) {
+            a = x.a;
+        }
+    }
+
+    void SetMax(const ColorRgba<T> &x) {
+        ColorRgb<T>::SetMax(x);
+        if (x.a > a) {
+            a = x.a;
+        }
+    }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/Interval.h b/3rdparty/bimg/3rdparty/pvrtc/Interval.h
new file mode 100644
index 0000000..a7252e8
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/Interval.h
@@ -0,0 +1,21 @@
+#pragma once
+
+namespace Javelin {
+
+template<typename T>
+class Interval {
+public:
+    T min;
+    T max;
+
+    Interval() {
+    }
+
+    Interval<T> &operator|=(const T &x) {
+        min.SetMin(x); 
+        max.SetMax(x);
+        return *this;
+    }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/LICENSE.TXT b/3rdparty/bimg/3rdparty/pvrtc/LICENSE.TXT
new file mode 100644
index 0000000..974fc09
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/LICENSE.TXT
@@ -0,0 +1,25 @@
+Copyright © 2014, Jeffrey Lim. All Rights Reserved.
+
+Redistribution and use in source and binary forms, with or without 
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, 
+   this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, 
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
+FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR 
+BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/bimg/3rdparty/pvrtc/MortonTable.cpp b/3rdparty/bimg/3rdparty/pvrtc/MortonTable.cpp
new file mode 100644
index 0000000..29a5af6
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/MortonTable.cpp
@@ -0,0 +1,43 @@
+//============================================================================
+
+#include "MortonTable.h"
+
+//============================================================================
+
+const unsigned short Javelin::Data::MORTON_TABLE[256] =
+{
+    0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+    0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+    0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+    0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+    0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+    0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+    0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+    0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+    0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+    0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+    0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+    0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+    0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+    0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+    0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+    0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+    0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+    0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+    0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+    0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+    0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+    0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+    0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+    0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+    0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+    0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+    0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+    0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+    0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+    0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+    0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+    0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
+};
+
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/MortonTable.h b/3rdparty/bimg/3rdparty/pvrtc/MortonTable.h
new file mode 100644
index 0000000..7a27e59
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/MortonTable.h
@@ -0,0 +1,18 @@
+//============================================================================
+
+#pragma once
+
+//============================================================================
+
+namespace Javelin
+{
+	namespace Data
+	{
+//============================================================================
+		
+		extern const unsigned short MORTON_TABLE[256];
+		
+//============================================================================
+	} // namespace Data
+} // namespace Javelin
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/Point2.h b/3rdparty/bimg/3rdparty/pvrtc/Point2.h
new file mode 100644
index 0000000..89fa4b6
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/Point2.h
@@ -0,0 +1,17 @@
+#pragma once
+
+namespace Javelin {
+
+template<typename T>
+class Point2 {
+public:
+    T x;
+    T y;
+
+    Point2(int a, int b)
+        : x(a)
+        , y(b) {
+    }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.cpp b/3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.cpp
new file mode 100644
index 0000000..d8a36b3
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.cpp
@@ -0,0 +1,144 @@
+//============================================================================
+
+#include "PvrTcDecoder.h"
+#include "PvrTcPacket.h"
+
+#include "MortonTable.h"
+#include <assert.h>
+
+//============================================================================
+
+using namespace Javelin;
+using Data::MORTON_TABLE;
+
+//============================================================================
+
+inline unsigned PvrTcDecoder::GetMortonNumber(int x, int y)
+{
+    return MORTON_TABLE[x >> 8] << 17 | MORTON_TABLE[y >> 8] << 16 | MORTON_TABLE[x & 0xFF] << 1 | MORTON_TABLE[y & 0xFF];
+}
+
+//============================================================================
+
+void PvrTcDecoder::DecodeRgb4Bpp(ColorRgb<unsigned char>* result, const Point2<int>& size, const void* data)
+{
+    assert(size.x == size.y);
+	
+	const int blocks = size.x / 4;
+	const int blockMask = blocks-1;
+    const PvrTcPacket* packets = static_cast<const PvrTcPacket*>(data);
+    
+    for(int y = 0; y < blocks; ++y)
+    {
+        for(int x = 0; x < blocks; ++x)
+        {
+            const PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+            
+            unsigned mod = packet->modulationData;
+			const unsigned char (*weights)[4] = PvrTcPacket::WEIGHTS + 4*packet->usePunchthroughAlpha;
+            const unsigned char (*factor)[4] = PvrTcPacket::BILINEAR_FACTORS;
+			
+			for(int py = 0; py < 4; ++py)
+			{
+				const int yOffset = (py < 2) ? -1 : 0;
+				const int y0 = (y + yOffset) & blockMask;
+				const int y1 = (y0+1) & blockMask;
+				
+				for(int px = 0; px < 4; ++px)
+				{
+					const int xOffset = (px < 2) ? -1 : 0;
+					const int x0 = (x + xOffset) & blockMask;
+					const int x1 = (x0+1) & blockMask;
+					
+					const PvrTcPacket* p0 = packets + GetMortonNumber(x0, y0);
+					const PvrTcPacket* p1 = packets + GetMortonNumber(x1, y0);
+					const PvrTcPacket* p2 = packets + GetMortonNumber(x0, y1);
+					const PvrTcPacket* p3 = packets + GetMortonNumber(x1, y1);
+					
+					ColorRgb<int> ca = p0->GetColorRgbA() * (*factor)[0] +
+									   p1->GetColorRgbA() * (*factor)[1] +
+									   p2->GetColorRgbA() * (*factor)[2] +
+									   p3->GetColorRgbA() * (*factor)[3];
+					
+					ColorRgb<int> cb = p0->GetColorRgbB() * (*factor)[0] +
+									   p1->GetColorRgbB() * (*factor)[1] +
+									   p2->GetColorRgbB() * (*factor)[2] +
+									   p3->GetColorRgbB() * (*factor)[3];
+					
+					const unsigned char* w = weights[mod&3];
+					ColorRgb<unsigned char> c;
+					c.r = (ca.r * w[0] + cb.r * w[1]) >> 7;
+					c.g = (ca.g * w[0] + cb.g * w[1]) >> 7;
+					c.b = (ca.b * w[0] + cb.b * w[1]) >> 7;
+					
+					result[(py+y*4)*size.x + (px+x*4)] = c;
+					mod >>= 2;
+					factor++;
+				}
+			}
+        }
+    }
+}
+
+void PvrTcDecoder::DecodeRgba4Bpp(ColorRgba<unsigned char>* result, const Point2<int>& size, const void* data)
+{
+    assert(size.x == size.y);
+    
+	const int blocks = size.x / 4;
+	const int blockMask = blocks-1;
+    const PvrTcPacket* packets = static_cast<const PvrTcPacket*>(data);
+    
+    for(int y = 0; y < blocks; ++y)
+    {
+        for(int x = 0; x < blocks; ++x)
+        {
+            const PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+            
+            unsigned mod = packet->modulationData;
+            const unsigned char (*weights)[4] = PvrTcPacket::WEIGHTS + 4*packet->usePunchthroughAlpha;
+            const unsigned char (*factor)[4] = PvrTcPacket::BILINEAR_FACTORS;
+			
+			for(int py = 0; py < 4; ++py)
+			{
+				const int yOffset = (py < 2) ? -1 : 0;
+				const int y0 = (y + yOffset) & blockMask;
+				const int y1 = (y0+1) & blockMask;
+				
+				for(int px = 0; px < 4; ++px)
+				{
+					const int xOffset = (px < 2) ? -1 : 0;
+					const int x0 = (x + xOffset) & blockMask;
+					const int x1 = (x0+1) & blockMask;
+					
+					const PvrTcPacket* p0 = packets + GetMortonNumber(x0, y0);
+					const PvrTcPacket* p1 = packets + GetMortonNumber(x1, y0);
+					const PvrTcPacket* p2 = packets + GetMortonNumber(x0, y1);
+					const PvrTcPacket* p3 = packets + GetMortonNumber(x1, y1);
+					
+					ColorRgba<int> ca = p0->GetColorRgbaA() * (*factor)[0] +
+									   	p1->GetColorRgbaA() * (*factor)[1] +
+									   	p2->GetColorRgbaA() * (*factor)[2] +
+										p3->GetColorRgbaA() * (*factor)[3];
+					
+					ColorRgba<int> cb = p0->GetColorRgbaB() * (*factor)[0] +
+										p1->GetColorRgbaB() * (*factor)[1] +
+										p2->GetColorRgbaB() * (*factor)[2] +
+										p3->GetColorRgbaB() * (*factor)[3];
+					
+					const unsigned char* w = weights[mod&3];
+					ColorRgba<unsigned char> c;
+					c.r = (ca.r * w[0] + cb.r * w[1]) >> 7;
+					c.g = (ca.g * w[0] + cb.g * w[1]) >> 7;
+					c.b = (ca.b * w[0] + cb.b * w[1]) >> 7;
+					c.a = (ca.a * w[2] + cb.a * w[3]) >> 7;
+					
+					result[(py+y*4)*size.x + (px+x*4)] = c;
+					mod >>= 2;
+					factor++;
+				}
+			}
+        }
+    }
+}
+
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.h b/3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.h
new file mode 100644
index 0000000..1b6fcf9
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/PvrTcDecoder.h
@@ -0,0 +1,25 @@
+//============================================================================
+
+#pragma once
+#include "Point2.h"
+#include "ColorRgba.h"
+
+//============================================================================
+
+namespace Javelin
+{
+//============================================================================
+
+    class PvrTcDecoder
+    {
+    public:
+        static void DecodeRgb4Bpp(ColorRgb<unsigned char>* result, const Point2<int>& size, const void* data);
+        static void DecodeRgba4Bpp(ColorRgba<unsigned char>* result, const Point2<int>& size, const void* data);
+        
+    private:
+		static unsigned GetMortonNumber(int x, int y);
+    };
+    
+//============================================================================
+}
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.cpp b/3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.cpp
new file mode 100644
index 0000000..56cc8e0
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.cpp
@@ -0,0 +1,464 @@
+//============================================================================
+
+#include "PvrTcEncoder.h"
+#include "AlphaBitmap.h"
+#include "PvrTcPacket.h"
+#include "RgbBitmap.h"
+#include "RgbaBitmap.h"
+#include "MortonTable.h"
+#include "BitUtility.h"
+#include "Interval.h"
+#include <assert.h>
+#include <math.h>
+#include <stdint.h>
+
+//============================================================================
+
+using namespace Javelin;
+using Data::MORTON_TABLE;
+
+//============================================================================
+
+static const unsigned char MODULATION_LUT[16] =
+{
+	0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3
+};
+
+//============================================================================
+
+inline unsigned PvrTcEncoder::GetMortonNumber(int x, int y)
+{
+	return MORTON_TABLE[x >> 8] << 17 | MORTON_TABLE[y >> 8] << 16 | MORTON_TABLE[x & 0xFF] << 1 | MORTON_TABLE[y & 0xFF];
+}
+
+//============================================================================
+
+void PvrTcEncoder::EncodeAlpha2Bpp(void* result, const AlphaBitmap& bitmap)
+{
+	int size = bitmap.GetBitmapWidth();
+	assert(size == bitmap.GetBitmapHeight());
+	assert(BitUtility::IsPowerOf2(size));
+	
+	// Blocks in each dimension.
+	int xBlocks = size/8;
+	int yBlocks = size/4;
+	
+	const unsigned char* bitmapData = bitmap.GetRawData();
+	
+	PvrTcPacket* packets = static_cast<PvrTcPacket*>(result);
+	for(int y = 0; y < yBlocks; ++y)
+	{
+		for(int x = 0; x < xBlocks; ++x)
+		{
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->usePunchthroughAlpha = 0;
+			packet->colorAIsOpaque = 0;
+			packet->colorA = 0x7ff;		// White, with 0 alpha
+			packet->colorBIsOpaque = 1;
+			packet->colorB = 0x7fff;	// White with full alpha
+			
+			const unsigned char* blockBitmapData = &bitmapData[y*4*size + x*8];
+			
+			uint32_t modulationData = 0;
+			for(int py = 0; py < 4; ++py)
+			{
+				const unsigned char* rowBitmapData = blockBitmapData;
+				for(int px = 0; px < 8; ++px)
+				{
+					unsigned char pixel = *rowBitmapData++;
+					modulationData = BitUtility::RotateRight(modulationData | (pixel >> 7), 1);
+				}
+				blockBitmapData += size;
+			}
+			packet->modulationData = modulationData;
+		}
+	}
+}
+
+void PvrTcEncoder::EncodeAlpha4Bpp(void* result, const AlphaBitmap& bitmap)
+{
+	int size = bitmap.GetBitmapWidth();
+	assert(size == bitmap.GetBitmapHeight());
+	assert(BitUtility::IsPowerOf2(size));
+	
+	// Blocks in each dimension.
+	int blocks = size/4;
+	
+	const unsigned char* bitmapData = bitmap.GetRawData();
+	
+	PvrTcPacket* packets = static_cast<PvrTcPacket*>(result);
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->usePunchthroughAlpha = 0;
+			packet->colorAIsOpaque = 0;
+			packet->colorA = 0x7ff;		// White, with 0 alpha
+			packet->colorBIsOpaque = 1;
+			packet->colorB = 0x7fff;	// White with full alpha
+
+			const unsigned char* blockBitmapData = &bitmapData[(y*size + x)*4];
+			
+			uint32_t modulationData = 0;
+			for(int py = 0; py < 4; ++py)
+			{
+				const unsigned char* rowBitmapData = blockBitmapData;
+				for(int px = 0; px < 4; ++px)
+				{
+					unsigned char pixel = *rowBitmapData++;
+					modulationData = BitUtility::RotateRight(modulationData | MODULATION_LUT[pixel>>4], 2);
+				}
+				blockBitmapData += size;
+			}
+			packet->modulationData = modulationData;
+		}
+	}
+}
+
+//============================================================================
+
+typedef Interval<ColorRgb<unsigned char> > ColorRgbBoundingBox;
+
+static void CalculateBoundingBox(ColorRgbBoundingBox& cbb, const RgbBitmap& bitmap, int blockX, int blockY)
+{
+	int size = bitmap.GetBitmapWidth();
+	const ColorRgb<unsigned char>* data = bitmap.GetData() + blockY * 4 * size + blockX * 4;
+	
+	cbb.min = data[0];
+	cbb.max = data[0];
+	cbb |= data[1];
+	cbb |= data[2];
+	cbb |= data[3];
+	
+	cbb |= data[size];
+	cbb |= data[size+1];
+	cbb |= data[size+2];
+	cbb |= data[size+3];
+
+	cbb |= data[2*size];
+	cbb |= data[2*size+1];
+	cbb |= data[2*size+2];
+	cbb |= data[2*size+3];
+
+	cbb |= data[3*size];
+	cbb |= data[3*size+1];
+	cbb |= data[3*size+2];
+	cbb |= data[3*size+3];
+}
+
+void PvrTcEncoder::EncodeRgb4Bpp(void* result, const RgbBitmap& bitmap)
+{
+	assert(bitmap.GetBitmapWidth() == bitmap.GetBitmapHeight());
+	assert(BitUtility::IsPowerOf2(bitmap.GetBitmapWidth()));
+	const int size = bitmap.GetBitmapWidth();
+	const int blocks = size / 4;
+	const int blockMask = blocks-1;
+	
+	PvrTcPacket* packets = static_cast<PvrTcPacket*>(result);
+
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			ColorRgbBoundingBox cbb;
+			CalculateBoundingBox(cbb, bitmap, x, y);
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->usePunchthroughAlpha = 0;
+			packet->SetColorA(cbb.min);
+			packet->SetColorB(cbb.max);
+		}
+	}
+	
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			const unsigned char (*factor)[4] = PvrTcPacket::BILINEAR_FACTORS;
+			const ColorRgb<unsigned char>* data = bitmap.GetData() + y * 4 * size + x * 4;
+
+			uint32_t modulationData = 0;
+			
+			for(int py = 0; py < 4; ++py)
+			{
+				const int yOffset = (py < 2) ? -1 : 0;
+				const int y0 = (y + yOffset) & blockMask;
+				const int y1 = (y0+1) & blockMask;
+
+				for(int px = 0; px < 4; ++px)
+				{
+					const int xOffset = (px < 2) ? -1 : 0;
+					const int x0 = (x + xOffset) & blockMask;
+					const int x1 = (x0+1) & blockMask;
+					
+					const PvrTcPacket* p0 = packets + GetMortonNumber(x0, y0);
+					const PvrTcPacket* p1 = packets + GetMortonNumber(x1, y0);
+					const PvrTcPacket* p2 = packets + GetMortonNumber(x0, y1);
+					const PvrTcPacket* p3 = packets + GetMortonNumber(x1, y1);
+					
+					ColorRgb<int> ca = p0->GetColorRgbA() * (*factor)[0] +
+									   p1->GetColorRgbA() * (*factor)[1] +
+									   p2->GetColorRgbA() * (*factor)[2] +
+									   p3->GetColorRgbA() * (*factor)[3];
+					
+					ColorRgb<int> cb = p0->GetColorRgbB() * (*factor)[0] +
+									   p1->GetColorRgbB() * (*factor)[1] +
+									   p2->GetColorRgbB() * (*factor)[2] +
+									   p3->GetColorRgbB() * (*factor)[3];
+					
+					const ColorRgb<unsigned char>& pixel = data[py*size + px];
+					ColorRgb<int> d = cb - ca;
+					ColorRgb<int> p(pixel.r*16, pixel.g*16, pixel.b*16);
+					ColorRgb<int> v = p - ca;
+					
+					// PVRTC uses weightings of 0, 3/8, 5/8 and 1
+					// The boundaries for these are 3/16, 1/2 (=8/16), 13/16
+					int projection = (v % d) * 16;
+					int lengthSquared = d % d;
+					if(projection > 3*lengthSquared) modulationData++;
+					if(projection > 8*lengthSquared) modulationData++;
+					if(projection > 13*lengthSquared) modulationData++;
+					
+					modulationData = BitUtility::RotateRight(modulationData, 2);
+					
+					factor++;
+				}
+			}
+
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->modulationData = modulationData;
+		}
+	}
+}
+
+//============================================================================
+
+static void CalculateBoundingBox(ColorRgbBoundingBox& cbb, const RgbaBitmap& bitmap, int blockX, int blockY)
+{
+	int size = bitmap.GetBitmapWidth();
+	const ColorRgba<unsigned char>* data = bitmap.GetData() + blockY * 4 * size + blockX * 4;
+	
+	cbb.min = data[0];
+	cbb.max = data[0];
+	
+	cbb |= data[1];
+	cbb |= data[2];
+	cbb |= data[3];
+	
+	cbb |= data[size];
+	cbb |= data[size+1];
+	cbb |= data[size+2];
+	cbb |= data[size+3];
+	
+	cbb |= data[2*size];
+	cbb |= data[2*size+1];
+	cbb |= data[2*size+2];
+	cbb |= data[2*size+3];
+	
+	cbb |= data[3*size];
+	cbb |= data[3*size+1];
+	cbb |= data[3*size+2];
+	cbb |= data[3*size+3];
+}
+
+void PvrTcEncoder::EncodeRgb4Bpp(void* result, const RgbaBitmap& bitmap)
+{
+	assert(bitmap.GetBitmapWidth() == bitmap.GetBitmapHeight());
+	assert(BitUtility::IsPowerOf2(bitmap.GetBitmapWidth()));
+	const int size = bitmap.GetBitmapWidth();
+	const int blocks = size / 4;
+	const int blockMask = blocks-1;
+	
+	PvrTcPacket* packets = static_cast<PvrTcPacket*>(result);
+	
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			ColorRgbBoundingBox cbb;
+			CalculateBoundingBox(cbb, bitmap, x, y);
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->usePunchthroughAlpha = 0;
+			packet->SetColorA(cbb.min);
+			packet->SetColorB(cbb.max);
+		}
+	}
+	
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			const unsigned char (*factor)[4] = PvrTcPacket::BILINEAR_FACTORS;
+			const ColorRgba<unsigned char>* data = bitmap.GetData() + y * 4 * size + x * 4;
+			
+			uint32_t modulationData = 0;
+			
+			for(int py = 0; py < 4; ++py)
+			{
+				const int yOffset = (py < 2) ? -1 : 0;
+				const int y0 = (y + yOffset) & blockMask;
+				const int y1 = (y0+1) & blockMask;
+
+				for(int px = 0; px < 4; ++px)
+				{
+					const int xOffset = (px < 2) ? -1 : 0;
+					const int x0 = (x + xOffset) & blockMask;
+					const int x1 = (x0+1) & blockMask;
+					
+					const PvrTcPacket* p0 = packets + GetMortonNumber(x0, y0);
+					const PvrTcPacket* p1 = packets + GetMortonNumber(x1, y0);
+					const PvrTcPacket* p2 = packets + GetMortonNumber(x0, y1);
+					const PvrTcPacket* p3 = packets + GetMortonNumber(x1, y1);
+					
+					ColorRgb<int> ca = p0->GetColorRgbA() * (*factor)[0] +
+									   p1->GetColorRgbA() * (*factor)[1] +
+									   p2->GetColorRgbA() * (*factor)[2] +
+									   p3->GetColorRgbA() * (*factor)[3];
+					
+					ColorRgb<int> cb = p0->GetColorRgbB() * (*factor)[0] +
+									   p1->GetColorRgbB() * (*factor)[1] +
+									   p2->GetColorRgbB() * (*factor)[2] +
+									   p3->GetColorRgbB() * (*factor)[3];
+					
+					const ColorRgb<unsigned char>& pixel = data[py*size + px];
+					ColorRgb<int> d = cb - ca;
+					ColorRgb<int> p(pixel.r*16, pixel.g*16, pixel.b*16);
+					ColorRgb<int> v = p - ca;
+					
+					// PVRTC uses weightings of 0, 3/8, 5/8 and 1
+					// The boundaries for these are 3/16, 1/2 (=8/16), 13/16
+					int projection = (v % d) * 16;
+					int lengthSquared = d % d;
+					if(projection > 3*lengthSquared) modulationData++;
+					if(projection > 8*lengthSquared) modulationData++;
+					if(projection > 13*lengthSquared) modulationData++;
+					
+					modulationData = BitUtility::RotateRight(modulationData, 2);
+					
+					factor++;
+				}
+			}
+
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->modulationData = modulationData;
+		}
+	}
+}
+
+//============================================================================
+
+typedef Interval<ColorRgba<unsigned char> > ColorRgbaBoundingBox;
+
+static void CalculateBoundingBox(ColorRgbaBoundingBox& cbb, const RgbaBitmap& bitmap, int blockX, int blockY)
+{
+	int size = bitmap.GetBitmapWidth();
+	const ColorRgba<unsigned char>* data = bitmap.GetData() + blockY * 4 * size + blockX * 4;
+	
+	cbb.min = data[0];
+	cbb.max = data[0];
+	
+	cbb |= data[1];
+	cbb |= data[2];
+	cbb |= data[3];
+	
+	cbb |= data[size];
+	cbb |= data[size+1];
+	cbb |= data[size+2];
+	cbb |= data[size+3];
+	
+	cbb |= data[2*size];
+	cbb |= data[2*size+1];
+	cbb |= data[2*size+2];
+	cbb |= data[2*size+3];
+	
+	cbb |= data[3*size];
+	cbb |= data[3*size+1];
+	cbb |= data[3*size+2];
+	cbb |= data[3*size+3];
+}
+
+void PvrTcEncoder::EncodeRgba4Bpp(void* result, const RgbaBitmap& bitmap)
+{
+	assert(bitmap.GetBitmapWidth() == bitmap.GetBitmapHeight());
+	assert(BitUtility::IsPowerOf2(bitmap.GetBitmapWidth()));
+	const int size = bitmap.GetBitmapWidth();
+	const int blocks = size / 4;
+	const int blockMask = blocks-1;
+	
+	PvrTcPacket* packets = static_cast<PvrTcPacket*>(result);
+	
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			ColorRgbaBoundingBox cbb;
+			CalculateBoundingBox(cbb, bitmap, x, y);
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->usePunchthroughAlpha = 0;
+			packet->SetColorA(cbb.min);
+			packet->SetColorB(cbb.max);
+		}
+	}
+	
+	for(int y = 0; y < blocks; ++y)
+	{
+		for(int x = 0; x < blocks; ++x)
+		{
+			const unsigned char (*factor)[4] = PvrTcPacket::BILINEAR_FACTORS;
+			const ColorRgba<unsigned char>* data = bitmap.GetData() + y * 4 * size + x * 4;
+			
+			uint32_t modulationData = 0;
+			
+			for(int py = 0; py < 4; ++py)
+			{
+				const int yOffset = (py < 2) ? -1 : 0;
+				const int y0 = (y + yOffset) & blockMask;
+				const int y1 = (y0+1) & blockMask;
+				
+				for(int px = 0; px < 4; ++px)
+				{
+					const int xOffset = (px < 2) ? -1 : 0;
+					const int x0 = (x + xOffset) & blockMask;
+					const int x1 = (x0+1) & blockMask;
+					
+					const PvrTcPacket* p0 = packets + GetMortonNumber(x0, y0);
+					const PvrTcPacket* p1 = packets + GetMortonNumber(x1, y0);
+					const PvrTcPacket* p2 = packets + GetMortonNumber(x0, y1);
+					const PvrTcPacket* p3 = packets + GetMortonNumber(x1, y1);
+					
+					ColorRgba<int> ca = p0->GetColorRgbaA() * (*factor)[0] +
+										p1->GetColorRgbaA() * (*factor)[1] +
+										p2->GetColorRgbaA() * (*factor)[2] +
+										p3->GetColorRgbaA() * (*factor)[3];
+					
+					ColorRgba<int> cb = p0->GetColorRgbaB() * (*factor)[0] +
+										p1->GetColorRgbaB() * (*factor)[1] +
+										p2->GetColorRgbaB() * (*factor)[2] +
+										p3->GetColorRgbaB() * (*factor)[3];
+					
+					const ColorRgba<unsigned char>& pixel = data[py*size + px];
+					ColorRgba<int> d = cb - ca;
+					ColorRgba<int> p(pixel.r*16, pixel.g*16, pixel.b*16, pixel.a*16);
+					ColorRgba<int> v = p - ca;
+					
+					// PVRTC uses weightings of 0, 3/8, 5/8 and 1
+					// The boundaries for these are 3/16, 1/2 (=8/16), 13/16
+					int projection = (v % d) * 16;
+					int lengthSquared = d % d;
+					if(projection > 3*lengthSquared) modulationData++;
+					if(projection > 8*lengthSquared) modulationData++;
+					if(projection > 13*lengthSquared) modulationData++;
+					
+					modulationData = BitUtility::RotateRight(modulationData, 2);
+					
+					factor++;
+				}
+			}
+			
+			PvrTcPacket* packet = packets + GetMortonNumber(x, y);
+			packet->modulationData = modulationData;
+		}
+	}
+}
+
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.h b/3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.h
new file mode 100644
index 0000000..fd24484
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/PvrTcEncoder.h
@@ -0,0 +1,43 @@
+//============================================================================
+
+#pragma once
+#include "ColorRgba.h"
+#include "AlphaBitmap.h"
+#include "RgbBitmap.h"
+#include "RgbaBitmap.h"
+
+//============================================================================
+
+namespace Javelin
+{
+//============================================================================
+
+	class AlphaBitmap;
+	class RgbBitmap;
+	class RgbaBitmap;
+	
+	class PvrTcEncoder
+	{
+	public:
+		// Result must be large enough for bitmap.GetArea()/4 bytes
+		static void EncodeAlpha2Bpp(void* result, const AlphaBitmap& bitmap);
+		
+		// Result must be large enough for bitmap.GetArea()/2 bytes
+		static void EncodeAlpha4Bpp(void* result, const AlphaBitmap& bitmap);
+		
+		// Result must be large enough for bitmap.GetArea()/2 bytes
+		static void EncodeRgb4Bpp(void* result, const RgbBitmap& bitmap);
+
+		// Result must be large enough for bitmap.GetArea()/2 bytes
+		static void EncodeRgb4Bpp(void* result, const RgbaBitmap& bitmap);
+
+		// Result must be large enough for bitmap.GetArea()/2 bytes
+		static void EncodeRgba4Bpp(void* result, const RgbaBitmap& bitmap);
+
+	private:
+		static unsigned GetMortonNumber(int x, int y);
+	};
+	
+//============================================================================
+}
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.cpp b/3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.cpp
new file mode 100644
index 0000000..2e40d37
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.cpp
@@ -0,0 +1,209 @@
+//============================================================================
+
+#include "PvrTcPacket.h"
+#include "BitScale.h"
+
+//============================================================================
+
+using namespace Javelin;
+
+//============================================================================
+
+const unsigned char PvrTcPacket::BILINEAR_FACTORS[16][4] =
+{
+	{ 4, 4, 4, 4 },
+	{ 2, 6, 2, 6 },
+	{ 8, 0, 8, 0 },
+	{ 6, 2, 6, 2 },
+	
+	{ 2, 2, 6, 6 },
+	{ 1, 3, 3, 9 },
+	{ 4, 0, 12, 0 },
+	{ 3, 1, 9, 3 },
+	
+	{ 8, 8, 0, 0 },
+	{ 4, 12, 0, 0 },
+	{ 16, 0, 0, 0 },
+	{ 12, 4, 0, 0 },
+	
+	{ 6, 6, 2, 2 },
+	{ 3, 9, 1, 3 },
+	{ 12, 0, 4, 0 },
+	{ 9, 3, 3, 1 },
+};
+
+// Weights are { colorA, colorB, alphaA, alphaB }
+const unsigned char PvrTcPacket::WEIGHTS[8][4] =
+{
+	// Weights for Mode=0
+	{ 8, 0, 8, 0 },
+	{ 5, 3, 5, 3 },
+	{ 3, 5, 3, 5 },
+	{ 0, 8, 0, 8 },
+	
+	// Weights for Mode=1
+	{ 8, 0, 8, 0 },
+	{ 4, 4, 4, 4 },
+	{ 4, 4, 0, 0 },
+	{ 0, 8, 0, 8 },
+};
+
+//============================================================================
+
+ColorRgb<int> PvrTcPacket::GetColorRgbA() const
+{
+	if(colorAIsOpaque)
+	{
+		unsigned char r = colorA >> 9;
+		unsigned char g = colorA >> 4 & 0x1f;
+		unsigned char b = colorA & 0xf;
+		return ColorRgb<int>(Data::BITSCALE_5_TO_8[r],
+							 Data::BITSCALE_5_TO_8[g],
+							 Data::BITSCALE_4_TO_8[b]);
+	}
+	else
+	{
+		unsigned char r = (colorA >> 7) & 0xf;
+		unsigned char g = (colorA >> 3) & 0xf;
+		unsigned char b = colorA & 7;
+		return ColorRgb<int>(Data::BITSCALE_4_TO_8[r],
+							 Data::BITSCALE_4_TO_8[g],
+							 Data::BITSCALE_3_TO_8[b]);
+	}
+}
+
+ColorRgb<int> PvrTcPacket::GetColorRgbB() const
+{
+	if(colorBIsOpaque)
+	{
+		unsigned char r = colorB >> 10;
+		unsigned char g = colorB >> 5 & 0x1f;
+		unsigned char b = colorB & 0x1f;
+		return ColorRgb<int>(Data::BITSCALE_5_TO_8[r],
+							 Data::BITSCALE_5_TO_8[g],
+							 Data::BITSCALE_5_TO_8[b]);
+	}
+	else
+	{
+		unsigned char r = colorB >> 8 & 0xf;
+		unsigned char g = colorB >> 4 & 0xf;
+		unsigned char b = colorB & 0xf;
+		return ColorRgb<int>(Data::BITSCALE_4_TO_8[r],
+							 Data::BITSCALE_4_TO_8[g],
+							 Data::BITSCALE_4_TO_8[b]);
+	}
+}
+
+ColorRgba<int> PvrTcPacket::GetColorRgbaA() const
+{
+	if(colorAIsOpaque)
+	{
+		unsigned char r = colorA >> 9;
+		unsigned char g = colorA >> 4 & 0x1f;
+		unsigned char b = colorA & 0xf;
+		return ColorRgba<int>(Data::BITSCALE_5_TO_8[r],
+							  Data::BITSCALE_5_TO_8[g],
+							  Data::BITSCALE_4_TO_8[b],
+							  255);
+	}
+	else
+	{
+		unsigned char a = colorA >> 11 & 7;
+		unsigned char r = colorA >> 7 & 0xf;
+		unsigned char g = colorA >> 3 & 0xf;
+		unsigned char b = colorA & 7;
+		return ColorRgba<int>(Data::BITSCALE_4_TO_8[r],
+							  Data::BITSCALE_4_TO_8[g],
+							  Data::BITSCALE_3_TO_8[b],
+							  Data::BITSCALE_3_TO_8[a]);
+	}
+}
+
+ColorRgba<int> PvrTcPacket::GetColorRgbaB() const
+{
+	if(colorBIsOpaque)
+	{
+		unsigned char r = colorB >> 10;
+		unsigned char g = colorB >> 5 & 0x1f;
+		unsigned char b = colorB & 0x1f;
+		return ColorRgba<int>(Data::BITSCALE_5_TO_8[r],
+							  Data::BITSCALE_5_TO_8[g],
+							  Data::BITSCALE_5_TO_8[b],
+							  255);
+	}
+	else
+	{
+		unsigned char a = colorB >> 12 & 7;
+		unsigned char r = colorB >> 8 & 0xf;
+		unsigned char g = colorB >> 4 & 0xf;
+		unsigned char b = colorB & 0xf;
+		return ColorRgba<int>(Data::BITSCALE_4_TO_8[r],
+							  Data::BITSCALE_4_TO_8[g],
+							  Data::BITSCALE_4_TO_8[b],
+							  Data::BITSCALE_3_TO_8[a]);
+	}
+}
+
+//============================================================================
+
+void PvrTcPacket::SetColorA(const ColorRgb<unsigned char>& c)
+{
+	int r = Data::BITSCALE_8_TO_5_FLOOR[c.r];
+	int g = Data::BITSCALE_8_TO_5_FLOOR[c.g];
+	int b = Data::BITSCALE_8_TO_4_FLOOR[c.b];
+	colorA = r<<9 | g<<4 | b;
+	colorAIsOpaque = true;
+}
+
+void PvrTcPacket::SetColorB(const ColorRgb<unsigned char>& c)
+{
+	int r = Data::BITSCALE_8_TO_5_CEIL[c.r];
+	int g = Data::BITSCALE_8_TO_5_CEIL[c.g];
+	int b = Data::BITSCALE_8_TO_5_CEIL[c.b];
+	colorB = r<<10 | g<<5 | b;
+	colorBIsOpaque = true;
+}
+
+void PvrTcPacket::SetColorA(const ColorRgba<unsigned char>& c)
+{
+	int a = Data::BITSCALE_8_TO_3_FLOOR[c.a];
+	if(a == 7)
+	{
+		int r = Data::BITSCALE_8_TO_5_FLOOR[c.r];
+		int g = Data::BITSCALE_8_TO_5_FLOOR[c.g];
+		int b = Data::BITSCALE_8_TO_4_FLOOR[c.b];
+		colorA = r<<9 | g<<4 | b;
+		colorAIsOpaque = true;
+	}
+	else
+	{
+		int r = Data::BITSCALE_8_TO_4_FLOOR[c.r];
+		int g = Data::BITSCALE_8_TO_4_FLOOR[c.g];
+		int b = Data::BITSCALE_8_TO_3_FLOOR[c.b];
+		colorA = a<<11 | r<<7 | g<<3 | b;
+		colorAIsOpaque = false;
+	}
+}
+
+void PvrTcPacket::SetColorB(const ColorRgba<unsigned char>& c)
+{
+	int a = Data::BITSCALE_8_TO_3_CEIL[c.a];
+	if(a == 7)
+	{
+		int r = Data::BITSCALE_8_TO_5_CEIL[c.r];
+		int g = Data::BITSCALE_8_TO_5_CEIL[c.g];
+		int b = Data::BITSCALE_8_TO_5_CEIL[c.b];
+		colorB = r<<10 | g<<5 | b;
+		colorBIsOpaque = true;
+	}
+	else
+	{
+		int r = Data::BITSCALE_8_TO_4_CEIL[c.r];
+		int g = Data::BITSCALE_8_TO_4_CEIL[c.g];
+		int b = Data::BITSCALE_8_TO_4_CEIL[c.b];
+		colorB = a<<12 | r<<8 | g<<4 | b;
+		colorBIsOpaque = false;
+	}
+}
+
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.h b/3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.h
new file mode 100644
index 0000000..ac3b6a4
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/PvrTcPacket.h
@@ -0,0 +1,65 @@
+//============================================================================
+//
+// Modulation data specifies weightings of colorA to colorB for each pixel
+//
+// For mode = 0
+//	00: 0/8
+//  01: 3/8
+//  10: 5/8
+//  11: 8/8
+//
+// For mode = 1
+//  00: 0/8
+//  01: 4/8
+//  10: 4/8 with alpha punchthrough
+//  11: 8/8
+//
+// For colorIsOpaque=0
+//  3 bits A
+//  4 bits R
+//  4 bits G
+//  3/4 bits B
+//
+// For colorIsOpaque=1
+//  5 bits R
+//  5 bits G
+//  4/5 bits B
+//
+//============================================================================
+
+#pragma once
+#include "ColorRgba.h"
+
+//============================================================================
+
+namespace Javelin
+{
+//============================================================================
+
+	struct PvrTcPacket
+	{
+		unsigned int    modulationData;
+		unsigned        usePunchthroughAlpha : 1;
+		unsigned        colorA          	 : 14;
+		unsigned        colorAIsOpaque  	 : 1;
+		unsigned        colorB        		 : 15;
+		unsigned        colorBIsOpaque  	 : 1;
+		
+		ColorRgb<int> GetColorRgbA() const;
+		ColorRgb<int> GetColorRgbB() const;
+		ColorRgba<int> GetColorRgbaA() const;
+		ColorRgba<int> GetColorRgbaB() const;
+		
+		void SetColorA(const ColorRgb<unsigned char>& c);
+		void SetColorB(const ColorRgb<unsigned char>& c);
+
+		void SetColorA(const ColorRgba<unsigned char>& c);
+		void SetColorB(const ColorRgba<unsigned char>& c);
+		
+		static const unsigned char BILINEAR_FACTORS[16][4];
+		static const unsigned char WEIGHTS[8][4];
+	};
+
+//============================================================================
+} // namespace Javelin
+//============================================================================
diff --git a/3rdparty/bimg/3rdparty/pvrtc/README.md b/3rdparty/bimg/3rdparty/pvrtc/README.md
new file mode 100644
index 0000000..fb31a18
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/README.md
@@ -0,0 +1,17 @@
+PvrTcCompressor
+===============
+
+This was an afternoon project to determine whether crude approximations could
+produce reasonable results.
+
+~~This is *NOT* complete sourcecode. It includes enough code to show the details
+of how the algorithm works.~~
+
+~~If anyone decides to make this compile separately, send a pull request.~~
+
+Thanks to Brendan Duncan for contributing a pull request to fill in all of the
+classes and to build a simple test case. Specifically, he has contributed all of
+the files that do NOT begin with PvrTc
+
+http://roartindon.blogspot.sg/2014/08/pvr-texture-compression-exploration.html
+
diff --git a/3rdparty/bimg/3rdparty/pvrtc/RgbBitmap.h b/3rdparty/bimg/3rdparty/pvrtc/RgbBitmap.h
new file mode 100644
index 0000000..4f3c57b
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/RgbBitmap.h
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "Bitmap.h"
+#include "ColorRgba.h"
+
+namespace Javelin {
+
+class RgbBitmap : public Bitmap {
+public:
+    RgbBitmap() {}
+
+    RgbBitmap(int w, int h)
+        : Bitmap(w, h, 3) {
+    }
+
+    const ColorRgb<unsigned char> *GetData() const { 
+        return reinterpret_cast<ColorRgb<unsigned char> *>(data); 
+    }
+
+    ColorRgb<unsigned char> *GetData() { 
+        return reinterpret_cast<ColorRgb<unsigned char> *>(data); 
+    }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/pvrtc/RgbaBitmap.h b/3rdparty/bimg/3rdparty/pvrtc/RgbaBitmap.h
new file mode 100644
index 0000000..ae43a77
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/pvrtc/RgbaBitmap.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "ColorRgba.h"
+
+namespace Javelin {
+
+class RgbaBitmap : public Bitmap {
+public:
+    RgbaBitmap() {}
+
+    RgbaBitmap(int w, int h)
+        : Bitmap(w, h, 4) {
+    }
+
+    const ColorRgba<unsigned char> *GetData() const { 
+        return reinterpret_cast<ColorRgba<unsigned char> *>(data); 
+    }
+
+    ColorRgba<unsigned char> *GetData() { 
+        return reinterpret_cast<ColorRgba<unsigned char> *>(data); 
+    }
+};
+
+}
diff --git a/3rdparty/bimg/3rdparty/stb/stb_image.h b/3rdparty/bimg/3rdparty/stb/stb_image.h
new file mode 100644
index 0000000..3ce6d8f
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/stb/stb_image.h
@@ -0,0 +1,7183 @@
+#if defined(_MSC_VER)
+#	pragma warning(disable:4244) // warning C4244: '=': conversion from 'int' to 'stbi__uint16', possible loss of data
+#	pragma warning(disable:4245) // warning C4245: 'argument': conversion from 'int' to 'char', signed/unsigned mismatch
+#	pragma warning(disable:4456) // warning C4456: declaration of 'k' hides previous local declaration
+#endif
+
+/* stb_image - v2.15 - public domain image loader - http://nothings.org/stb_image.h
+                                     no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) partial animated GIF support
+                         limited 16-bit PSD support
+                         minor bugs, code cleanup, and compiler warnings
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+                                           Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes
+    Fabian "ryg" Giesen
+    Arseny Kapoulkine
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George   Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson     Phil Jordan
+    Dave Moore              Roy Eltham         Hayaki Saito       Nathan Reed
+    Won Chun                Luke Graham        Johan Duparc       Nick Verigakis
+    the Horde3D community   Thomas Ruf         Ronny Chevalier    Baldur Karlsson
+    Janez Zemva             John Bartholomew   Michal Cichon      github:rlyeh
+    Jonathan Blow           Ken Hamada         Tero Hanninen      github:romigrou
+    Laurent Gomila          Cort Stratton      Sergio Gonzalez    github:svdijk
+    Aruelien Pocheville     Thibault Reuille   Cass Everitt       github:snagar
+    Ryamond Barbiero        Paul Du Bois       Engin Manap        github:Zelex
+    Michaelangel007@github  Philipp Wiesemann  Dale Weiler        github:grim210
+    Oriol Ferrer Mesia      Josh Tobin         Matthew Gregan     github:sammyhw
+    Blazej Dariusz Roszkowski                  Gregory Mullen     github:phprus
+
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 16-bit-per-channel PNG
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - no 1-bit BMP
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data)
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'req_comp' if req_comp is non-zero, or *comp otherwise.
+// If req_comp is non-zero, *comp has the number of components that _would_
+// have been output otherwise. E.g. if you set req_comp to 4, you will always
+// get RGBA output, but you can check *comp to see if it's trivially opaque
+// because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *comp will be unchanged. The function stbi_failure_reason()
+// can be queried for an extremely brief, end-user unfriendly explanation
+// of why the load failed. Define STBI_NO_FAILURE_STRINGS to avoid
+// compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy to use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// make more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image now supports loading HDR images in general, and currently
+// the Radiance .HDR file format, although the support is provided
+// generically. You can still load any file through the existing interface;
+// if you attempt to load an HDR file, it will be automatically remapped to
+// LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// By default we convert iphone-formatted PNGs back to RGB, even though
+// they are internally encoded differently. You can disable this conversion
+// by by calling stbi_convert_iphone_png_to_rgb(0), in which case
+// you will always just get the native iphone "format" through (which
+// is BGR stored in RGB).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for req_comp
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load               (char              const *filename,           int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load_from_file   (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+// @TODO the other variants
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf                 (char const *filename,           int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// NOT THREADSAFE
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info            (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file  (FILE *f,                  int *x, int *y, int *comp);
+
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+
+#ifdef _MSC_VER
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (32 - (y))))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+static int stbi__sse2_available()
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+static int stbi__sse2_available()
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+// assume GCC or Clang on ARM targets
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   fseek((FILE*) user, n, SEEK_CUR);
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+// this is not threadsafe
+static const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+    stbi__vertically_flip_on_load = flag_true_if_should_flip;
+}
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 8) {
+      STBI_ASSERT(ri.bits_per_channel == 16);
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int w = *x, h = *y;
+      int channels = req_comp ? req_comp : *comp;
+      int row,col,z;
+      stbi_uc *image = (stbi_uc *) result;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < channels; z++) {
+               stbi_uc temp = image[(row * w + col) * channels + z];
+               image[(row * w + col) * channels + z] = image[((h - row - 1) * w + col) * channels + z];
+               image[((h - row - 1) * w + col) * channels + z] = temp;
+            }
+         }
+      }
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   if (ri.bits_per_channel != 16) {
+      STBI_ASSERT(ri.bits_per_channel == 8);
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int w = *x, h = *y;
+      int channels = req_comp ? req_comp : *comp;
+      int row,col,z;
+      stbi__uint16 *image = (stbi__uint16 *) result;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < channels; z++) {
+               stbi__uint16 temp = image[(row * w + col) * channels + z];
+               image[(row * w + col) * channels + z] = image[((h - row - 1) * w + col) * channels + z];
+               image[((h - row - 1) * w + col) * channels + z] = temp;
+            }
+         }
+      }
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#ifndef STBI_NO_HDR
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int w = *x, h = *y;
+      int depth = req_comp ? req_comp : *comp;
+      int row,col,z;
+      float temp;
+
+      // @OPTIMIZE: use a bigger temp buffer and memcpy multiple pixels at once
+      for (row = 0; row < (h>>1); row++) {
+         for (col = 0; col < w; col++) {
+            for (z = 0; z < depth; z++) {
+               temp = result[(row * w + col) * depth + z];
+               result[(row * w + col) * depth + z] = result[((h - row - 1) * w + col) * depth + z];
+               result[((h - row - 1) * w + col) * depth + z] = temp;
+            }
+         }
+      }
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int      stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   return z + (stbi__get16le(s) << 16);
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0], dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0], dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2],dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]), dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0],dest[1]=src[1],dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0);
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+      if (k < comp) output[i*comp + k] = data[i*comp+k]/255.0f;
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0,code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i)
+      for (j=0; j < count[i]; ++j)
+         h->size[k++] = (stbi_uc) (i+1);
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1 << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k << 8) + (run << 4) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static int const stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+
+   sgn = (stbi__int32)j->code_buffer >> 31; // sign bit is always in MSB
+   k = stbi_lrot(j->code_buffer, n);
+   STBI_ASSERT(n >= 0 && n < (int) (sizeof(stbi__bmask)/sizeof(*stbi__bmask)));
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & ~sgn);
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      data[0] = (short) (dc << j->succ_low);
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) << shift);
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) << shift);
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) << 12)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0] << 2;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = sixteen ? stbi__get16be(z->s) : stbi__get8(z->s);
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+            // handle 0s at the end of image data from IP Kamera 9060
+            while (!stbi__at_eof(j->s)) {
+               int x = stbi__get8(j->s);
+               if (x == 255) {
+                  j->marker = stbi__get8(j->s);
+                  break;
+               }
+            }
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) stbi__err("bad DNL height", "Corrupt JPEG");
+      } else {
+         if (!stbi__process_marker(j, m)) return 0;
+      }
+      m = stbi__get_marker(j);
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4];
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc k = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], k);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], k);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], k);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc k = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], k);
+                     out[1] = stbi__blinn_8x8(255 - out[1], k);
+                     out[2] = stbi__blinn_8x8(255 - out[2], k);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc k = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], k);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], k);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], k);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) *out++ = y[i], *out++ = 255;
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[288];
+   stbi__uint16 value[288];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   if (z->zbuffer >= z->zbuffer_end) return 0;
+   return *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      STBI_ASSERT(z->code_buffer < (1U << z->num_bits));
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s == 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   STBI_ASSERT(z->size[b] == s);
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) stbi__fill_bits(a);
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (int) (z->zout     - z->zout_start);
+   limit = old_limit = (int) (z->zout_end - z->zout_start);
+   while (cur + n > limit)
+      limit *= 2;
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17)
+            c = stbi__zreceive(a,3)+3;
+         else {
+            STBI_ASSERT(c == 18);
+            c = stbi__zreceive(a,7)+11;
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   STBI_ASSERT(a->num_bits == 0);
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[288] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , 288)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+   if (s->img_x == x && s->img_y == y) {
+      if (raw_len != img_len) return stbi__err("not enough pixels","Corrupt PNG");
+   } else { // interlaced:
+      if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+   }
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         STBI_ASSERT(img_width_bytes <= x);
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load = 0;
+static int stbi__de_iphone_flag = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag = flag_true_if_should_convert;
+}
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               p[0] = p[2] * 255 / a;
+               p[1] = p[1] * 255 / a;
+               p[2] =  t   * 255 / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((a) << 24) + ((b) << 16) + ((c) << 8) + (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3];
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s); if (s->img_x > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            s->img_y = stbi__get32be(s); if (s->img_y > (1 << 24)) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+               if (scan == STBI__SCAN_header) return 1;
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+               // if SCAN_header, have to scan to see if we have a tRNS
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth < 8)
+         ri->bits_per_channel = 8;
+      else
+         ri->bits_per_channel = p->depth;
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) n += 16, z >>= 16;
+   if (z >= 0x00100) n +=  8, z >>=  8;
+   if (z >= 0x00010) n +=  4, z >>=  4;
+   if (z >= 0x00004) n +=  2, z >>=  2;
+   if (z >= 0x00002) n +=  1, z >>=  1;
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+static int stbi__shiftsigned(int v, int shift, int bits)
+{
+   int result;
+   int z=0;
+
+   if (shift < 0) v <<= -shift;
+   else v >>= shift;
+   result = v;
+
+   z = bits;
+   while (z < 8) {
+      result += v >> z;
+      z += bits;
+   }
+   return result;
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+} stbi__bmp_data;
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (info->bpp == 1) return stbi__errpuc("monochrome", "BMP type not supported: 1-bit");
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               if (info->bpp == 32) {
+                  info->mr = 0xffu << 16;
+                  info->mg = 0xffu <<  8;
+                  info->mb = 0xffu <<  0;
+                  info->ma = 0xffu << 24;
+                  info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+               } else {
+                  info->mr = 31u << 10;
+                  info->mg = 31u <<  5;
+                  info->mb = 31u <<  0;
+               }
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - 14 - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - 14 - info.hsz) >> 2;
+   }
+
+   s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - 14 - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      for (j=0; j < (int) s->img_y; ++j) {
+         for (i=0; i < (int) s->img_x; i += 2) {
+            int v=stbi__get8(s),v2=0;
+            if (info.bpp == 4) {
+               v2 = v & 15;
+               v >>= 4;
+            }
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+            if (i+1 == (int) s->img_x) break;
+            v = (info.bpp == 8) ? stbi__get8(s) : v2;
+            out[z++] = pal[v][0];
+            out[z++] = pal[v][1];
+            out[z++] = pal[v][2];
+            if (target == 4) out[z++] = 255;
+         }
+         stbi__skip(s, pad);
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - 14 - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i], p1[i] = p2[i], p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if(is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+            // else: fall-through
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+            return STBI_rgb;
+      case 24: // fall-through
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceeded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out, *old_out;             // output buffer (always 4 components)
+   int flags, bgindex, ratio, transparent, eflags, delay;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[4096];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   p = &g->out[g->cur_x + g->cur_y];
+   c = &g->color_table[g->codes[code].suffix * 4];
+
+   if (c[3] >= 128) {
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) return stbi__errpuc("no clear code", "Corrupt GIF");
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 4096)        return stbi__errpuc("too many codes", "Corrupt GIF");
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+static void stbi__fill_gif_background(stbi__gif *g, int x0, int y0, int x1, int y1)
+{
+   int x, y;
+   stbi_uc *c = g->pal[g->bgindex];
+   for (y = y0; y < y1; y += 4 * g->w) {
+      for (x = x0; x < x1; x += 4) {
+         stbi_uc *p  = &g->out[y + x];
+         p[0] = c[2];
+         p[1] = c[1];
+         p[2] = c[0];
+         p[3] = 0;
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp)
+{
+   int i;
+   stbi_uc *prev_out = 0;
+
+   if (g->out == 0 && !stbi__gif_header(s, g, comp,0))
+      return 0; // stbi__g_failure_reason set by stbi__gif_header
+
+   if (!stbi__mad3sizes_valid(g->w, g->h, 4, 0))
+      return stbi__errpuc("too large", "GIF too large");
+
+   prev_out = g->out;
+   g->out = (stbi_uc *) stbi__malloc_mad3(4, g->w, g->h, 0);
+   if (g->out == 0) return stbi__errpuc("outofmem", "Out of memory");
+
+   switch ((g->eflags & 0x1C) >> 2) {
+      case 0: // unspecified (also always used on 1st frame)
+         stbi__fill_gif_background(g, 0, 0, 4 * g->w, 4 * g->w * g->h);
+         break;
+      case 1: // do not dispose
+         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
+         g->old_out = prev_out;
+         break;
+      case 2: // dispose to background
+         if (prev_out) memcpy(g->out, prev_out, 4 * g->w * g->h);
+         stbi__fill_gif_background(g, g->start_x, g->start_y, g->max_x, g->max_y);
+         break;
+      case 3: // dispose to previous
+         if (g->old_out) {
+            for (i = g->start_y; i < g->max_y; i += 4 * g->w)
+               memcpy(&g->out[i + g->start_x], &g->old_out[i + g->start_x], g->max_x - g->start_x);
+         }
+         break;
+   }
+
+   for (;;) {
+      switch (stbi__get8(s)) {
+         case 0x2C: /* Image Descriptor */
+         {
+            int prev_trans = -1;
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               if (g->transparent >= 0 && (g->eflags & 0x01)) {
+                  prev_trans = g->pal[g->transparent][3];
+                  g->pal[g->transparent][3] = 0;
+               }
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (o == NULL) return NULL;
+
+            if (prev_trans != -1)
+               g->pal[g->transparent][3] = (stbi_uc) prev_trans;
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            if (stbi__get8(s) == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = stbi__get16le(s);
+                  g->transparent = stbi__get8(s);
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0)
+               stbi__skip(s, len);
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+
+   STBI_NOTUSED(req_comp);
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   memset(g, 0, sizeof(*g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, g, comp, req_comp);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g->w;
+      *y = g->h;
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g->w, g->h);
+   }
+   else if (g->out)
+      STBI_FREE(g->out);
+   STBI_FREE(g);
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   stbi__rewind( s );
+   if (p == NULL)
+      return 0;
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) *comp = info.ma ? 4 : 3;
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   if (stbi__get16be(s) != 8) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+//    Does not support 16-bit-per-channel
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   if (!stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n))
+      return 0;
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad3sizes_valid(s->img_n, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad3(s->img_n, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y);
+
+   if (req_comp && req_comp != s->img_n) {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+
+   if (maxv > 255)
+      return stbi__err("max value > 255", "PPM image not 8-bit");
+   else
+      return 1;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/3rdparty/bimg/3rdparty/stb/stb_image_resize.h b/3rdparty/bimg/3rdparty/stb/stb_image_resize.h
new file mode 100644
index 0000000..b507e04
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/stb/stb_image_resize.h
@@ -0,0 +1,2624 @@
+/* stb_image_resize - v0.94 - public domain image resizing
+   by Jorge L Rodriguez (@VinoBS) - 2014
+   http://github.com/nothings/stb
+
+   Written with emphasis on usability, portability, and efficiency. (No
+   SIMD or threads, so it be easily outperformed by libs that use those.)
+   Only scaling and translation is supported, no rotations or shears.
+   Easy API downsamples w/Mitchell filter, upsamples w/cubic interpolation.
+
+   COMPILING & LINKING
+      In one C/C++ file that #includes this file, do this:
+         #define STB_IMAGE_RESIZE_IMPLEMENTATION
+      before the #include. That will create the implementation in that file.
+
+   QUICKSTART
+      stbir_resize_uint8(      input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0, num_channels)
+      stbir_resize_float(...)
+      stbir_resize_uint8_srgb( input_pixels , in_w , in_h , 0,
+                               output_pixels, out_w, out_h, 0,
+                               num_channels , alpha_chan  , 0)
+      stbir_resize_uint8_srgb_edgemode(
+                               input_pixels , in_w , in_h , 0, 
+                               output_pixels, out_w, out_h, 0, 
+                               num_channels , alpha_chan  , 0, STBIR_EDGE_CLAMP)
+                                                            // WRAP/REFLECT/ZERO
+
+   FULL API
+      See the "header file" section of the source for API documentation.
+
+   ADDITIONAL DOCUMENTATION
+
+      SRGB & FLOATING POINT REPRESENTATION
+         The sRGB functions presume IEEE floating point. If you do not have
+         IEEE floating point, define STBIR_NON_IEEE_FLOAT. This will use
+         a slower implementation.
+
+      MEMORY ALLOCATION
+         The resize functions here perform a single memory allocation using
+         malloc. To control the memory allocation, before the #include that
+         triggers the implementation, do:
+
+            #define STBIR_MALLOC(size,context) ...
+            #define STBIR_FREE(ptr,context)   ...
+
+         Each resize function makes exactly one call to malloc/free, so to use
+         temp memory, store the temp memory in the context and return that.
+
+      ASSERT
+         Define STBIR_ASSERT(boolval) to override assert() and not use assert.h
+
+      OPTIMIZATION
+         Define STBIR_SATURATE_INT to compute clamp values in-range using
+         integer operations instead of float operations. This may be faster
+         on some platforms.
+
+      DEFAULT FILTERS
+         For functions which don't provide explicit control over what filters
+         to use, you can change the compile-time defaults with
+
+            #define STBIR_DEFAULT_FILTER_UPSAMPLE     STBIR_FILTER_something
+            #define STBIR_DEFAULT_FILTER_DOWNSAMPLE   STBIR_FILTER_something
+
+         See stbir_filter in the header-file section for the list of filters.
+
+      NEW FILTERS
+         A number of 1D filter kernels are used. For a list of
+         supported filters see the stbir_filter enum. To add a new filter,
+         write a filter function and add it to stbir__filter_info_table.
+
+      PROGRESS
+         For interactive use with slow resize operations, you can install
+         a progress-report callback:
+
+            #define STBIR_PROGRESS_REPORT(val)   some_func(val)
+
+         The parameter val is a float which goes from 0 to 1 as progress is made.
+
+         For example:
+
+            static void my_progress_report(float progress);
+            #define STBIR_PROGRESS_REPORT(val) my_progress_report(val)
+
+            #define STB_IMAGE_RESIZE_IMPLEMENTATION
+            #include "stb_image_resize.h"
+
+            static void my_progress_report(float progress)
+            {
+               printf("Progress: %f%%\n", progress*100);
+            }
+
+      MAX CHANNELS
+         If your image has more than 64 channels, define STBIR_MAX_CHANNELS
+         to the max you'll have.
+
+      ALPHA CHANNEL
+         Most of the resizing functions provide the ability to control how
+         the alpha channel of an image is processed. The important things
+         to know about this:
+
+         1. The best mathematically-behaved version of alpha to use is
+         called "premultiplied alpha", in which the other color channels
+         have had the alpha value multiplied in. If you use premultiplied
+         alpha, linear filtering (such as image resampling done by this
+         library, or performed in texture units on GPUs) does the "right
+         thing". While premultiplied alpha is standard in the movie CGI
+         industry, it is still uncommon in the videogame/real-time world.
+
+         If you linearly filter non-premultiplied alpha, strange effects
+         occur. (For example, the 50/50 average of 99% transparent bright green
+         and 1% transparent black produces 50% transparent dark green when
+         non-premultiplied, whereas premultiplied it produces 50%
+         transparent near-black. The former introduces green energy
+         that doesn't exist in the source image.)
+
+         2. Artists should not edit premultiplied-alpha images; artists
+         want non-premultiplied alpha images. Thus, art tools generally output
+         non-premultiplied alpha images.
+
+         3. You will get best results in most cases by converting images
+         to premultiplied alpha before processing them mathematically.
+
+         4. If you pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED, the
+         resizer does not do anything special for the alpha channel;
+         it is resampled identically to other channels. This produces
+         the correct results for premultiplied-alpha images, but produces
+         less-than-ideal results for non-premultiplied-alpha images.
+
+         5. If you do not pass the flag STBIR_FLAG_ALPHA_PREMULTIPLIED,
+         then the resizer weights the contribution of input pixels
+         based on their alpha values, or, equivalently, it multiplies
+         the alpha value into the color channels, resamples, then divides
+         by the resultant alpha value. Input pixels which have alpha=0 do
+         not contribute at all to output pixels unless _all_ of the input
+         pixels affecting that output pixel have alpha=0, in which case
+         the result for that pixel is the same as it would be without
+         STBIR_FLAG_ALPHA_PREMULTIPLIED. However, this is only true for
+         input images in integer formats. For input images in float format,
+         input pixels with alpha=0 have no effect, and output pixels
+         which have alpha=0 will be 0 in all channels. (For float images,
+         you can manually achieve the same result by adding a tiny epsilon
+         value to the alpha channel of every image, and then subtracting
+         or clamping it at the end.)
+
+         6. You can suppress the behavior described in #5 and make
+         all-0-alpha pixels have 0 in all channels by #defining
+         STBIR_NO_ALPHA_EPSILON.
+
+         7. You can separately control whether the alpha channel is
+         interpreted as linear or affected by the colorspace. By default
+         it is linear; you almost never want to apply the colorspace.
+         (For example, graphics hardware does not apply sRGB conversion
+         to the alpha channel.)
+
+   CONTRIBUTORS
+      Jorge L Rodriguez: Implementation
+      Sean Barrett: API design, optimizations
+      Aras Pranckevicius: bugfix
+         
+   REVISIONS
+      0.94 (2017-03-18) fixed warnings
+      0.93 (2017-03-03) fixed bug with certain combinations of heights
+      0.92 (2017-01-02) fix integer overflow on large (>2GB) images
+      0.91 (2016-04-02) fix warnings; fix handling of subpixel regions
+      0.90 (2014-09-17) first released version
+
+   LICENSE
+     See end of file for license information.
+
+   TODO
+      Don't decode all of the image data when only processing a partial tile
+      Don't use full-width decode buffers when only processing a partial tile
+      When processing wide images, break processing into tiles so data fits in L1 cache
+      Installable filters?
+      Resize that respects alpha test coverage
+         (Reference code: FloatImage::alphaTestCoverage and FloatImage::scaleAlphaToCoverage:
+         https://code.google.com/p/nvidia-texture-tools/source/browse/trunk/src/nvimage/FloatImage.cpp )
+*/
+
+#ifndef STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+#define STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+#ifdef _MSC_VER
+typedef unsigned char  stbir_uint8;
+typedef unsigned short stbir_uint16;
+typedef unsigned int   stbir_uint32;
+#else
+#include <stdint.h>
+typedef uint8_t  stbir_uint8;
+typedef uint16_t stbir_uint16;
+typedef uint32_t stbir_uint32;
+#endif
+
+#ifdef STB_IMAGE_RESIZE_STATIC
+#define STBIRDEF static
+#else
+#ifdef __cplusplus
+#define STBIRDEF extern "C"
+#else
+#define STBIRDEF extern
+#endif
+#endif
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Easy-to-use API:
+//
+//     * "input pixels" points to an array of image data with 'num_channels' channels (e.g. RGB=3, RGBA=4)
+//     * input_w is input image width (x-axis), input_h is input image height (y-axis)
+//     * stride is the offset between successive rows of image data in memory, in bytes. you can
+//       specify 0 to mean packed continuously in memory
+//     * alpha channel is treated identically to other channels.
+//     * colorspace is linear or sRGB as specified by function name
+//     * returned result is 1 for success or 0 in case of an error.
+//       #define STBIR_ASSERT() to trigger an assert on parameter validation errors.
+//     * Memory required grows approximately linearly with input and output size, but with
+//       discontinuities at input_w == output_w and input_h == output_h.
+//     * These functions use a "default" resampling filter defined at compile time. To change the filter,
+//       you can change the compile-time defaults by #defining STBIR_DEFAULT_FILTER_UPSAMPLE
+//       and STBIR_DEFAULT_FILTER_DOWNSAMPLE, or you can use the medium-complexity API.
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels);
+
+
+// The following functions interpret image data as gamma-corrected sRGB. 
+// Specify STBIR_ALPHA_CHANNEL_NONE if you have no alpha channel,
+// or otherwise provide the index of the alpha channel. Flags value
+// of 0 will probably do the right thing if you're not sure what
+// the flags mean.
+
+#define STBIR_ALPHA_CHANNEL_NONE       -1
+
+// Set this flag if your texture has premultiplied alpha. Otherwise, stbir will
+// use alpha-weighted resampling (effectively premultiplying, resampling,
+// then unpremultiplying).
+#define STBIR_FLAG_ALPHA_PREMULTIPLIED    (1 << 0)
+// The specified alpha channel should be handled as gamma-corrected value even
+// when doing sRGB operations.
+#define STBIR_FLAG_ALPHA_USES_COLORSPACE  (1 << 1)
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags);
+
+
+typedef enum
+{
+    STBIR_EDGE_CLAMP   = 1,
+    STBIR_EDGE_REFLECT = 2,
+    STBIR_EDGE_WRAP    = 3,
+    STBIR_EDGE_ZERO    = 4,
+} stbir_edge;
+
+// This function adds the ability to specify how requests to sample off the edge of the image are handled.
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode);
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Medium-complexity API
+//
+// This extends the easy-to-use API as follows:
+//
+//     * Alpha-channel can be processed separately
+//       * If alpha_channel is not STBIR_ALPHA_CHANNEL_NONE
+//         * Alpha channel will not be gamma corrected (unless flags&STBIR_FLAG_GAMMA_CORRECT)
+//         * Filters will be weighted by alpha channel (unless flags&STBIR_FLAG_ALPHA_PREMULTIPLIED)
+//     * Filter can be selected explicitly
+//     * uint16 image type
+//     * sRGB colorspace available for all types
+//     * context parameter for passing to STBIR_MALLOC
+
+typedef enum
+{
+    STBIR_FILTER_DEFAULT      = 0,  // use same filter type that easy-to-use API chooses
+    STBIR_FILTER_BOX          = 1,  // A trapezoid w/1-pixel wide ramps, same result as box for integer scale ratios
+    STBIR_FILTER_TRIANGLE     = 2,  // On upsampling, produces same results as bilinear texture filtering
+    STBIR_FILTER_CUBICBSPLINE = 3,  // The cubic b-spline (aka Mitchell-Netrevalli with B=1,C=0), gaussian-esque
+    STBIR_FILTER_CATMULLROM   = 4,  // An interpolating cubic spline
+    STBIR_FILTER_MITCHELL     = 5,  // Mitchell-Netrevalli filter with B=1/3, C=1/3
+} stbir_filter;
+
+typedef enum
+{
+    STBIR_COLORSPACE_LINEAR,
+    STBIR_COLORSPACE_SRGB,
+
+    STBIR_MAX_COLORSPACES,
+} stbir_colorspace;
+
+// The following functions are all identical except for the type of the image data
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context);
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Full-complexity API
+//
+// This extends the medium API as follows:
+//
+//       * uint32 image type
+//     * not typesafe
+//     * separate filter types for each axis
+//     * separate edge modes for each axis
+//     * can specify scale explicitly for subpixel correctness
+//     * can specify image source tile using texture coordinates
+
+typedef enum
+{
+    STBIR_TYPE_UINT8 ,
+    STBIR_TYPE_UINT16,
+    STBIR_TYPE_UINT32,
+    STBIR_TYPE_FLOAT ,
+
+    STBIR_MAX_TYPES
+} stbir_datatype;
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context);
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset);
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1);
+// (s0, t0) & (s1, t1) are the top-left and bottom right corner (uv addressing style: [0, 1]x[0, 1]) of a region of the input image to use.
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBIR_INCLUDE_STB_IMAGE_RESIZE_H
+
+
+
+
+
+#ifdef STB_IMAGE_RESIZE_IMPLEMENTATION
+
+#ifndef STBIR_ASSERT
+#include <assert.h>
+#define STBIR_ASSERT(x) assert(x)
+#endif
+
+// For memset
+#include <string.h>
+
+#include <math.h>
+
+#ifndef STBIR_MALLOC
+#include <stdlib.h>
+#define STBIR_MALLOC(size,c) malloc(size)
+#define STBIR_FREE(ptr,c)    free(ptr)
+#endif
+
+#ifndef _MSC_VER
+#ifdef __cplusplus
+#define stbir__inline inline
+#else
+#define stbir__inline
+#endif
+#else
+#define stbir__inline __forceinline
+#endif
+
+
+// should produce compiler error if size is wrong
+typedef unsigned char stbir__validate_uint32[sizeof(stbir_uint32) == 4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBIR__NOTUSED(v)  (void)(v)
+#else
+#define STBIR__NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#define STBIR__ARRAY_SIZE(a) (sizeof((a))/sizeof((a)[0]))
+
+#ifndef STBIR_DEFAULT_FILTER_UPSAMPLE
+#define STBIR_DEFAULT_FILTER_UPSAMPLE    STBIR_FILTER_CATMULLROM
+#endif
+
+#ifndef STBIR_DEFAULT_FILTER_DOWNSAMPLE
+#define STBIR_DEFAULT_FILTER_DOWNSAMPLE  STBIR_FILTER_MITCHELL
+#endif
+
+#ifndef STBIR_PROGRESS_REPORT
+#define STBIR_PROGRESS_REPORT(float_0_to_1)
+#endif
+
+#ifndef STBIR_MAX_CHANNELS
+#define STBIR_MAX_CHANNELS 64
+#endif
+
+#if STBIR_MAX_CHANNELS > 65536
+#error "Too many channels; STBIR_MAX_CHANNELS must be no more than 65536."
+// because we store the indices in 16-bit variables
+#endif
+
+// This value is added to alpha just before premultiplication to avoid
+// zeroing out color values. It is equivalent to 2^-80. If you don't want
+// that behavior (it may interfere if you have floating point images with
+// very small alpha values) then you can define STBIR_NO_ALPHA_EPSILON to
+// disable it.
+#ifndef STBIR_ALPHA_EPSILON
+#define STBIR_ALPHA_EPSILON ((float)1 / (1 << 20) / (1 << 20) / (1 << 20) / (1 << 20))
+#endif
+
+
+
+#ifdef _MSC_VER
+#define STBIR__UNUSED_PARAM(v)  (void)(v)
+#else
+#define STBIR__UNUSED_PARAM(v)  (void)sizeof(v)
+#endif
+
+// must match stbir_datatype
+static unsigned char stbir__type_size[] = {
+    1, // STBIR_TYPE_UINT8
+    2, // STBIR_TYPE_UINT16
+    4, // STBIR_TYPE_UINT32
+    4, // STBIR_TYPE_FLOAT
+};
+
+// Kernel function centered at 0
+typedef float (stbir__kernel_fn)(float x, float scale);
+typedef float (stbir__support_fn)(float scale);
+
+typedef struct
+{
+    stbir__kernel_fn* kernel;
+    stbir__support_fn* support;
+} stbir__filter_info;
+
+// When upsampling, the contributors are which source pixels contribute.
+// When downsampling, the contributors are which destination pixels are contributed to.
+typedef struct
+{
+    int n0; // First contributing pixel
+    int n1; // Last contributing pixel
+} stbir__contributors;
+
+typedef struct
+{
+    const void* input_data;
+    int input_w;
+    int input_h;
+    int input_stride_bytes;
+
+    void* output_data;
+    int output_w;
+    int output_h;
+    int output_stride_bytes;
+
+    float s0, t0, s1, t1;
+
+    float horizontal_shift; // Units: output pixels
+    float vertical_shift;   // Units: output pixels
+    float horizontal_scale;
+    float vertical_scale;
+
+    int channels;
+    int alpha_channel;
+    stbir_uint32 flags;
+    stbir_datatype type;
+    stbir_filter horizontal_filter;
+    stbir_filter vertical_filter;
+    stbir_edge edge_horizontal;
+    stbir_edge edge_vertical;
+    stbir_colorspace colorspace;
+
+    stbir__contributors* horizontal_contributors;
+    float* horizontal_coefficients;
+
+    stbir__contributors* vertical_contributors;
+    float* vertical_coefficients;
+
+    int decode_buffer_pixels;
+    float* decode_buffer;
+
+    float* horizontal_buffer;
+
+    // cache these because ceil/floor are inexplicably showing up in profile
+    int horizontal_coefficient_width;
+    int vertical_coefficient_width;
+    int horizontal_filter_pixel_width;
+    int vertical_filter_pixel_width;
+    int horizontal_filter_pixel_margin;
+    int vertical_filter_pixel_margin;
+    int horizontal_num_contributors;
+    int vertical_num_contributors;
+
+    int ring_buffer_length_bytes;   // The length of an individual entry in the ring buffer. The total number of ring buffers is stbir__get_filter_pixel_width(filter)
+    int ring_buffer_num_entries;    // Total number of entries in the ring buffer.
+    int ring_buffer_first_scanline;
+    int ring_buffer_last_scanline;
+    int ring_buffer_begin_index;    // first_scanline is at this index in the ring buffer
+    float* ring_buffer;
+
+    float* encode_buffer; // A temporary buffer to store floats so we don't lose precision while we do multiply-adds.
+
+    int horizontal_contributors_size;
+    int horizontal_coefficients_size;
+    int vertical_contributors_size;
+    int vertical_coefficients_size;
+    int decode_buffer_size;
+    int horizontal_buffer_size;
+    int ring_buffer_size;
+    int encode_buffer_size;
+} stbir__info;
+
+
+static const float stbir__max_uint8_as_float  = 255.0f;
+static const float stbir__max_uint16_as_float = 65535.0f;
+static const double stbir__max_uint32_as_float = 4294967295.0;
+
+
+static stbir__inline int stbir__min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+static stbir__inline float stbir__saturate(float x)
+{
+    if (x < 0)
+        return 0;
+
+    if (x > 1)
+        return 1;
+
+    return x;
+}
+
+#ifdef STBIR_SATURATE_INT
+static stbir__inline stbir_uint8 stbir__saturate8(int x)
+{
+    if ((unsigned int) x <= 255)
+        return x;
+
+    if (x < 0)
+        return 0;
+
+    return 255;
+}
+
+static stbir__inline stbir_uint16 stbir__saturate16(int x)
+{
+    if ((unsigned int) x <= 65535)
+        return x;
+
+    if (x < 0)
+        return 0;
+
+    return 65535;
+}
+#endif
+
+static float stbir__srgb_uchar_to_linear_float[256] = {
+    0.000000f, 0.000304f, 0.000607f, 0.000911f, 0.001214f, 0.001518f, 0.001821f, 0.002125f, 0.002428f, 0.002732f, 0.003035f,
+    0.003347f, 0.003677f, 0.004025f, 0.004391f, 0.004777f, 0.005182f, 0.005605f, 0.006049f, 0.006512f, 0.006995f, 0.007499f,
+    0.008023f, 0.008568f, 0.009134f, 0.009721f, 0.010330f, 0.010960f, 0.011612f, 0.012286f, 0.012983f, 0.013702f, 0.014444f,
+    0.015209f, 0.015996f, 0.016807f, 0.017642f, 0.018500f, 0.019382f, 0.020289f, 0.021219f, 0.022174f, 0.023153f, 0.024158f,
+    0.025187f, 0.026241f, 0.027321f, 0.028426f, 0.029557f, 0.030713f, 0.031896f, 0.033105f, 0.034340f, 0.035601f, 0.036889f,
+    0.038204f, 0.039546f, 0.040915f, 0.042311f, 0.043735f, 0.045186f, 0.046665f, 0.048172f, 0.049707f, 0.051269f, 0.052861f,
+    0.054480f, 0.056128f, 0.057805f, 0.059511f, 0.061246f, 0.063010f, 0.064803f, 0.066626f, 0.068478f, 0.070360f, 0.072272f,
+    0.074214f, 0.076185f, 0.078187f, 0.080220f, 0.082283f, 0.084376f, 0.086500f, 0.088656f, 0.090842f, 0.093059f, 0.095307f,
+    0.097587f, 0.099899f, 0.102242f, 0.104616f, 0.107023f, 0.109462f, 0.111932f, 0.114435f, 0.116971f, 0.119538f, 0.122139f,
+    0.124772f, 0.127438f, 0.130136f, 0.132868f, 0.135633f, 0.138432f, 0.141263f, 0.144128f, 0.147027f, 0.149960f, 0.152926f,
+    0.155926f, 0.158961f, 0.162029f, 0.165132f, 0.168269f, 0.171441f, 0.174647f, 0.177888f, 0.181164f, 0.184475f, 0.187821f,
+    0.191202f, 0.194618f, 0.198069f, 0.201556f, 0.205079f, 0.208637f, 0.212231f, 0.215861f, 0.219526f, 0.223228f, 0.226966f,
+    0.230740f, 0.234551f, 0.238398f, 0.242281f, 0.246201f, 0.250158f, 0.254152f, 0.258183f, 0.262251f, 0.266356f, 0.270498f,
+    0.274677f, 0.278894f, 0.283149f, 0.287441f, 0.291771f, 0.296138f, 0.300544f, 0.304987f, 0.309469f, 0.313989f, 0.318547f,
+    0.323143f, 0.327778f, 0.332452f, 0.337164f, 0.341914f, 0.346704f, 0.351533f, 0.356400f, 0.361307f, 0.366253f, 0.371238f,
+    0.376262f, 0.381326f, 0.386430f, 0.391573f, 0.396755f, 0.401978f, 0.407240f, 0.412543f, 0.417885f, 0.423268f, 0.428691f,
+    0.434154f, 0.439657f, 0.445201f, 0.450786f, 0.456411f, 0.462077f, 0.467784f, 0.473532f, 0.479320f, 0.485150f, 0.491021f,
+    0.496933f, 0.502887f, 0.508881f, 0.514918f, 0.520996f, 0.527115f, 0.533276f, 0.539480f, 0.545725f, 0.552011f, 0.558340f,
+    0.564712f, 0.571125f, 0.577581f, 0.584078f, 0.590619f, 0.597202f, 0.603827f, 0.610496f, 0.617207f, 0.623960f, 0.630757f,
+    0.637597f, 0.644480f, 0.651406f, 0.658375f, 0.665387f, 0.672443f, 0.679543f, 0.686685f, 0.693872f, 0.701102f, 0.708376f,
+    0.715694f, 0.723055f, 0.730461f, 0.737911f, 0.745404f, 0.752942f, 0.760525f, 0.768151f, 0.775822f, 0.783538f, 0.791298f,
+    0.799103f, 0.806952f, 0.814847f, 0.822786f, 0.830770f, 0.838799f, 0.846873f, 0.854993f, 0.863157f, 0.871367f, 0.879622f,
+    0.887923f, 0.896269f, 0.904661f, 0.913099f, 0.921582f, 0.930111f, 0.938686f, 0.947307f, 0.955974f, 0.964686f, 0.973445f,
+    0.982251f, 0.991102f, 1.0f
+};
+
+static float stbir__srgb_to_linear(float f)
+{
+    if (f <= 0.04045f)
+        return f / 12.92f;
+    else
+        return (float)pow((f + 0.055f) / 1.055f, 2.4f);
+}
+
+static float stbir__linear_to_srgb(float f)
+{
+    if (f <= 0.0031308f)
+        return f * 12.92f;
+    else
+        return 1.055f * (float)pow(f, 1 / 2.4f) - 0.055f;
+}
+
+#ifndef STBIR_NON_IEEE_FLOAT
+// From https://gist.github.com/rygorous/2203834
+
+typedef union
+{
+    stbir_uint32 u;
+    float f;
+} stbir__FP32;
+
+static const stbir_uint32 fp32_to_srgb8_tab4[104] = {
+    0x0073000d, 0x007a000d, 0x0080000d, 0x0087000d, 0x008d000d, 0x0094000d, 0x009a000d, 0x00a1000d,
+    0x00a7001a, 0x00b4001a, 0x00c1001a, 0x00ce001a, 0x00da001a, 0x00e7001a, 0x00f4001a, 0x0101001a,
+    0x010e0033, 0x01280033, 0x01410033, 0x015b0033, 0x01750033, 0x018f0033, 0x01a80033, 0x01c20033,
+    0x01dc0067, 0x020f0067, 0x02430067, 0x02760067, 0x02aa0067, 0x02dd0067, 0x03110067, 0x03440067,
+    0x037800ce, 0x03df00ce, 0x044600ce, 0x04ad00ce, 0x051400ce, 0x057b00c5, 0x05dd00bc, 0x063b00b5,
+    0x06970158, 0x07420142, 0x07e30130, 0x087b0120, 0x090b0112, 0x09940106, 0x0a1700fc, 0x0a9500f2,
+    0x0b0f01cb, 0x0bf401ae, 0x0ccb0195, 0x0d950180, 0x0e56016e, 0x0f0d015e, 0x0fbc0150, 0x10630143,
+    0x11070264, 0x1238023e, 0x1357021d, 0x14660201, 0x156601e9, 0x165a01d3, 0x174401c0, 0x182401af,
+    0x18fe0331, 0x1a9602fe, 0x1c1502d2, 0x1d7e02ad, 0x1ed4028d, 0x201a0270, 0x21520256, 0x227d0240,
+    0x239f0443, 0x25c003fe, 0x27bf03c4, 0x29a10392, 0x2b6a0367, 0x2d1d0341, 0x2ebe031f, 0x304d0300,
+    0x31d105b0, 0x34a80555, 0x37520507, 0x39d504c5, 0x3c37048b, 0x3e7c0458, 0x40a8042a, 0x42bd0401,
+    0x44c20798, 0x488e071e, 0x4c1c06b6, 0x4f76065d, 0x52a50610, 0x55ac05cc, 0x5892058f, 0x5b590559,
+    0x5e0c0a23, 0x631c0980, 0x67db08f6, 0x6c55087f, 0x70940818, 0x74a007bd, 0x787d076c, 0x7c330723,
+};
+ 
+static stbir_uint8 stbir__linear_to_srgb_uchar(float in)
+{
+    static const stbir__FP32 almostone = { 0x3f7fffff }; // 1-eps
+    static const stbir__FP32 minval = { (127-13) << 23 };
+    stbir_uint32 tab,bias,scale,t;
+    stbir__FP32 f;
+ 
+    // Clamp to [2^(-13), 1-eps]; these two values map to 0 and 1, respectively.
+    // The tests are carefully written so that NaNs map to 0, same as in the reference
+    // implementation.
+    if (!(in > minval.f)) // written this way to catch NaNs
+        in = minval.f;
+    if (in > almostone.f)
+        in = almostone.f;
+ 
+    // Do the table lookup and unpack bias, scale
+    f.f = in;
+    tab = fp32_to_srgb8_tab4[(f.u - minval.u) >> 20];
+    bias = (tab >> 16) << 9;
+    scale = tab & 0xffff;
+ 
+    // Grab next-highest mantissa bits and perform linear interpolation
+    t = (f.u >> 12) & 0xff;
+    return (unsigned char) ((bias + scale*t) >> 16);
+}
+
+#else
+// sRGB transition values, scaled by 1<<28
+static int stbir__srgb_offset_to_linear_scaled[256] =
+{
+            0,     40738,    122216,    203693,    285170,    366648,    448125,    529603,
+       611080,    692557,    774035,    855852,    942009,   1033024,   1128971,   1229926,
+      1335959,   1447142,   1563542,   1685229,   1812268,   1944725,   2082664,   2226148,
+      2375238,   2529996,   2690481,   2856753,   3028870,   3206888,   3390865,   3580856,
+      3776916,   3979100,   4187460,   4402049,   4622919,   4850123,   5083710,   5323731,
+      5570236,   5823273,   6082892,   6349140,   6622065,   6901714,   7188133,   7481369,
+      7781466,   8088471,   8402427,   8723380,   9051372,   9386448,   9728650,  10078021,
+     10434603,  10798439,  11169569,  11548036,  11933879,  12327139,  12727857,  13136073,
+     13551826,  13975156,  14406100,  14844697,  15290987,  15745007,  16206795,  16676389,
+     17153826,  17639142,  18132374,  18633560,  19142734,  19659934,  20185196,  20718552,
+     21260042,  21809696,  22367554,  22933648,  23508010,  24090680,  24681686,  25281066,
+     25888850,  26505076,  27129772,  27762974,  28404716,  29055026,  29713942,  30381490,
+     31057708,  31742624,  32436272,  33138682,  33849884,  34569912,  35298800,  36036568,
+     36783260,  37538896,  38303512,  39077136,  39859796,  40651528,  41452360,  42262316,
+     43081432,  43909732,  44747252,  45594016,  46450052,  47315392,  48190064,  49074096,
+     49967516,  50870356,  51782636,  52704392,  53635648,  54576432,  55526772,  56486700,
+     57456236,  58435408,  59424248,  60422780,  61431036,  62449032,  63476804,  64514376,
+     65561776,  66619028,  67686160,  68763192,  69850160,  70947088,  72053992,  73170912,
+     74297864,  75434880,  76581976,  77739184,  78906536,  80084040,  81271736,  82469648,
+     83677792,  84896192,  86124888,  87363888,  88613232,  89872928,  91143016,  92423512,
+     93714432,  95015816,  96327688,  97650056,  98982952, 100326408, 101680440, 103045072,
+    104420320, 105806224, 107202800, 108610064, 110028048, 111456776, 112896264, 114346544,
+    115807632, 117279552, 118762328, 120255976, 121760536, 123276016, 124802440, 126339832,
+    127888216, 129447616, 131018048, 132599544, 134192112, 135795792, 137410592, 139036528,
+    140673648, 142321952, 143981456, 145652208, 147334208, 149027488, 150732064, 152447968,
+    154175200, 155913792, 157663776, 159425168, 161197984, 162982240, 164777968, 166585184,
+    168403904, 170234160, 172075968, 173929344, 175794320, 177670896, 179559120, 181458992,
+    183370528, 185293776, 187228736, 189175424, 191133888, 193104112, 195086128, 197079968,
+    199085648, 201103184, 203132592, 205173888, 207227120, 209292272, 211369392, 213458480,
+    215559568, 217672656, 219797792, 221934976, 224084240, 226245600, 228419056, 230604656,
+    232802400, 235012320, 237234432, 239468736, 241715280, 243974080, 246245120, 248528464,
+    250824112, 253132064, 255452368, 257785040, 260130080, 262487520, 264857376, 267239664,
+};
+
+static stbir_uint8 stbir__linear_to_srgb_uchar(float f)
+{
+    int x = (int) (f * (1 << 28)); // has headroom so you don't need to clamp
+    int v = 0;
+    int i;
+
+    // Refine the guess with a short binary search.
+    i = v + 128; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  64; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  32; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +  16; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   8; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   4; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   2; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+    i = v +   1; if (x >= stbir__srgb_offset_to_linear_scaled[i]) v = i;
+
+    return (stbir_uint8) v;
+}
+#endif
+
+static float stbir__filter_trapezoid(float x, float scale)
+{
+    float halfscale = scale / 2;
+    float t = 0.5f + halfscale;
+    STBIR_ASSERT(scale <= 1);
+
+    x = (float)fabs(x);
+
+    if (x >= t)
+        return 0;
+    else
+    {
+        float r = 0.5f - halfscale;
+        if (x <= r)
+            return 1;
+        else
+            return (t - x) / scale;
+    }
+}
+
+static float stbir__support_trapezoid(float scale)
+{
+    STBIR_ASSERT(scale <= 1);
+    return 0.5f + scale / 2;
+}
+
+static float stbir__filter_triangle(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x <= 1.0f)
+        return 1 - x;
+    else
+        return 0;
+}
+
+static float stbir__filter_cubic(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (4 + x*x*(3*x - 6))/6;
+    else if (x < 2.0f)
+        return (8 + x*(-12 + x*(6 - x)))/6;
+
+    return (0.0f);
+}
+
+static float stbir__filter_catmullrom(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return 1 - x*x*(2.5f - 1.5f*x);
+    else if (x < 2.0f)
+        return 2 - x*(4 + x*(0.5f*x - 2.5f));
+
+    return (0.0f);
+}
+
+static float stbir__filter_mitchell(float x, float s)
+{
+    STBIR__UNUSED_PARAM(s);
+
+    x = (float)fabs(x);
+
+    if (x < 1.0f)
+        return (16 + x*x*(21 * x - 36))/18;
+    else if (x < 2.0f)
+        return (32 + x*(-60 + x*(36 - 7*x)))/18;
+
+    return (0.0f);
+}
+
+static float stbir__support_zero(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 0;
+}
+
+static float stbir__support_one(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 1;
+}
+
+static float stbir__support_two(float s)
+{
+    STBIR__UNUSED_PARAM(s);
+    return 2;
+}
+
+static stbir__filter_info stbir__filter_info_table[] = {
+        { NULL,                     stbir__support_zero },
+        { stbir__filter_trapezoid,  stbir__support_trapezoid },
+        { stbir__filter_triangle,   stbir__support_one },
+        { stbir__filter_cubic,      stbir__support_two },
+        { stbir__filter_catmullrom, stbir__support_two },
+        { stbir__filter_mitchell,   stbir__support_two },
+};
+
+stbir__inline static int stbir__use_upsampling(float ratio)
+{
+    return ratio > 1;
+}
+
+stbir__inline static int stbir__use_width_upsampling(stbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->horizontal_scale);
+}
+
+stbir__inline static int stbir__use_height_upsampling(stbir__info* stbir_info)
+{
+    return stbir__use_upsampling(stbir_info->vertical_scale);
+}
+
+// This is the maximum number of input samples that can affect an output sample
+// with the given filter
+static int stbir__get_filter_pixel_width(stbir_filter filter, float scale)
+{
+    STBIR_ASSERT(filter != 0);
+    STBIR_ASSERT(filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2 / scale);
+}
+
+// This is how much to expand buffers to account for filters seeking outside
+// the image boundaries.
+static int stbir__get_filter_pixel_margin(stbir_filter filter, float scale)
+{
+    return stbir__get_filter_pixel_width(filter, scale) / 2;
+}
+
+static int stbir__get_coefficient_width(stbir_filter filter, float scale)
+{
+    if (stbir__use_upsampling(scale))
+        return (int)ceil(stbir__filter_info_table[filter].support(1 / scale) * 2);
+    else
+        return (int)ceil(stbir__filter_info_table[filter].support(scale) * 2);
+}
+
+static int stbir__get_contributors(float scale, stbir_filter filter, int input_size, int output_size)
+{
+    if (stbir__use_upsampling(scale))
+        return output_size;
+    else
+        return (input_size + stbir__get_filter_pixel_margin(filter, scale) * 2);
+}
+
+static int stbir__get_total_horizontal_coefficients(stbir__info* info)
+{
+    return info->horizontal_num_contributors
+         * stbir__get_coefficient_width      (info->horizontal_filter, info->horizontal_scale);
+}
+
+static int stbir__get_total_vertical_coefficients(stbir__info* info)
+{
+    return info->vertical_num_contributors
+         * stbir__get_coefficient_width      (info->vertical_filter, info->vertical_scale);
+}
+
+static stbir__contributors* stbir__get_contributor(stbir__contributors* contributors, int n)
+{
+    return &contributors[n];
+}
+
+// For perf reasons this code is duplicated in stbir__resample_horizontal_upsample/downsample,
+// if you change it here change it there too.
+static float* stbir__get_coefficient(float* coefficients, stbir_filter filter, float scale, int n, int c)
+{
+    int width = stbir__get_coefficient_width(filter, scale);
+    return &coefficients[width*n + c];
+}
+
+static int stbir__edge_wrap_slow(stbir_edge edge, int n, int max)
+{
+    switch (edge)
+    {
+    case STBIR_EDGE_ZERO:
+        return 0; // we'll decode the wrong pixel here, and then overwrite with 0s later
+
+    case STBIR_EDGE_CLAMP:
+        if (n < 0)
+            return 0;
+
+        if (n >= max)
+            return max - 1;
+
+        return n; // NOTREACHED
+
+    case STBIR_EDGE_REFLECT:
+    {
+        if (n < 0)
+        {
+            if (n < max)
+                return -n;
+            else
+                return max - 1;
+        }
+
+        if (n >= max)
+        {
+            int max2 = max * 2;
+            if (n >= max2)
+                return 0;
+            else
+                return max2 - n - 1;
+        }
+
+        return n; // NOTREACHED
+    }
+
+    case STBIR_EDGE_WRAP:
+        if (n >= 0)
+            return (n % max);
+        else
+        {
+            int m = (-n) % max;
+
+            if (m != 0)
+                m = max - m;
+
+            return (m);
+        }
+        return n;  // NOTREACHED
+
+    default:
+        STBIR_ASSERT(!"Unimplemented edge type");
+        return 0;
+    }
+}
+
+stbir__inline static int stbir__edge_wrap(stbir_edge edge, int n, int max)
+{
+    // avoid per-pixel switch
+    if (n >= 0 && n < max)
+        return n;
+    return stbir__edge_wrap_slow(edge, n, max);
+}
+
+// What input pixels contribute to this output pixel?
+static void stbir__calculate_sample_range_upsample(int n, float out_filter_radius, float scale_ratio, float out_shift, int* in_first_pixel, int* in_last_pixel, float* in_center_of_out)
+{
+    float out_pixel_center = (float)n + 0.5f;
+    float out_pixel_influence_lowerbound = out_pixel_center - out_filter_radius;
+    float out_pixel_influence_upperbound = out_pixel_center + out_filter_radius;
+
+    float in_pixel_influence_lowerbound = (out_pixel_influence_lowerbound + out_shift) / scale_ratio;
+    float in_pixel_influence_upperbound = (out_pixel_influence_upperbound + out_shift) / scale_ratio;
+
+    *in_center_of_out = (out_pixel_center + out_shift) / scale_ratio;
+    *in_first_pixel = (int)(floor(in_pixel_influence_lowerbound + 0.5));
+    *in_last_pixel = (int)(floor(in_pixel_influence_upperbound - 0.5));
+}
+
+// What output pixels does this input pixel contribute to?
+static void stbir__calculate_sample_range_downsample(int n, float in_pixels_radius, float scale_ratio, float out_shift, int* out_first_pixel, int* out_last_pixel, float* out_center_of_in)
+{
+    float in_pixel_center = (float)n + 0.5f;
+    float in_pixel_influence_lowerbound = in_pixel_center - in_pixels_radius;
+    float in_pixel_influence_upperbound = in_pixel_center + in_pixels_radius;
+
+    float out_pixel_influence_lowerbound = in_pixel_influence_lowerbound * scale_ratio - out_shift;
+    float out_pixel_influence_upperbound = in_pixel_influence_upperbound * scale_ratio - out_shift;
+
+    *out_center_of_in = in_pixel_center * scale_ratio - out_shift;
+    *out_first_pixel = (int)(floor(out_pixel_influence_lowerbound + 0.5));
+    *out_last_pixel = (int)(floor(out_pixel_influence_upperbound - 0.5));
+}
+
+static void stbir__calculate_coefficients_upsample(stbir_filter filter, float scale, int in_first_pixel, int in_last_pixel, float in_center_of_out, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+    float total_filter = 0;
+    float filter_scale;
+
+    STBIR_ASSERT(in_last_pixel - in_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(1/scale) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = in_first_pixel;
+    contributor->n1 = in_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+    {
+        float in_pixel_center = (float)(i + in_first_pixel) + 0.5f;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(in_center_of_out - in_pixel_center, 1 / scale);
+
+        // If the coefficient is zero, skip it. (Don't do the <0 check here, we want the influence of those outside pixels.)
+        if (i == 0 && !coefficient_group[i])
+        {
+            contributor->n0 = ++in_first_pixel;
+            i--;
+            continue;
+        }
+
+        total_filter += coefficient_group[i];
+    }
+
+    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(in_last_pixel + 1) + 0.5f - in_center_of_out, 1/scale) == 0);
+
+    STBIR_ASSERT(total_filter > 0.9);
+    STBIR_ASSERT(total_filter < 1.1f); // Make sure it's not way off.
+
+    // Make sure the sum of all coefficients is 1.
+    filter_scale = 1 / total_filter;
+
+    for (i = 0; i <= in_last_pixel - in_first_pixel; i++)
+        coefficient_group[i] *= filter_scale;
+
+    for (i = in_last_pixel - in_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__calculate_coefficients_downsample(stbir_filter filter, float scale_ratio, int out_first_pixel, int out_last_pixel, float out_center_of_in, stbir__contributors* contributor, float* coefficient_group)
+{
+    int i;
+
+     STBIR_ASSERT(out_last_pixel - out_first_pixel <= (int)ceil(stbir__filter_info_table[filter].support(scale_ratio) * 2)); // Taken directly from stbir__get_coefficient_width() which we can't call because we don't know if we're horizontal or vertical.
+
+    contributor->n0 = out_first_pixel;
+    contributor->n1 = out_last_pixel;
+
+    STBIR_ASSERT(contributor->n1 >= contributor->n0);
+
+    for (i = 0; i <= out_last_pixel - out_first_pixel; i++)
+    {
+        float out_pixel_center = (float)(i + out_first_pixel) + 0.5f;
+        float x = out_pixel_center - out_center_of_in;
+        coefficient_group[i] = stbir__filter_info_table[filter].kernel(x, scale_ratio) * scale_ratio;
+    }
+
+    STBIR_ASSERT(stbir__filter_info_table[filter].kernel((float)(out_last_pixel + 1) + 0.5f - out_center_of_in, scale_ratio) == 0);
+
+    for (i = out_last_pixel - out_first_pixel; i >= 0; i--)
+    {
+        if (coefficient_group[i])
+            break;
+
+        // This line has no weight. We can skip it.
+        contributor->n1 = contributor->n0 + i - 1;
+    }
+}
+
+static void stbir__normalize_downsample_coefficients(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, int input_size, int output_size)
+{
+    int num_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+    int num_coefficients = stbir__get_coefficient_width(filter, scale_ratio);
+    int i, j;
+    int skip;
+
+    for (i = 0; i < output_size; i++)
+    {
+        float scale;
+        float total = 0;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+            {
+                float coefficient = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0);
+                total += coefficient;
+            }
+            else if (i < contributors[j].n0)
+                break;
+        }
+
+        STBIR_ASSERT(total > 0.9f);
+        STBIR_ASSERT(total < 1.1f);
+
+        scale = 1 / total;
+
+        for (j = 0; j < num_contributors; j++)
+        {
+            if (i >= contributors[j].n0 && i <= contributors[j].n1)
+                *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i - contributors[j].n0) *= scale;
+            else if (i < contributors[j].n0)
+                break;
+        }
+    }
+
+    // Optimize: Skip zero coefficients and contributions outside of image bounds.
+    // Do this after normalizing because normalization depends on the n0/n1 values.
+    for (j = 0; j < num_contributors; j++)
+    {
+        int range, max, width;
+
+        skip = 0;
+        while (*stbir__get_coefficient(coefficients, filter, scale_ratio, j, skip) == 0)
+            skip++;
+
+        contributors[j].n0 += skip;
+
+        while (contributors[j].n0 < 0)
+        {
+            contributors[j].n0++;
+            skip++;
+        }
+
+        range = contributors[j].n1 - contributors[j].n0 + 1;
+        max = stbir__min(num_coefficients, range);
+
+        width = stbir__get_coefficient_width(filter, scale_ratio);
+        for (i = 0; i < max; i++)
+        {
+            if (i + skip >= width)
+                break;
+
+            *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i) = *stbir__get_coefficient(coefficients, filter, scale_ratio, j, i + skip);
+        }
+
+        continue;
+    }
+
+    // Using min to avoid writing into invalid pixels.
+    for (i = 0; i < num_contributors; i++)
+        contributors[i].n1 = stbir__min(contributors[i].n1, output_size - 1);
+}
+
+// Each scan line uses the same kernel values so we should calculate the kernel
+// values once and then we can use them for every scan line.
+static void stbir__calculate_filters(stbir__contributors* contributors, float* coefficients, stbir_filter filter, float scale_ratio, float shift, int input_size, int output_size)
+{
+    int n;
+    int total_contributors = stbir__get_contributors(scale_ratio, filter, input_size, output_size);
+
+    if (stbir__use_upsampling(scale_ratio))
+    {
+        float out_pixels_radius = stbir__filter_info_table[filter].support(1 / scale_ratio) * scale_ratio;
+
+        // Looping through out pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float in_center_of_out; // Center of the current out pixel in the in pixel space
+            int in_first_pixel, in_last_pixel;
+
+            stbir__calculate_sample_range_upsample(n, out_pixels_radius, scale_ratio, shift, &in_first_pixel, &in_last_pixel, &in_center_of_out);
+
+            stbir__calculate_coefficients_upsample(filter, scale_ratio, in_first_pixel, in_last_pixel, in_center_of_out, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+    }
+    else
+    {
+        float in_pixels_radius = stbir__filter_info_table[filter].support(scale_ratio) / scale_ratio;
+
+        // Looping through in pixels
+        for (n = 0; n < total_contributors; n++)
+        {
+            float out_center_of_in; // Center of the current out pixel in the in pixel space
+            int out_first_pixel, out_last_pixel;
+            int n_adjusted = n - stbir__get_filter_pixel_margin(filter, scale_ratio);
+
+            stbir__calculate_sample_range_downsample(n_adjusted, in_pixels_radius, scale_ratio, shift, &out_first_pixel, &out_last_pixel, &out_center_of_in);
+
+            stbir__calculate_coefficients_downsample(filter, scale_ratio, out_first_pixel, out_last_pixel, out_center_of_in, stbir__get_contributor(contributors, n), stbir__get_coefficient(coefficients, filter, scale_ratio, n, 0));
+        }
+
+        stbir__normalize_downsample_coefficients(contributors, coefficients, filter, scale_ratio, input_size, output_size);
+    }
+}
+
+static float* stbir__get_decode_buffer(stbir__info* stbir_info)
+{
+    // The 0 index of the decode buffer starts after the margin. This makes
+    // it okay to use negative indexes on the decode buffer.
+    return &stbir_info->decode_buffer[stbir_info->horizontal_filter_pixel_margin * stbir_info->channels];
+}
+
+#define STBIR__DECODE(type, colorspace) ((type) * (STBIR_MAX_COLORSPACES) + (colorspace))
+
+static void stbir__decode_scanline(stbir__info* stbir_info, int n)
+{
+    int c;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int input_w = stbir_info->input_w;
+    size_t input_stride_bytes = stbir_info->input_stride_bytes;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir_edge edge_horizontal = stbir_info->edge_horizontal;
+    stbir_edge edge_vertical = stbir_info->edge_vertical;
+    size_t in_buffer_row_offset = stbir__edge_wrap(edge_vertical, n, stbir_info->input_h) * input_stride_bytes;
+    const void* input_data = (char *) stbir_info->input_data + in_buffer_row_offset;
+    int max_x = input_w + stbir_info->horizontal_filter_pixel_margin;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    int x = -stbir_info->horizontal_filter_pixel_margin;
+
+    // special handling for STBIR_EDGE_ZERO because it needs to return an item that doesn't appear in the input,
+    // and we want to avoid paying overhead on every pixel if not STBIR_EDGE_ZERO
+    if (edge_vertical == STBIR_EDGE_ZERO && (n < 0 || n >= stbir_info->input_h))
+    {
+        for (; x < max_x; x++)
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        return;
+    }
+
+    switch (decode)
+    {
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned char*)input_data)[input_pixel_index + c]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_uchar_to_linear_float[((const unsigned char*)input_data)[input_pixel_index + c]];
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned char*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint8_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((float)((const unsigned short*)input_data)[input_pixel_index + c]) / stbir__max_uint16_as_float);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((float)((const unsigned short*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint16_as_float;
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear((float)(((double)((const unsigned int*)input_data)[input_pixel_index + c]) / stbir__max_uint32_as_float));
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = (float)(((double)((const unsigned int*)input_data)[input_pixel_index + alpha_channel]) / stbir__max_uint32_as_float);
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = ((const float*)input_data)[input_pixel_index + c];
+        }
+        break;
+
+    case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+        for (; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+            int input_pixel_index = stbir__edge_wrap(edge_horizontal, x, input_w) * channels;
+            for (c = 0; c < channels; c++)
+                decode_buffer[decode_pixel_index + c] = stbir__srgb_to_linear(((const float*)input_data)[input_pixel_index + c]);
+
+            if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                decode_buffer[decode_pixel_index + alpha_channel] = ((const float*)input_data)[input_pixel_index + alpha_channel];
+        }
+
+        break;
+
+    default:
+        STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+        break;
+    }
+
+    if (!(stbir_info->flags & STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < max_x; x++)
+        {
+            int decode_pixel_index = x * channels;
+
+            // If the alpha value is 0 it will clobber the color values. Make sure it's not.
+            float alpha = decode_buffer[decode_pixel_index + alpha_channel];
+#ifndef STBIR_NO_ALPHA_EPSILON
+            if (stbir_info->type != STBIR_TYPE_FLOAT) {
+                alpha += STBIR_ALPHA_EPSILON;
+                decode_buffer[decode_pixel_index + alpha_channel] = alpha;
+            }
+#endif
+            for (c = 0; c < channels; c++)
+            {
+                if (c == alpha_channel)
+                    continue;
+
+                decode_buffer[decode_pixel_index + c] *= alpha;
+            }
+        }
+    }
+
+    if (edge_horizontal == STBIR_EDGE_ZERO)
+    {
+        for (x = -stbir_info->horizontal_filter_pixel_margin; x < 0; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+        for (x = input_w; x < max_x; x++)
+        {
+            for (c = 0; c < channels; c++)
+                decode_buffer[x*channels + c] = 0;
+        }
+    }
+}
+
+static float* stbir__get_ring_buffer_entry(float* ring_buffer, int index, int ring_buffer_length)
+{
+    return &ring_buffer[index * ring_buffer_length];
+}
+
+static float* stbir__add_empty_ring_buffer_entry(stbir__info* stbir_info, int n)
+{
+    int ring_buffer_index;
+    float* ring_buffer;
+
+    stbir_info->ring_buffer_last_scanline = n;
+
+    if (stbir_info->ring_buffer_begin_index < 0)
+    {
+        ring_buffer_index = stbir_info->ring_buffer_begin_index = 0;
+        stbir_info->ring_buffer_first_scanline = n;
+    }
+    else
+    {
+        ring_buffer_index = (stbir_info->ring_buffer_begin_index + (stbir_info->ring_buffer_last_scanline - stbir_info->ring_buffer_first_scanline)) % stbir_info->ring_buffer_num_entries;
+        STBIR_ASSERT(ring_buffer_index != stbir_info->ring_buffer_begin_index);
+    }
+
+    ring_buffer = stbir__get_ring_buffer_entry(stbir_info->ring_buffer, ring_buffer_index, stbir_info->ring_buffer_length_bytes / sizeof(float));
+    memset(ring_buffer, 0, stbir_info->ring_buffer_length_bytes);
+
+    return ring_buffer;
+}
+
+
+static void stbir__resample_horizontal_upsample(stbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+
+    for (x = 0; x < output_w; x++)
+    {
+        int n0 = horizontal_contributors[x].n0;
+        int n1 = horizontal_contributors[x].n1;
+
+        int out_pixel_index = x * channels;
+        int coefficient_group = coefficient_width * x;
+        int coefficient_counter = 0;
+
+        STBIR_ASSERT(n1 >= n0);
+        STBIR_ASSERT(n0 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 >= -stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n0 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+        STBIR_ASSERT(n1 < stbir_info->input_w + stbir_info->horizontal_filter_pixel_margin);
+
+        switch (channels) {
+            case 1:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (k = n0; k <= n1; k++)
+                {
+                    int in_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + coefficient_counter++];
+                    int c;
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+}
+
+static void stbir__resample_horizontal_downsample(stbir__info* stbir_info, float* output_buffer)
+{
+    int x, k;
+    int input_w = stbir_info->input_w;
+    int channels = stbir_info->channels;
+    float* decode_buffer = stbir__get_decode_buffer(stbir_info);
+    stbir__contributors* horizontal_contributors = stbir_info->horizontal_contributors;
+    float* horizontal_coefficients = stbir_info->horizontal_coefficients;
+    int coefficient_width = stbir_info->horizontal_coefficient_width;
+    int filter_pixel_margin = stbir_info->horizontal_filter_pixel_margin;
+    int max_x = input_w + filter_pixel_margin * 2;
+
+    STBIR_ASSERT(!stbir__use_width_upsampling(stbir_info));
+
+    switch (channels) {
+        case 1:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 1;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 1;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+
+        case 2:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 2;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 2;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+
+        case 3:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 3;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 3;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+
+        case 4:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * 4;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int out_pixel_index = k * 4;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    output_buffer[out_pixel_index + 0] += decode_buffer[in_pixel_index + 0] * coefficient;
+                    output_buffer[out_pixel_index + 1] += decode_buffer[in_pixel_index + 1] * coefficient;
+                    output_buffer[out_pixel_index + 2] += decode_buffer[in_pixel_index + 2] * coefficient;
+                    output_buffer[out_pixel_index + 3] += decode_buffer[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+
+        default:
+            for (x = 0; x < max_x; x++)
+            {
+                int n0 = horizontal_contributors[x].n0;
+                int n1 = horizontal_contributors[x].n1;
+
+                int in_x = x - filter_pixel_margin;
+                int in_pixel_index = in_x * channels;
+                int max_n = n1;
+                int coefficient_group = coefficient_width * x;
+
+                for (k = n0; k <= max_n; k++)
+                {
+                    int c;
+                    int out_pixel_index = k * channels;
+                    float coefficient = horizontal_coefficients[coefficient_group + k - n0];
+                    STBIR_ASSERT(coefficient != 0);
+                    for (c = 0; c < channels; c++)
+                        output_buffer[out_pixel_index + c] += decode_buffer[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+}
+
+static void stbir__decode_and_resample_upsample(stbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    // Now resample it into the ring buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir__add_empty_ring_buffer_entry(stbir_info, n));
+
+    // Now it's sitting in the ring buffer ready to be used as source for the vertical sampling.
+}
+
+static void stbir__decode_and_resample_downsample(stbir__info* stbir_info, int n)
+{
+    // Decode the nth scanline from the source image into the decode buffer.
+    stbir__decode_scanline(stbir_info, n);
+
+    memset(stbir_info->horizontal_buffer, 0, stbir_info->output_w * stbir_info->channels * sizeof(float));
+
+    // Now resample it into the horizontal buffer.
+    if (stbir__use_width_upsampling(stbir_info))
+        stbir__resample_horizontal_upsample(stbir_info, stbir_info->horizontal_buffer);
+    else
+        stbir__resample_horizontal_downsample(stbir_info, stbir_info->horizontal_buffer);
+
+    // Now it's sitting in the horizontal buffer ready to be distributed into the ring buffers.
+}
+
+// Get the specified scan line from the ring buffer.
+static float* stbir__get_ring_buffer_scanline(int get_scanline, float* ring_buffer, int begin_index, int first_scanline, int ring_buffer_num_entries, int ring_buffer_length)
+{
+    int ring_buffer_index = (begin_index + (get_scanline - first_scanline)) % ring_buffer_num_entries;
+    return stbir__get_ring_buffer_entry(ring_buffer, ring_buffer_index, ring_buffer_length);
+}
+
+
+static void stbir__encode_scanline(stbir__info* stbir_info, int num_pixels, void *output_buffer, float *encode_buffer, int channels, int alpha_channel, int decode)
+{
+    int x;
+    int n;
+    int num_nonalpha;
+    stbir_uint16 nonalpha[STBIR_MAX_CHANNELS];
+
+    if (!(stbir_info->flags&STBIR_FLAG_ALPHA_PREMULTIPLIED))
+    {
+        for (x=0; x < num_pixels; ++x)
+        {
+            int pixel_index = x*channels;
+
+            float alpha = encode_buffer[pixel_index + alpha_channel];
+            float reciprocal_alpha = alpha ? 1.0f / alpha : 0;
+
+            // unrolling this produced a 1% slowdown upscaling a large RGBA linear-space image on my machine - stb
+            for (n = 0; n < channels; n++)
+                if (n != alpha_channel)
+                    encode_buffer[pixel_index + n] *= reciprocal_alpha;
+
+            // We added in a small epsilon to prevent the color channel from being deleted with zero alpha.
+            // Because we only add it for integer types, it will automatically be discarded on integer
+            // conversion, so we don't need to subtract it back out (which would be problematic for
+            // numeric precision reasons).
+        }
+    }
+
+    // build a table of all channels that need colorspace correction, so
+    // we don't perform colorspace correction on channels that don't need it.
+    for (x = 0, num_nonalpha = 0; x < channels; ++x)
+    {
+        if (x != alpha_channel || (stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+        {
+            nonalpha[num_nonalpha++] = (stbir_uint16)x;
+        }
+    }
+
+    #define STBIR__ROUND_INT(f)    ((int)          ((f)+0.5))
+    #define STBIR__ROUND_UINT(f)   ((stbir_uint32) ((f)+0.5))
+
+    #ifdef STBIR__SATURATE_INT
+    #define STBIR__ENCODE_LINEAR8(f)   stbir__saturate8 (STBIR__ROUND_INT((f) * stbir__max_uint8_as_float ))
+    #define STBIR__ENCODE_LINEAR16(f)  stbir__saturate16(STBIR__ROUND_INT((f) * stbir__max_uint16_as_float))
+    #else
+    #define STBIR__ENCODE_LINEAR8(f)   (unsigned char ) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint8_as_float )
+    #define STBIR__ENCODE_LINEAR16(f)  (unsigned short) STBIR__ROUND_INT(stbir__saturate(f) * stbir__max_uint16_as_float)
+    #endif
+
+    switch (decode)
+    {
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned char*)output_buffer)[index] = STBIR__ENCODE_LINEAR8(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT8, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned char*)output_buffer)[index] = stbir__linear_to_srgb_uchar(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags & STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned char *)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR8(encode_buffer[pixel_index+alpha_channel]);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned short*)output_buffer)[index] = STBIR__ENCODE_LINEAR16(encode_buffer[index]);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT16, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned short*)output_buffer)[index] = (unsigned short)STBIR__ROUND_INT(stbir__linear_to_srgb(stbir__saturate(encode_buffer[index])) * stbir__max_uint16_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned short*)output_buffer)[pixel_index + alpha_channel] = STBIR__ENCODE_LINEAR16(encode_buffer[pixel_index + alpha_channel]);
+            }
+
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__saturate(encode_buffer[index])) * stbir__max_uint32_as_float);
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_UINT32, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((unsigned int*)output_buffer)[index] = (unsigned int)STBIR__ROUND_UINT(((double)stbir__linear_to_srgb(stbir__saturate(encode_buffer[index]))) * stbir__max_uint32_as_float);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((unsigned int*)output_buffer)[pixel_index + alpha_channel] = (unsigned int)STBIR__ROUND_INT(((double)stbir__saturate(encode_buffer[pixel_index + alpha_channel])) * stbir__max_uint32_as_float);
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_LINEAR):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < channels; n++)
+                {
+                    int index = pixel_index + n;
+                    ((float*)output_buffer)[index] = encode_buffer[index];
+                }
+            }
+            break;
+
+        case STBIR__DECODE(STBIR_TYPE_FLOAT, STBIR_COLORSPACE_SRGB):
+            for (x=0; x < num_pixels; ++x)
+            {
+                int pixel_index = x*channels;
+
+                for (n = 0; n < num_nonalpha; n++)
+                {
+                    int index = pixel_index + nonalpha[n];
+                    ((float*)output_buffer)[index] = stbir__linear_to_srgb(encode_buffer[index]);
+                }
+
+                if (!(stbir_info->flags&STBIR_FLAG_ALPHA_USES_COLORSPACE))
+                    ((float*)output_buffer)[pixel_index + alpha_channel] = encode_buffer[pixel_index + alpha_channel];
+            }
+            break;
+
+        default:
+            STBIR_ASSERT(!"Unknown type/colorspace/channels combination.");
+            break;
+    }
+}
+
+static void stbir__resample_vertical_upsample(stbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    void* output_data = stbir_info->output_data;
+    float* encode_buffer = stbir_info->encode_buffer;
+    int decode = STBIR__DECODE(type, colorspace);
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int coefficient_counter;
+    int contributor = n;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    int n0,n1, output_row_start;
+    int coefficient_group = coefficient_width * contributor;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    output_row_start = n * stbir_info->output_stride_bytes;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    memset(encode_buffer, 0, output_w * sizeof(float) * channels);
+
+    // I tried reblocking this for better cache usage of encode_buffer
+    // (using x_outer, k, x_inner), but it lost speed. -- stb
+
+    coefficient_counter = 0;
+    switch (channels) {
+        case 1:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 1;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                }
+            }
+            break;
+        case 2:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 2;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                }
+            }
+            break;
+        case 3:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 3;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                }
+            }
+            break;
+        case 4:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * 4;
+                    encode_buffer[in_pixel_index + 0] += ring_buffer_entry[in_pixel_index + 0] * coefficient;
+                    encode_buffer[in_pixel_index + 1] += ring_buffer_entry[in_pixel_index + 1] * coefficient;
+                    encode_buffer[in_pixel_index + 2] += ring_buffer_entry[in_pixel_index + 2] * coefficient;
+                    encode_buffer[in_pixel_index + 3] += ring_buffer_entry[in_pixel_index + 3] * coefficient;
+                }
+            }
+            break;
+        default:
+            for (k = n0; k <= n1; k++)
+            {
+                int coefficient_index = coefficient_counter++;
+                float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+                float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+                for (x = 0; x < output_w; ++x)
+                {
+                    int in_pixel_index = x * channels;
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        encode_buffer[in_pixel_index + c] += ring_buffer_entry[in_pixel_index + c] * coefficient;
+                }
+            }
+            break;
+    }
+    stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, encode_buffer, channels, alpha_channel, decode);
+}
+
+static void stbir__resample_vertical_downsample(stbir__info* stbir_info, int n)
+{
+    int x, k;
+    int output_w = stbir_info->output_w;
+    stbir__contributors* vertical_contributors = stbir_info->vertical_contributors;
+    float* vertical_coefficients = stbir_info->vertical_coefficients;
+    int channels = stbir_info->channels;
+    int ring_buffer_entries = stbir_info->ring_buffer_num_entries;
+    float* horizontal_buffer = stbir_info->horizontal_buffer;
+    int coefficient_width = stbir_info->vertical_coefficient_width;
+    int contributor = n + stbir_info->vertical_filter_pixel_margin;
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_begin_index = stbir_info->ring_buffer_begin_index;
+    int ring_buffer_first_scanline = stbir_info->ring_buffer_first_scanline;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+    int n0,n1;
+
+    n0 = vertical_contributors[contributor].n0;
+    n1 = vertical_contributors[contributor].n1;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (k = n0; k <= n1; k++)
+    {
+        int coefficient_index = k - n0;
+        int coefficient_group = coefficient_width * contributor;
+        float coefficient = vertical_coefficients[coefficient_group + coefficient_index];
+
+        float* ring_buffer_entry = stbir__get_ring_buffer_scanline(k, ring_buffer, ring_buffer_begin_index, ring_buffer_first_scanline, ring_buffer_entries, ring_buffer_length);
+
+        switch (channels) {
+            case 1:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 1;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                }
+                break;
+            case 2:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 2;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                }
+                break;
+            case 3:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 3;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                }
+                break;
+            case 4:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * 4;
+                    ring_buffer_entry[in_pixel_index + 0] += horizontal_buffer[in_pixel_index + 0] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 1] += horizontal_buffer[in_pixel_index + 1] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 2] += horizontal_buffer[in_pixel_index + 2] * coefficient;
+                    ring_buffer_entry[in_pixel_index + 3] += horizontal_buffer[in_pixel_index + 3] * coefficient;
+                }
+                break;
+            default:
+                for (x = 0; x < output_w; x++)
+                {
+                    int in_pixel_index = x * channels;
+
+                    int c;
+                    for (c = 0; c < channels; c++)
+                        ring_buffer_entry[in_pixel_index + c] += horizontal_buffer[in_pixel_index + c] * coefficient;
+                }
+                break;
+        }
+    }
+}
+
+static void stbir__buffer_loop_upsample(stbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    float out_scanlines_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(1/scale_ratio) * scale_ratio;
+
+    STBIR_ASSERT(stbir__use_height_upsampling(stbir_info));
+
+    for (y = 0; y < stbir_info->output_h; y++)
+    {
+        float in_center_of_out = 0; // Center of the current out scanline in the in scanline space
+        int in_first_scanline = 0, in_last_scanline = 0;
+
+        stbir__calculate_sample_range_upsample(y, out_scanlines_radius, scale_ratio, stbir_info->vertical_shift, &in_first_scanline, &in_last_scanline, &in_center_of_out);
+
+        STBIR_ASSERT(in_last_scanline - in_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (stbir_info->ring_buffer_begin_index >= 0)
+        {
+            // Get rid of whatever we don't need anymore.
+            while (in_first_scanline > stbir_info->ring_buffer_first_scanline)
+            {
+                if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+                {
+                    // We just popped the last scanline off the ring buffer.
+                    // Reset it to the empty state.
+                    stbir_info->ring_buffer_begin_index = -1;
+                    stbir_info->ring_buffer_first_scanline = 0;
+                    stbir_info->ring_buffer_last_scanline = 0;
+                    break;
+                }
+                else
+                {
+                    stbir_info->ring_buffer_first_scanline++;
+                    stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+                }
+            }
+        }
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__decode_and_resample_upsample(stbir_info, in_first_scanline);
+
+        while (in_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__decode_and_resample_upsample(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now all buffers should be ready to write a row of vertical sampling.
+        stbir__resample_vertical_upsample(stbir_info, y);
+
+        STBIR_PROGRESS_REPORT((float)y / stbir_info->output_h);
+    }
+}
+
+static void stbir__empty_ring_buffer(stbir__info* stbir_info, int first_necessary_scanline)
+{
+    int output_stride_bytes = stbir_info->output_stride_bytes;
+    int channels = stbir_info->channels;
+    int alpha_channel = stbir_info->alpha_channel;
+    int type = stbir_info->type;
+    int colorspace = stbir_info->colorspace;
+    int output_w = stbir_info->output_w;
+    void* output_data = stbir_info->output_data;
+    int decode = STBIR__DECODE(type, colorspace);
+
+    float* ring_buffer = stbir_info->ring_buffer;
+    int ring_buffer_length = stbir_info->ring_buffer_length_bytes/sizeof(float);
+
+    if (stbir_info->ring_buffer_begin_index >= 0)
+    {
+        // Get rid of whatever we don't need anymore.
+        while (first_necessary_scanline > stbir_info->ring_buffer_first_scanline)
+        {
+            if (stbir_info->ring_buffer_first_scanline >= 0 && stbir_info->ring_buffer_first_scanline < stbir_info->output_h)
+            {
+                int output_row_start = stbir_info->ring_buffer_first_scanline * output_stride_bytes;
+                float* ring_buffer_entry = stbir__get_ring_buffer_entry(ring_buffer, stbir_info->ring_buffer_begin_index, ring_buffer_length);
+                stbir__encode_scanline(stbir_info, output_w, (char *) output_data + output_row_start, ring_buffer_entry, channels, alpha_channel, decode);
+                STBIR_PROGRESS_REPORT((float)stbir_info->ring_buffer_first_scanline / stbir_info->output_h);
+            }
+
+            if (stbir_info->ring_buffer_first_scanline == stbir_info->ring_buffer_last_scanline)
+            {
+                // We just popped the last scanline off the ring buffer.
+                // Reset it to the empty state.
+                stbir_info->ring_buffer_begin_index = -1;
+                stbir_info->ring_buffer_first_scanline = 0;
+                stbir_info->ring_buffer_last_scanline = 0;
+                break;
+            }
+            else
+            {
+                stbir_info->ring_buffer_first_scanline++;
+                stbir_info->ring_buffer_begin_index = (stbir_info->ring_buffer_begin_index + 1) % stbir_info->ring_buffer_num_entries;
+            }
+        }
+    }
+}
+
+static void stbir__buffer_loop_downsample(stbir__info* stbir_info)
+{
+    int y;
+    float scale_ratio = stbir_info->vertical_scale;
+    int output_h = stbir_info->output_h;
+    float in_pixels_radius = stbir__filter_info_table[stbir_info->vertical_filter].support(scale_ratio) / scale_ratio;
+    int pixel_margin = stbir_info->vertical_filter_pixel_margin;
+    int max_y = stbir_info->input_h + pixel_margin;
+
+    STBIR_ASSERT(!stbir__use_height_upsampling(stbir_info));
+
+    for (y = -pixel_margin; y < max_y; y++)
+    {
+        float out_center_of_in; // Center of the current out scanline in the in scanline space
+        int out_first_scanline, out_last_scanline;
+
+        stbir__calculate_sample_range_downsample(y, in_pixels_radius, scale_ratio, stbir_info->vertical_shift, &out_first_scanline, &out_last_scanline, &out_center_of_in);
+
+        STBIR_ASSERT(out_last_scanline - out_first_scanline + 1 <= stbir_info->ring_buffer_num_entries);
+
+        if (out_last_scanline < 0 || out_first_scanline >= output_h)
+            continue;
+
+        stbir__empty_ring_buffer(stbir_info, out_first_scanline);
+
+        stbir__decode_and_resample_downsample(stbir_info, y);
+
+        // Load in new ones.
+        if (stbir_info->ring_buffer_begin_index < 0)
+            stbir__add_empty_ring_buffer_entry(stbir_info, out_first_scanline);
+
+        while (out_last_scanline > stbir_info->ring_buffer_last_scanline)
+            stbir__add_empty_ring_buffer_entry(stbir_info, stbir_info->ring_buffer_last_scanline + 1);
+
+        // Now the horizontal buffer is ready to write to all ring buffer rows.
+        stbir__resample_vertical_downsample(stbir_info, y);
+    }
+
+    stbir__empty_ring_buffer(stbir_info, stbir_info->output_h);
+}
+
+static void stbir__setup(stbir__info *info, int input_w, int input_h, int output_w, int output_h, int channels)
+{
+    info->input_w = input_w;
+    info->input_h = input_h;
+    info->output_w = output_w;
+    info->output_h = output_h;
+    info->channels = channels;
+}
+
+static void stbir__calculate_transform(stbir__info *info, float s0, float t0, float s1, float t1, float *transform)
+{
+    info->s0 = s0;
+    info->t0 = t0;
+    info->s1 = s1;
+    info->t1 = t1;
+
+    if (transform)
+    {
+        info->horizontal_scale = transform[0];
+        info->vertical_scale   = transform[1];
+        info->horizontal_shift = transform[2];
+        info->vertical_shift   = transform[3];
+    }
+    else
+    {
+        info->horizontal_scale = ((float)info->output_w / info->input_w) / (s1 - s0);
+        info->vertical_scale = ((float)info->output_h / info->input_h) / (t1 - t0);
+
+        info->horizontal_shift = s0 * info->output_w / (s1 - s0);
+        info->vertical_shift = t0 * info->output_h / (t1 - t0);
+    }
+}
+
+static void stbir__choose_filter(stbir__info *info, stbir_filter h_filter, stbir_filter v_filter)
+{
+    if (h_filter == 0)
+        h_filter = stbir__use_upsampling(info->horizontal_scale) ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    if (v_filter == 0)
+        v_filter = stbir__use_upsampling(info->vertical_scale)   ? STBIR_DEFAULT_FILTER_UPSAMPLE : STBIR_DEFAULT_FILTER_DOWNSAMPLE;
+    info->horizontal_filter = h_filter;
+    info->vertical_filter = v_filter;
+}
+
+static stbir_uint32 stbir__calculate_memory(stbir__info *info)
+{
+    int pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    int filter_height = stbir__get_filter_pixel_width(info->vertical_filter, info->vertical_scale);
+
+    info->horizontal_num_contributors = stbir__get_contributors(info->horizontal_scale, info->horizontal_filter, info->input_w, info->output_w);
+    info->vertical_num_contributors   = stbir__get_contributors(info->vertical_scale  , info->vertical_filter  , info->input_h, info->output_h);
+
+    // One extra entry because floating point precision problems sometimes cause an extra to be necessary.
+    info->ring_buffer_num_entries = filter_height + 1;
+
+    info->horizontal_contributors_size = info->horizontal_num_contributors * sizeof(stbir__contributors);
+    info->horizontal_coefficients_size = stbir__get_total_horizontal_coefficients(info) * sizeof(float);
+    info->vertical_contributors_size = info->vertical_num_contributors * sizeof(stbir__contributors);
+    info->vertical_coefficients_size = stbir__get_total_vertical_coefficients(info) * sizeof(float);
+    info->decode_buffer_size = (info->input_w + pixel_margin * 2) * info->channels * sizeof(float);
+    info->horizontal_buffer_size = info->output_w * info->channels * sizeof(float);
+    info->ring_buffer_size = info->output_w * info->channels * info->ring_buffer_num_entries * sizeof(float);
+    info->encode_buffer_size = info->output_w * info->channels * sizeof(float);
+
+    STBIR_ASSERT(info->horizontal_filter != 0);
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+    STBIR_ASSERT(info->vertical_filter != 0);
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table)); // this now happens too late
+
+    if (stbir__use_height_upsampling(info))
+        // The horizontal buffer is for when we're downsampling the height and we
+        // can't output the result of sampling the decode buffer directly into the
+        // ring buffers.
+        info->horizontal_buffer_size = 0;
+    else
+        // The encode buffer is to retain precision in the height upsampling method
+        // and isn't used when height downsampling.
+        info->encode_buffer_size = 0;
+
+    return info->horizontal_contributors_size + info->horizontal_coefficients_size
+        + info->vertical_contributors_size + info->vertical_coefficients_size
+        + info->decode_buffer_size + info->horizontal_buffer_size
+        + info->ring_buffer_size + info->encode_buffer_size;
+}
+
+static int stbir__resize_allocated(stbir__info *info,
+    const void* input_data, int input_stride_in_bytes,
+    void* output_data, int output_stride_in_bytes,
+    int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace,
+    void* tempmem, size_t tempmem_size_in_bytes)
+{
+    size_t memory_required = stbir__calculate_memory(info);
+
+    int width_stride_input = input_stride_in_bytes ? input_stride_in_bytes : info->channels * info->input_w * stbir__type_size[type];
+    int width_stride_output = output_stride_in_bytes ? output_stride_in_bytes : info->channels * info->output_w * stbir__type_size[type];
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+#define OVERWRITE_ARRAY_SIZE 8
+    unsigned char overwrite_output_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_before_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_output_after_pre[OVERWRITE_ARRAY_SIZE];
+    unsigned char overwrite_tempmem_after_pre[OVERWRITE_ARRAY_SIZE];
+
+    size_t begin_forbidden = width_stride_output * (info->output_h - 1) + info->output_w * info->channels * stbir__type_size[type];
+    memcpy(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE);
+    memcpy(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE);
+#endif
+
+    STBIR_ASSERT(info->channels >= 0);
+    STBIR_ASSERT(info->channels <= STBIR_MAX_CHANNELS);
+
+    if (info->channels < 0 || info->channels > STBIR_MAX_CHANNELS)
+        return 0;
+
+    STBIR_ASSERT(info->horizontal_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+    STBIR_ASSERT(info->vertical_filter < STBIR__ARRAY_SIZE(stbir__filter_info_table));
+
+    if (info->horizontal_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+    if (info->vertical_filter >= STBIR__ARRAY_SIZE(stbir__filter_info_table))
+        return 0;
+
+    if (alpha_channel < 0)
+        flags |= STBIR_FLAG_ALPHA_USES_COLORSPACE | STBIR_FLAG_ALPHA_PREMULTIPLIED;
+
+    if (!(flags&STBIR_FLAG_ALPHA_USES_COLORSPACE) || !(flags&STBIR_FLAG_ALPHA_PREMULTIPLIED))
+        STBIR_ASSERT(alpha_channel >= 0 && alpha_channel < info->channels);
+
+    if (alpha_channel >= info->channels)
+        return 0;
+
+    STBIR_ASSERT(tempmem);
+
+    if (!tempmem)
+        return 0;
+
+    STBIR_ASSERT(tempmem_size_in_bytes >= memory_required);
+
+    if (tempmem_size_in_bytes < memory_required)
+        return 0;
+
+    memset(tempmem, 0, tempmem_size_in_bytes);
+
+    info->input_data = input_data;
+    info->input_stride_bytes = width_stride_input;
+
+    info->output_data = output_data;
+    info->output_stride_bytes = width_stride_output;
+
+    info->alpha_channel = alpha_channel;
+    info->flags = flags;
+    info->type = type;
+    info->edge_horizontal = edge_horizontal;
+    info->edge_vertical = edge_vertical;
+    info->colorspace = colorspace;
+
+    info->horizontal_coefficient_width   = stbir__get_coefficient_width  (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_coefficient_width     = stbir__get_coefficient_width  (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_width  = stbir__get_filter_pixel_width (info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_width    = stbir__get_filter_pixel_width (info->vertical_filter  , info->vertical_scale  );
+    info->horizontal_filter_pixel_margin = stbir__get_filter_pixel_margin(info->horizontal_filter, info->horizontal_scale);
+    info->vertical_filter_pixel_margin   = stbir__get_filter_pixel_margin(info->vertical_filter  , info->vertical_scale  );
+
+    info->ring_buffer_length_bytes = info->output_w * info->channels * sizeof(float);
+    info->decode_buffer_pixels = info->input_w + info->horizontal_filter_pixel_margin * 2;
+
+#define STBIR__NEXT_MEMPTR(current, newtype) (newtype*)(((unsigned char*)current) + current##_size)
+
+    info->horizontal_contributors = (stbir__contributors *) tempmem;
+    info->horizontal_coefficients = STBIR__NEXT_MEMPTR(info->horizontal_contributors, float);
+    info->vertical_contributors = STBIR__NEXT_MEMPTR(info->horizontal_coefficients, stbir__contributors);
+    info->vertical_coefficients = STBIR__NEXT_MEMPTR(info->vertical_contributors, float);
+    info->decode_buffer = STBIR__NEXT_MEMPTR(info->vertical_coefficients, float);
+
+    if (stbir__use_height_upsampling(info))
+    {
+        info->horizontal_buffer = NULL;
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->encode_buffer = STBIR__NEXT_MEMPTR(info->ring_buffer, float);
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->encode_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+    else
+    {
+        info->horizontal_buffer = STBIR__NEXT_MEMPTR(info->decode_buffer, float);
+        info->ring_buffer = STBIR__NEXT_MEMPTR(info->horizontal_buffer, float);
+        info->encode_buffer = NULL;
+
+        STBIR_ASSERT((size_t)STBIR__NEXT_MEMPTR(info->ring_buffer, unsigned char) == (size_t)tempmem + tempmem_size_in_bytes);
+    }
+
+#undef STBIR__NEXT_MEMPTR
+
+    // This signals that the ring buffer is empty
+    info->ring_buffer_begin_index = -1;
+
+    stbir__calculate_filters(info->horizontal_contributors, info->horizontal_coefficients, info->horizontal_filter, info->horizontal_scale, info->horizontal_shift, info->input_w, info->output_w);
+    stbir__calculate_filters(info->vertical_contributors, info->vertical_coefficients, info->vertical_filter, info->vertical_scale, info->vertical_shift, info->input_h, info->output_h);
+
+    STBIR_PROGRESS_REPORT(0);
+
+    if (stbir__use_height_upsampling(info))
+        stbir__buffer_loop_upsample(info);
+    else
+        stbir__buffer_loop_downsample(info);
+
+    STBIR_PROGRESS_REPORT(1);
+
+#ifdef STBIR_DEBUG_OVERWRITE_TEST
+    STBIR_ASSERT(memcmp(overwrite_output_before_pre, &((unsigned char*)output_data)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_output_after_pre, &((unsigned char*)output_data)[begin_forbidden], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_before_pre, &((unsigned char*)tempmem)[-OVERWRITE_ARRAY_SIZE], OVERWRITE_ARRAY_SIZE) == 0);
+    STBIR_ASSERT(memcmp(overwrite_tempmem_after_pre, &((unsigned char*)tempmem)[tempmem_size_in_bytes], OVERWRITE_ARRAY_SIZE) == 0);
+#endif
+
+    return 1;
+}
+
+
+static int stbir__resize_arbitrary(
+    void *alloc_context,
+    const void* input_data, int input_w, int input_h, int input_stride_in_bytes,
+    void* output_data, int output_w, int output_h, int output_stride_in_bytes,
+    float s0, float t0, float s1, float t1, float *transform,
+    int channels, int alpha_channel, stbir_uint32 flags, stbir_datatype type,
+    stbir_filter h_filter, stbir_filter v_filter,
+    stbir_edge edge_horizontal, stbir_edge edge_vertical, stbir_colorspace colorspace)
+{
+    stbir__info info;
+    int result;
+    size_t memory_required;
+    void* extra_memory;
+
+    stbir__setup(&info, input_w, input_h, output_w, output_h, channels);
+    stbir__calculate_transform(&info, s0,t0,s1,t1,transform);
+    stbir__choose_filter(&info, h_filter, v_filter);
+    memory_required = stbir__calculate_memory(&info);
+    extra_memory = STBIR_MALLOC(memory_required, alloc_context);
+
+    if (!extra_memory)
+        return 0;
+
+    result = stbir__resize_allocated(&info, input_data, input_stride_in_bytes,
+                                            output_data, output_stride_in_bytes, 
+                                            alpha_channel, flags, type,
+                                            edge_horizontal, edge_vertical,
+                                            colorspace, extra_memory, memory_required);
+
+    STBIR_FREE(extra_memory, alloc_context);
+
+    return result;
+}
+
+STBIRDEF int stbir_resize_uint8(     const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_float(     const float *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           float *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,-1,0, STBIR_TYPE_FLOAT, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_LINEAR);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                           unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                     int num_channels, int alpha_channel, int flags)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_srgb_edgemode(const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                                    unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                              int num_channels, int alpha_channel, int flags,
+                                              stbir_edge edge_wrap_mode)
+{
+    return stbir__resize_arbitrary(NULL, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, STBIR_FILTER_DEFAULT, STBIR_FILTER_DEFAULT,
+        edge_wrap_mode, edge_wrap_mode, STBIR_COLORSPACE_SRGB);
+}
+
+STBIRDEF int stbir_resize_uint8_generic( const unsigned char *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                               unsigned char *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT8, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+STBIRDEF int stbir_resize_uint16_generic(const stbir_uint16 *input_pixels  , int input_w , int input_h , int input_stride_in_bytes,
+                                               stbir_uint16 *output_pixels , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_UINT16, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize_float_generic( const float *input_pixels         , int input_w , int input_h , int input_stride_in_bytes,
+                                               float *output_pixels        , int output_w, int output_h, int output_stride_in_bytes,
+                                         int num_channels, int alpha_channel, int flags,
+                                         stbir_edge edge_wrap_mode, stbir_filter filter, stbir_colorspace space, 
+                                         void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, STBIR_TYPE_FLOAT, filter, filter,
+        edge_wrap_mode, edge_wrap_mode, space);
+}
+
+
+STBIRDEF int stbir_resize(         const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+
+STBIRDEF int stbir_resize_subpixel(const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float x_scale, float y_scale,
+                                   float x_offset, float y_offset)
+{
+    float transform[4];
+    transform[0] = x_scale;
+    transform[1] = y_scale;
+    transform[2] = x_offset;
+    transform[3] = y_offset;
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        0,0,1,1,transform,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+STBIRDEF int stbir_resize_region(  const void *input_pixels , int input_w , int input_h , int input_stride_in_bytes,
+                                         void *output_pixels, int output_w, int output_h, int output_stride_in_bytes,
+                                   stbir_datatype datatype,
+                                   int num_channels, int alpha_channel, int flags,
+                                   stbir_edge edge_mode_horizontal, stbir_edge edge_mode_vertical, 
+                                   stbir_filter filter_horizontal,  stbir_filter filter_vertical,
+                                   stbir_colorspace space, void *alloc_context,
+                                   float s0, float t0, float s1, float t1)
+{
+    return stbir__resize_arbitrary(alloc_context, input_pixels, input_w, input_h, input_stride_in_bytes,
+        output_pixels, output_w, output_h, output_stride_in_bytes,
+        s0,t0,s1,t1,NULL,num_channels,alpha_channel,flags, datatype, filter_horizontal, filter_vertical,
+        edge_mode_horizontal, edge_mode_vertical, space);
+}
+
+#endif // STB_IMAGE_RESIZE_IMPLEMENTATION
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/3rdparty/bimg/3rdparty/stb/stb_image_write.h b/3rdparty/bimg/3rdparty/stb/stb_image_write.h
new file mode 100644
index 0000000..df62339
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/stb/stb_image_write.h
@@ -0,0 +1,1092 @@
+/* stb_image_write - v1.05 - public domain - http://nothings.org/stb/stb_image_write.h
+   writes out PNG/BMP/TGA images to C stdio - Sean Barrett 2010-2015
+                                     no warranty implied; use at your own risk
+
+   Before #including,
+
+       #define STB_IMAGE_WRITE_IMPLEMENTATION
+
+   in the file that you want to have the implementation.
+
+   Will probably not work correctly with strict-aliasing optimizations.
+
+ABOUT:
+
+   This header file is a library for writing images to C stdio. It could be
+   adapted to write to memory or a general streaming interface; let me know.
+
+   The PNG output is not optimal; it is 20-50% larger than the file
+   written by a decent optimizing implementation. This library is designed
+   for source code compactness and simplicity, not optimal image file size
+   or run-time performance.
+
+BUILDING:
+
+   You can #define STBIW_ASSERT(x) before the #include to avoid using assert.h.
+   You can #define STBIW_MALLOC(), STBIW_REALLOC(), and STBIW_FREE() to replace
+   malloc,realloc,free.
+   You can define STBIW_MEMMOVE() to replace memmove()
+
+USAGE:
+
+   There are four functions, one for each image file format:
+
+     int stbi_write_png(char const *filename, int w, int h, int comp, const void *data, int stride_in_bytes);
+     int stbi_write_bmp(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_tga(char const *filename, int w, int h, int comp, const void *data);
+     int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+
+   There are also four equivalent functions that use an arbitrary write function. You are
+   expected to open/close your file-equivalent before and after calling these:
+
+     int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+     int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+     int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+
+   where the callback is:
+      void stbi_write_func(void *context, void *data, int size);
+
+   You can define STBI_WRITE_NO_STDIO to disable the file variant of these
+   functions, so the library will not use stdio.h at all. However, this will
+   also disable HDR writing, because it requires stdio for formatted output.
+
+   Each function returns 0 on failure and non-0 on success.
+
+   The functions create an image file defined by the parameters. The image
+   is a rectangle of pixels stored from left-to-right, top-to-bottom.
+   Each pixel contains 'comp' channels of data stored interleaved with 8-bits
+   per channel, in the following order: 1=Y, 2=YA, 3=RGB, 4=RGBA. (Y is
+   monochrome color.) The rectangle is 'w' pixels wide and 'h' pixels tall.
+   The *data pointer points to the first byte of the top-left-most pixel.
+   For PNG, "stride_in_bytes" is the distance in bytes from the first byte of
+   a row of pixels to the first byte of the next row of pixels.
+
+   PNG creates output files with the same number of components as the input.
+   The BMP format expands Y to RGB in the file format and does not
+   output alpha.
+
+   PNG supports writing rectangles of data even when the bytes storing rows of
+   data are not consecutive in memory (e.g. sub-rectangles of a larger image),
+   by supplying the stride between the beginning of adjacent rows. The other
+   formats do not. (Thus you cannot write a native-format BMP through the BMP
+   writer, both because it is in BGR order and because it may have padding
+   at the end of the line.)
+
+   HDR expects linear float data. Since the format is always 32-bit rgb(e)
+   data, alpha (if provided) is discarded, and for monochrome data it is
+   replicated across all three channels.
+
+   TGA supports RLE or non-RLE compressed data. To use non-RLE-compressed
+   data, set the global variable 'stbi_write_tga_with_rle' to 0.
+
+CREDITS:
+
+   PNG/BMP/TGA
+      Sean Barrett
+   HDR
+      Baldur Karlsson
+   TGA monochrome:
+      Jean-Sebastien Guay
+   misc enhancements:
+      Tim Kelsey
+   TGA RLE
+      Alan Hickman
+   initial file IO callback implementation
+      Emmanuel Julien
+   bugfixes:
+      github:Chribba
+      Guillaume Chereau
+      github:jry2
+      github:romigrou
+      Sergio Gonzalez
+      Jonas Karlsson
+      Filip Wasil
+      Thatcher Ulrich
+      github:poppolopoppo
+      Patrick Boettcher
+      
+LICENSE
+
+  See end of file for license information.
+
+*/
+
+#ifndef INCLUDE_STB_IMAGE_WRITE_H
+#define INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef STB_IMAGE_WRITE_STATIC
+#define STBIWDEF static
+#else
+#define STBIWDEF extern
+extern int stbi_write_tga_with_rle;
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga(char const *filename, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr(char const *filename, int w, int h, int comp, const float *data);
+#endif
+
+typedef void stbi_write_func(void *context, void *data, int size);
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data, int stride_in_bytes);
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_tga_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const void  *data);
+STBIWDEF int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int w, int h, int comp, const float *data);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif//INCLUDE_STB_IMAGE_WRITE_H
+
+#ifdef STB_IMAGE_WRITE_IMPLEMENTATION
+
+#ifdef _WIN32
+   #ifndef _CRT_SECURE_NO_WARNINGS
+   #define _CRT_SECURE_NO_WARNINGS
+   #endif
+   #ifndef _CRT_NONSTDC_NO_DEPRECATE
+   #define _CRT_NONSTDC_NO_DEPRECATE
+   #endif
+#endif
+
+#ifndef STBI_WRITE_NO_STDIO
+#include <stdio.h>
+#endif // STBI_WRITE_NO_STDIO
+
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#if defined(STBIW_MALLOC) && defined(STBIW_FREE) && (defined(STBIW_REALLOC) || defined(STBIW_REALLOC_SIZED))
+// ok
+#elif !defined(STBIW_MALLOC) && !defined(STBIW_FREE) && !defined(STBIW_REALLOC) && !defined(STBIW_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBIW_MALLOC, STBIW_FREE, and STBIW_REALLOC (or STBIW_REALLOC_SIZED)."
+#endif
+
+#ifndef STBIW_MALLOC
+#define STBIW_MALLOC(sz)        malloc(sz)
+#define STBIW_REALLOC(p,newsz)  realloc(p,newsz)
+#define STBIW_FREE(p)           free(p)
+#endif
+
+#ifndef STBIW_REALLOC_SIZED
+#define STBIW_REALLOC_SIZED(p,oldsz,newsz) STBIW_REALLOC(p,newsz)
+#endif
+
+
+#ifndef STBIW_MEMMOVE
+#define STBIW_MEMMOVE(a,b,sz) memmove(a,b,sz)
+#endif
+
+
+#ifndef STBIW_ASSERT
+#include <assert.h>
+#define STBIW_ASSERT(x) assert(x)
+#endif
+
+#define STBIW_UCHAR(x) (unsigned char) ((x) & 0xff)
+
+typedef struct
+{
+   stbi_write_func *func;
+   void *context;
+} stbi__write_context;
+
+// initialize a callback-based context
+static void stbi__start_write_callbacks(stbi__write_context *s, stbi_write_func *c, void *context)
+{
+   s->func    = c;
+   s->context = context;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+
+static void stbi__stdio_write(void *context, void *data, int size)
+{
+   fwrite(data,1,size,(FILE*) context);
+}
+
+static int stbi__start_write_file(stbi__write_context *s, const char *filename)
+{
+   FILE *f = fopen(filename, "wb");
+   stbi__start_write_callbacks(s, stbi__stdio_write, (void *) f);
+   return f != NULL;
+}
+
+static void stbi__end_write_file(stbi__write_context *s)
+{
+   fclose((FILE *)s->context);
+}
+
+#endif // !STBI_WRITE_NO_STDIO
+
+typedef unsigned int stbiw_uint32;
+typedef int stb_image_write_test[sizeof(stbiw_uint32)==4 ? 1 : -1];
+
+#ifdef STB_IMAGE_WRITE_STATIC
+static int stbi_write_tga_with_rle = 1;
+#else
+int stbi_write_tga_with_rle = 1;
+#endif
+
+static void stbiw__writefv(stbi__write_context *s, const char *fmt, va_list v)
+{
+   while (*fmt) {
+      switch (*fmt++) {
+         case ' ': break;
+         case '1': { unsigned char x = STBIW_UCHAR(va_arg(v, int));
+                     s->func(s->context,&x,1);
+                     break; }
+         case '2': { int x = va_arg(v,int);
+                     unsigned char b[2];
+                     b[0] = STBIW_UCHAR(x);
+                     b[1] = STBIW_UCHAR(x>>8);
+                     s->func(s->context,b,2);
+                     break; }
+         case '4': { stbiw_uint32 x = va_arg(v,int);
+                     unsigned char b[4];
+                     b[0]=STBIW_UCHAR(x);
+                     b[1]=STBIW_UCHAR(x>>8);
+                     b[2]=STBIW_UCHAR(x>>16);
+                     b[3]=STBIW_UCHAR(x>>24);
+                     s->func(s->context,b,4);
+                     break; }
+         default:
+            STBIW_ASSERT(0);
+            return;
+      }
+   }
+}
+
+static void stbiw__writef(stbi__write_context *s, const char *fmt, ...)
+{
+   va_list v;
+   va_start(v, fmt);
+   stbiw__writefv(s, fmt, v);
+   va_end(v);
+}
+
+static void stbiw__write3(stbi__write_context *s, unsigned char a, unsigned char b, unsigned char c)
+{
+   unsigned char arr[3];
+   arr[0] = a, arr[1] = b, arr[2] = c;
+   s->func(s->context, arr, 3);
+}
+
+static void stbiw__write_pixel(stbi__write_context *s, int rgb_dir, int comp, int write_alpha, int expand_mono, unsigned char *d)
+{
+   unsigned char bg[3] = { 255, 0, 255}, px[3];
+   int k;
+
+   if (write_alpha < 0)
+      s->func(s->context, &d[comp - 1], 1);
+
+   switch (comp) {
+      case 2: // 2 pixels = mono + alpha, alpha is written separately, so same as 1-channel case
+      case 1:
+         if (expand_mono)
+            stbiw__write3(s, d[0], d[0], d[0]); // monochrome bmp
+         else
+            s->func(s->context, d, 1);  // monochrome TGA
+         break;
+      case 4:
+         if (!write_alpha) {
+            // composite against pink background
+            for (k = 0; k < 3; ++k)
+               px[k] = bg[k] + ((d[k] - bg[k]) * d[3]) / 255;
+            stbiw__write3(s, px[1 - rgb_dir], px[1], px[1 + rgb_dir]);
+            break;
+         }
+         /* FALLTHROUGH */
+      case 3:
+         stbiw__write3(s, d[1 - rgb_dir], d[1], d[1 + rgb_dir]);
+         break;
+   }
+   if (write_alpha > 0)
+      s->func(s->context, &d[comp - 1], 1);
+}
+
+static void stbiw__write_pixels(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, void *data, int write_alpha, int scanline_pad, int expand_mono)
+{
+   stbiw_uint32 zero = 0;
+   int i,j, j_end;
+
+   if (y <= 0)
+      return;
+
+   if (vdir < 0)
+      j_end = -1, j = y-1;
+   else
+      j_end =  y, j = 0;
+
+   for (; j != j_end; j += vdir) {
+      for (i=0; i < x; ++i) {
+         unsigned char *d = (unsigned char *) data + (j*x+i)*comp;
+         stbiw__write_pixel(s, rgb_dir, comp, write_alpha, expand_mono, d);
+      }
+      s->func(s->context, &zero, scanline_pad);
+   }
+}
+
+static int stbiw__outfile(stbi__write_context *s, int rgb_dir, int vdir, int x, int y, int comp, int expand_mono, void *data, int alpha, int pad, const char *fmt, ...)
+{
+   if (y < 0 || x < 0) {
+      return 0;
+   } else {
+      va_list v;
+      va_start(v, fmt);
+      stbiw__writefv(s, fmt, v);
+      va_end(v);
+      stbiw__write_pixels(s,rgb_dir,vdir,x,y,comp,data,alpha,pad, expand_mono);
+      return 1;
+   }
+}
+
+static int stbi_write_bmp_core(stbi__write_context *s, int x, int y, int comp, const void *data)
+{
+   int pad = (-x*3) & 3;
+   return stbiw__outfile(s,-1,-1,x,y,comp,1,(void *) data,0,pad,
+           "11 4 22 4" "4 44 22 444444",
+           'B', 'M', 14+40+(x*3+pad)*y, 0,0, 14+40,  // file header
+            40, x,y, 1,24, 0,0,0,0,0,0);             // bitmap header
+}
+
+STBIWDEF int stbi_write_bmp_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_bmp_core(&s, x, y, comp, data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_bmp(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_bmp_core(&s, x, y, comp, data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif //!STBI_WRITE_NO_STDIO
+
+static int stbi_write_tga_core(stbi__write_context *s, int x, int y, int comp, void *data)
+{
+   int has_alpha = (comp == 2 || comp == 4);
+   int colorbytes = has_alpha ? comp-1 : comp;
+   int format = colorbytes < 2 ? 3 : 2; // 3 color channels (RGB/RGBA) = 2, 1 color channel (Y/YA) = 3
+
+   if (y < 0 || x < 0)
+      return 0;
+
+   if (!stbi_write_tga_with_rle) {
+      return stbiw__outfile(s, -1, -1, x, y, comp, 0, (void *) data, has_alpha, 0,
+         "111 221 2222 11", 0, 0, format, 0, 0, 0, 0, 0, x, y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+   } else {
+      int i,j,k;
+
+      stbiw__writef(s, "111 221 2222 11", 0,0,format+8, 0,0,0, 0,0,x,y, (colorbytes + has_alpha) * 8, has_alpha * 8);
+
+      for (j = y - 1; j >= 0; --j) {
+          unsigned char *row = (unsigned char *) data + j * x * comp;
+         int len;
+
+         for (i = 0; i < x; i += len) {
+            unsigned char *begin = row + i * comp;
+            int diff = 1;
+            len = 1;
+
+            if (i < x - 1) {
+               ++len;
+               diff = memcmp(begin, row + (i + 1) * comp, comp);
+               if (diff) {
+                  const unsigned char *prev = begin;
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (memcmp(prev, row + k * comp, comp)) {
+                        prev += comp;
+                        ++len;
+                     } else {
+                        --len;
+                        break;
+                     }
+                  }
+               } else {
+                  for (k = i + 2; k < x && len < 128; ++k) {
+                     if (!memcmp(begin, row + k * comp, comp)) {
+                        ++len;
+                     } else {
+                        break;
+                     }
+                  }
+               }
+            }
+
+            if (diff) {
+               unsigned char header = STBIW_UCHAR(len - 1);
+               s->func(s->context, &header, 1);
+               for (k = 0; k < len; ++k) {
+                  stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin + k * comp);
+               }
+            } else {
+               unsigned char header = STBIW_UCHAR(len - 129);
+               s->func(s->context, &header, 1);
+               stbiw__write_pixel(s, -1, comp, has_alpha, 0, begin);
+            }
+         }
+      }
+   }
+   return 1;
+}
+
+int stbi_write_tga_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_tga_core(&s, x, y, comp, (void *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+int stbi_write_tga(char const *filename, int x, int y, int comp, const void *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_tga_core(&s, x, y, comp, (void *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR writer
+// by Baldur Karlsson
+
+#define stbiw__max(a, b)  ((a) > (b) ? (a) : (b))
+
+void stbiw__linear_to_rgbe(unsigned char *rgbe, float *linear)
+{
+   int exponent;
+   float maxcomp = stbiw__max(linear[0], stbiw__max(linear[1], linear[2]));
+
+   if (maxcomp < 1e-32f) {
+      rgbe[0] = rgbe[1] = rgbe[2] = rgbe[3] = 0;
+   } else {
+      float normalize = (float) frexp(maxcomp, &exponent) * 256.0f/maxcomp;
+
+      rgbe[0] = (unsigned char)(linear[0] * normalize);
+      rgbe[1] = (unsigned char)(linear[1] * normalize);
+      rgbe[2] = (unsigned char)(linear[2] * normalize);
+      rgbe[3] = (unsigned char)(exponent + 128);
+   }
+}
+
+void stbiw__write_run_data(stbi__write_context *s, int length, unsigned char databyte)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length+128);
+   STBIW_ASSERT(length+128 <= 255);
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, &databyte, 1);
+}
+
+void stbiw__write_dump_data(stbi__write_context *s, int length, unsigned char *data)
+{
+   unsigned char lengthbyte = STBIW_UCHAR(length);
+   STBIW_ASSERT(length <= 128); // inconsistent with spec but consistent with official code
+   s->func(s->context, &lengthbyte, 1);
+   s->func(s->context, data, length);
+}
+
+void stbiw__write_hdr_scanline(stbi__write_context *s, int width, int ncomp, unsigned char *scratch, float *scanline)
+{
+   unsigned char scanlineheader[4] = { 2, 2, 0, 0 };
+   unsigned char rgbe[4];
+   float linear[3];
+   int x;
+
+   scanlineheader[2] = (width&0xff00)>>8;
+   scanlineheader[3] = (width&0x00ff);
+
+   /* skip RLE for images too small or large */
+   if (width < 8 || width >= 32768) {
+      for (x=0; x < width; x++) {
+         switch (ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         s->func(s->context, rgbe, 4);
+      }
+   } else {
+      int c,r;
+      /* encode into scratch buffer */
+      for (x=0; x < width; x++) {
+         switch(ncomp) {
+            case 4: /* fallthrough */
+            case 3: linear[2] = scanline[x*ncomp + 2];
+                    linear[1] = scanline[x*ncomp + 1];
+                    linear[0] = scanline[x*ncomp + 0];
+                    break;
+            default:
+                    linear[0] = linear[1] = linear[2] = scanline[x*ncomp + 0];
+                    break;
+         }
+         stbiw__linear_to_rgbe(rgbe, linear);
+         scratch[x + width*0] = rgbe[0];
+         scratch[x + width*1] = rgbe[1];
+         scratch[x + width*2] = rgbe[2];
+         scratch[x + width*3] = rgbe[3];
+      }
+
+      s->func(s->context, scanlineheader, 4);
+
+      /* RLE each component separately */
+      for (c=0; c < 4; c++) {
+         unsigned char *comp = &scratch[width*c];
+
+         x = 0;
+         while (x < width) {
+            // find first run
+            r = x;
+            while (r+2 < width) {
+               if (comp[r] == comp[r+1] && comp[r] == comp[r+2])
+                  break;
+               ++r;
+            }
+            if (r+2 >= width)
+               r = width;
+            // dump up to first run
+            while (x < r) {
+               int len = r-x;
+               if (len > 128) len = 128;
+               stbiw__write_dump_data(s, len, &comp[x]);
+               x += len;
+            }
+            // if there's a run, output it
+            if (r+2 < width) { // same test as what we break out of in search loop, so only true if we break'd
+               // find next byte after run
+               while (r < width && comp[r] == comp[x])
+                  ++r;
+               // output run up to r
+               while (x < r) {
+                  int len = r-x;
+                  if (len > 127) len = 127;
+                  stbiw__write_run_data(s, len, comp[x]);
+                  x += len;
+               }
+            }
+         }
+      }
+   }
+}
+
+static int stbi_write_hdr_core(stbi__write_context *s, int x, int y, int comp, float *data)
+{
+   if (y <= 0 || x <= 0 || data == NULL)
+      return 0;
+   else {
+      // Each component is stored separately. Allocate scratch space for full output scanline.
+      unsigned char *scratch = (unsigned char *) STBIW_MALLOC(x*4);
+      int i, len;
+      char buffer[128];
+      char header[] = "#?RADIANCE\n# Written by stb_image_write.h\nFORMAT=32-bit_rle_rgbe\n";
+      s->func(s->context, header, sizeof(header)-1);
+
+      len = sprintf(buffer, "EXPOSURE=          1.0000000000000\n\n-Y %d +X %d\n", y, x);
+      s->func(s->context, buffer, len);
+
+      for(i=0; i < y; i++)
+         stbiw__write_hdr_scanline(s, x, comp, scratch, data + comp*i*x);
+      STBIW_FREE(scratch);
+      return 1;
+   }
+}
+
+int stbi_write_hdr_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   stbi__start_write_callbacks(&s, func, context);
+   return stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+int stbi_write_hdr(char const *filename, int x, int y, int comp, const float *data)
+{
+   stbi__write_context s;
+   if (stbi__start_write_file(&s,filename)) {
+      int r = stbi_write_hdr_core(&s, x, y, comp, (float *) data);
+      stbi__end_write_file(&s);
+      return r;
+   } else
+      return 0;
+}
+#endif // STBI_WRITE_NO_STDIO
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PNG writer
+//
+
+// stretchy buffer; stbiw__sbpush() == vector<>::push_back() -- stbiw__sbcount() == vector<>::size()
+#define stbiw__sbraw(a) ((int *) (a) - 2)
+#define stbiw__sbm(a)   stbiw__sbraw(a)[0]
+#define stbiw__sbn(a)   stbiw__sbraw(a)[1]
+
+#define stbiw__sbneedgrow(a,n)  ((a)==0 || stbiw__sbn(a)+n >= stbiw__sbm(a))
+#define stbiw__sbmaybegrow(a,n) (stbiw__sbneedgrow(a,(n)) ? stbiw__sbgrow(a,n) : 0)
+#define stbiw__sbgrow(a,n)  stbiw__sbgrowf((void **) &(a), (n), sizeof(*(a)))
+
+#define stbiw__sbpush(a, v)      (stbiw__sbmaybegrow(a,1), (a)[stbiw__sbn(a)++] = (v))
+#define stbiw__sbcount(a)        ((a) ? stbiw__sbn(a) : 0)
+#define stbiw__sbfree(a)         ((a) ? STBIW_FREE(stbiw__sbraw(a)),0 : 0)
+
+static void *stbiw__sbgrowf(void **arr, int increment, int itemsize)
+{
+   int m = *arr ? 2*stbiw__sbm(*arr)+increment : increment+1;
+   void *p = STBIW_REALLOC_SIZED(*arr ? stbiw__sbraw(*arr) : 0, *arr ? (stbiw__sbm(*arr)*itemsize + sizeof(int)*2) : 0, itemsize * m + sizeof(int)*2);
+   STBIW_ASSERT(p);
+   if (p) {
+      if (!*arr) ((int *) p)[1] = 0;
+      *arr = (void *) ((int *) p + 2);
+      stbiw__sbm(*arr) = m;
+   }
+   return *arr;
+}
+
+static unsigned char *stbiw__zlib_flushf(unsigned char *data, unsigned int *bitbuffer, int *bitcount)
+{
+   while (*bitcount >= 8) {
+      stbiw__sbpush(data, STBIW_UCHAR(*bitbuffer));
+      *bitbuffer >>= 8;
+      *bitcount -= 8;
+   }
+   return data;
+}
+
+static int stbiw__zlib_bitrev(int code, int codebits)
+{
+   int res=0;
+   while (codebits--) {
+      res = (res << 1) | (code & 1);
+      code >>= 1;
+   }
+   return res;
+}
+
+static unsigned int stbiw__zlib_countm(unsigned char *a, unsigned char *b, int limit)
+{
+   int i;
+   for (i=0; i < limit && i < 258; ++i)
+      if (a[i] != b[i]) break;
+   return i;
+}
+
+static unsigned int stbiw__zhash(unsigned char *data)
+{
+   stbiw_uint32 hash = data[0] + (data[1] << 8) + (data[2] << 16);
+   hash ^= hash << 3;
+   hash += hash >> 5;
+   hash ^= hash << 4;
+   hash += hash >> 17;
+   hash ^= hash << 25;
+   hash += hash >> 6;
+   return hash;
+}
+
+#define stbiw__zlib_flush() (out = stbiw__zlib_flushf(out, &bitbuf, &bitcount))
+#define stbiw__zlib_add(code,codebits) \
+      (bitbuf |= (code) << bitcount, bitcount += (codebits), stbiw__zlib_flush())
+#define stbiw__zlib_huffa(b,c)  stbiw__zlib_add(stbiw__zlib_bitrev(b,c),c)
+// default huffman tables
+#define stbiw__zlib_huff1(n)  stbiw__zlib_huffa(0x30 + (n), 8)
+#define stbiw__zlib_huff2(n)  stbiw__zlib_huffa(0x190 + (n)-144, 9)
+#define stbiw__zlib_huff3(n)  stbiw__zlib_huffa(0 + (n)-256,7)
+#define stbiw__zlib_huff4(n)  stbiw__zlib_huffa(0xc0 + (n)-280,8)
+#define stbiw__zlib_huff(n)  ((n) <= 143 ? stbiw__zlib_huff1(n) : (n) <= 255 ? stbiw__zlib_huff2(n) : (n) <= 279 ? stbiw__zlib_huff3(n) : stbiw__zlib_huff4(n))
+#define stbiw__zlib_huffb(n) ((n) <= 143 ? stbiw__zlib_huff1(n) : stbiw__zlib_huff2(n))
+
+#define stbiw__ZHASH   16384
+
+unsigned char * stbi_zlib_compress(unsigned char *data, int data_len, int *out_len, int quality)
+{
+   static unsigned short lengthc[] = { 3,4,5,6,7,8,9,10,11,13,15,17,19,23,27,31,35,43,51,59,67,83,99,115,131,163,195,227,258, 259 };
+   static unsigned char  lengtheb[]= { 0,0,0,0,0,0,0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,  4,  5,  5,  5,  5,  0 };
+   static unsigned short distc[]   = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577, 32768 };
+   static unsigned char  disteb[]  = { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 };
+   unsigned int bitbuf=0;
+   int i,j, bitcount=0;
+   unsigned char *out = NULL;
+   unsigned char ***hash_table = (unsigned char***) STBIW_MALLOC(stbiw__ZHASH * sizeof(char**));
+   if (quality < 5) quality = 5;
+
+   stbiw__sbpush(out, 0x78);   // DEFLATE 32K window
+   stbiw__sbpush(out, 0x5e);   // FLEVEL = 1
+   stbiw__zlib_add(1,1);  // BFINAL = 1
+   stbiw__zlib_add(1,2);  // BTYPE = 1 -- fixed huffman
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      hash_table[i] = NULL;
+
+   i=0;
+   while (i < data_len-3) {
+      // hash next 3 bytes of data to be compressed
+      int h = stbiw__zhash(data+i)&(stbiw__ZHASH-1), best=3;
+      unsigned char *bestloc = 0;
+      unsigned char **hlist = hash_table[h];
+      int n = stbiw__sbcount(hlist);
+      for (j=0; j < n; ++j) {
+         if (hlist[j]-data > i-32768) { // if entry lies within window
+            int d = stbiw__zlib_countm(hlist[j], data+i, data_len-i);
+            if (d >= best) best=d,bestloc=hlist[j];
+         }
+      }
+      // when hash table entry is too long, delete half the entries
+      if (hash_table[h] && stbiw__sbn(hash_table[h]) == 2*quality) {
+         STBIW_MEMMOVE(hash_table[h], hash_table[h]+quality, sizeof(hash_table[h][0])*quality);
+         stbiw__sbn(hash_table[h]) = quality;
+      }
+      stbiw__sbpush(hash_table[h],data+i);
+
+      if (bestloc) {
+         // "lazy matching" - check match at *next* byte, and if it's better, do cur byte as literal
+         h = stbiw__zhash(data+i+1)&(stbiw__ZHASH-1);
+         hlist = hash_table[h];
+         n = stbiw__sbcount(hlist);
+         for (j=0; j < n; ++j) {
+            if (hlist[j]-data > i-32767) {
+               int e = stbiw__zlib_countm(hlist[j], data+i+1, data_len-i-1);
+               if (e > best) { // if next match is better, bail on current match
+                  bestloc = NULL;
+                  break;
+               }
+            }
+         }
+      }
+
+      if (bestloc) {
+         int d = (int) (data+i - bestloc); // distance back
+         STBIW_ASSERT(d <= 32767 && best <= 258);
+         for (j=0; best > lengthc[j+1]-1; ++j);
+         stbiw__zlib_huff(j+257);
+         if (lengtheb[j]) stbiw__zlib_add(best - lengthc[j], lengtheb[j]);
+         for (j=0; d > distc[j+1]-1; ++j);
+         stbiw__zlib_add(stbiw__zlib_bitrev(j,5),5);
+         if (disteb[j]) stbiw__zlib_add(d - distc[j], disteb[j]);
+         i += best;
+      } else {
+         stbiw__zlib_huffb(data[i]);
+         ++i;
+      }
+   }
+   // write out final bytes
+   for (;i < data_len; ++i)
+      stbiw__zlib_huffb(data[i]);
+   stbiw__zlib_huff(256); // end of block
+   // pad with 0 bits to byte boundary
+   while (bitcount)
+      stbiw__zlib_add(0,1);
+
+   for (i=0; i < stbiw__ZHASH; ++i)
+      (void) stbiw__sbfree(hash_table[i]);
+   STBIW_FREE(hash_table);
+
+   {
+      // compute adler32 on input
+      unsigned int s1=1, s2=0;
+      int blocklen = (int) (data_len % 5552);
+      j=0;
+      while (j < data_len) {
+         for (i=0; i < blocklen; ++i) s1 += data[j+i], s2 += s1;
+         s1 %= 65521, s2 %= 65521;
+         j += blocklen;
+         blocklen = 5552;
+      }
+      stbiw__sbpush(out, STBIW_UCHAR(s2 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s2));
+      stbiw__sbpush(out, STBIW_UCHAR(s1 >> 8));
+      stbiw__sbpush(out, STBIW_UCHAR(s1));
+   }
+   *out_len = stbiw__sbn(out);
+   // make returned pointer freeable
+   STBIW_MEMMOVE(stbiw__sbraw(out), out, *out_len);
+   return (unsigned char *) stbiw__sbraw(out);
+}
+
+static unsigned int stbiw__crc32(unsigned char *buffer, int len)
+{
+   static unsigned int crc_table[256] =
+   {
+      0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA, 0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+      0x0eDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988, 0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+      0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE, 0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+      0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC, 0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+      0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172, 0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+      0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940, 0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+      0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116, 0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+      0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924, 0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+      0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A, 0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+      0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818, 0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+      0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E, 0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+      0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C, 0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+      0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2, 0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+      0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0, 0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+      0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086, 0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+      0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4, 0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+      0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A, 0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+      0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8, 0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+      0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE, 0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+      0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC, 0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+      0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252, 0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+      0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60, 0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+      0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236, 0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+      0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04, 0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+      0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A, 0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+      0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38, 0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+      0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E, 0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+      0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C, 0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+      0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2, 0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+      0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0, 0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+      0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6, 0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+      0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94, 0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+   };
+
+   unsigned int crc = ~0u;
+   int i;
+   for (i=0; i < len; ++i)
+      crc = (crc >> 8) ^ crc_table[buffer[i] ^ (crc & 0xff)];
+   return ~crc;
+}
+
+#define stbiw__wpng4(o,a,b,c,d) ((o)[0]=STBIW_UCHAR(a),(o)[1]=STBIW_UCHAR(b),(o)[2]=STBIW_UCHAR(c),(o)[3]=STBIW_UCHAR(d),(o)+=4)
+#define stbiw__wp32(data,v) stbiw__wpng4(data, (v)>>24,(v)>>16,(v)>>8,(v));
+#define stbiw__wptag(data,s) stbiw__wpng4(data, s[0],s[1],s[2],s[3])
+
+static void stbiw__wpcrc(unsigned char **data, int len)
+{
+   unsigned int crc = stbiw__crc32(*data - len - 4, len+4);
+   stbiw__wp32(*data, crc);
+}
+
+static unsigned char stbiw__paeth(int a, int b, int c)
+{
+   int p = a + b - c, pa = abs(p-a), pb = abs(p-b), pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return STBIW_UCHAR(a);
+   if (pb <= pc) return STBIW_UCHAR(b);
+   return STBIW_UCHAR(c);
+}
+
+// @OPTIMIZE: provide an option that always forces left-predict or paeth predict
+unsigned char *stbi_write_png_to_mem(unsigned char *pixels, int stride_bytes, int x, int y, int n, int *out_len)
+{
+   int ctype[5] = { -1, 0, 4, 2, 6 };
+   unsigned char sig[8] = { 137,80,78,71,13,10,26,10 };
+   unsigned char *out,*o, *filt, *zlib;
+   signed char *line_buffer;
+   int i,j,k,p,zlen;
+
+   if (stride_bytes == 0)
+      stride_bytes = x * n;
+
+   filt = (unsigned char *) STBIW_MALLOC((x*n+1) * y); if (!filt) return 0;
+   line_buffer = (signed char *) STBIW_MALLOC(x * n); if (!line_buffer) { STBIW_FREE(filt); return 0; }
+   for (j=0; j < y; ++j) {
+      static int mapping[] = { 0,1,2,3,4 };
+      static int firstmap[] = { 0,1,0,5,6 };
+      int *mymap = (j != 0) ? mapping : firstmap;
+      int best = 0, bestval = 0x7fffffff;
+      for (p=0; p < 2; ++p) {
+         for (k= p?best:0; k < 5; ++k) { // @TODO: clarity: rewrite this to go 0..5, and 'continue' the unwanted ones during 2nd pass
+            int type = mymap[k],est=0;
+            unsigned char *z = pixels + stride_bytes*j;
+            for (i=0; i < n; ++i)
+               switch (type) {
+                  case 0: line_buffer[i] = z[i]; break;
+                  case 1: line_buffer[i] = z[i]; break;
+                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
+                  case 3: line_buffer[i] = z[i] - (z[i-stride_bytes]>>1); break;
+                  case 4: line_buffer[i] = (signed char) (z[i] - stbiw__paeth(0,z[i-stride_bytes],0)); break;
+                  case 5: line_buffer[i] = z[i]; break;
+                  case 6: line_buffer[i] = z[i]; break;
+               }
+            for (i=n; i < x*n; ++i) {
+               switch (type) {
+                  case 0: line_buffer[i] = z[i]; break;
+                  case 1: line_buffer[i] = z[i] - z[i-n]; break;
+                  case 2: line_buffer[i] = z[i] - z[i-stride_bytes]; break;
+                  case 3: line_buffer[i] = z[i] - ((z[i-n] + z[i-stride_bytes])>>1); break;
+                  case 4: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], z[i-stride_bytes], z[i-stride_bytes-n]); break;
+                  case 5: line_buffer[i] = z[i] - (z[i-n]>>1); break;
+                  case 6: line_buffer[i] = z[i] - stbiw__paeth(z[i-n], 0,0); break;
+               }
+            }
+            if (p) break;
+            for (i=0; i < x*n; ++i)
+               est += abs((signed char) line_buffer[i]);
+            if (est < bestval) { bestval = est; best = k; }
+         }
+      }
+      // when we get here, best contains the filter type, and line_buffer contains the data
+      filt[j*(x*n+1)] = (unsigned char) best;
+      STBIW_MEMMOVE(filt+j*(x*n+1)+1, line_buffer, x*n);
+   }
+   STBIW_FREE(line_buffer);
+   zlib = stbi_zlib_compress(filt, y*( x*n+1), &zlen, 8); // increase 8 to get smaller but use more memory
+   STBIW_FREE(filt);
+   if (!zlib) return 0;
+
+   // each tag requires 12 bytes of overhead
+   out = (unsigned char *) STBIW_MALLOC(8 + 12+13 + 12+zlen + 12);
+   if (!out) return 0;
+   *out_len = 8 + 12+13 + 12+zlen + 12;
+
+   o=out;
+   STBIW_MEMMOVE(o,sig,8); o+= 8;
+   stbiw__wp32(o, 13); // header length
+   stbiw__wptag(o, "IHDR");
+   stbiw__wp32(o, x);
+   stbiw__wp32(o, y);
+   *o++ = 8;
+   *o++ = STBIW_UCHAR(ctype[n]);
+   *o++ = 0;
+   *o++ = 0;
+   *o++ = 0;
+   stbiw__wpcrc(&o,13);
+
+   stbiw__wp32(o, zlen);
+   stbiw__wptag(o, "IDAT");
+   STBIW_MEMMOVE(o, zlib, zlen);
+   o += zlen;
+   STBIW_FREE(zlib);
+   stbiw__wpcrc(&o, zlen);
+
+   stbiw__wp32(o,0);
+   stbiw__wptag(o, "IEND");
+   stbiw__wpcrc(&o,0);
+
+   STBIW_ASSERT(o == out + *out_len);
+
+   return out;
+}
+
+#ifndef STBI_WRITE_NO_STDIO
+STBIWDEF int stbi_write_png(char const *filename, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   FILE *f;
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   f = fopen(filename, "wb");
+   if (!f) { STBIW_FREE(png); return 0; }
+   fwrite(png, 1, len, f);
+   fclose(f);
+   STBIW_FREE(png);
+   return 1;
+}
+#endif
+
+STBIWDEF int stbi_write_png_to_func(stbi_write_func *func, void *context, int x, int y, int comp, const void *data, int stride_bytes)
+{
+   int len;
+   unsigned char *png = stbi_write_png_to_mem((unsigned char *) data, stride_bytes, x, y, comp, &len);
+   if (png == NULL) return 0;
+   func(context, png, len);
+   STBIW_FREE(png);
+   return 1;
+}
+
+#endif // STB_IMAGE_WRITE_IMPLEMENTATION
+
+/* Revision history
+      1.04 (2017-03-03)
+             monochrome BMP expansion
+      1.03   ???
+      1.02 (2016-04-02)
+             avoid allocating large structures on the stack
+      1.01 (2016-01-16)
+             STBIW_REALLOC_SIZED: support allocators with no realloc support
+             avoid race-condition in crc initialization
+             minor compile issues
+      1.00 (2015-09-14)
+             installable file IO function
+      0.99 (2015-09-13)
+             warning fixes; TGA rle support
+      0.98 (2015-04-08)
+             added STBIW_MALLOC, STBIW_ASSERT etc
+      0.97 (2015-01-18)
+             fixed HDR asserts, rewrote HDR rle logic
+      0.96 (2015-01-17)
+             add HDR output
+             fix monochrome BMP
+      0.95 (2014-08-17)
+		       add monochrome TGA output
+      0.94 (2014-05-31)
+             rename private functions to avoid conflicts with stb_image.h
+      0.93 (2014-05-27)
+             warning fixes
+      0.92 (2010-08-01)
+             casts to unsigned char to fix warnings
+      0.91 (2010-07-17)
+             first public release
+      0.90   first internal release
+*/
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of 
+this software and associated documentation files (the "Software"), to deal in 
+the Software without restriction, including without limitation the rights to 
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 
+of the Software, and to permit persons to whom the Software is furnished to do 
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all 
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 
+software, either in source code form or as a compiled binary, for any purpose, 
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this 
+software dedicate any and all copyright interest in the software to the public 
+domain. We make this dedication for the benefit of the public at large and to 
+the detriment of our heirs and successors. We intend this dedication to be an 
+overt act of relinquishment in perpetuity of all present and future rights to 
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/3rdparty/bimg/3rdparty/tinyexr/README.md b/3rdparty/bimg/3rdparty/tinyexr/README.md
new file mode 100644
index 0000000..aea03fd
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/tinyexr/README.md
@@ -0,0 +1,274 @@
+# Tiny OpenEXR image library.
+
+![Example](https://github.com/syoyo/tinyexr/blob/master/asakusa.png?raw=true)
+
+[![AppVeyor build status](https://ci.appveyor.com/api/projects/status/k07ftfe4ph057qau/branch/master?svg=true)](https://ci.appveyor.com/project/syoyo/tinyexr/branch/master)
+
+[![Travis build Status](https://travis-ci.org/syoyo/tinyexr.svg)](https://travis-ci.org/syoyo/tinyexr)
+
+[![Coverity Scan Build Status](https://scan.coverity.com/projects/5827/badge.svg)](https://scan.coverity.com/projects/5827)
+
+`tinyexr` is a small, single header-only library to load and save OpenEXR(.exr) images.
+`tinyexr` is written in portable C++(no library dependency except for STL), thus `tinyexr` is good to embed into your application.
+To use `tinyexr`, simply copy `tinyexr.h` into your project.
+
+`tinyexr` currently supports:
+
+* OpenEXR version 1.x.
+* Normal image
+  * Scanline format.
+  * Uncompress("compress" = 0), ZIPS("compress" = 2), ZIP compression("compress" = 3) and PIZ compression("compress" = 4).
+  * Half/Uint/Float pixel type.
+  * Custom attributes(up to 128)
+* Deep image
+  * Scanline format.
+  * ZIPS compression("compress" = 2).
+  * Half, float pixel type.
+* Litte endian machine.
+* Limited support for big endian machine.
+  * read/write normal image.
+* C interface.
+  * You can easily write language bindings(e.g. golang)
+* EXR saving
+  * with ZIP compression.
+* JavaScript library
+  * Through emscripten.
+
+# Use case 
+
+* mallie https://github.com/lighttransport/mallie
+* PBRT v3 https://github.com/mmp/pbrt-v3
+* Cinder 0.9.0 https://libcinder.org/notes/v0.9.0
+* Piccante(develop branch) http://piccantelib.net/
+* Your project here!
+
+## Examples
+
+* [examples/deepview/](examples/deepview) Deep image view
+* [examples/rgbe2exr/](examples/rgbe2exr) .hdr to EXR converter
+* [examples/exr2rgbe/](examples/exr2rgbe) EXR to .hdr converter
+
+## Usage
+
+NOTE: **API is still subject to change**. See the source code for details.
+
+Include `tinyexr.h` with `TINYEXR_IMPLEMENTATION` flag(do this only for **one** .cc file).
+
+```
+#define TINYEXR_IMPLEMENTATION
+#include "tinyexr.h"
+```
+
+Quickly reading RGB(A) EXR file.
+
+```
+  const char* input = "asakusa.exr";
+  float* out; // width * height * RGBA
+  int width;
+  int height;
+  const char* err;
+
+  int ret = LoadEXR(&out, &width, &height, input, &err);
+```
+
+Loading EXR from a file.
+
+```
+  const char* input = "asakusa.exr";
+  const char* err;
+
+  EXRImage exrImage;
+  InitEXRImage(&exrImage);
+
+  int ret = ParseMultiChannelEXRHeaderFromFile(&exrImage, input, &err);
+  if (ret != 0) {
+    fprintf(stderr, "Parse EXR err: %s\n", err);
+    return;
+  }
+
+  //// Uncomment if you want reading HALF image as FLOAT.
+  //for (int i = 0; i < exrImage.num_channels; i++) {
+  //  if (exrImage.pixel_types[i] = TINYEXR_PIXELTYPE_HALF) {
+  //    exrImage.requested_pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT;
+  //  }
+  //}
+
+  ret = LoadMultiChannelEXRFromFile(&exrImage, input, &err);
+  if (ret != 0) {
+    fprintf(stderr, "Load EXR err: %s\n", err);
+    return;
+  }
+```
+
+Saving EXR file.
+
+```
+  bool SaveEXR(const float* rgb, int width, int height, const char* outfilename) {
+
+    float* channels[3];
+
+    EXRImage image;
+    InitEXRImage(&image);
+
+    image.num_channels = 3;
+
+    // Must be BGR(A) order, since most of EXR viewers expect this channel order.
+    const char* channel_names[] = {"B", "G", "R"}; // "B", "G", "R", "A" for RGBA image
+
+    std::vector<float> images[3];
+    images[0].resize(width * height);
+    images[1].resize(width * height);
+    images[2].resize(width * height);
+
+    for (int i = 0; i < width * height; i++) {
+      images[0][i] = rgb[3*i+0];
+      images[1][i] = rgb[3*i+1];
+      images[2][i] = rgb[3*i+2];
+    }
+
+    float* image_ptr[3];
+    image_ptr[0] = &(images[2].at(0)); // B
+    image_ptr[1] = &(images[1].at(0)); // G
+    image_ptr[2] = &(images[0].at(0)); // R
+
+    image.channel_names = channel_names;
+    image.images = (unsigned char**)image_ptr;
+    image.width = width;
+    image.height = height;
+    image.compression = TINYEXR_COMPRESSIONTYPE_ZIP;
+
+    image.pixel_types = (int *)malloc(sizeof(int) * image.num_channels);
+    image.requested_pixel_types = (int *)malloc(sizeof(int) * image.num_channels);
+    for (int i = 0; i < image.num_channels; i++) {
+      image.pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT; // pixel type of input image
+      image.requested_pixel_types[i] = TINYEXR_PIXELTYPE_HALF; // pixel type of output image to be stored in .EXR
+    }
+
+    const char* err;
+    int ret = SaveMultiChannelEXRToFile(&image, outfilename, &err);
+    if (ret != 0) {
+      fprintf(stderr, "Save EXR err: %s\n", err);
+      return ret;
+    }
+    printf("Saved exr file. [ %s ] \n", outfilename);
+
+    free(image.pixel_types);
+    free(image.requested_pixel_types);
+
+    return ret;
+
+  }
+```
+
+
+Reading deep image EXR file.
+See `example/deepview` for actual usage.
+
+```
+  const char* input = "deepimage.exr";
+  const char* err;
+  DeepImage deepImage;
+
+  int ret = LoadDeepEXR(&deepImage, input, &err);
+
+  // acccess to each sample in the deep pixel.
+  for (int y = 0; y < deepImage.height; y++) {
+    int sampleNum = deepImage.offset_table[y][deepImage.width-1];
+    for (int x = 0; x < deepImage.width-1; x++) {
+      int s_start = deepImage.offset_table[y][x];
+      int s_end   = deepImage.offset_table[y][x+1];
+      if (s_start >= sampleNum) {
+        continue;
+      }
+      s_end = (s_end < sampleNum) ? s_end : sampleNum;
+      for (int s = s_start; s < s_end; s++) {
+        float val = deepImage.image[depthChan][y][s];
+        ...
+      }
+    }
+  }
+
+```
+
+### deepview
+
+`examples/deepview` is simple deep image viewer in OpenGL.
+
+![DeepViewExample](https://github.com/syoyo/tinyexr/blob/master/examples/deepview/deepview_screencast.gif?raw=true)
+
+## TODO
+
+Contribution is welcome!
+
+- [ ] Compression
+  - [ ] NONE("compress" = 0, load)
+  - [ ] RLE("compress" = 1, load)
+  - [x] ZIPS("compress" = 2, load)
+  - [x] ZIP("compress" = 3, load)
+  - [x] PIZ("compress" = 4, load)
+  - [x] NONE("compress" = 0, save)
+  - [ ] RLE("compress" = 1, save)
+  - [x] ZIPS("compress" = 2, save)
+  - [x] ZIP("compress" = 3, save)
+  - [ ] PIZ("compress" = 4, save)
+- [ ] Custom attributes
+  - [x] Normal image(EXR 1.x)
+  - [ ] Deep image(EXR 2.x)
+- [ ] JavaScript library
+  - [x] LoadEXRFromMemory
+  - [ ] SaveMultiChannelEXR
+  - [ ] Deep image save/load
+- [ ] Write from/to memory buffer.
+  - [x] SaveMultiChannelEXR
+  - [x] LoadMultiChannelEXR
+  - [ ] Deep image save/load
+- [ ] Tile format.
+- [ ] Support for various compression type.
+  - [x] zstd compression(Not in OpenEXR spec, though)
+- [x] Multi-channel.
+- [ ] Multi-part(EXR2.0)
+- [ ] Line order.
+  - [x] Increasing, decreasing(load)
+  - [ ] Random?
+  - [ ] Increasing, decreasing(save)
+- [ ] Pixel format(UINT, FLOAT).
+  - [x] UINT, FLOAT(load)
+  - [x] UINT, FLOAT(deep load)
+  - [x] UINT, FLOAT(save)
+  - [ ] UINT, FLOAT(deep save)
+- [ ] Full support for big endian machine.
+  - [x] Loading multi channel EXR
+  - [x] Saving multi channel EXR
+  - [ ] Loading deep image
+  - [ ] Saving deep image
+- [ ] Optimization
+  - [ ] ISPC?
+  - [x] OpenMP multi-threading in EXR loading.
+  - [x] OpenMP multi-threading in EXR saving.
+  - [ ] OpenMP multi-threading in deep image loading.
+  - [ ] OpenMP multi-threading in deep image saving.
+
+## Similar or related projects
+
+* miniexr: https://github.com/aras-p/miniexr (Write OpenEXR)
+* stb_image_resize.h: https://github.com/nothings/stb (Good for HDR image resizing)
+
+## License
+
+3-clause BSD
+
+`tinyexr` uses miniz, which is developed by Rich Geldreich <richgel99@gmail.com>, and licensed under public domain.
+
+`tinyexr` tools uses stb, which is licensed under public domain: https://github.com/nothings/stb
+`tinyexr` uses some code from OpenEXR, which is licensed under 3-clause BSD license.
+
+## Author(s)
+
+Syoyo Fujita(syoyo@lighttransport.com)
+
+## Contributor(s)
+
+* Matt Ebb (http://mattebb.com) : deep image example. Thanks!
+* Matt Pharr (http://pharr.org/matt/) : Testing tinyexr with OpenEXR(IlmImf). Thanks! 
+* Andrew Bell (https://github.com/andrewfb) & Richard Eakin (https://github.com/richardeakin) : Improving TinyEXR API. Thanks!
+* Mike Wong (https://github.com/mwkm) : ZIPS compression support in loading. Thanks!
diff --git a/3rdparty/bimg/3rdparty/tinyexr/tinyexr.h b/3rdparty/bimg/3rdparty/tinyexr/tinyexr.h
new file mode 100644
index 0000000..b0998b6
--- /dev/null
+++ b/3rdparty/bimg/3rdparty/tinyexr/tinyexr.h
@@ -0,0 +1,12356 @@
+/*
+Copyright (c) 2014 - 2016, Syoyo Fujita
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the <organization> nor the
+      names of its contributors may be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+// TinyEXR contains some OpenEXR code, which is licensed under ------------
+
+///////////////////////////////////////////////////////////////////////////
+//
+// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
+// Digital Ltd. LLC
+//
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+// *       Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// *       Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// *       Neither the name of Industrial Light & Magic nor the names of
+// its contributors may be used to endorse or promote products derived
+// from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+///////////////////////////////////////////////////////////////////////////
+
+// End of OpenEXR license -------------------------------------------------
+
+#ifndef TINYEXR_H_
+#define TINYEXR_H_
+
+//
+//
+//   Do this:
+//    #define TINYEXR_IMPLEMENTATION
+//   before you include this file in *one* C or C++ file to create the
+//   implementation.
+//
+//   // i.e. it should look like this:
+//   #include ...
+//   #include ...
+//   #include ...
+//   #define TINYEXR_IMPLEMENTATION
+//   #include "tinyexr.h"
+//
+//
+
+#include <stddef.h>  // for size_t
+#include <stdint.h>  // guess stdint.h is available(C99)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Use embedded miniz or not to decode ZIP format pixel. Linking with zlib
+// required if this flas is 0.
+#ifndef TINYEXR_USE_MINIZ
+#define TINYEXR_USE_MINIZ (1)
+#endif
+
+// Disable PIZ comporession when applying cpplint.
+#ifndef TINYEXR_USE_PIZ
+#define TINYEXR_USE_PIZ (1)
+#endif
+
+#ifndef TINYEXR_USE_ZFP
+#define TINYEXR_USE_ZFP (0)  // TinyEXR extension.
+// http://computation.llnl.gov/projects/floating-point-compression
+#endif
+
+#define TINYEXR_SUCCESS (0)
+#define TINYEXR_ERROR_INVALID_MAGIC_NUMBER (-1)
+#define TINYEXR_ERROR_INVALID_EXR_VERSION (-2)
+#define TINYEXR_ERROR_INVALID_ARGUMENT (-3)
+#define TINYEXR_ERROR_INVALID_DATA (-4)
+#define TINYEXR_ERROR_INVALID_FILE (-5)
+#define TINYEXR_ERROR_INVALID_PARAMETER (-5)
+#define TINYEXR_ERROR_CANT_OPEN_FILE (-6)
+#define TINYEXR_ERROR_UNSUPPORTED_FORMAT (-7)
+#define TINYEXR_ERROR_INVALID_HEADER (-8)
+
+// @note { OpenEXR file format: http://www.openexr.com/openexrfilelayout.pdf }
+
+// pixel type: possible values are: UINT = 0 HALF = 1 FLOAT = 2
+#define TINYEXR_PIXELTYPE_UINT (0)
+#define TINYEXR_PIXELTYPE_HALF (1)
+#define TINYEXR_PIXELTYPE_FLOAT (2)
+
+#define TINYEXR_MAX_ATTRIBUTES (128)
+
+#define TINYEXR_COMPRESSIONTYPE_NONE (0)
+#define TINYEXR_COMPRESSIONTYPE_RLE (1)
+#define TINYEXR_COMPRESSIONTYPE_ZIPS (2)
+#define TINYEXR_COMPRESSIONTYPE_ZIP (3)
+#define TINYEXR_COMPRESSIONTYPE_PIZ (4)
+#define TINYEXR_COMPRESSIONTYPE_ZFP (128)  // TinyEXR extension
+
+#define TINYEXR_ZFP_COMPRESSIONTYPE_RATE (0)
+#define TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION (1)
+#define TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY (2)
+
+#define TINYEXR_TILE_ONE_LEVEL (0)
+#define TINYEXR_TILE_MIPMAP_LEVELS (1)
+#define TINYEXR_TILE_RIPMAP_LEVELS (2)
+
+#define TINYEXR_TILE_ROUND_DOWN (0)
+#define TINYEXR_TILE_ROUND_UP (1)
+
+typedef struct _EXRVersion {
+  int version;    // this must be 2
+  int tiled;      // tile format image
+  int long_name;  // long name attribute
+  int non_image;  // deep image(EXR 2.0)
+  int multipart;  // multi-part(EXR 2.0)
+} EXRVersion;
+
+typedef struct _EXRAttribute {
+  char name[256];  // name and type are up to 255 chars long.
+  char type[256];
+  unsigned char *value;  // uint8_t*
+  int size;
+  int pad0;
+} EXRAttribute;
+
+typedef struct _EXRChannelInfo {
+  char name[256];  // less than 255 bytes long
+  int pixel_type;
+  int x_sampling;
+  int y_sampling;
+  unsigned char p_linear;
+  unsigned char pad[3];
+} EXRChannelInfo;
+
+typedef struct _EXRTile {
+  int offset_x;
+  int offset_y;
+  int level_x;
+  int level_y;
+
+  int width;   // actual width in a tile.
+  int height;  // actual height int a tile.
+
+  unsigned char **images;  // image[channels][pixels]
+} EXRTile;
+
+typedef struct _EXRHeader {
+  float pixel_aspect_ratio;
+  int line_order;
+  int data_window[4];
+  int display_window[4];
+  float screen_window_center[2];
+  float screen_window_width;
+
+  int chunk_count;
+
+  // Properties for tiled format(`tiledesc`).
+  int tiled;
+  int tile_size_x;
+  int tile_size_y;
+  int tile_level_mode;
+  int tile_rounding_mode;
+
+  int long_name;
+  int non_image;
+  int multipart;
+  unsigned int header_len;
+
+  // Custom attributes(exludes required attributes(e.g. `channels`,
+  // `compression`, etc)
+  int num_custom_attributes;
+  EXRAttribute custom_attributes[TINYEXR_MAX_ATTRIBUTES];
+
+  EXRChannelInfo *channels;  // [num_channels]
+
+  int *pixel_types;  // Loaded pixel type(TINYEXR_PIXELTYPE_*) of `images` for
+  // each channel. This is overwritten with `requested_pixel_types` when
+  // loading.
+  int num_channels;
+
+  int compression_type;        // compression type(TINYEXR_COMPRESSIONTYPE_*)
+  int *requested_pixel_types;  // Filled initially by
+                               // ParseEXRHeaderFrom(Meomory|File), then users
+                               // can edit it(only valid for HALF pixel type
+                               // channel)
+
+} EXRHeader;
+
+typedef struct _EXRMultiPartHeader {
+  int num_headers;
+  EXRHeader *headers;
+
+} EXRMultiPartHeader;
+
+typedef struct _EXRImage {
+  EXRTile *tiles;  // Tiled pixel data. The application must reconstruct image
+                   // from tiles manually. NULL if scanline format.
+  unsigned char **images;  // image[channels][pixels]. NULL if tiled format.
+
+  int width;
+  int height;
+  int num_channels;
+
+  // Properties for tile format.
+  int num_tiles;
+
+} EXRImage;
+
+typedef struct _EXRMultiPartImage {
+  int num_images;
+  EXRImage *images;
+
+} EXRMultiPartImage;
+
+typedef struct _DeepImage {
+  const char **channel_names;
+  float ***image;      // image[channels][scanlines][samples]
+  int **offset_table;  // offset_table[scanline][offsets]
+  int num_channels;
+  int width;
+  int height;
+  int pad0;
+} DeepImage;
+
+// @deprecated { to be removed. }
+// Loads single-frame OpenEXR image. Assume EXR image contains RGB(A) channels.
+// Application must free image data as returned by `out_rgba`
+// Result image format is: float x RGBA x width x hight
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXR(float **out_rgba, int *width, int *height,
+                   const char *filename, const char **err);
+
+// @deprecated { to be removed. }
+// Saves single-frame OpenEXR image. Assume EXR image contains RGB(A) channels.
+// components must be 3(RGB) or 4(RGBA).
+// Result image format is: float x RGB(A) x width x hight
+extern int SaveEXR(const float *data, int width, int height, int components,
+                   const char *filename);
+
+// Initialize EXRHeader struct
+extern void InitEXRHeader(EXRHeader *exr_header);
+
+// Initialize EXRImage struct
+extern void InitEXRImage(EXRImage *exr_image);
+
+// Free's internal data of EXRHeader struct
+extern int FreeEXRHeader(EXRHeader *exr_header);
+
+// Free's internal data of EXRImage struct
+extern int FreeEXRImage(EXRImage *exr_image);
+
+// Parse EXR version header of a file.
+extern int ParseEXRVersionFromFile(EXRVersion *version, const char *filename);
+
+// Parse EXR version header from memory-mapped EXR data.
+extern int ParseEXRVersionFromMemory(EXRVersion *version,
+                                     const unsigned char *memory, size_t size);
+
+// Parse single-part OpenEXR header from a file and initialize `EXRHeader`.
+extern int ParseEXRHeaderFromFile(EXRHeader *header, const EXRVersion *version,
+                                  const char *filename, const char **err);
+
+// Parse single-part OpenEXR header from a memory and initialize `EXRHeader`.
+extern int ParseEXRHeaderFromMemory(EXRHeader *header,
+                                    const EXRVersion *version,
+                                    const unsigned char *memory, size_t size,
+                                    const char **err);
+
+// Parse multi-part OpenEXR headers from a file and initialize `EXRHeader*`
+// array.
+extern int ParseEXRMultipartHeaderFromFile(EXRHeader ***headers,
+                                           int *num_headers,
+                                           const EXRVersion *version,
+                                           const char *filename,
+                                           const char **err);
+
+// Parse multi-part OpenEXR headers from a memory and initialize `EXRHeader*`
+// array
+extern int ParseEXRMultipartHeaderFromMemory(EXRHeader ***headers,
+                                             int *num_headers,
+                                             const EXRVersion *version,
+                                             const unsigned char *memory,
+                                             size_t size, const char **err);
+
+// Loads single-part OpenEXR image from a file.
+// Application must setup `ParseEXRHeaderFromFile` before calling this function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXRImageFromFile(EXRImage *image, const EXRHeader *header,
+                                const char *filename, const char **err);
+
+// Loads single-part OpenEXR image from a memory.
+// Application must setup `EXRHeader` with
+// `ParseEXRHeaderFromMemory` before calling this function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXRImageFromMemory(EXRImage *image, const EXRHeader *header,
+                                  const unsigned char *memory,
+                                  const size_t size,
+                                  const char **err);
+
+// Loads multi-part OpenEXR image from a file.
+// Application must setup `ParseEXRMultipartHeaderFromFile` before calling this
+// function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXRMultipartImageFromFile(EXRImage *images,
+                                         const EXRHeader **headers,
+                                         unsigned int num_parts,
+                                         const char *filename,
+                                         const char **err);
+
+// Loads multi-part OpenEXR image from a memory.
+// Application must setup `EXRHeader*` array with
+// `ParseEXRMultipartHeaderFromMemory` before calling this function.
+// Application can free EXRImage using `FreeEXRImage`
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXRMultipartImageFromMemory(EXRImage *images,
+                                           const EXRHeader **headers,
+                                           unsigned int num_parts,
+                                           const unsigned char *memory,
+                                           const char **err);
+
+// Saves multi-channel, single-frame OpenEXR image to a file.
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int SaveEXRImageToFile(const EXRImage *image,
+                              const EXRHeader *exr_header, const char *filename,
+                              const char **err);
+
+// Saves multi-channel, single-frame OpenEXR image to a memory.
+// Image is compressed using EXRImage.compression value.
+// Return the number of bytes if succes.
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern size_t SaveEXRImageToMemory(const EXRImage *image,
+                                   const EXRHeader *exr_header,
+                                   unsigned char **memory, const char **err);
+
+// Loads single-frame OpenEXR deep image.
+// Application must free memory of variables in DeepImage(image, offset_table)
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadDeepEXR(DeepImage *out_image, const char *filename,
+                       const char **err);
+
+// NOT YET IMPLEMENTED:
+// Saves single-frame OpenEXR deep image.
+// Returns negative value and may set error string in `err` when there's an
+// error
+// extern int SaveDeepEXR(const DeepImage *in_image, const char *filename,
+//                       const char **err);
+
+// NOT YET IMPLEMENTED:
+// Loads multi-part OpenEXR deep image.
+// Application must free memory of variables in DeepImage(image, offset_table)
+// extern int LoadMultiPartDeepEXR(DeepImage **out_image, int num_parts, const
+// char *filename,
+//                       const char **err);
+
+// For emscripten.
+// Loads single-frame OpenEXR image from memory. Assume EXR image contains
+// RGB(A) channels.
+// `out_rgba` must have enough memory(at least sizeof(float) x 4(RGBA) x width x
+// hight)
+// Returns negative value and may set error string in `err` when there's an
+// error
+extern int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory,
+                             size_t size, const char **err);
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef TINYEXR_IMPLEMENTATION
+#include <algorithm>
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <sstream>
+
+#include <string>
+#include <vector>
+
+// @todo { remove including tinyexr.h }
+#include "tinyexr.h"
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#if TINYEXR_USE_MINIZ
+#else
+#include "zlib.h"
+#endif
+
+#if TINYEXR_USE_ZFP
+#include "zfp.h"
+#endif
+
+#if __cplusplus > 199711L
+// C++11
+#include <cstdint>
+#endif // __cplusplus > 199711L
+
+namespace tinyexr {
+
+#if __cplusplus > 199711L
+// C++11
+typedef uint64_t tinyexr_uint64;
+typedef int64_t tinyexr_int64;
+#else
+// Although `long long` is not a standard type pre C++11, assume it is defined
+// as a compiler's extension.
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-long-long"
+#endif
+typedef unsigned long long tinyexr_uint64;
+typedef long long tinyexr_int64;
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+#endif
+
+#if TINYEXR_USE_MINIZ
+
+namespace miniz {
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-long-long"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wpadded"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#pragma clang diagnostic ignored "-Wconversion"
+#endif
+
+/* miniz.c v1.15 - public domain deflate/inflate, zlib-subset, ZIP
+   reading/writing/appending, PNG writing
+   See "unlicense" statement at the end of this file.
+   Rich Geldreich <richgel99@gmail.com>, last updated Oct. 13, 2013
+   Implements RFC 1950: http://www.ietf.org/rfc/rfc1950.txt and RFC 1951:
+   http://www.ietf.org/rfc/rfc1951.txt
+
+   Most API's defined in miniz.c are optional. For example, to disable the
+   archive related functions just define
+   MINIZ_NO_ARCHIVE_APIS, or to get rid of all stdio usage define MINIZ_NO_STDIO
+   (see the list below for more macros).
+
+   * Change History
+     10/13/13 v1.15 r4 - Interim bugfix release while I work on the next major
+   release with Zip64 support (almost there!):
+       - Critical fix for the MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY bug
+   (thanks kahmyong.moon@hp.com) which could cause locate files to not find
+   files. This bug
+        would only have occured in earlier versions if you explicitly used this
+   flag, OR if you used mz_zip_extract_archive_file_to_heap() or
+   mz_zip_add_mem_to_archive_file_in_place()
+        (which used this flag). If you can't switch to v1.15 but want to fix
+   this bug, just remove the uses of this flag from both helper funcs (and of
+   course don't use the flag).
+       - Bugfix in mz_zip_reader_extract_to_mem_no_alloc() from kymoon when
+   pUser_read_buf is not NULL and compressed size is > uncompressed size
+       - Fixing mz_zip_reader_extract_*() funcs so they don't try to extract
+   compressed data from directory entries, to account for weird zipfiles which
+   contain zero-size compressed data on dir entries.
+         Hopefully this fix won't cause any issues on weird zip archives,
+   because it assumes the low 16-bits of zip external attributes are DOS
+   attributes (which I believe they always are in practice).
+       - Fixing mz_zip_reader_is_file_a_directory() so it doesn't check the
+   internal attributes, just the filename and external attributes
+       - mz_zip_reader_init_file() - missing MZ_FCLOSE() call if the seek failed
+       - Added cmake support for Linux builds which builds all the examples,
+   tested with clang v3.3 and gcc v4.6.
+       - Clang fix for tdefl_write_image_to_png_file_in_memory() from toffaletti
+       - Merged MZ_FORCEINLINE fix from hdeanclark
+       - Fix <time.h> include before config #ifdef, thanks emil.brink
+       - Added tdefl_write_image_to_png_file_in_memory_ex(): supports Y flipping
+   (super useful for OpenGL apps), and explicit control over the compression
+   level (so you can
+        set it to 1 for real-time compression).
+       - Merged in some compiler fixes from paulharris's github repro.
+       - Retested this build under Windows (VS 2010, including static analysis),
+   tcc  0.9.26, gcc v4.6 and clang v3.3.
+       - Added example6.c, which dumps an image of the mandelbrot set to a PNG
+   file.
+       - Modified example2 to help test the
+   MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY flag more.
+       - In r3: Bugfix to mz_zip_writer_add_file() found during merge: Fix
+   possible src file fclose() leak if alignment bytes+local header file write
+   faiiled
+                 - In r4: Minor bugfix to mz_zip_writer_add_from_zip_reader():
+   Was pushing the wrong central dir header offset, appears harmless in this
+   release, but it became a problem in the zip64 branch
+     5/20/12 v1.14 - MinGW32/64 GCC 4.6.1 compiler fixes: added MZ_FORCEINLINE,
+   #include <time.h> (thanks fermtect).
+     5/19/12 v1.13 - From jason@cornsyrup.org and kelwert@mtu.edu - Fix
+   mz_crc32() so it doesn't compute the wrong CRC-32's when mz_ulong is 64-bit.
+       - Temporarily/locally slammed in "typedef unsigned long mz_ulong" and
+   re-ran a randomized regression test on ~500k files.
+       - Eliminated a bunch of warnings when compiling with GCC 32-bit/64.
+       - Ran all examples, miniz.c, and tinfl.c through MSVC 2008's /analyze
+   (static analysis) option and fixed all warnings (except for the silly
+        "Use of the comma-operator in a tested expression.." analysis warning,
+   which I purposely use to work around a MSVC compiler warning).
+       - Created 32-bit and 64-bit Codeblocks projects/workspace. Built and
+   tested Linux executables. The codeblocks workspace is compatible with
+   Linux+Win32/x64.
+       - Added miniz_tester solution/project, which is a useful little app
+   derived from LZHAM's tester app that I use as part of the regression test.
+       - Ran miniz.c and tinfl.c through another series of regression testing on
+   ~500,000 files and archives.
+       - Modified example5.c so it purposely disables a bunch of high-level
+   functionality (MINIZ_NO_STDIO, etc.). (Thanks to corysama for the
+   MINIZ_NO_STDIO bug report.)
+       - Fix ftell() usage in examples so they exit with an error on files which
+   are too large (a limitation of the examples, not miniz itself).
+     4/12/12 v1.12 - More comments, added low-level example5.c, fixed a couple
+   minor level_and_flags issues in the archive API's.
+      level_and_flags can now be set to MZ_DEFAULT_COMPRESSION. Thanks to Bruce
+   Dawson <bruced@valvesoftware.com> for the feedback/bug report.
+     5/28/11 v1.11 - Added statement from unlicense.org
+     5/27/11 v1.10 - Substantial compressor optimizations:
+      - Level 1 is now ~4x faster than before. The L1 compressor's throughput
+   now varies between 70-110MB/sec. on a
+      - Core i7 (actual throughput varies depending on the type of data, and x64
+   vs. x86).
+      - Improved baseline L2-L9 compression perf. Also, greatly improved
+   compression perf. issues on some file types.
+      - Refactored the compression code for better readability and
+   maintainability.
+      - Added level 10 compression level (L10 has slightly better ratio than
+   level 9, but could have a potentially large
+       drop in throughput on some files).
+     5/15/11 v1.09 - Initial stable release.
+
+   * Low-level Deflate/Inflate implementation notes:
+
+     Compression: Use the "tdefl" API's. The compressor supports raw, static,
+   and dynamic blocks, lazy or
+     greedy parsing, match length filtering, RLE-only, and Huffman-only streams.
+   It performs and compresses
+     approximately as well as zlib.
+
+     Decompression: Use the "tinfl" API's. The entire decompressor is
+   implemented as a single function
+     coroutine: see tinfl_decompress(). It supports decompression into a 32KB
+   (or larger power of 2) wrapping buffer, or into a memory
+     block large enough to hold the entire file.
+
+     The low-level tdefl/tinfl API's do not make any use of dynamic memory
+   allocation.
+
+   * zlib-style API notes:
+
+     miniz.c implements a fairly large subset of zlib. There's enough
+   functionality present for it to be a drop-in
+     zlib replacement in many apps:
+        The z_stream struct, optional memory allocation callbacks
+        deflateInit/deflateInit2/deflate/deflateReset/deflateEnd/deflateBound
+        inflateInit/inflateInit2/inflate/inflateEnd
+        compress, compress2, compressBound, uncompress
+        CRC-32, Adler-32 - Using modern, minimal code size, CPU cache friendly
+   routines.
+        Supports raw deflate streams or standard zlib streams with adler-32
+   checking.
+
+     Limitations:
+      The callback API's are not implemented yet. No support for gzip headers or
+   zlib static dictionaries.
+      I've tried to closely emulate zlib's various flavors of stream flushing
+   and return status codes, but
+      there are no guarantees that miniz.c pulls this off perfectly.
+
+   * PNG writing: See the tdefl_write_image_to_png_file_in_memory() function,
+   originally written by
+     Alex Evans. Supports 1-4 bytes/pixel images.
+
+   * ZIP archive API notes:
+
+     The ZIP archive API's where designed with simplicity and efficiency in
+   mind, with just enough abstraction to
+     get the job done with minimal fuss. There are simple API's to retrieve file
+   information, read files from
+     existing archives, create new archives, append new files to existing
+   archives, or clone archive data from
+     one archive to another. It supports archives located in memory or the heap,
+   on disk (using stdio.h),
+     or you can specify custom file read/write callbacks.
+
+     - Archive reading: Just call this function to read a single file from a
+   disk archive:
+
+      void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename, const
+   char *pArchive_name,
+        size_t *pSize, mz_uint zip_flags);
+
+     For more complex cases, use the "mz_zip_reader" functions. Upon opening an
+   archive, the entire central
+     directory is located and read as-is into memory, and subsequent file access
+   only occurs when reading individual files.
+
+     - Archives file scanning: The simple way is to use this function to scan a
+   loaded archive for a specific file:
+
+     int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName,
+   const char *pComment, mz_uint flags);
+
+     The locate operation can optionally check file comments too, which (as one
+   example) can be used to identify
+     multiple versions of the same file in an archive. This function uses a
+   simple linear search through the central
+     directory, so it's not very fast.
+
+     Alternately, you can iterate through all the files in an archive (using
+   mz_zip_reader_get_num_files()) and
+     retrieve detailed info on each file by calling mz_zip_reader_file_stat().
+
+     - Archive creation: Use the "mz_zip_writer" functions. The ZIP writer
+   immediately writes compressed file data
+     to disk and builds an exact image of the central directory in memory. The
+   central directory image is written
+     all at once at the end of the archive file when the archive is finalized.
+
+     The archive writer can optionally align each file's local header and file
+   data to any power of 2 alignment,
+     which can be useful when the archive will be read from optical media. Also,
+   the writer supports placing
+     arbitrary data blobs at the very beginning of ZIP archives. Archives
+   written using either feature are still
+     readable by any ZIP tool.
+
+     - Archive appending: The simple way to add a single file to an archive is
+   to call this function:
+
+      mz_bool mz_zip_add_mem_to_archive_file_in_place(const char *pZip_filename,
+   const char *pArchive_name,
+        const void *pBuf, size_t buf_size, const void *pComment, mz_uint16
+   comment_size, mz_uint level_and_flags);
+
+     The archive will be created if it doesn't already exist, otherwise it'll be
+   appended to.
+     Note the appending is done in-place and is not an atomic operation, so if
+   something goes wrong
+     during the operation it's possible the archive could be left without a
+   central directory (although the local
+     file headers and file data will be fine, so the archive will be
+   recoverable).
+
+     For more complex archive modification scenarios:
+     1. The safest way is to use a mz_zip_reader to read the existing archive,
+   cloning only those bits you want to
+     preserve into a new archive using using the
+   mz_zip_writer_add_from_zip_reader() function (which compiles the
+     compressed file data as-is). When you're done, delete the old archive and
+   rename the newly written archive, and
+     you're done. This is safe but requires a bunch of temporary disk space or
+   heap memory.
+
+     2. Or, you can convert an mz_zip_reader in-place to an mz_zip_writer using
+   mz_zip_writer_init_from_reader(),
+     append new files as needed, then finalize the archive which will write an
+   updated central directory to the
+     original archive. (This is basically what
+   mz_zip_add_mem_to_archive_file_in_place() does.) There's a
+     possibility that the archive's central directory could be lost with this
+   method if anything goes wrong, though.
+
+     - ZIP archive support limitations:
+     No zip64 or spanning support. Extraction functions can only handle
+   unencrypted, stored or deflated files.
+     Requires streams capable of seeking.
+
+   * This is a header file library, like stb_image.c. To get only a header file,
+   either cut and paste the
+     below header, or create miniz.h, #define MINIZ_HEADER_FILE_ONLY, and then
+   include miniz.c from it.
+
+   * Important: For best perf. be sure to customize the below macros for your
+   target platform:
+     #define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+     #define MINIZ_LITTLE_ENDIAN 1
+     #define MINIZ_HAS_64BIT_REGISTERS 1
+
+   * On platforms using glibc, Be sure to "#define _LARGEFILE64_SOURCE 1" before
+   including miniz.c to ensure miniz
+     uses the 64-bit variants: fopen64(), stat64(), etc. Otherwise you won't be
+   able to process large files
+     (i.e. 32-bit stat() fails for me on files > 0x7FFFFFFF bytes).
+*/
+
+#ifndef MINIZ_HEADER_INCLUDED
+#define MINIZ_HEADER_INCLUDED
+
+#include <stdlib.h>
+
+// Defines to completely disable specific portions of miniz.c:
+// If all macros here are defined the only functionality remaining will be
+// CRC-32, adler-32, tinfl, and tdefl.
+
+// Define MINIZ_NO_STDIO to disable all usage and any functions which rely on
+// stdio for file I/O.
+//#define MINIZ_NO_STDIO
+
+// If MINIZ_NO_TIME is specified then the ZIP archive functions will not be able
+// to get the current time, or
+// get/set file times, and the C run-time funcs that get/set times won't be
+// called.
+// The current downside is the times written to your archives will be from 1979.
+#define MINIZ_NO_TIME
+
+// Define MINIZ_NO_ARCHIVE_APIS to disable all ZIP archive API's.
+//#define MINIZ_NO_ARCHIVE_APIS
+
+// Define MINIZ_NO_ARCHIVE_APIS to disable all writing related ZIP archive
+// API's.
+//#define MINIZ_NO_ARCHIVE_WRITING_APIS
+
+// Define MINIZ_NO_ZLIB_APIS to remove all ZLIB-style compression/decompression
+// API's.
+//#define MINIZ_NO_ZLIB_APIS
+
+// Define MINIZ_NO_ZLIB_COMPATIBLE_NAME to disable zlib names, to prevent
+// conflicts against stock zlib.
+//#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
+// Define MINIZ_NO_MALLOC to disable all calls to malloc, free, and realloc.
+// Note if MINIZ_NO_MALLOC is defined then the user must always provide custom
+// user alloc/free/realloc
+// callbacks to the zlib and archive API's, and a few stand-alone helper API's
+// which don't provide custom user
+// functions (such as tdefl_compress_mem_to_heap() and
+// tinfl_decompress_mem_to_heap()) won't work.
+//#define MINIZ_NO_MALLOC
+
+#if defined(__TINYC__) && (defined(__linux) || defined(__linux__))
+// TODO: Work around "error: include file 'sys\utime.h' when compiling with tcc
+// on Linux
+#define MINIZ_NO_TIME
+#endif
+
+#if !defined(MINIZ_NO_TIME) && !defined(MINIZ_NO_ARCHIVE_APIS)
+#include <time.h>
+#endif
+
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+    defined(__i386) || defined(__i486__) || defined(__i486) ||  \
+    defined(i386) || defined(__ia64__) || defined(__x86_64__)
+// MINIZ_X86_OR_X64_CPU is only used to help set the below macros.
+#define MINIZ_X86_OR_X64_CPU 1
+#else
+#define MINIZ_X86_OR_X64_CPU 0
+#endif
+
+#if defined(__sparcv9)
+// Big endian
+#else
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+// Set MINIZ_LITTLE_ENDIAN to 1 if the processor is little endian.
+#define MINIZ_LITTLE_ENDIAN 1
+#endif
+#endif
+
+#if 1 // MINIZ_X86_OR_X64_CPU
+// Set MINIZ_USE_UNALIGNED_LOADS_AND_STORES to 1 on CPU's that permit efficient
+// integer loads and stores from unaligned addresses.
+//#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES 1
+#define MINIZ_USE_UNALIGNED_LOADS_AND_STORES \
+  0  // disable to suppress compiler warnings
+#endif
+
+#if defined(_M_X64) || defined(_WIN64) || defined(__MINGW64__) || \
+    defined(_LP64) || defined(__LP64__) || defined(__ia64__) ||   \
+    defined(__x86_64__)
+// Set MINIZ_HAS_64BIT_REGISTERS to 1 if operations on 64-bit integers are
+// reasonably fast (and don't involve compiler generated calls to helper
+// functions).
+#define MINIZ_HAS_64BIT_REGISTERS 1
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------- zlib-style API Definitions.
+
+// For more compatibility with zlib, miniz.c uses unsigned long for some
+// parameters/struct members. Beware: mz_ulong can be either 32 or 64-bits!
+typedef unsigned long mz_ulong;
+
+// mz_free() internally uses the MZ_FREE() macro (which by default calls free()
+// unless you've modified the MZ_MALLOC macro) to release a block allocated from
+// the heap.
+void mz_free(void *p);
+
+#define MZ_ADLER32_INIT (1)
+// mz_adler32() returns the initial adler-32 value to use when called with
+// ptr==NULL.
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len);
+
+#define MZ_CRC32_INIT (0)
+// mz_crc32() returns the initial CRC-32 value to use when called with
+// ptr==NULL.
+mz_ulong mz_crc32(mz_ulong crc, const unsigned char *ptr, size_t buf_len);
+
+// Compression strategies.
+enum {
+  MZ_DEFAULT_STRATEGY = 0,
+  MZ_FILTERED = 1,
+  MZ_HUFFMAN_ONLY = 2,
+  MZ_RLE = 3,
+  MZ_FIXED = 4
+};
+
+// Method
+#define MZ_DEFLATED 8
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+// Heap allocation callbacks.
+// Note that mz_alloc_func parameter types purpsosely differ from zlib's:
+// items/size is size_t, not unsigned long.
+typedef void *(*mz_alloc_func)(void *opaque, size_t items, size_t size);
+typedef void (*mz_free_func)(void *opaque, void *address);
+typedef void *(*mz_realloc_func)(void *opaque, void *address, size_t items,
+                                 size_t size);
+
+#define MZ_VERSION "9.1.15"
+#define MZ_VERNUM 0x91F0
+#define MZ_VER_MAJOR 9
+#define MZ_VER_MINOR 1
+#define MZ_VER_REVISION 15
+#define MZ_VER_SUBREVISION 0
+
+// Flush values. For typical usage you only need MZ_NO_FLUSH and MZ_FINISH. The
+// other values are for advanced use (refer to the zlib docs).
+enum {
+  MZ_NO_FLUSH = 0,
+  MZ_PARTIAL_FLUSH = 1,
+  MZ_SYNC_FLUSH = 2,
+  MZ_FULL_FLUSH = 3,
+  MZ_FINISH = 4,
+  MZ_BLOCK = 5
+};
+
+// Return status codes. MZ_PARAM_ERROR is non-standard.
+enum {
+  MZ_OK = 0,
+  MZ_STREAM_END = 1,
+  MZ_NEED_DICT = 2,
+  MZ_ERRNO = -1,
+  MZ_STREAM_ERROR = -2,
+  MZ_DATA_ERROR = -3,
+  MZ_MEM_ERROR = -4,
+  MZ_BUF_ERROR = -5,
+  MZ_VERSION_ERROR = -6,
+  MZ_PARAM_ERROR = -10000
+};
+
+// Compression levels: 0-9 are the standard zlib-style levels, 10 is best
+// possible compression (not zlib compatible, and may be very slow),
+// MZ_DEFAULT_COMPRESSION=MZ_DEFAULT_LEVEL.
+enum {
+  MZ_NO_COMPRESSION = 0,
+  MZ_BEST_SPEED = 1,
+  MZ_BEST_COMPRESSION = 9,
+  MZ_UBER_COMPRESSION = 10,
+  MZ_DEFAULT_LEVEL = 6,
+  MZ_DEFAULT_COMPRESSION = -1
+};
+
+// Window bits
+#define MZ_DEFAULT_WINDOW_BITS 15
+
+struct mz_internal_state;
+
+// Compression/decompression stream struct.
+typedef struct mz_stream_s {
+  const unsigned char *next_in;  // pointer to next byte to read
+  unsigned int avail_in;         // number of bytes available at next_in
+  mz_ulong total_in;             // total number of bytes consumed so far
+
+  unsigned char *next_out;  // pointer to next byte to write
+  unsigned int avail_out;   // number of bytes that can be written to next_out
+  mz_ulong total_out;       // total number of bytes produced so far
+
+  char *msg;                        // error msg (unused)
+  struct mz_internal_state *state;  // internal state, allocated by zalloc/zfree
+
+  mz_alloc_func
+      zalloc;          // optional heap allocation function (defaults to malloc)
+  mz_free_func zfree;  // optional heap free function (defaults to free)
+  void *opaque;        // heap alloc function user pointer
+
+  int data_type;      // data_type (unused)
+  mz_ulong adler;     // adler32 of the source or uncompressed data
+  mz_ulong reserved;  // not used
+} mz_stream;
+
+typedef mz_stream *mz_streamp;
+
+// Returns the version string of miniz.c.
+const char *mz_version(void);
+
+// mz_deflateInit() initializes a compressor with default options:
+// Parameters:
+//  pStream must point to an initialized mz_stream struct.
+//  level must be between [MZ_NO_COMPRESSION, MZ_BEST_COMPRESSION].
+//  level 1 enables a specially optimized compression function that's been
+//  optimized purely for performance, not ratio.
+//  (This special func. is currently only enabled when
+//  MINIZ_USE_UNALIGNED_LOADS_AND_STORES and MINIZ_LITTLE_ENDIAN are defined.)
+// Return values:
+//  MZ_OK on success.
+//  MZ_STREAM_ERROR if the stream is bogus.
+//  MZ_PARAM_ERROR if the input parameters are bogus.
+//  MZ_MEM_ERROR on out of memory.
+int mz_deflateInit(mz_streamp pStream, int level);
+
+// mz_deflateInit2() is like mz_deflate(), except with more control:
+// Additional parameters:
+//   method must be MZ_DEFLATED
+//   window_bits must be MZ_DEFAULT_WINDOW_BITS (to wrap the deflate stream with
+//   zlib header/adler-32 footer) or -MZ_DEFAULT_WINDOW_BITS (raw deflate/no
+//   header or footer)
+//   mem_level must be between [1, 9] (it's checked but ignored by miniz.c)
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits,
+                    int mem_level, int strategy);
+
+// Quickly resets a compressor without having to reallocate anything. Same as
+// calling mz_deflateEnd() followed by mz_deflateInit()/mz_deflateInit2().
+int mz_deflateReset(mz_streamp pStream);
+
+// mz_deflate() compresses the input to output, consuming as much of the input
+// and producing as much output as possible.
+// Parameters:
+//   pStream is the stream to read from and write to. You must initialize/update
+//   the next_in, avail_in, next_out, and avail_out members.
+//   flush may be MZ_NO_FLUSH, MZ_PARTIAL_FLUSH/MZ_SYNC_FLUSH, MZ_FULL_FLUSH, or
+//   MZ_FINISH.
+// Return values:
+//   MZ_OK on success (when flushing, or if more input is needed but not
+//   available, and/or there's more output to be written but the output buffer
+//   is full).
+//   MZ_STREAM_END if all input has been consumed and all output bytes have been
+//   written. Don't call mz_deflate() on the stream anymore.
+//   MZ_STREAM_ERROR if the stream is bogus.
+//   MZ_PARAM_ERROR if one of the parameters is invalid.
+//   MZ_BUF_ERROR if no forward progress is possible because the input and/or
+//   output buffers are empty. (Fill up the input buffer or free up some output
+//   space and try again.)
+int mz_deflate(mz_streamp pStream, int flush);
+
+// mz_deflateEnd() deinitializes a compressor:
+// Return values:
+//  MZ_OK on success.
+//  MZ_STREAM_ERROR if the stream is bogus.
+int mz_deflateEnd(mz_streamp pStream);
+
+// mz_deflateBound() returns a (very) conservative upper bound on the amount of
+// data that could be generated by deflate(), assuming flush is set to only
+// MZ_NO_FLUSH or MZ_FINISH.
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len);
+
+// Single-call compression functions mz_compress() and mz_compress2():
+// Returns MZ_OK on success, or one of the error codes from mz_deflate() on
+// failure.
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len,
+                const unsigned char *pSource, mz_ulong source_len);
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len,
+                 const unsigned char *pSource, mz_ulong source_len, int level);
+
+// mz_compressBound() returns a (very) conservative upper bound on the amount of
+// data that could be generated by calling mz_compress().
+mz_ulong mz_compressBound(mz_ulong source_len);
+
+// Initializes a decompressor.
+int mz_inflateInit(mz_streamp pStream);
+
+// mz_inflateInit2() is like mz_inflateInit() with an additional option that
+// controls the window size and whether or not the stream has been wrapped with
+// a zlib header/footer:
+// window_bits must be MZ_DEFAULT_WINDOW_BITS (to parse zlib header/footer) or
+// -MZ_DEFAULT_WINDOW_BITS (raw deflate).
+int mz_inflateInit2(mz_streamp pStream, int window_bits);
+
+// Decompresses the input stream to the output, consuming only as much of the
+// input as needed, and writing as much to the output as possible.
+// Parameters:
+//   pStream is the stream to read from and write to. You must initialize/update
+//   the next_in, avail_in, next_out, and avail_out members.
+//   flush may be MZ_NO_FLUSH, MZ_SYNC_FLUSH, or MZ_FINISH.
+//   On the first call, if flush is MZ_FINISH it's assumed the input and output
+//   buffers are both sized large enough to decompress the entire stream in a
+//   single call (this is slightly faster).
+//   MZ_FINISH implies that there are no more source bytes available beside
+//   what's already in the input buffer, and that the output buffer is large
+//   enough to hold the rest of the decompressed data.
+// Return values:
+//   MZ_OK on success. Either more input is needed but not available, and/or
+//   there's more output to be written but the output buffer is full.
+//   MZ_STREAM_END if all needed input has been consumed and all output bytes
+//   have been written. For zlib streams, the adler-32 of the decompressed data
+//   has also been verified.
+//   MZ_STREAM_ERROR if the stream is bogus.
+//   MZ_DATA_ERROR if the deflate stream is invalid.
+//   MZ_PARAM_ERROR if one of the parameters is invalid.
+//   MZ_BUF_ERROR if no forward progress is possible because the input buffer is
+//   empty but the inflater needs more input to continue, or if the output
+//   buffer is not large enough. Call mz_inflate() again
+//   with more input data, or with more room in the output buffer (except when
+//   using single call decompression, described above).
+int mz_inflate(mz_streamp pStream, int flush);
+
+// Deinitializes a decompressor.
+int mz_inflateEnd(mz_streamp pStream);
+
+// Single-call decompression.
+// Returns MZ_OK on success, or one of the error codes from mz_inflate() on
+// failure.
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len,
+                  const unsigned char *pSource, mz_ulong source_len);
+
+// Returns a string description of the specified error code, or NULL if the
+// error code is invalid.
+const char *mz_error(int err);
+
+// Redefine zlib-compatible names to miniz equivalents, so miniz.c can be used
+// as a drop-in replacement for the subset of zlib that miniz.c supports.
+// Define MINIZ_NO_ZLIB_COMPATIBLE_NAMES to disable zlib-compatibility if you
+// use zlib in the same project.
+#ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+typedef unsigned char Byte;
+typedef unsigned int uInt;
+typedef mz_ulong uLong;
+typedef Byte Bytef;
+typedef uInt uIntf;
+typedef char charf;
+typedef int intf;
+typedef void *voidpf;
+typedef uLong uLongf;
+typedef void *voidp;
+typedef void *const voidpc;
+#define Z_NULL 0
+#define Z_NO_FLUSH MZ_NO_FLUSH
+#define Z_PARTIAL_FLUSH MZ_PARTIAL_FLUSH
+#define Z_SYNC_FLUSH MZ_SYNC_FLUSH
+#define Z_FULL_FLUSH MZ_FULL_FLUSH
+#define Z_FINISH MZ_FINISH
+#define Z_BLOCK MZ_BLOCK
+#define Z_OK MZ_OK
+#define Z_STREAM_END MZ_STREAM_END
+#define Z_NEED_DICT MZ_NEED_DICT
+#define Z_ERRNO MZ_ERRNO
+#define Z_STREAM_ERROR MZ_STREAM_ERROR
+#define Z_DATA_ERROR MZ_DATA_ERROR
+#define Z_MEM_ERROR MZ_MEM_ERROR
+#define Z_BUF_ERROR MZ_BUF_ERROR
+#define Z_VERSION_ERROR MZ_VERSION_ERROR
+#define Z_PARAM_ERROR MZ_PARAM_ERROR
+#define Z_NO_COMPRESSION MZ_NO_COMPRESSION
+#define Z_BEST_SPEED MZ_BEST_SPEED
+#define Z_BEST_COMPRESSION MZ_BEST_COMPRESSION
+#define Z_DEFAULT_COMPRESSION MZ_DEFAULT_COMPRESSION
+#define Z_DEFAULT_STRATEGY MZ_DEFAULT_STRATEGY
+#define Z_FILTERED MZ_FILTERED
+#define Z_HUFFMAN_ONLY MZ_HUFFMAN_ONLY
+#define Z_RLE MZ_RLE
+#define Z_FIXED MZ_FIXED
+#define Z_DEFLATED MZ_DEFLATED
+#define Z_DEFAULT_WINDOW_BITS MZ_DEFAULT_WINDOW_BITS
+#define alloc_func mz_alloc_func
+#define free_func mz_free_func
+#define internal_state mz_internal_state
+#define z_stream mz_stream
+#define deflateInit mz_deflateInit
+#define deflateInit2 mz_deflateInit2
+#define deflateReset mz_deflateReset
+#define deflate mz_deflate
+#define deflateEnd mz_deflateEnd
+#define deflateBound mz_deflateBound
+#define compress mz_compress
+#define compress2 mz_compress2
+#define compressBound mz_compressBound
+#define inflateInit mz_inflateInit
+#define inflateInit2 mz_inflateInit2
+#define inflate mz_inflate
+#define inflateEnd mz_inflateEnd
+#define uncompress mz_uncompress
+#define crc32 mz_crc32
+#define adler32 mz_adler32
+#define MAX_WBITS 15
+#define MAX_MEM_LEVEL 9
+#define zError mz_error
+#define ZLIB_VERSION MZ_VERSION
+#define ZLIB_VERNUM MZ_VERNUM
+#define ZLIB_VER_MAJOR MZ_VER_MAJOR
+#define ZLIB_VER_MINOR MZ_VER_MINOR
+#define ZLIB_VER_REVISION MZ_VER_REVISION
+#define ZLIB_VER_SUBREVISION MZ_VER_SUBREVISION
+#define zlibVersion mz_version
+#define zlib_version mz_version()
+#endif  // #ifndef MINIZ_NO_ZLIB_COMPATIBLE_NAMES
+
+#endif  // MINIZ_NO_ZLIB_APIS
+
+// ------------------- Types and macros
+
+typedef unsigned char mz_uint8;
+typedef signed short mz_int16;
+typedef unsigned short mz_uint16;
+typedef unsigned int mz_uint32;
+typedef unsigned int mz_uint;
+typedef long long mz_int64;
+typedef unsigned long long mz_uint64;
+typedef int mz_bool;
+
+#define MZ_FALSE (0)
+#define MZ_TRUE (1)
+
+// An attempt to work around MSVC's spammy "warning C4127: conditional
+// expression is constant" message.
+#ifdef _MSC_VER
+#define MZ_MACRO_END while (0, 0)
+#else
+#define MZ_MACRO_END while (0)
+#endif
+
+// ------------------- ZIP archive reading/writing
+
+#ifndef MINIZ_NO_ARCHIVE_APIS
+
+enum {
+  MZ_ZIP_MAX_IO_BUF_SIZE = 64 * 1024,
+  MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE = 260,
+  MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE = 256
+};
+
+typedef struct {
+  mz_uint32 m_file_index;
+  mz_uint32 m_central_dir_ofs;
+  mz_uint16 m_version_made_by;
+  mz_uint16 m_version_needed;
+  mz_uint16 m_bit_flag;
+  mz_uint16 m_method;
+#ifndef MINIZ_NO_TIME
+  time_t m_time;
+#endif
+  mz_uint32 m_crc32;
+  mz_uint64 m_comp_size;
+  mz_uint64 m_uncomp_size;
+  mz_uint16 m_internal_attr;
+  mz_uint32 m_external_attr;
+  mz_uint64 m_local_header_ofs;
+  mz_uint32 m_comment_size;
+  char m_filename[MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE];
+  char m_comment[MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE];
+} mz_zip_archive_file_stat;
+
+typedef size_t (*mz_file_read_func)(void *pOpaque, mz_uint64 file_ofs,
+                                    void *pBuf, size_t n);
+typedef size_t (*mz_file_write_func)(void *pOpaque, mz_uint64 file_ofs,
+                                     const void *pBuf, size_t n);
+
+struct mz_zip_internal_state_tag;
+typedef struct mz_zip_internal_state_tag mz_zip_internal_state;
+
+typedef enum {
+  MZ_ZIP_MODE_INVALID = 0,
+  MZ_ZIP_MODE_READING = 1,
+  MZ_ZIP_MODE_WRITING = 2,
+  MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED = 3
+} mz_zip_mode;
+
+typedef struct mz_zip_archive_tag {
+  mz_uint64 m_archive_size;
+  mz_uint64 m_central_directory_file_ofs;
+  mz_uint m_total_files;
+  mz_zip_mode m_zip_mode;
+
+  mz_uint m_file_offset_alignment;
+
+  mz_alloc_func m_pAlloc;
+  mz_free_func m_pFree;
+  mz_realloc_func m_pRealloc;
+  void *m_pAlloc_opaque;
+
+  mz_file_read_func m_pRead;
+  mz_file_write_func m_pWrite;
+  void *m_pIO_opaque;
+
+  mz_zip_internal_state *m_pState;
+
+} mz_zip_archive;
+
+typedef enum {
+  MZ_ZIP_FLAG_CASE_SENSITIVE = 0x0100,
+  MZ_ZIP_FLAG_IGNORE_PATH = 0x0200,
+  MZ_ZIP_FLAG_COMPRESSED_DATA = 0x0400,
+  MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY = 0x0800
+} mz_zip_flags;
+
+// ZIP archive reading
+
+// Inits a ZIP archive reader.
+// These functions read and validate the archive's central directory.
+mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size,
+                           mz_uint32 flags);
+mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem,
+                               size_t size, mz_uint32 flags);
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename,
+                                mz_uint32 flags);
+#endif
+
+// Returns the total number of files in the archive.
+mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip);
+
+// Returns detailed information about an archive file entry.
+mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index,
+                                mz_zip_archive_file_stat *pStat);
+
+// Determines if an archive file entry is a directory entry.
+mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip,
+                                          mz_uint file_index);
+mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip,
+                                        mz_uint file_index);
+
+// Retrieves the filename of an archive file entry.
+// Returns the number of bytes written to pFilename, or if filename_buf_size is
+// 0 this function returns the number of bytes needed to fully store the
+// filename.
+mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_index,
+                                   char *pFilename, mz_uint filename_buf_size);
+
+// Attempts to locates a file in the archive's central directory.
+// Valid flags: MZ_ZIP_FLAG_CASE_SENSITIVE, MZ_ZIP_FLAG_IGNORE_PATH
+// Returns -1 if the file cannot be found.
+int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName,
+                              const char *pComment, mz_uint flags);
+
+// Extracts a archive file to a memory buffer using no memory allocation.
+mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip,
+                                              mz_uint file_index, void *pBuf,
+                                              size_t buf_size, mz_uint flags,
+                                              void *pUser_read_buf,
+                                              size_t user_read_buf_size);
+mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(
+    mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size,
+    mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size);
+
+// Extracts a archive file to a memory buffer.
+mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index,
+                                     void *pBuf, size_t buf_size,
+                                     mz_uint flags);
+mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip,
+                                          const char *pFilename, void *pBuf,
+                                          size_t buf_size, mz_uint flags);
+
+// Extracts a archive file to a dynamically allocated heap buffer.
+void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index,
+                                    size_t *pSize, mz_uint flags);
+void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip,
+                                         const char *pFilename, size_t *pSize,
+                                         mz_uint flags);
+
+// Extracts a archive file using a callback function to output the file's data.
+mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip,
+                                          mz_uint file_index,
+                                          mz_file_write_func pCallback,
+                                          void *pOpaque, mz_uint flags);
+mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip,
+                                               const char *pFilename,
+                                               mz_file_write_func pCallback,
+                                               void *pOpaque, mz_uint flags);
+
+#ifndef MINIZ_NO_STDIO
+// Extracts a archive file to a disk file and sets its last accessed and
+// modified times.
+// This function only extracts files, not archive directory records.
+mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file_index,
+                                      const char *pDst_filename, mz_uint flags);
+mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip,
+                                           const char *pArchive_filename,
+                                           const char *pDst_filename,
+                                           mz_uint flags);
+#endif
+
+// Ends archive reading, freeing all allocations, and closing the input archive
+// file if mz_zip_reader_init_file() was used.
+mz_bool mz_zip_reader_end(mz_zip_archive *pZip);
+
+// ZIP archive writing
+
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+
+// Inits a ZIP archive writer.
+mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size);
+mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip,
+                                size_t size_to_reserve_at_beginning,
+                                size_t initial_allocation_size);
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilename,
+                                mz_uint64 size_to_reserve_at_beginning);
+#endif
+
+// Converts a ZIP archive reader object into a writer object, to allow efficient
+// in-place file appends to occur on an existing archive.
+// For archives opened using mz_zip_reader_init_file, pFilename must be the
+// archive's filename so it can be reopened for writing. If the file can't be
+// reopened, mz_zip_reader_end() will be called.
+// For archives opened using mz_zip_reader_init_mem, the memory block must be
+// growable using the realloc callback (which defaults to realloc unless you've
+// overridden it).
+// Finally, for archives opened using mz_zip_reader_init, the mz_zip_archive's
+// user provided m_pWrite function cannot be NULL.
+// Note: In-place archive modification is not recommended unless you know what
+// you're doing, because if execution stops or something goes wrong before
+// the archive is finalized the file's central directory will be hosed.
+mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip,
+                                       const char *pFilename);
+
+// Adds the contents of a memory buffer to an archive. These functions record
+// the current local time into the archive.
+// To add a directory entry, call this method with an archive name ending in a
+// forwardslash with empty buffer.
+// level_and_flags - compression level (0-10, see MZ_BEST_SPEED,
+// MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or
+// just set to MZ_DEFAULT_COMPRESSION.
+mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive_name,
+                              const void *pBuf, size_t buf_size,
+                              mz_uint level_and_flags);
+mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip,
+                                 const char *pArchive_name, const void *pBuf,
+                                 size_t buf_size, const void *pComment,
+                                 mz_uint16 comment_size,
+                                 mz_uint level_and_flags, mz_uint64 uncomp_size,
+                                 mz_uint32 uncomp_crc32);
+
+#ifndef MINIZ_NO_STDIO
+// Adds the contents of a disk file to an archive. This function also records
+// the disk file's modified time into the archive.
+// level_and_flags - compression level (0-10, see MZ_BEST_SPEED,
+// MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or
+// just set to MZ_DEFAULT_COMPRESSION.
+mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name,
+                               const char *pSrc_filename, const void *pComment,
+                               mz_uint16 comment_size, mz_uint level_and_flags);
+#endif
+
+// Adds a file to an archive by fully cloning the data from another archive.
+// This function fully clones the source file's compressed data (no
+// recompression), along with its full filename, extra data, and comment fields.
+mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip,
+                                          mz_zip_archive *pSource_zip,
+                                          mz_uint file_index);
+
+// Finalizes the archive by writing the central directory records followed by
+// the end of central directory record.
+// After an archive is finalized, the only valid call on the mz_zip_archive
+// struct is mz_zip_writer_end().
+// An archive must be manually finalized by calling this function for it to be
+// valid.
+mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip);
+mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void **pBuf,
+                                            size_t *pSize);
+
+// Ends archive writing, freeing all allocations, and closing the output file if
+// mz_zip_writer_init_file() was used.
+// Note for the archive to be valid, it must have been finalized before ending.
+mz_bool mz_zip_writer_end(mz_zip_archive *pZip);
+
+// Misc. high-level helper functions:
+
+// mz_zip_add_mem_to_archive_file_in_place() efficiently (but not atomically)
+// appends a memory blob to a ZIP archive.
+// level_and_flags - compression level (0-10, see MZ_BEST_SPEED,
+// MZ_BEST_COMPRESSION, etc.) logically OR'd with zero or more mz_zip_flags, or
+// just set to MZ_DEFAULT_COMPRESSION.
+mz_bool mz_zip_add_mem_to_archive_file_in_place(
+    const char *pZip_filename, const char *pArchive_name, const void *pBuf,
+    size_t buf_size, const void *pComment, mz_uint16 comment_size,
+    mz_uint level_and_flags);
+
+// Reads a single file from an archive into a heap block.
+// Returns NULL on failure.
+void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename,
+                                          const char *pArchive_name,
+                                          size_t *pSize, mz_uint zip_flags);
+
+#endif  // #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+
+#endif  // #ifndef MINIZ_NO_ARCHIVE_APIS
+
+// ------------------- Low-level Decompression API Definitions
+
+// Decompression flags used by tinfl_decompress().
+// TINFL_FLAG_PARSE_ZLIB_HEADER: If set, the input has a valid zlib header and
+// ends with an adler32 checksum (it's a valid zlib stream). Otherwise, the
+// input is a raw deflate stream.
+// TINFL_FLAG_HAS_MORE_INPUT: If set, there are more input bytes available
+// beyond the end of the supplied input buffer. If clear, the input buffer
+// contains all remaining input.
+// TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF: If set, the output buffer is large
+// enough to hold the entire decompressed stream. If clear, the output buffer is
+// at least the size of the dictionary (typically 32KB).
+// TINFL_FLAG_COMPUTE_ADLER32: Force adler-32 checksum computation of the
+// decompressed bytes.
+enum {
+  TINFL_FLAG_PARSE_ZLIB_HEADER = 1,
+  TINFL_FLAG_HAS_MORE_INPUT = 2,
+  TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF = 4,
+  TINFL_FLAG_COMPUTE_ADLER32 = 8
+};
+
+// High level decompression functions:
+// tinfl_decompress_mem_to_heap() decompresses a block in memory to a heap block
+// allocated via malloc().
+// On entry:
+//  pSrc_buf, src_buf_len: Pointer and size of the Deflate or zlib source data
+//  to decompress.
+// On return:
+//  Function returns a pointer to the decompressed data, or NULL on failure.
+//  *pOut_len will be set to the decompressed data's size, which could be larger
+//  than src_buf_len on uncompressible data.
+//  The caller must call mz_free() on the returned block when it's no longer
+//  needed.
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len,
+                                   size_t *pOut_len, int flags);
+
+// tinfl_decompress_mem_to_mem() decompresses a block in memory to another block
+// in memory.
+// Returns TINFL_DECOMPRESS_MEM_TO_MEM_FAILED on failure, or the number of bytes
+// written on success.
+#define TINFL_DECOMPRESS_MEM_TO_MEM_FAILED ((size_t)(-1))
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len,
+                                   const void *pSrc_buf, size_t src_buf_len,
+                                   int flags);
+
+// tinfl_decompress_mem_to_callback() decompresses a block in memory to an
+// internal 32KB buffer, and a user provided callback function will be called to
+// flush the buffer.
+// Returns 1 on success or 0 on failure.
+typedef int (*tinfl_put_buf_func_ptr)(const void *pBuf, int len, void *pUser);
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size,
+                                     tinfl_put_buf_func_ptr pPut_buf_func,
+                                     void *pPut_buf_user, int flags);
+
+struct tinfl_decompressor_tag;
+typedef struct tinfl_decompressor_tag tinfl_decompressor;
+
+// Max size of LZ dictionary.
+#define TINFL_LZ_DICT_SIZE 32768
+
+// Return status.
+typedef enum {
+  TINFL_STATUS_BAD_PARAM = -3,
+  TINFL_STATUS_ADLER32_MISMATCH = -2,
+  TINFL_STATUS_FAILED = -1,
+  TINFL_STATUS_DONE = 0,
+  TINFL_STATUS_NEEDS_MORE_INPUT = 1,
+  TINFL_STATUS_HAS_MORE_OUTPUT = 2
+} tinfl_status;
+
+// Initializes the decompressor to its initial state.
+#define tinfl_init(r) \
+  do {                \
+    (r)->m_state = 0; \
+  }                   \
+  MZ_MACRO_END
+#define tinfl_get_adler32(r) (r)->m_check_adler32
+
+// Main low-level decompressor coroutine function. This is the only function
+// actually needed for decompression. All the other functions are just
+// high-level helpers for improved usability.
+// This is a universal API, i.e. it can be used as a building block to build any
+// desired higher level decompression API. In the limit case, it can be called
+// once per every byte input or output.
+tinfl_status tinfl_decompress(tinfl_decompressor *r,
+                              const mz_uint8 *pIn_buf_next,
+                              size_t *pIn_buf_size, mz_uint8 *pOut_buf_start,
+                              mz_uint8 *pOut_buf_next, size_t *pOut_buf_size,
+                              const mz_uint32 decomp_flags);
+
+// Internal/private bits follow.
+enum {
+  TINFL_MAX_HUFF_TABLES = 3,
+  TINFL_MAX_HUFF_SYMBOLS_0 = 288,
+  TINFL_MAX_HUFF_SYMBOLS_1 = 32,
+  TINFL_MAX_HUFF_SYMBOLS_2 = 19,
+  TINFL_FAST_LOOKUP_BITS = 10,
+  TINFL_FAST_LOOKUP_SIZE = 1 << TINFL_FAST_LOOKUP_BITS
+};
+
+typedef struct {
+  mz_uint8 m_code_size[TINFL_MAX_HUFF_SYMBOLS_0];
+  mz_int16 m_look_up[TINFL_FAST_LOOKUP_SIZE],
+      m_tree[TINFL_MAX_HUFF_SYMBOLS_0 * 2];
+} tinfl_huff_table;
+
+
+#ifndef MINIZ_HAS_64BIT_REGISTERS
+#	define MINIZ_HAS_64BIT_REGISTERS 0
+#endif
+
+#ifndef TINFL_USE_64BIT_BITBUF
+#	if MINIZ_HAS_64BIT_REGISTERS
+#		define TINFL_USE_64BIT_BITBUF 1
+#	else
+#		define TINFL_USE_64BIT_BITBUF 0
+#	endif
+#endif
+
+#if TINFL_USE_64BIT_BITBUF
+typedef mz_uint64 tinfl_bit_buf_t;
+#define TINFL_BITBUF_SIZE (64)
+#else
+typedef mz_uint32 tinfl_bit_buf_t;
+#define TINFL_BITBUF_SIZE (32)
+#endif
+
+struct tinfl_decompressor_tag {
+  mz_uint32 m_state, m_num_bits, m_zhdr0, m_zhdr1, m_z_adler32, m_final, m_type,
+      m_check_adler32, m_dist, m_counter, m_num_extra,
+      m_table_sizes[TINFL_MAX_HUFF_TABLES];
+  tinfl_bit_buf_t m_bit_buf;
+  size_t m_dist_from_out_buf_start;
+  tinfl_huff_table m_tables[TINFL_MAX_HUFF_TABLES];
+  mz_uint8 m_raw_header[4],
+      m_len_codes[TINFL_MAX_HUFF_SYMBOLS_0 + TINFL_MAX_HUFF_SYMBOLS_1 + 137];
+};
+
+// ------------------- Low-level Compression API Definitions
+
+// Set TDEFL_LESS_MEMORY to 1 to use less memory (compression will be slightly
+// slower, and raw/dynamic blocks will be output more frequently).
+#define TDEFL_LESS_MEMORY 0
+
+// tdefl_init() compression flags logically OR'd together (low 12 bits contain
+// the max. number of probes per dictionary search):
+// TDEFL_DEFAULT_MAX_PROBES: The compressor defaults to 128 dictionary probes
+// per dictionary search. 0=Huffman only, 1=Huffman+LZ (fastest/crap
+// compression), 4095=Huffman+LZ (slowest/best compression).
+enum {
+  TDEFL_HUFFMAN_ONLY = 0,
+  TDEFL_DEFAULT_MAX_PROBES = 128,
+  TDEFL_MAX_PROBES_MASK = 0xFFF
+};
+
+// TDEFL_WRITE_ZLIB_HEADER: If set, the compressor outputs a zlib header before
+// the deflate data, and the Adler-32 of the source data at the end. Otherwise,
+// you'll get raw deflate data.
+// TDEFL_COMPUTE_ADLER32: Always compute the adler-32 of the input data (even
+// when not writing zlib headers).
+// TDEFL_GREEDY_PARSING_FLAG: Set to use faster greedy parsing, instead of more
+// efficient lazy parsing.
+// TDEFL_NONDETERMINISTIC_PARSING_FLAG: Enable to decrease the compressor's
+// initialization time to the minimum, but the output may vary from run to run
+// given the same input (depending on the contents of memory).
+// TDEFL_RLE_MATCHES: Only look for RLE matches (matches with a distance of 1)
+// TDEFL_FILTER_MATCHES: Discards matches <= 5 chars if enabled.
+// TDEFL_FORCE_ALL_STATIC_BLOCKS: Disable usage of optimized Huffman tables.
+// TDEFL_FORCE_ALL_RAW_BLOCKS: Only use raw (uncompressed) deflate blocks.
+// The low 12 bits are reserved to control the max # of hash probes per
+// dictionary lookup (see TDEFL_MAX_PROBES_MASK).
+enum {
+  TDEFL_WRITE_ZLIB_HEADER = 0x01000,
+  TDEFL_COMPUTE_ADLER32 = 0x02000,
+  TDEFL_GREEDY_PARSING_FLAG = 0x04000,
+  TDEFL_NONDETERMINISTIC_PARSING_FLAG = 0x08000,
+  TDEFL_RLE_MATCHES = 0x10000,
+  TDEFL_FILTER_MATCHES = 0x20000,
+  TDEFL_FORCE_ALL_STATIC_BLOCKS = 0x40000,
+  TDEFL_FORCE_ALL_RAW_BLOCKS = 0x80000
+};
+
+// High level compression functions:
+// tdefl_compress_mem_to_heap() compresses a block in memory to a heap block
+// allocated via malloc().
+// On entry:
+//  pSrc_buf, src_buf_len: Pointer and size of source block to compress.
+//  flags: The max match finder probes (default is 128) logically OR'd against
+//  the above flags. Higher probes are slower but improve compression.
+// On return:
+//  Function returns a pointer to the compressed data, or NULL on failure.
+//  *pOut_len will be set to the compressed data's size, which could be larger
+//  than src_buf_len on uncompressible data.
+//  The caller must free() the returned block when it's no longer needed.
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len,
+                                 size_t *pOut_len, int flags);
+
+// tdefl_compress_mem_to_mem() compresses a block in memory to another block in
+// memory.
+// Returns 0 on failure.
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len,
+                                 const void *pSrc_buf, size_t src_buf_len,
+                                 int flags);
+
+// Compresses an image to a compressed PNG file in memory.
+// On entry:
+//  pImage, w, h, and num_chans describe the image to compress. num_chans may be
+//  1, 2, 3, or 4.
+//  The image pitch in bytes per scanline will be w*num_chans. The leftmost
+//  pixel on the top scanline is stored first in memory.
+//  level may range from [0,10], use MZ_NO_COMPRESSION, MZ_BEST_SPEED,
+//  MZ_BEST_COMPRESSION, etc. or a decent default is MZ_DEFAULT_LEVEL
+//  If flip is true, the image will be flipped on the Y axis (useful for OpenGL
+//  apps).
+// On return:
+//  Function returns a pointer to the compressed data, or NULL on failure.
+//  *pLen_out will be set to the size of the PNG image file.
+//  The caller must mz_free() the returned heap block (which will typically be
+//  larger than *pLen_out) when it's no longer needed.
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w,
+                                                 int h, int num_chans,
+                                                 size_t *pLen_out,
+                                                 mz_uint level, mz_bool flip);
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h,
+                                              int num_chans, size_t *pLen_out);
+
+// Output stream interface. The compressor uses this interface to write
+// compressed data. It'll typically be called TDEFL_OUT_BUF_SIZE at a time.
+typedef mz_bool (*tdefl_put_buf_func_ptr)(const void *pBuf, int len,
+                                          void *pUser);
+
+// tdefl_compress_mem_to_output() compresses a block to an output stream. The
+// above helpers use this function internally.
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len,
+                                     tdefl_put_buf_func_ptr pPut_buf_func,
+                                     void *pPut_buf_user, int flags);
+
+enum {
+  TDEFL_MAX_HUFF_TABLES = 3,
+  TDEFL_MAX_HUFF_SYMBOLS_0 = 288,
+  TDEFL_MAX_HUFF_SYMBOLS_1 = 32,
+  TDEFL_MAX_HUFF_SYMBOLS_2 = 19,
+  TDEFL_LZ_DICT_SIZE = 32768,
+  TDEFL_LZ_DICT_SIZE_MASK = TDEFL_LZ_DICT_SIZE - 1,
+  TDEFL_MIN_MATCH_LEN = 3,
+  TDEFL_MAX_MATCH_LEN = 258
+};
+
+// TDEFL_OUT_BUF_SIZE MUST be large enough to hold a single entire compressed
+// output block (using static/fixed Huffman codes).
+#if TDEFL_LESS_MEMORY
+enum {
+  TDEFL_LZ_CODE_BUF_SIZE = 24 * 1024,
+  TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13) / 10,
+  TDEFL_MAX_HUFF_SYMBOLS = 288,
+  TDEFL_LZ_HASH_BITS = 12,
+  TDEFL_LEVEL1_HASH_SIZE_MASK = 4095,
+  TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3,
+  TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS
+};
+#else
+enum {
+  TDEFL_LZ_CODE_BUF_SIZE = 64 * 1024,
+  TDEFL_OUT_BUF_SIZE = (TDEFL_LZ_CODE_BUF_SIZE * 13) / 10,
+  TDEFL_MAX_HUFF_SYMBOLS = 288,
+  TDEFL_LZ_HASH_BITS = 15,
+  TDEFL_LEVEL1_HASH_SIZE_MASK = 4095,
+  TDEFL_LZ_HASH_SHIFT = (TDEFL_LZ_HASH_BITS + 2) / 3,
+  TDEFL_LZ_HASH_SIZE = 1 << TDEFL_LZ_HASH_BITS
+};
+#endif
+
+// The low-level tdefl functions below may be used directly if the above helper
+// functions aren't flexible enough. The low-level functions don't make any heap
+// allocations, unlike the above helper functions.
+typedef enum {
+  TDEFL_STATUS_BAD_PARAM = -2,
+  TDEFL_STATUS_PUT_BUF_FAILED = -1,
+  TDEFL_STATUS_OKAY = 0,
+  TDEFL_STATUS_DONE = 1
+} tdefl_status;
+
+// Must map to MZ_NO_FLUSH, MZ_SYNC_FLUSH, etc. enums
+typedef enum {
+  TDEFL_NO_FLUSH = 0,
+  TDEFL_SYNC_FLUSH = 2,
+  TDEFL_FULL_FLUSH = 3,
+  TDEFL_FINISH = 4
+} tdefl_flush;
+
+// tdefl's compression state structure.
+typedef struct {
+  tdefl_put_buf_func_ptr m_pPut_buf_func;
+  void *m_pPut_buf_user;
+  mz_uint m_flags, m_max_probes[2];
+  int m_greedy_parsing;
+  mz_uint m_adler32, m_lookahead_pos, m_lookahead_size, m_dict_size;
+  mz_uint8 *m_pLZ_code_buf, *m_pLZ_flags, *m_pOutput_buf, *m_pOutput_buf_end;
+  mz_uint m_num_flags_left, m_total_lz_bytes, m_lz_code_buf_dict_pos, m_bits_in,
+      m_bit_buffer;
+  mz_uint m_saved_match_dist, m_saved_match_len, m_saved_lit,
+      m_output_flush_ofs, m_output_flush_remaining, m_finished, m_block_index,
+      m_wants_to_finish;
+  tdefl_status m_prev_return_status;
+  const void *m_pIn_buf;
+  void *m_pOut_buf;
+  size_t *m_pIn_buf_size, *m_pOut_buf_size;
+  tdefl_flush m_flush;
+  const mz_uint8 *m_pSrc;
+  size_t m_src_buf_left, m_out_buf_ofs;
+  mz_uint8 m_dict[TDEFL_LZ_DICT_SIZE + TDEFL_MAX_MATCH_LEN - 1];
+  mz_uint16 m_huff_count[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint16 m_huff_codes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_huff_code_sizes[TDEFL_MAX_HUFF_TABLES][TDEFL_MAX_HUFF_SYMBOLS];
+  mz_uint8 m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE];
+  mz_uint16 m_next[TDEFL_LZ_DICT_SIZE];
+  mz_uint16 m_hash[TDEFL_LZ_HASH_SIZE];
+  mz_uint8 m_output_buf[TDEFL_OUT_BUF_SIZE];
+} tdefl_compressor;
+
+// Initializes the compressor.
+// There is no corresponding deinit() function because the tdefl API's do not
+// dynamically allocate memory.
+// pBut_buf_func: If NULL, output data will be supplied to the specified
+// callback. In this case, the user should call the tdefl_compress_buffer() API
+// for compression.
+// If pBut_buf_func is NULL the user should always call the tdefl_compress()
+// API.
+// flags: See the above enums (TDEFL_HUFFMAN_ONLY, TDEFL_WRITE_ZLIB_HEADER,
+// etc.)
+tdefl_status tdefl_init(tdefl_compressor *d,
+                        tdefl_put_buf_func_ptr pPut_buf_func,
+                        void *pPut_buf_user, int flags);
+
+// Compresses a block of data, consuming as much of the specified input buffer
+// as possible, and writing as much compressed data to the specified output
+// buffer as possible.
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf,
+                            size_t *pIn_buf_size, void *pOut_buf,
+                            size_t *pOut_buf_size, tdefl_flush flush);
+
+// tdefl_compress_buffer() is only usable when the tdefl_init() is called with a
+// non-NULL tdefl_put_buf_func_ptr.
+// tdefl_compress_buffer() always consumes the entire input buffer.
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf,
+                                   size_t in_buf_size, tdefl_flush flush);
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d);
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d);
+
+// Can't use tdefl_create_comp_flags_from_zip_params if MINIZ_NO_ZLIB_APIS isn't
+// defined, because it uses some of its macros.
+#ifndef MINIZ_NO_ZLIB_APIS
+// Create tdefl_compress() flags given zlib-style compression parameters.
+// level may range from [0,10] (where 10 is absolute max compression, but may be
+// much slower on some files)
+// window_bits may be -15 (raw deflate) or 15 (zlib)
+// strategy may be either MZ_DEFAULT_STRATEGY, MZ_FILTERED, MZ_HUFFMAN_ONLY,
+// MZ_RLE, or MZ_FIXED
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits,
+                                                int strategy);
+#endif  // #ifndef MINIZ_NO_ZLIB_APIS
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINIZ_HEADER_INCLUDED
+
+// ------------------- End of Header: Implementation follows. (If you only want
+// the header, define MINIZ_HEADER_FILE_ONLY.)
+
+#ifndef MINIZ_HEADER_FILE_ONLY
+
+typedef unsigned char mz_validate_uint16[sizeof(mz_uint16) == 2 ? 1 : -1];
+typedef unsigned char mz_validate_uint32[sizeof(mz_uint32) == 4 ? 1 : -1];
+typedef unsigned char mz_validate_uint64[sizeof(mz_uint64) == 8 ? 1 : -1];
+
+#include <assert.h>
+#include <string.h>
+
+#define MZ_ASSERT(x) assert(x)
+
+#ifdef MINIZ_NO_MALLOC
+#define MZ_MALLOC(x) NULL
+#define MZ_FREE(x) (void)x, ((void)0)
+#define MZ_REALLOC(p, x) NULL
+#else
+#define MZ_MALLOC(x) malloc(x)
+#define MZ_FREE(x) free(x)
+#define MZ_REALLOC(p, x) realloc(p, x)
+#endif
+
+#define MZ_MAX(a, b) (((a) > (b)) ? (a) : (b))
+#define MZ_MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MZ_CLEAR_OBJ(obj) memset(&(obj), 0, sizeof(obj))
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+#define MZ_READ_LE16(p) *((const mz_uint16 *)(p))
+#define MZ_READ_LE32(p) *((const mz_uint32 *)(p))
+#else
+#define MZ_READ_LE16(p)                      \
+  ((mz_uint32)(((const mz_uint8 *)(p))[0]) | \
+   ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U))
+#define MZ_READ_LE32(p)                               \
+  ((mz_uint32)(((const mz_uint8 *)(p))[0]) |          \
+   ((mz_uint32)(((const mz_uint8 *)(p))[1]) << 8U) |  \
+   ((mz_uint32)(((const mz_uint8 *)(p))[2]) << 16U) | \
+   ((mz_uint32)(((const mz_uint8 *)(p))[3]) << 24U))
+#endif
+
+#ifdef _MSC_VER
+#define MZ_FORCEINLINE __forceinline
+#elif defined(__GNUC__)
+#define MZ_FORCEINLINE inline __attribute__((__always_inline__))
+#else
+#define MZ_FORCEINLINE inline
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------- zlib-style API's
+
+mz_ulong mz_adler32(mz_ulong adler, const unsigned char *ptr, size_t buf_len) {
+  mz_uint32 i, s1 = (mz_uint32)(adler & 0xffff), s2 = (mz_uint32)(adler >> 16);
+  size_t block_len = buf_len % 5552;
+  if (!ptr) return MZ_ADLER32_INIT;
+  while (buf_len) {
+    for (i = 0; i + 7 < block_len; i += 8, ptr += 8) {
+      s1 += ptr[0], s2 += s1;
+      s1 += ptr[1], s2 += s1;
+      s1 += ptr[2], s2 += s1;
+      s1 += ptr[3], s2 += s1;
+      s1 += ptr[4], s2 += s1;
+      s1 += ptr[5], s2 += s1;
+      s1 += ptr[6], s2 += s1;
+      s1 += ptr[7], s2 += s1;
+    }
+    for (; i < block_len; ++i) s1 += *ptr++, s2 += s1;
+    s1 %= 65521U, s2 %= 65521U;
+    buf_len -= block_len;
+    block_len = 5552;
+  }
+  return (s2 << 16) + s1;
+}
+
+// Karl Malbrain's compact CRC-32. See "A compact CCITT crc16 and crc32 C
+// implementation that balances processor cache usage against speed":
+// http://www.geocities.com/malbrain/
+mz_ulong mz_crc32(mz_ulong crc, const mz_uint8 *ptr, size_t buf_len) {
+  static const mz_uint32 s_crc32[16] = {
+      0,          0x1db71064, 0x3b6e20c8, 0x26d930ac, 0x76dc4190, 0x6b6b51f4,
+      0x4db26158, 0x5005713c, 0xedb88320, 0xf00f9344, 0xd6d6a3e8, 0xcb61b38c,
+      0x9b64c2b0, 0x86d3d2d4, 0xa00ae278, 0xbdbdf21c};
+  mz_uint32 crcu32 = (mz_uint32)crc;
+  if (!ptr) return MZ_CRC32_INIT;
+  crcu32 = ~crcu32;
+  while (buf_len--) {
+    mz_uint8 b = *ptr++;
+    crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b & 0xF)];
+    crcu32 = (crcu32 >> 4) ^ s_crc32[(crcu32 & 0xF) ^ (b >> 4)];
+  }
+  return ~crcu32;
+}
+
+void mz_free(void *p) { MZ_FREE(p); }
+
+#ifndef MINIZ_NO_ZLIB_APIS
+
+static void *def_alloc_func(void *opaque, size_t items, size_t size) {
+  (void)opaque, (void)items, (void)size;
+  return MZ_MALLOC(items * size);
+}
+static void def_free_func(void *opaque, void *address) {
+  (void)opaque, (void)address;
+  MZ_FREE(address);
+}
+static void *def_realloc_func(void *opaque, void *address, size_t items,
+                              size_t size) {
+  (void)opaque, (void)address, (void)items, (void)size;
+  return MZ_REALLOC(address, items * size);
+}
+
+const char *mz_version(void) { return MZ_VERSION; }
+
+int mz_deflateInit(mz_streamp pStream, int level) {
+  return mz_deflateInit2(pStream, level, MZ_DEFLATED, MZ_DEFAULT_WINDOW_BITS, 9,
+                         MZ_DEFAULT_STRATEGY);
+}
+
+int mz_deflateInit2(mz_streamp pStream, int level, int method, int window_bits,
+                    int mem_level, int strategy) {
+  tdefl_compressor *pComp;
+  mz_uint comp_flags =
+      TDEFL_COMPUTE_ADLER32 |
+      tdefl_create_comp_flags_from_zip_params(level, window_bits, strategy);
+
+  if (!pStream) return MZ_STREAM_ERROR;
+  if ((method != MZ_DEFLATED) || ((mem_level < 1) || (mem_level > 9)) ||
+      ((window_bits != MZ_DEFAULT_WINDOW_BITS) &&
+       (-window_bits != MZ_DEFAULT_WINDOW_BITS)))
+    return MZ_PARAM_ERROR;
+
+  pStream->data_type = 0;
+  pStream->adler = MZ_ADLER32_INIT;
+  pStream->msg = NULL;
+  pStream->reserved = 0;
+  pStream->total_in = 0;
+  pStream->total_out = 0;
+  if (!pStream->zalloc) pStream->zalloc = def_alloc_func;
+  if (!pStream->zfree) pStream->zfree = def_free_func;
+
+  pComp = (tdefl_compressor *)pStream->zalloc(pStream->opaque, 1,
+                                              sizeof(tdefl_compressor));
+  if (!pComp) return MZ_MEM_ERROR;
+
+  pStream->state = (struct mz_internal_state *)pComp;
+
+  if (tdefl_init(pComp, NULL, NULL, comp_flags) != TDEFL_STATUS_OKAY) {
+    mz_deflateEnd(pStream);
+    return MZ_PARAM_ERROR;
+  }
+
+  return MZ_OK;
+}
+
+int mz_deflateReset(mz_streamp pStream) {
+  if ((!pStream) || (!pStream->state) || (!pStream->zalloc) ||
+      (!pStream->zfree))
+    return MZ_STREAM_ERROR;
+  pStream->total_in = pStream->total_out = 0;
+  tdefl_init((tdefl_compressor *)pStream->state, NULL, NULL,
+             ((tdefl_compressor *)pStream->state)->m_flags);
+  return MZ_OK;
+}
+
+int mz_deflate(mz_streamp pStream, int flush) {
+  size_t in_bytes, out_bytes;
+  mz_ulong orig_total_in, orig_total_out;
+  int mz_status = MZ_OK;
+
+  if ((!pStream) || (!pStream->state) || (flush < 0) || (flush > MZ_FINISH) ||
+      (!pStream->next_out))
+    return MZ_STREAM_ERROR;
+  if (!pStream->avail_out) return MZ_BUF_ERROR;
+
+  if (flush == MZ_PARTIAL_FLUSH) flush = MZ_SYNC_FLUSH;
+
+  if (((tdefl_compressor *)pStream->state)->m_prev_return_status ==
+      TDEFL_STATUS_DONE)
+    return (flush == MZ_FINISH) ? MZ_STREAM_END : MZ_BUF_ERROR;
+
+  orig_total_in = pStream->total_in;
+  orig_total_out = pStream->total_out;
+  for (;;) {
+    tdefl_status defl_status;
+    in_bytes = pStream->avail_in;
+    out_bytes = pStream->avail_out;
+
+    defl_status = tdefl_compress((tdefl_compressor *)pStream->state,
+                                 pStream->next_in, &in_bytes, pStream->next_out,
+                                 &out_bytes, (tdefl_flush)flush);
+    pStream->next_in += (mz_uint)in_bytes;
+    pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes;
+    pStream->adler = tdefl_get_adler32((tdefl_compressor *)pStream->state);
+
+    pStream->next_out += (mz_uint)out_bytes;
+    pStream->avail_out -= (mz_uint)out_bytes;
+    pStream->total_out += (mz_uint)out_bytes;
+
+    if (defl_status < 0) {
+      mz_status = MZ_STREAM_ERROR;
+      break;
+    } else if (defl_status == TDEFL_STATUS_DONE) {
+      mz_status = MZ_STREAM_END;
+      break;
+    } else if (!pStream->avail_out)
+      break;
+    else if ((!pStream->avail_in) && (flush != MZ_FINISH)) {
+      if ((flush) || (pStream->total_in != orig_total_in) ||
+          (pStream->total_out != orig_total_out))
+        break;
+      return MZ_BUF_ERROR;  // Can't make forward progress without some input.
+    }
+  }
+  return mz_status;
+}
+
+int mz_deflateEnd(mz_streamp pStream) {
+  if (!pStream) return MZ_STREAM_ERROR;
+  if (pStream->state) {
+    pStream->zfree(pStream->opaque, pStream->state);
+    pStream->state = NULL;
+  }
+  return MZ_OK;
+}
+
+mz_ulong mz_deflateBound(mz_streamp pStream, mz_ulong source_len) {
+  (void)pStream;
+  // This is really over conservative. (And lame, but it's actually pretty
+  // tricky to compute a true upper bound given the way tdefl's blocking works.)
+  return MZ_MAX(128 + (source_len * 110) / 100,
+                128 + source_len + ((source_len / (31 * 1024)) + 1) * 5);
+}
+
+int mz_compress2(unsigned char *pDest, mz_ulong *pDest_len,
+                 const unsigned char *pSource, mz_ulong source_len, int level) {
+  int status;
+  mz_stream stream;
+  memset(&stream, 0, sizeof(stream));
+
+  // In case mz_ulong is 64-bits (argh I hate longs).
+  if ((source_len | *pDest_len) > 0xFFFFFFFFU) return MZ_PARAM_ERROR;
+
+  stream.next_in = pSource;
+  stream.avail_in = (mz_uint32)source_len;
+  stream.next_out = pDest;
+  stream.avail_out = (mz_uint32)*pDest_len;
+
+  status = mz_deflateInit(&stream, level);
+  if (status != MZ_OK) return status;
+
+  status = mz_deflate(&stream, MZ_FINISH);
+  if (status != MZ_STREAM_END) {
+    mz_deflateEnd(&stream);
+    return (status == MZ_OK) ? MZ_BUF_ERROR : status;
+  }
+
+  *pDest_len = stream.total_out;
+  return mz_deflateEnd(&stream);
+}
+
+int mz_compress(unsigned char *pDest, mz_ulong *pDest_len,
+                const unsigned char *pSource, mz_ulong source_len) {
+  return mz_compress2(pDest, pDest_len, pSource, source_len,
+                      MZ_DEFAULT_COMPRESSION);
+}
+
+mz_ulong mz_compressBound(mz_ulong source_len) {
+  return mz_deflateBound(NULL, source_len);
+}
+
+typedef struct {
+  tinfl_decompressor m_decomp;
+  mz_uint m_dict_ofs, m_dict_avail, m_first_call, m_has_flushed;
+  int m_window_bits;
+  mz_uint8 m_dict[TINFL_LZ_DICT_SIZE];
+  tinfl_status m_last_status;
+} inflate_state;
+
+int mz_inflateInit2(mz_streamp pStream, int window_bits) {
+  inflate_state *pDecomp;
+  if (!pStream) return MZ_STREAM_ERROR;
+  if ((window_bits != MZ_DEFAULT_WINDOW_BITS) &&
+      (-window_bits != MZ_DEFAULT_WINDOW_BITS))
+    return MZ_PARAM_ERROR;
+
+  pStream->data_type = 0;
+  pStream->adler = 0;
+  pStream->msg = NULL;
+  pStream->total_in = 0;
+  pStream->total_out = 0;
+  pStream->reserved = 0;
+  if (!pStream->zalloc) pStream->zalloc = def_alloc_func;
+  if (!pStream->zfree) pStream->zfree = def_free_func;
+
+  pDecomp = (inflate_state *)pStream->zalloc(pStream->opaque, 1,
+                                             sizeof(inflate_state));
+  if (!pDecomp) return MZ_MEM_ERROR;
+
+  pStream->state = (struct mz_internal_state *)pDecomp;
+
+  tinfl_init(&pDecomp->m_decomp);
+  pDecomp->m_dict_ofs = 0;
+  pDecomp->m_dict_avail = 0;
+  pDecomp->m_last_status = TINFL_STATUS_NEEDS_MORE_INPUT;
+  pDecomp->m_first_call = 1;
+  pDecomp->m_has_flushed = 0;
+  pDecomp->m_window_bits = window_bits;
+
+  return MZ_OK;
+}
+
+int mz_inflateInit(mz_streamp pStream) {
+  return mz_inflateInit2(pStream, MZ_DEFAULT_WINDOW_BITS);
+}
+
+int mz_inflate(mz_streamp pStream, int flush) {
+  inflate_state *pState;
+  mz_uint n, first_call, decomp_flags = TINFL_FLAG_COMPUTE_ADLER32;
+  size_t in_bytes, out_bytes, orig_avail_in;
+  tinfl_status status;
+
+  if ((!pStream) || (!pStream->state)) return MZ_STREAM_ERROR;
+  if (flush == MZ_PARTIAL_FLUSH) flush = MZ_SYNC_FLUSH;
+  if ((flush) && (flush != MZ_SYNC_FLUSH) && (flush != MZ_FINISH))
+    return MZ_STREAM_ERROR;
+
+  pState = (inflate_state *)pStream->state;
+  if (pState->m_window_bits > 0) decomp_flags |= TINFL_FLAG_PARSE_ZLIB_HEADER;
+  orig_avail_in = pStream->avail_in;
+
+  first_call = pState->m_first_call;
+  pState->m_first_call = 0;
+  if (pState->m_last_status < 0) return MZ_DATA_ERROR;
+
+  if (pState->m_has_flushed && (flush != MZ_FINISH)) return MZ_STREAM_ERROR;
+  pState->m_has_flushed |= (flush == MZ_FINISH);
+
+  if ((flush == MZ_FINISH) && (first_call)) {
+    // MZ_FINISH on the first call implies that the input and output buffers are
+    // large enough to hold the entire compressed/decompressed file.
+    decomp_flags |= TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF;
+    in_bytes = pStream->avail_in;
+    out_bytes = pStream->avail_out;
+    status = tinfl_decompress(&pState->m_decomp, pStream->next_in, &in_bytes,
+                              pStream->next_out, pStream->next_out, &out_bytes,
+                              decomp_flags);
+    pState->m_last_status = status;
+    pStream->next_in += (mz_uint)in_bytes;
+    pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes;
+    pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+    pStream->next_out += (mz_uint)out_bytes;
+    pStream->avail_out -= (mz_uint)out_bytes;
+    pStream->total_out += (mz_uint)out_bytes;
+
+    if (status < 0)
+      return MZ_DATA_ERROR;
+    else if (status != TINFL_STATUS_DONE) {
+      pState->m_last_status = TINFL_STATUS_FAILED;
+      return MZ_BUF_ERROR;
+    }
+    return MZ_STREAM_END;
+  }
+  // flush != MZ_FINISH then we must assume there's more input.
+  if (flush != MZ_FINISH) decomp_flags |= TINFL_FLAG_HAS_MORE_INPUT;
+
+  if (pState->m_dict_avail) {
+    n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+    memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+    pStream->next_out += n;
+    pStream->avail_out -= n;
+    pStream->total_out += n;
+    pState->m_dict_avail -= n;
+    pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+    return ((pState->m_last_status == TINFL_STATUS_DONE) &&
+            (!pState->m_dict_avail))
+               ? MZ_STREAM_END
+               : MZ_OK;
+  }
+
+  for (;;) {
+    in_bytes = pStream->avail_in;
+    out_bytes = TINFL_LZ_DICT_SIZE - pState->m_dict_ofs;
+
+    status = tinfl_decompress(
+        &pState->m_decomp, pStream->next_in, &in_bytes, pState->m_dict,
+        pState->m_dict + pState->m_dict_ofs, &out_bytes, decomp_flags);
+    pState->m_last_status = status;
+
+    pStream->next_in += (mz_uint)in_bytes;
+    pStream->avail_in -= (mz_uint)in_bytes;
+    pStream->total_in += (mz_uint)in_bytes;
+    pStream->adler = tinfl_get_adler32(&pState->m_decomp);
+
+    pState->m_dict_avail = (mz_uint)out_bytes;
+
+    n = MZ_MIN(pState->m_dict_avail, pStream->avail_out);
+    memcpy(pStream->next_out, pState->m_dict + pState->m_dict_ofs, n);
+    pStream->next_out += n;
+    pStream->avail_out -= n;
+    pStream->total_out += n;
+    pState->m_dict_avail -= n;
+    pState->m_dict_ofs = (pState->m_dict_ofs + n) & (TINFL_LZ_DICT_SIZE - 1);
+
+    if (status < 0)
+      return MZ_DATA_ERROR;  // Stream is corrupted (there could be some
+    // uncompressed data left in the output dictionary -
+    // oh well).
+    else if ((status == TINFL_STATUS_NEEDS_MORE_INPUT) && (!orig_avail_in))
+      return MZ_BUF_ERROR;  // Signal caller that we can't make forward progress
+                            // without supplying more input or by setting flush
+                            // to MZ_FINISH.
+    else if (flush == MZ_FINISH) {
+      // The output buffer MUST be large to hold the remaining uncompressed data
+      // when flush==MZ_FINISH.
+      if (status == TINFL_STATUS_DONE)
+        return pState->m_dict_avail ? MZ_BUF_ERROR : MZ_STREAM_END;
+      // status here must be TINFL_STATUS_HAS_MORE_OUTPUT, which means there's
+      // at least 1 more byte on the way. If there's no more room left in the
+      // output buffer then something is wrong.
+      else if (!pStream->avail_out)
+        return MZ_BUF_ERROR;
+    } else if ((status == TINFL_STATUS_DONE) || (!pStream->avail_in) ||
+               (!pStream->avail_out) || (pState->m_dict_avail))
+      break;
+  }
+
+  return ((status == TINFL_STATUS_DONE) && (!pState->m_dict_avail))
+             ? MZ_STREAM_END
+             : MZ_OK;
+}
+
+int mz_inflateEnd(mz_streamp pStream) {
+  if (!pStream) return MZ_STREAM_ERROR;
+  if (pStream->state) {
+    pStream->zfree(pStream->opaque, pStream->state);
+    pStream->state = NULL;
+  }
+  return MZ_OK;
+}
+
+int mz_uncompress(unsigned char *pDest, mz_ulong *pDest_len,
+                  const unsigned char *pSource, mz_ulong source_len) {
+  mz_stream stream;
+  int status;
+  memset(&stream, 0, sizeof(stream));
+
+  // In case mz_ulong is 64-bits (argh I hate longs).
+  if ((source_len | *pDest_len) > 0xFFFFFFFFU) return MZ_PARAM_ERROR;
+
+  stream.next_in = pSource;
+  stream.avail_in = (mz_uint32)source_len;
+  stream.next_out = pDest;
+  stream.avail_out = (mz_uint32)*pDest_len;
+
+  status = mz_inflateInit(&stream);
+  if (status != MZ_OK) return status;
+
+  status = mz_inflate(&stream, MZ_FINISH);
+  if (status != MZ_STREAM_END) {
+    mz_inflateEnd(&stream);
+    return ((status == MZ_BUF_ERROR) && (!stream.avail_in)) ? MZ_DATA_ERROR
+                                                            : status;
+  }
+  *pDest_len = stream.total_out;
+
+  return mz_inflateEnd(&stream);
+}
+
+const char *mz_error(int err) {
+  static struct {
+    int m_err;
+    const char *m_pDesc;
+  } s_error_descs[] = {{MZ_OK, ""},
+                       {MZ_STREAM_END, "stream end"},
+                       {MZ_NEED_DICT, "need dictionary"},
+                       {MZ_ERRNO, "file error"},
+                       {MZ_STREAM_ERROR, "stream error"},
+                       {MZ_DATA_ERROR, "data error"},
+                       {MZ_MEM_ERROR, "out of memory"},
+                       {MZ_BUF_ERROR, "buf error"},
+                       {MZ_VERSION_ERROR, "version error"},
+                       {MZ_PARAM_ERROR, "parameter error"}};
+  mz_uint i;
+  for (i = 0; i < sizeof(s_error_descs) / sizeof(s_error_descs[0]); ++i)
+    if (s_error_descs[i].m_err == err) return s_error_descs[i].m_pDesc;
+  return NULL;
+}
+
+#endif  // MINIZ_NO_ZLIB_APIS
+
+// ------------------- Low-level Decompression (completely independent from all
+// compression API's)
+
+#define TINFL_MEMCPY(d, s, l) memcpy(d, s, l)
+#define TINFL_MEMSET(p, c, l) memset(p, c, l)
+
+#define TINFL_CR_BEGIN  \
+  switch (r->m_state) { \
+    case 0:
+#define TINFL_CR_RETURN(state_index, result) \
+  do {                                       \
+    status = result;                         \
+    r->m_state = state_index;                \
+    goto common_exit;                        \
+    case state_index:;                       \
+  }                                          \
+  MZ_MACRO_END
+#define TINFL_CR_RETURN_FOREVER(state_index, result) \
+  do {                                               \
+    for (;;) {                                       \
+      TINFL_CR_RETURN(state_index, result);          \
+    }                                                \
+  }                                                  \
+  MZ_MACRO_END
+#define TINFL_CR_FINISH }
+
+// TODO: If the caller has indicated that there's no more input, and we attempt
+// to read beyond the input buf, then something is wrong with the input because
+// the inflator never
+// reads ahead more than it needs to. Currently TINFL_GET_BYTE() pads the end of
+// the stream with 0's in this scenario.
+#define TINFL_GET_BYTE(state_index, c)                                 \
+  do {                                                                 \
+    if (pIn_buf_cur >= pIn_buf_end) {                                  \
+      for (;;) {                                                       \
+        if (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) {                \
+          TINFL_CR_RETURN(state_index, TINFL_STATUS_NEEDS_MORE_INPUT); \
+          if (pIn_buf_cur < pIn_buf_end) {                             \
+            c = *pIn_buf_cur++;                                        \
+            break;                                                     \
+          }                                                            \
+        } else {                                                       \
+          c = 0;                                                       \
+          break;                                                       \
+        }                                                              \
+      }                                                                \
+    } else                                                             \
+      c = *pIn_buf_cur++;                                              \
+  }                                                                    \
+  MZ_MACRO_END
+
+#define TINFL_NEED_BITS(state_index, n)            \
+  do {                                             \
+    mz_uint c;                                     \
+    TINFL_GET_BYTE(state_index, c);                \
+    bit_buf |= (((tinfl_bit_buf_t)c) << num_bits); \
+    num_bits += 8;                                 \
+  } while (num_bits < (mz_uint)(n))
+#define TINFL_SKIP_BITS(state_index, n) \
+  do {                                  \
+    if (num_bits < (mz_uint)(n)) {      \
+      TINFL_NEED_BITS(state_index, n);  \
+    }                                   \
+    bit_buf >>= (n);                    \
+    num_bits -= (n);                    \
+  }                                     \
+  MZ_MACRO_END
+#define TINFL_GET_BITS(state_index, b, n) \
+  do {                                    \
+    if (num_bits < (mz_uint)(n)) {        \
+      TINFL_NEED_BITS(state_index, n);    \
+    }                                     \
+    b = bit_buf & ((1 << (n)) - 1);       \
+    bit_buf >>= (n);                      \
+    num_bits -= (n);                      \
+  }                                       \
+  MZ_MACRO_END
+
+// TINFL_HUFF_BITBUF_FILL() is only used rarely, when the number of bytes
+// remaining in the input buffer falls below 2.
+// It reads just enough bytes from the input stream that are needed to decode
+// the next Huffman code (and absolutely no more). It works by trying to fully
+// decode a
+// Huffman code by using whatever bits are currently present in the bit buffer.
+// If this fails, it reads another byte, and tries again until it succeeds or
+// until the
+// bit buffer contains >=15 bits (deflate's max. Huffman code size).
+#define TINFL_HUFF_BITBUF_FILL(state_index, pHuff)                     \
+  do {                                                                 \
+    temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]; \
+    if (temp >= 0) {                                                   \
+      code_len = temp >> 9;                                            \
+      if ((code_len) && (num_bits >= code_len)) break;                 \
+    } else if (num_bits > TINFL_FAST_LOOKUP_BITS) {                    \
+      code_len = TINFL_FAST_LOOKUP_BITS;                               \
+      do {                                                             \
+        temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)]; \
+      } while ((temp < 0) && (num_bits >= (code_len + 1)));            \
+      if (temp >= 0) break;                                            \
+    }                                                                  \
+    TINFL_GET_BYTE(state_index, c);                                    \
+    bit_buf |= (((tinfl_bit_buf_t)c) << num_bits);                     \
+    num_bits += 8;                                                     \
+  } while (num_bits < 15);
+
+// TINFL_HUFF_DECODE() decodes the next Huffman coded symbol. It's more complex
+// than you would initially expect because the zlib API expects the decompressor
+// to never read
+// beyond the final byte of the deflate stream. (In other words, when this macro
+// wants to read another byte from the input, it REALLY needs another byte in
+// order to fully
+// decode the next Huffman code.) Handling this properly is particularly
+// important on raw deflate (non-zlib) streams, which aren't followed by a byte
+// aligned adler-32.
+// The slow path is only executed at the very end of the input buffer.
+#define TINFL_HUFF_DECODE(state_index, sym, pHuff)                             \
+  do {                                                                         \
+    int temp;                                                                  \
+    mz_uint code_len, c;                                                       \
+    if (num_bits < 15) {                                                       \
+      if ((pIn_buf_end - pIn_buf_cur) < 2) {                                   \
+        TINFL_HUFF_BITBUF_FILL(state_index, pHuff);                            \
+      } else {                                                                 \
+        bit_buf |= (((tinfl_bit_buf_t)pIn_buf_cur[0]) << num_bits) |           \
+                   (((tinfl_bit_buf_t)pIn_buf_cur[1]) << (num_bits + 8));      \
+        pIn_buf_cur += 2;                                                      \
+        num_bits += 16;                                                        \
+      }                                                                        \
+    }                                                                          \
+    if ((temp = (pHuff)->m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >= \
+        0)                                                                     \
+      code_len = temp >> 9, temp &= 511;                                       \
+    else {                                                                     \
+      code_len = TINFL_FAST_LOOKUP_BITS;                                       \
+      do {                                                                     \
+        temp = (pHuff)->m_tree[~temp + ((bit_buf >> code_len++) & 1)];         \
+      } while (temp < 0);                                                      \
+    }                                                                          \
+    sym = temp;                                                                \
+    bit_buf >>= code_len;                                                      \
+    num_bits -= code_len;                                                      \
+  }                                                                            \
+  MZ_MACRO_END
+
+tinfl_status tinfl_decompress(tinfl_decompressor *r,
+                              const mz_uint8 *pIn_buf_next,
+                              size_t *pIn_buf_size, mz_uint8 *pOut_buf_start,
+                              mz_uint8 *pOut_buf_next, size_t *pOut_buf_size,
+                              const mz_uint32 decomp_flags) {
+  static const int s_length_base[31] = {
+      3,  4,  5,  6,  7,  8,  9,  10,  11,  13,  15,  17,  19,  23, 27, 31,
+      35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0,  0};
+  static const int s_length_extra[31] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
+                                         1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4,
+                                         4, 4, 5, 5, 5, 5, 0, 0, 0};
+  static const int s_dist_base[32] = {
+      1,    2,    3,    4,    5,    7,     9,     13,    17,  25,   33,
+      49,   65,   97,   129,  193,  257,   385,   513,   769, 1025, 1537,
+      2049, 3073, 4097, 6145, 8193, 12289, 16385, 24577, 0,   0};
+  static const int s_dist_extra[32] = {0, 0, 0,  0,  1,  1,  2,  2,  3,  3,
+                                       4, 4, 5,  5,  6,  6,  7,  7,  8,  8,
+                                       9, 9, 10, 10, 11, 11, 12, 12, 13, 13};
+  static const mz_uint8 s_length_dezigzag[19] = {
+      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+  static const int s_min_table_sizes[3] = {257, 1, 4};
+
+  tinfl_status status = TINFL_STATUS_FAILED;
+  mz_uint32 num_bits, dist, counter, num_extra;
+  tinfl_bit_buf_t bit_buf;
+  const mz_uint8 *pIn_buf_cur = pIn_buf_next,
+                 *const pIn_buf_end = pIn_buf_next + *pIn_buf_size;
+  mz_uint8 *pOut_buf_cur = pOut_buf_next,
+           *const pOut_buf_end = pOut_buf_next + *pOut_buf_size;
+  size_t out_buf_size_mask =
+             (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)
+                 ? (size_t)-1
+                 : ((pOut_buf_next - pOut_buf_start) + *pOut_buf_size) - 1,
+         dist_from_out_buf_start;
+
+  // Ensure the output buffer's size is a power of 2, unless the output buffer
+  // is large enough to hold the entire output file (in which case it doesn't
+  // matter).
+  if (((out_buf_size_mask + 1) & out_buf_size_mask) ||
+      (pOut_buf_next < pOut_buf_start)) {
+    *pIn_buf_size = *pOut_buf_size = 0;
+    return TINFL_STATUS_BAD_PARAM;
+  }
+
+  num_bits = r->m_num_bits;
+  bit_buf = r->m_bit_buf;
+  dist = r->m_dist;
+  counter = r->m_counter;
+  num_extra = r->m_num_extra;
+  dist_from_out_buf_start = r->m_dist_from_out_buf_start;
+  TINFL_CR_BEGIN
+
+  bit_buf = num_bits = dist = counter = num_extra = r->m_zhdr0 = r->m_zhdr1 = 0;
+  r->m_z_adler32 = r->m_check_adler32 = 1;
+  if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER) {
+    TINFL_GET_BYTE(1, r->m_zhdr0);
+    TINFL_GET_BYTE(2, r->m_zhdr1);
+    counter = (((r->m_zhdr0 * 256 + r->m_zhdr1) % 31 != 0) ||
+               (r->m_zhdr1 & 32) || ((r->m_zhdr0 & 15) != 8));
+    if (!(decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF))
+      counter |= (((1U << (8U + (r->m_zhdr0 >> 4))) > 32768U) ||
+                  ((out_buf_size_mask + 1) <
+                   (size_t)(1ULL << (8U + (r->m_zhdr0 >> 4)))));
+    if (counter) {
+      TINFL_CR_RETURN_FOREVER(36, TINFL_STATUS_FAILED);
+    }
+  }
+
+  do {
+    TINFL_GET_BITS(3, r->m_final, 3);
+    r->m_type = r->m_final >> 1;
+    if (r->m_type == 0) {
+      TINFL_SKIP_BITS(5, num_bits & 7);
+      for (counter = 0; counter < 4; ++counter) {
+        if (num_bits)
+          TINFL_GET_BITS(6, r->m_raw_header[counter], 8);
+        else
+          TINFL_GET_BYTE(7, r->m_raw_header[counter]);
+      }
+      if ((counter = (r->m_raw_header[0] | (r->m_raw_header[1] << 8))) !=
+          (mz_uint)(0xFFFF ^
+                    (r->m_raw_header[2] | (r->m_raw_header[3] << 8)))) {
+        TINFL_CR_RETURN_FOREVER(39, TINFL_STATUS_FAILED);
+      }
+      while ((counter) && (num_bits)) {
+        TINFL_GET_BITS(51, dist, 8);
+        while (pOut_buf_cur >= pOut_buf_end) {
+          TINFL_CR_RETURN(52, TINFL_STATUS_HAS_MORE_OUTPUT);
+        }
+        *pOut_buf_cur++ = (mz_uint8)dist;
+        counter--;
+      }
+      while (counter) {
+        size_t n;
+        while (pOut_buf_cur >= pOut_buf_end) {
+          TINFL_CR_RETURN(9, TINFL_STATUS_HAS_MORE_OUTPUT);
+        }
+        while (pIn_buf_cur >= pIn_buf_end) {
+          if (decomp_flags & TINFL_FLAG_HAS_MORE_INPUT) {
+            TINFL_CR_RETURN(38, TINFL_STATUS_NEEDS_MORE_INPUT);
+          } else {
+            TINFL_CR_RETURN_FOREVER(40, TINFL_STATUS_FAILED);
+          }
+        }
+        n = MZ_MIN(MZ_MIN((size_t)(pOut_buf_end - pOut_buf_cur),
+                          (size_t)(pIn_buf_end - pIn_buf_cur)),
+                   counter);
+        TINFL_MEMCPY(pOut_buf_cur, pIn_buf_cur, n);
+        pIn_buf_cur += n;
+        pOut_buf_cur += n;
+        counter -= (mz_uint)n;
+      }
+    } else if (r->m_type == 3) {
+      TINFL_CR_RETURN_FOREVER(10, TINFL_STATUS_FAILED);
+    } else {
+      if (r->m_type == 1) {
+        mz_uint8 *p = r->m_tables[0].m_code_size;
+        mz_uint i;
+        r->m_table_sizes[0] = 288;
+        r->m_table_sizes[1] = 32;
+        TINFL_MEMSET(r->m_tables[1].m_code_size, 5, 32);
+        for (i = 0; i <= 143; ++i) *p++ = 8;
+        for (; i <= 255; ++i) *p++ = 9;
+        for (; i <= 279; ++i) *p++ = 7;
+        for (; i <= 287; ++i) *p++ = 8;
+      } else {
+        for (counter = 0; counter < 3; counter++) {
+          TINFL_GET_BITS(11, r->m_table_sizes[counter], "\05\05\04"[counter]);
+          r->m_table_sizes[counter] += s_min_table_sizes[counter];
+        }
+        MZ_CLEAR_OBJ(r->m_tables[2].m_code_size);
+        for (counter = 0; counter < r->m_table_sizes[2]; counter++) {
+          mz_uint s;
+          TINFL_GET_BITS(14, s, 3);
+          r->m_tables[2].m_code_size[s_length_dezigzag[counter]] = (mz_uint8)s;
+        }
+        r->m_table_sizes[2] = 19;
+      }
+      for (; (int)r->m_type >= 0; r->m_type--) {
+        int tree_next, tree_cur;
+        tinfl_huff_table *pTable;
+        mz_uint i, j, used_syms, total, sym_index, next_code[17],
+            total_syms[16];
+        pTable = &r->m_tables[r->m_type];
+        MZ_CLEAR_OBJ(total_syms);
+        MZ_CLEAR_OBJ(pTable->m_look_up);
+        MZ_CLEAR_OBJ(pTable->m_tree);
+        for (i = 0; i < r->m_table_sizes[r->m_type]; ++i)
+          total_syms[pTable->m_code_size[i]]++;
+        used_syms = 0, total = 0;
+        next_code[0] = next_code[1] = 0;
+        for (i = 1; i <= 15; ++i) {
+          used_syms += total_syms[i];
+          next_code[i + 1] = (total = ((total + total_syms[i]) << 1));
+        }
+        if ((65536 != total) && (used_syms > 1)) {
+          TINFL_CR_RETURN_FOREVER(35, TINFL_STATUS_FAILED);
+        }
+        for (tree_next = -1, sym_index = 0;
+             sym_index < r->m_table_sizes[r->m_type]; ++sym_index) {
+          mz_uint rev_code = 0, l, cur_code,
+                  code_size = pTable->m_code_size[sym_index];
+          if (!code_size) continue;
+          cur_code = next_code[code_size]++;
+          for (l = code_size; l > 0; l--, cur_code >>= 1)
+            rev_code = (rev_code << 1) | (cur_code & 1);
+          if (code_size <= TINFL_FAST_LOOKUP_BITS) {
+            mz_int16 k = (mz_int16)((code_size << 9) | sym_index);
+            while (rev_code < TINFL_FAST_LOOKUP_SIZE) {
+              pTable->m_look_up[rev_code] = k;
+              rev_code += (1 << code_size);
+            }
+            continue;
+          }
+          if (0 ==
+              (tree_cur = pTable->m_look_up[rev_code &
+                                            (TINFL_FAST_LOOKUP_SIZE - 1)])) {
+            pTable->m_look_up[rev_code & (TINFL_FAST_LOOKUP_SIZE - 1)] =
+                (mz_int16)tree_next;
+            tree_cur = tree_next;
+            tree_next -= 2;
+          }
+          rev_code >>= (TINFL_FAST_LOOKUP_BITS - 1);
+          for (j = code_size; j > (TINFL_FAST_LOOKUP_BITS + 1); j--) {
+            tree_cur -= ((rev_code >>= 1) & 1);
+            if (!pTable->m_tree[-tree_cur - 1]) {
+              pTable->m_tree[-tree_cur - 1] = (mz_int16)tree_next;
+              tree_cur = tree_next;
+              tree_next -= 2;
+            } else
+              tree_cur = pTable->m_tree[-tree_cur - 1];
+          }
+          tree_cur -= ((rev_code >>= 1) & 1);
+          pTable->m_tree[-tree_cur - 1] = (mz_int16)sym_index;
+        }
+        if (r->m_type == 2) {
+          for (counter = 0;
+               counter < (r->m_table_sizes[0] + r->m_table_sizes[1]);) {
+            mz_uint s;
+            TINFL_HUFF_DECODE(16, dist, &r->m_tables[2]);
+            if (dist < 16) {
+              r->m_len_codes[counter++] = (mz_uint8)dist;
+              continue;
+            }
+            if ((dist == 16) && (!counter)) {
+              TINFL_CR_RETURN_FOREVER(17, TINFL_STATUS_FAILED);
+            }
+            num_extra = "\02\03\07"[dist - 16];
+            TINFL_GET_BITS(18, s, num_extra);
+            s += "\03\03\013"[dist - 16];
+            TINFL_MEMSET(r->m_len_codes + counter,
+                         (dist == 16) ? r->m_len_codes[counter - 1] : 0, s);
+            counter += s;
+          }
+          if ((r->m_table_sizes[0] + r->m_table_sizes[1]) != counter) {
+            TINFL_CR_RETURN_FOREVER(21, TINFL_STATUS_FAILED);
+          }
+          TINFL_MEMCPY(r->m_tables[0].m_code_size, r->m_len_codes,
+                       r->m_table_sizes[0]);
+          TINFL_MEMCPY(r->m_tables[1].m_code_size,
+                       r->m_len_codes + r->m_table_sizes[0],
+                       r->m_table_sizes[1]);
+        }
+      }
+      for (;;) {
+        mz_uint8 *pSrc;
+        for (;;) {
+          if (((pIn_buf_end - pIn_buf_cur) < 4) ||
+              ((pOut_buf_end - pOut_buf_cur) < 2)) {
+            TINFL_HUFF_DECODE(23, counter, &r->m_tables[0]);
+            if (counter >= 256) break;
+            while (pOut_buf_cur >= pOut_buf_end) {
+              TINFL_CR_RETURN(24, TINFL_STATUS_HAS_MORE_OUTPUT);
+            }
+            *pOut_buf_cur++ = (mz_uint8)counter;
+          } else {
+            int sym2;
+            mz_uint code_len;
+#if TINFL_USE_64BIT_BITBUF
+            if (num_bits < 30) {
+              bit_buf |=
+                  (((tinfl_bit_buf_t)MZ_READ_LE32(pIn_buf_cur)) << num_bits);
+              pIn_buf_cur += 4;
+              num_bits += 32;
+            }
+#else
+            if (num_bits < 15) {
+              bit_buf |=
+                  (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits);
+              pIn_buf_cur += 2;
+              num_bits += 16;
+            }
+#endif
+            if ((sym2 =
+                     r->m_tables[0]
+                         .m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >=
+                0)
+              code_len = sym2 >> 9;
+            else {
+              code_len = TINFL_FAST_LOOKUP_BITS;
+              do {
+                sym2 = r->m_tables[0]
+                           .m_tree[~sym2 + ((bit_buf >> code_len++) & 1)];
+              } while (sym2 < 0);
+            }
+            counter = sym2;
+            bit_buf >>= code_len;
+            num_bits -= code_len;
+            if (counter & 256) break;
+
+#if !TINFL_USE_64BIT_BITBUF
+            if (num_bits < 15) {
+              bit_buf |=
+                  (((tinfl_bit_buf_t)MZ_READ_LE16(pIn_buf_cur)) << num_bits);
+              pIn_buf_cur += 2;
+              num_bits += 16;
+            }
+#endif
+            if ((sym2 =
+                     r->m_tables[0]
+                         .m_look_up[bit_buf & (TINFL_FAST_LOOKUP_SIZE - 1)]) >=
+                0)
+              code_len = sym2 >> 9;
+            else {
+              code_len = TINFL_FAST_LOOKUP_BITS;
+              do {
+                sym2 = r->m_tables[0]
+                           .m_tree[~sym2 + ((bit_buf >> code_len++) & 1)];
+              } while (sym2 < 0);
+            }
+            bit_buf >>= code_len;
+            num_bits -= code_len;
+
+            pOut_buf_cur[0] = (mz_uint8)counter;
+            if (sym2 & 256) {
+              pOut_buf_cur++;
+              counter = sym2;
+              break;
+            }
+            pOut_buf_cur[1] = (mz_uint8)sym2;
+            pOut_buf_cur += 2;
+          }
+        }
+        if ((counter &= 511) == 256) break;
+
+        num_extra = s_length_extra[counter - 257];
+        counter = s_length_base[counter - 257];
+        if (num_extra) {
+          mz_uint extra_bits;
+          TINFL_GET_BITS(25, extra_bits, num_extra);
+          counter += extra_bits;
+        }
+
+        TINFL_HUFF_DECODE(26, dist, &r->m_tables[1]);
+        num_extra = s_dist_extra[dist];
+        dist = s_dist_base[dist];
+        if (num_extra) {
+          mz_uint extra_bits;
+          TINFL_GET_BITS(27, extra_bits, num_extra);
+          dist += extra_bits;
+        }
+
+        dist_from_out_buf_start = pOut_buf_cur - pOut_buf_start;
+        if ((dist > dist_from_out_buf_start) &&
+            (decomp_flags & TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)) {
+          TINFL_CR_RETURN_FOREVER(37, TINFL_STATUS_FAILED);
+        }
+
+        pSrc = pOut_buf_start +
+               ((dist_from_out_buf_start - dist) & out_buf_size_mask);
+
+        if ((MZ_MAX(pOut_buf_cur, pSrc) + counter) > pOut_buf_end) {
+          while (counter--) {
+            while (pOut_buf_cur >= pOut_buf_end) {
+              TINFL_CR_RETURN(53, TINFL_STATUS_HAS_MORE_OUTPUT);
+            }
+            *pOut_buf_cur++ =
+                pOut_buf_start[(dist_from_out_buf_start++ - dist) &
+                               out_buf_size_mask];
+          }
+          continue;
+        }
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+        else if ((counter >= 9) && (counter <= dist)) {
+          const mz_uint8 *pSrc_end = pSrc + (counter & ~7);
+          do {
+            ((mz_uint32 *)pOut_buf_cur)[0] = ((const mz_uint32 *)pSrc)[0];
+            ((mz_uint32 *)pOut_buf_cur)[1] = ((const mz_uint32 *)pSrc)[1];
+            pOut_buf_cur += 8;
+          } while ((pSrc += 8) < pSrc_end);
+          if ((counter &= 7) < 3) {
+            if (counter) {
+              pOut_buf_cur[0] = pSrc[0];
+              if (counter > 1) pOut_buf_cur[1] = pSrc[1];
+              pOut_buf_cur += counter;
+            }
+            continue;
+          }
+        }
+#endif
+        do {
+          pOut_buf_cur[0] = pSrc[0];
+          pOut_buf_cur[1] = pSrc[1];
+          pOut_buf_cur[2] = pSrc[2];
+          pOut_buf_cur += 3;
+          pSrc += 3;
+        } while ((int)(counter -= 3) > 2);
+        if ((int)counter > 0) {
+          pOut_buf_cur[0] = pSrc[0];
+          if ((int)counter > 1) pOut_buf_cur[1] = pSrc[1];
+          pOut_buf_cur += counter;
+        }
+      }
+    }
+  } while (!(r->m_final & 1));
+  if (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER) {
+    TINFL_SKIP_BITS(32, num_bits & 7);
+    for (counter = 0; counter < 4; ++counter) {
+      mz_uint s;
+      if (num_bits)
+        TINFL_GET_BITS(41, s, 8);
+      else
+        TINFL_GET_BYTE(42, s);
+      r->m_z_adler32 = (r->m_z_adler32 << 8) | s;
+    }
+  }
+  TINFL_CR_RETURN_FOREVER(34, TINFL_STATUS_DONE);
+  TINFL_CR_FINISH
+
+common_exit:
+  r->m_num_bits = num_bits;
+  r->m_bit_buf = bit_buf;
+  r->m_dist = dist;
+  r->m_counter = counter;
+  r->m_num_extra = num_extra;
+  r->m_dist_from_out_buf_start = dist_from_out_buf_start;
+  *pIn_buf_size = pIn_buf_cur - pIn_buf_next;
+  *pOut_buf_size = pOut_buf_cur - pOut_buf_next;
+  if ((decomp_flags &
+       (TINFL_FLAG_PARSE_ZLIB_HEADER | TINFL_FLAG_COMPUTE_ADLER32)) &&
+      (status >= 0)) {
+    const mz_uint8 *ptr = pOut_buf_next;
+    size_t buf_len = *pOut_buf_size;
+    mz_uint32 i, s1 = r->m_check_adler32 & 0xffff,
+                 s2 = r->m_check_adler32 >> 16;
+    size_t block_len = buf_len % 5552;
+    while (buf_len) {
+      for (i = 0; i + 7 < block_len; i += 8, ptr += 8) {
+        s1 += ptr[0], s2 += s1;
+        s1 += ptr[1], s2 += s1;
+        s1 += ptr[2], s2 += s1;
+        s1 += ptr[3], s2 += s1;
+        s1 += ptr[4], s2 += s1;
+        s1 += ptr[5], s2 += s1;
+        s1 += ptr[6], s2 += s1;
+        s1 += ptr[7], s2 += s1;
+      }
+      for (; i < block_len; ++i) s1 += *ptr++, s2 += s1;
+      s1 %= 65521U, s2 %= 65521U;
+      buf_len -= block_len;
+      block_len = 5552;
+    }
+    r->m_check_adler32 = (s2 << 16) + s1;
+    if ((status == TINFL_STATUS_DONE) &&
+        (decomp_flags & TINFL_FLAG_PARSE_ZLIB_HEADER) &&
+        (r->m_check_adler32 != r->m_z_adler32))
+      status = TINFL_STATUS_ADLER32_MISMATCH;
+  }
+  return status;
+}
+
+// Higher level helper functions.
+void *tinfl_decompress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len,
+                                   size_t *pOut_len, int flags) {
+  tinfl_decompressor decomp;
+  void *pBuf = NULL, *pNew_buf;
+  size_t src_buf_ofs = 0, out_buf_capacity = 0;
+  *pOut_len = 0;
+  tinfl_init(&decomp);
+  for (;;) {
+    size_t src_buf_size = src_buf_len - src_buf_ofs,
+           dst_buf_size = out_buf_capacity - *pOut_len, new_out_buf_capacity;
+    tinfl_status status = tinfl_decompress(
+        &decomp, (const mz_uint8 *)pSrc_buf + src_buf_ofs, &src_buf_size,
+        (mz_uint8 *)pBuf, pBuf ? (mz_uint8 *)pBuf + *pOut_len : NULL,
+        &dst_buf_size, (flags & ~TINFL_FLAG_HAS_MORE_INPUT) |
+                           TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+    if ((status < 0) || (status == TINFL_STATUS_NEEDS_MORE_INPUT)) {
+      MZ_FREE(pBuf);
+      *pOut_len = 0;
+      return NULL;
+    }
+    src_buf_ofs += src_buf_size;
+    *pOut_len += dst_buf_size;
+    if (status == TINFL_STATUS_DONE) break;
+    new_out_buf_capacity = out_buf_capacity * 2;
+    if (new_out_buf_capacity < 128) new_out_buf_capacity = 128;
+    pNew_buf = MZ_REALLOC(pBuf, new_out_buf_capacity);
+    if (!pNew_buf) {
+      MZ_FREE(pBuf);
+      *pOut_len = 0;
+      return NULL;
+    }
+    pBuf = pNew_buf;
+    out_buf_capacity = new_out_buf_capacity;
+  }
+  return pBuf;
+}
+
+size_t tinfl_decompress_mem_to_mem(void *pOut_buf, size_t out_buf_len,
+                                   const void *pSrc_buf, size_t src_buf_len,
+                                   int flags) {
+  tinfl_decompressor decomp;
+  tinfl_status status;
+  tinfl_init(&decomp);
+  status =
+      tinfl_decompress(&decomp, (const mz_uint8 *)pSrc_buf, &src_buf_len,
+                       (mz_uint8 *)pOut_buf, (mz_uint8 *)pOut_buf, &out_buf_len,
+                       (flags & ~TINFL_FLAG_HAS_MORE_INPUT) |
+                           TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF);
+  return (status != TINFL_STATUS_DONE) ? TINFL_DECOMPRESS_MEM_TO_MEM_FAILED
+                                       : out_buf_len;
+}
+
+int tinfl_decompress_mem_to_callback(const void *pIn_buf, size_t *pIn_buf_size,
+                                     tinfl_put_buf_func_ptr pPut_buf_func,
+                                     void *pPut_buf_user, int flags) {
+  int result = 0;
+  tinfl_decompressor decomp;
+  mz_uint8 *pDict = (mz_uint8 *)MZ_MALLOC(TINFL_LZ_DICT_SIZE);
+  size_t in_buf_ofs = 0, dict_ofs = 0;
+  if (!pDict) return TINFL_STATUS_FAILED;
+  tinfl_init(&decomp);
+  for (;;) {
+    size_t in_buf_size = *pIn_buf_size - in_buf_ofs,
+           dst_buf_size = TINFL_LZ_DICT_SIZE - dict_ofs;
+    tinfl_status status =
+        tinfl_decompress(&decomp, (const mz_uint8 *)pIn_buf + in_buf_ofs,
+                         &in_buf_size, pDict, pDict + dict_ofs, &dst_buf_size,
+                         (flags &
+                          ~(TINFL_FLAG_HAS_MORE_INPUT |
+                            TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF)));
+    in_buf_ofs += in_buf_size;
+    if ((dst_buf_size) &&
+        (!(*pPut_buf_func)(pDict + dict_ofs, (int)dst_buf_size, pPut_buf_user)))
+      break;
+    if (status != TINFL_STATUS_HAS_MORE_OUTPUT) {
+      result = (status == TINFL_STATUS_DONE);
+      break;
+    }
+    dict_ofs = (dict_ofs + dst_buf_size) & (TINFL_LZ_DICT_SIZE - 1);
+  }
+  MZ_FREE(pDict);
+  *pIn_buf_size = in_buf_ofs;
+  return result;
+}
+
+// ------------------- Low-level Compression (independent from all decompression
+// API's)
+
+// Purposely making these tables static for faster init and thread safety.
+static const mz_uint16 s_tdefl_len_sym[256] = {
+    257, 258, 259, 260, 261, 262, 263, 264, 265, 265, 266, 266, 267, 267, 268,
+    268, 269, 269, 269, 269, 270, 270, 270, 270, 271, 271, 271, 271, 272, 272,
+    272, 272, 273, 273, 273, 273, 273, 273, 273, 273, 274, 274, 274, 274, 274,
+    274, 274, 274, 275, 275, 275, 275, 275, 275, 275, 275, 276, 276, 276, 276,
+    276, 276, 276, 276, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277, 277,
+    277, 277, 277, 277, 277, 278, 278, 278, 278, 278, 278, 278, 278, 278, 278,
+    278, 278, 278, 278, 278, 278, 279, 279, 279, 279, 279, 279, 279, 279, 279,
+    279, 279, 279, 279, 279, 279, 279, 280, 280, 280, 280, 280, 280, 280, 280,
+    280, 280, 280, 280, 280, 280, 280, 280, 281, 281, 281, 281, 281, 281, 281,
+    281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 281,
+    281, 281, 281, 281, 281, 281, 281, 281, 281, 281, 282, 282, 282, 282, 282,
+    282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282,
+    282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 282, 283, 283, 283,
+    283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283,
+    283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 283, 284,
+    284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284,
+    284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284, 284,
+    285};
+
+static const mz_uint8 s_tdefl_len_extra[256] = {
+    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 0};
+
+static const mz_uint8 s_tdefl_small_dist_sym[512] = {
+    0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,
+    8,  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+    14, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+    15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
+    16, 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
+    17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17};
+
+static const mz_uint8 s_tdefl_small_dist_extra[512] = {
+    0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+    6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7};
+
+static const mz_uint8 s_tdefl_large_dist_sym[128] = {
+    0,  0,  18, 19, 20, 20, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24,
+    24, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+    27, 27, 27, 27, 27, 27, 27, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+    28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
+    28, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
+    29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29};
+
+static const mz_uint8 s_tdefl_large_dist_extra[128] = {
+    0,  0,  8,  8,  9,  9,  9,  9,  10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+    12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+    13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13};
+
+// Radix sorts tdefl_sym_freq[] array by 16-bit key m_key. Returns ptr to sorted
+// values.
+typedef struct { mz_uint16 m_key, m_sym_index; } tdefl_sym_freq;
+static tdefl_sym_freq *tdefl_radix_sort_syms(mz_uint num_syms,
+                                             tdefl_sym_freq *pSyms0,
+                                             tdefl_sym_freq *pSyms1) {
+  mz_uint32 total_passes = 2, pass_shift, pass, i, hist[256 * 2];
+  tdefl_sym_freq *pCur_syms = pSyms0, *pNew_syms = pSyms1;
+  MZ_CLEAR_OBJ(hist);
+  for (i = 0; i < num_syms; i++) {
+    mz_uint freq = pSyms0[i].m_key;
+    hist[freq & 0xFF]++;
+    hist[256 + ((freq >> 8) & 0xFF)]++;
+  }
+  while ((total_passes > 1) && (num_syms == hist[(total_passes - 1) * 256]))
+    total_passes--;
+  for (pass_shift = 0, pass = 0; pass < total_passes; pass++, pass_shift += 8) {
+    const mz_uint32 *pHist = &hist[pass << 8];
+    mz_uint offsets[256], cur_ofs = 0;
+    for (i = 0; i < 256; i++) {
+      offsets[i] = cur_ofs;
+      cur_ofs += pHist[i];
+    }
+    for (i = 0; i < num_syms; i++)
+      pNew_syms[offsets[(pCur_syms[i].m_key >> pass_shift) & 0xFF]++] =
+          pCur_syms[i];
+    {
+      tdefl_sym_freq *t = pCur_syms;
+      pCur_syms = pNew_syms;
+      pNew_syms = t;
+    }
+  }
+  return pCur_syms;
+}
+
+// tdefl_calculate_minimum_redundancy() originally written by: Alistair Moffat,
+// alistair@cs.mu.oz.au, Jyrki Katajainen, jyrki@diku.dk, November 1996.
+static void tdefl_calculate_minimum_redundancy(tdefl_sym_freq *A, int n) {
+  int root, leaf, next, avbl, used, dpth;
+  if (n == 0)
+    return;
+  else if (n == 1) {
+    A[0].m_key = 1;
+    return;
+  }
+  A[0].m_key += A[1].m_key;
+  root = 0;
+  leaf = 2;
+  for (next = 1; next < n - 1; next++) {
+    if (leaf >= n || A[root].m_key < A[leaf].m_key) {
+      A[next].m_key = A[root].m_key;
+      A[root++].m_key = (mz_uint16)next;
+    } else
+      A[next].m_key = A[leaf++].m_key;
+    if (leaf >= n || (root < next && A[root].m_key < A[leaf].m_key)) {
+      A[next].m_key = (mz_uint16)(A[next].m_key + A[root].m_key);
+      A[root++].m_key = (mz_uint16)next;
+    } else
+      A[next].m_key = (mz_uint16)(A[next].m_key + A[leaf++].m_key);
+  }
+  A[n - 2].m_key = 0;
+  for (next = n - 3; next >= 0; next--)
+    A[next].m_key = A[A[next].m_key].m_key + 1;
+  avbl = 1;
+  used = dpth = 0;
+  root = n - 2;
+  next = n - 1;
+  while (avbl > 0) {
+    while (root >= 0 && (int)A[root].m_key == dpth) {
+      used++;
+      root--;
+    }
+    while (avbl > used) {
+      A[next--].m_key = (mz_uint16)(dpth);
+      avbl--;
+    }
+    avbl = 2 * used;
+    dpth++;
+    used = 0;
+  }
+}
+
+// Limits canonical Huffman code table's max code size.
+enum { TDEFL_MAX_SUPPORTED_HUFF_CODESIZE = 32 };
+static void tdefl_huffman_enforce_max_code_size(int *pNum_codes,
+                                                int code_list_len,
+                                                int max_code_size) {
+  int i;
+  mz_uint32 total = 0;
+  if (code_list_len <= 1) return;
+  for (i = max_code_size + 1; i <= TDEFL_MAX_SUPPORTED_HUFF_CODESIZE; i++)
+    pNum_codes[max_code_size] += pNum_codes[i];
+  for (i = max_code_size; i > 0; i--)
+    total += (((mz_uint32)pNum_codes[i]) << (max_code_size - i));
+  while (total != (1UL << max_code_size)) {
+    pNum_codes[max_code_size]--;
+    for (i = max_code_size - 1; i > 0; i--)
+      if (pNum_codes[i]) {
+        pNum_codes[i]--;
+        pNum_codes[i + 1] += 2;
+        break;
+      }
+    total--;
+  }
+}
+
+static void tdefl_optimize_huffman_table(tdefl_compressor *d, int table_num,
+                                         int table_len, int code_size_limit,
+                                         int static_table) {
+  int i, j, l, num_codes[1 + TDEFL_MAX_SUPPORTED_HUFF_CODESIZE];
+  mz_uint next_code[TDEFL_MAX_SUPPORTED_HUFF_CODESIZE + 1];
+  MZ_CLEAR_OBJ(num_codes);
+  if (static_table) {
+    for (i = 0; i < table_len; i++)
+      num_codes[d->m_huff_code_sizes[table_num][i]]++;
+  } else {
+    tdefl_sym_freq syms0[TDEFL_MAX_HUFF_SYMBOLS], syms1[TDEFL_MAX_HUFF_SYMBOLS],
+        *pSyms;
+    int num_used_syms = 0;
+    const mz_uint16 *pSym_count = &d->m_huff_count[table_num][0];
+    for (i = 0; i < table_len; i++)
+      if (pSym_count[i]) {
+        syms0[num_used_syms].m_key = (mz_uint16)pSym_count[i];
+        syms0[num_used_syms++].m_sym_index = (mz_uint16)i;
+      }
+
+    pSyms = tdefl_radix_sort_syms(num_used_syms, syms0, syms1);
+    tdefl_calculate_minimum_redundancy(pSyms, num_used_syms);
+
+    for (i = 0; i < num_used_syms; i++) num_codes[pSyms[i].m_key]++;
+
+    tdefl_huffman_enforce_max_code_size(num_codes, num_used_syms,
+                                        code_size_limit);
+
+    MZ_CLEAR_OBJ(d->m_huff_code_sizes[table_num]);
+    MZ_CLEAR_OBJ(d->m_huff_codes[table_num]);
+    for (i = 1, j = num_used_syms; i <= code_size_limit; i++)
+      for (l = num_codes[i]; l > 0; l--)
+        d->m_huff_code_sizes[table_num][pSyms[--j].m_sym_index] = (mz_uint8)(i);
+  }
+
+  next_code[1] = 0;
+  for (j = 0, i = 2; i <= code_size_limit; i++)
+    next_code[i] = j = ((j + num_codes[i - 1]) << 1);
+
+  for (i = 0; i < table_len; i++) {
+    mz_uint rev_code = 0, code, code_size;
+    if ((code_size = d->m_huff_code_sizes[table_num][i]) == 0) continue;
+    code = next_code[code_size]++;
+    for (l = code_size; l > 0; l--, code >>= 1)
+      rev_code = (rev_code << 1) | (code & 1);
+    d->m_huff_codes[table_num][i] = (mz_uint16)rev_code;
+  }
+}
+
+#define TDEFL_PUT_BITS(b, l)                               \
+  do {                                                     \
+    mz_uint bits = b;                                      \
+    mz_uint len = l;                                       \
+    MZ_ASSERT(bits <= ((1U << len) - 1U));                 \
+    d->m_bit_buffer |= (bits << d->m_bits_in);             \
+    d->m_bits_in += len;                                   \
+    while (d->m_bits_in >= 8) {                            \
+      if (d->m_pOutput_buf < d->m_pOutput_buf_end)         \
+        *d->m_pOutput_buf++ = (mz_uint8)(d->m_bit_buffer); \
+      d->m_bit_buffer >>= 8;                               \
+      d->m_bits_in -= 8;                                   \
+    }                                                      \
+  }                                                        \
+  MZ_MACRO_END
+
+#define TDEFL_RLE_PREV_CODE_SIZE()                                        \
+  {                                                                       \
+    if (rle_repeat_count) {                                               \
+      if (rle_repeat_count < 3) {                                         \
+        d->m_huff_count[2][prev_code_size] = (mz_uint16)(                 \
+            d->m_huff_count[2][prev_code_size] + rle_repeat_count);       \
+        while (rle_repeat_count--)                                        \
+          packed_code_sizes[num_packed_code_sizes++] = prev_code_size;    \
+      } else {                                                            \
+        d->m_huff_count[2][16] = (mz_uint16)(d->m_huff_count[2][16] + 1); \
+        packed_code_sizes[num_packed_code_sizes++] = 16;                  \
+        packed_code_sizes[num_packed_code_sizes++] =                      \
+            (mz_uint8)(rle_repeat_count - 3);                             \
+      }                                                                   \
+      rle_repeat_count = 0;                                               \
+    }                                                                     \
+  }
+
+#define TDEFL_RLE_ZERO_CODE_SIZE()                                            \
+  {                                                                           \
+    if (rle_z_count) {                                                        \
+      if (rle_z_count < 3) {                                                  \
+        d->m_huff_count[2][0] =                                               \
+            (mz_uint16)(d->m_huff_count[2][0] + rle_z_count);                 \
+        while (rle_z_count--) packed_code_sizes[num_packed_code_sizes++] = 0; \
+      } else if (rle_z_count <= 10) {                                         \
+        d->m_huff_count[2][17] = (mz_uint16)(d->m_huff_count[2][17] + 1);     \
+        packed_code_sizes[num_packed_code_sizes++] = 17;                      \
+        packed_code_sizes[num_packed_code_sizes++] =                          \
+            (mz_uint8)(rle_z_count - 3);                                      \
+      } else {                                                                \
+        d->m_huff_count[2][18] = (mz_uint16)(d->m_huff_count[2][18] + 1);     \
+        packed_code_sizes[num_packed_code_sizes++] = 18;                      \
+        packed_code_sizes[num_packed_code_sizes++] =                          \
+            (mz_uint8)(rle_z_count - 11);                                     \
+      }                                                                       \
+      rle_z_count = 0;                                                        \
+    }                                                                         \
+  }
+
+static mz_uint8 s_tdefl_packed_code_size_syms_swizzle[] = {
+    16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static void tdefl_start_dynamic_block(tdefl_compressor *d) {
+  int num_lit_codes, num_dist_codes, num_bit_lengths;
+  mz_uint i, total_code_sizes_to_pack, num_packed_code_sizes, rle_z_count,
+      rle_repeat_count, packed_code_sizes_index;
+  mz_uint8
+      code_sizes_to_pack[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1],
+      packed_code_sizes[TDEFL_MAX_HUFF_SYMBOLS_0 + TDEFL_MAX_HUFF_SYMBOLS_1],
+      prev_code_size = 0xFF;
+
+  d->m_huff_count[0][256] = 1;
+
+  tdefl_optimize_huffman_table(d, 0, TDEFL_MAX_HUFF_SYMBOLS_0, 15, MZ_FALSE);
+  tdefl_optimize_huffman_table(d, 1, TDEFL_MAX_HUFF_SYMBOLS_1, 15, MZ_FALSE);
+
+  for (num_lit_codes = 286; num_lit_codes > 257; num_lit_codes--)
+    if (d->m_huff_code_sizes[0][num_lit_codes - 1]) break;
+  for (num_dist_codes = 30; num_dist_codes > 1; num_dist_codes--)
+    if (d->m_huff_code_sizes[1][num_dist_codes - 1]) break;
+
+  memcpy(code_sizes_to_pack, &d->m_huff_code_sizes[0][0], num_lit_codes);
+  memcpy(code_sizes_to_pack + num_lit_codes, &d->m_huff_code_sizes[1][0],
+         num_dist_codes);
+  total_code_sizes_to_pack = num_lit_codes + num_dist_codes;
+  num_packed_code_sizes = 0;
+  rle_z_count = 0;
+  rle_repeat_count = 0;
+
+  memset(&d->m_huff_count[2][0], 0,
+         sizeof(d->m_huff_count[2][0]) * TDEFL_MAX_HUFF_SYMBOLS_2);
+  for (i = 0; i < total_code_sizes_to_pack; i++) {
+    mz_uint8 code_size = code_sizes_to_pack[i];
+    if (!code_size) {
+      TDEFL_RLE_PREV_CODE_SIZE();
+      if (++rle_z_count == 138) {
+        TDEFL_RLE_ZERO_CODE_SIZE();
+      }
+    } else {
+      TDEFL_RLE_ZERO_CODE_SIZE();
+      if (code_size != prev_code_size) {
+        TDEFL_RLE_PREV_CODE_SIZE();
+        d->m_huff_count[2][code_size] =
+            (mz_uint16)(d->m_huff_count[2][code_size] + 1);
+        packed_code_sizes[num_packed_code_sizes++] = code_size;
+      } else if (++rle_repeat_count == 6) {
+        TDEFL_RLE_PREV_CODE_SIZE();
+      }
+    }
+    prev_code_size = code_size;
+  }
+  if (rle_repeat_count) {
+    TDEFL_RLE_PREV_CODE_SIZE();
+  } else {
+    TDEFL_RLE_ZERO_CODE_SIZE();
+  }
+
+  tdefl_optimize_huffman_table(d, 2, TDEFL_MAX_HUFF_SYMBOLS_2, 7, MZ_FALSE);
+
+  TDEFL_PUT_BITS(2, 2);
+
+  TDEFL_PUT_BITS(num_lit_codes - 257, 5);
+  TDEFL_PUT_BITS(num_dist_codes - 1, 5);
+
+  for (num_bit_lengths = 18; num_bit_lengths >= 0; num_bit_lengths--)
+    if (d->m_huff_code_sizes
+            [2][s_tdefl_packed_code_size_syms_swizzle[num_bit_lengths]])
+      break;
+  num_bit_lengths = MZ_MAX(4, (num_bit_lengths + 1));
+  TDEFL_PUT_BITS(num_bit_lengths - 4, 4);
+  for (i = 0; (int)i < num_bit_lengths; i++)
+    TDEFL_PUT_BITS(
+        d->m_huff_code_sizes[2][s_tdefl_packed_code_size_syms_swizzle[i]], 3);
+
+  for (packed_code_sizes_index = 0;
+       packed_code_sizes_index < num_packed_code_sizes;) {
+    mz_uint code = packed_code_sizes[packed_code_sizes_index++];
+    MZ_ASSERT(code < TDEFL_MAX_HUFF_SYMBOLS_2);
+    TDEFL_PUT_BITS(d->m_huff_codes[2][code], d->m_huff_code_sizes[2][code]);
+    if (code >= 16)
+      TDEFL_PUT_BITS(packed_code_sizes[packed_code_sizes_index++],
+                     "\02\03\07"[code - 16]);
+  }
+}
+
+static void tdefl_start_static_block(tdefl_compressor *d) {
+  mz_uint i;
+  mz_uint8 *p = &d->m_huff_code_sizes[0][0];
+
+  for (i = 0; i <= 143; ++i) *p++ = 8;
+  for (; i <= 255; ++i) *p++ = 9;
+  for (; i <= 279; ++i) *p++ = 7;
+  for (; i <= 287; ++i) *p++ = 8;
+
+  memset(d->m_huff_code_sizes[1], 5, 32);
+
+  tdefl_optimize_huffman_table(d, 0, 288, 15, MZ_TRUE);
+  tdefl_optimize_huffman_table(d, 1, 32, 15, MZ_TRUE);
+
+  TDEFL_PUT_BITS(1, 2);
+}
+
+static const mz_uint mz_bitmasks[17] = {
+    0x0000, 0x0001, 0x0003, 0x0007, 0x000F, 0x001F, 0x003F, 0x007F, 0x00FF,
+    0x01FF, 0x03FF, 0x07FF, 0x0FFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF};
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN && \
+    MINIZ_HAS_64BIT_REGISTERS
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d) {
+  mz_uint flags;
+  mz_uint8 *pLZ_codes;
+  mz_uint8 *pOutput_buf = d->m_pOutput_buf;
+  mz_uint8 *pLZ_code_buf_end = d->m_pLZ_code_buf;
+  mz_uint64 bit_buffer = d->m_bit_buffer;
+  mz_uint bits_in = d->m_bits_in;
+
+#define TDEFL_PUT_BITS_FAST(b, l)                \
+  {                                              \
+    bit_buffer |= (((mz_uint64)(b)) << bits_in); \
+    bits_in += (l);                              \
+  }
+
+  flags = 1;
+  for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < pLZ_code_buf_end;
+       flags >>= 1) {
+    if (flags == 1) flags = *pLZ_codes++ | 0x100;
+
+    if (flags & 1) {
+      mz_uint s0, s1, n0, n1, sym, num_extra_bits;
+      mz_uint match_len = pLZ_codes[0],
+              match_dist = *(const mz_uint16 *)(pLZ_codes + 1);
+      pLZ_codes += 3;
+
+      MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][s_tdefl_len_sym[match_len]],
+                          d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS_FAST(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]],
+                          s_tdefl_len_extra[match_len]);
+
+      // This sequence coaxes MSVC into using cmov's vs. jmp's.
+      s0 = s_tdefl_small_dist_sym[match_dist & 511];
+      n0 = s_tdefl_small_dist_extra[match_dist & 511];
+      s1 = s_tdefl_large_dist_sym[match_dist >> 8];
+      n1 = s_tdefl_large_dist_extra[match_dist >> 8];
+      sym = (match_dist < 512) ? s0 : s1;
+      num_extra_bits = (match_dist < 512) ? n0 : n1;
+
+      MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[1][sym],
+                          d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS_FAST(match_dist & mz_bitmasks[num_extra_bits],
+                          num_extra_bits);
+    } else {
+      mz_uint lit = *pLZ_codes++;
+      MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+      TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit],
+                          d->m_huff_code_sizes[0][lit]);
+
+      if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end)) {
+        flags >>= 1;
+        lit = *pLZ_codes++;
+        MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+        TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit],
+                            d->m_huff_code_sizes[0][lit]);
+
+        if (((flags & 2) == 0) && (pLZ_codes < pLZ_code_buf_end)) {
+          flags >>= 1;
+          lit = *pLZ_codes++;
+          MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+          TDEFL_PUT_BITS_FAST(d->m_huff_codes[0][lit],
+                              d->m_huff_code_sizes[0][lit]);
+        }
+      }
+    }
+
+    if (pOutput_buf >= d->m_pOutput_buf_end) return MZ_FALSE;
+
+    *(mz_uint64 *)pOutput_buf = bit_buffer;
+    pOutput_buf += (bits_in >> 3);
+    bit_buffer >>= (bits_in & ~7);
+    bits_in &= 7;
+  }
+
+#undef TDEFL_PUT_BITS_FAST
+
+  d->m_pOutput_buf = pOutput_buf;
+  d->m_bits_in = 0;
+  d->m_bit_buffer = 0;
+
+  while (bits_in) {
+    mz_uint32 n = MZ_MIN(bits_in, 16);
+    TDEFL_PUT_BITS((mz_uint)bit_buffer & mz_bitmasks[n], n);
+    bit_buffer >>= n;
+    bits_in -= n;
+  }
+
+  TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+  return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#else
+static mz_bool tdefl_compress_lz_codes(tdefl_compressor *d) {
+  mz_uint flags;
+  mz_uint8 *pLZ_codes;
+
+  flags = 1;
+  for (pLZ_codes = d->m_lz_code_buf; pLZ_codes < d->m_pLZ_code_buf;
+       flags >>= 1) {
+    if (flags == 1) flags = *pLZ_codes++ | 0x100;
+    if (flags & 1) {
+      mz_uint sym, num_extra_bits;
+      mz_uint match_len = pLZ_codes[0],
+              match_dist = (pLZ_codes[1] | (pLZ_codes[2] << 8));
+      pLZ_codes += 3;
+
+      MZ_ASSERT(d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS(d->m_huff_codes[0][s_tdefl_len_sym[match_len]],
+                     d->m_huff_code_sizes[0][s_tdefl_len_sym[match_len]]);
+      TDEFL_PUT_BITS(match_len & mz_bitmasks[s_tdefl_len_extra[match_len]],
+                     s_tdefl_len_extra[match_len]);
+
+      if (match_dist < 512) {
+        sym = s_tdefl_small_dist_sym[match_dist];
+        num_extra_bits = s_tdefl_small_dist_extra[match_dist];
+      } else {
+        sym = s_tdefl_large_dist_sym[match_dist >> 8];
+        num_extra_bits = s_tdefl_large_dist_extra[match_dist >> 8];
+      }
+      MZ_ASSERT(d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS(d->m_huff_codes[1][sym], d->m_huff_code_sizes[1][sym]);
+      TDEFL_PUT_BITS(match_dist & mz_bitmasks[num_extra_bits], num_extra_bits);
+    } else {
+      mz_uint lit = *pLZ_codes++;
+      MZ_ASSERT(d->m_huff_code_sizes[0][lit]);
+      TDEFL_PUT_BITS(d->m_huff_codes[0][lit], d->m_huff_code_sizes[0][lit]);
+    }
+  }
+
+  TDEFL_PUT_BITS(d->m_huff_codes[0][256], d->m_huff_code_sizes[0][256]);
+
+  return (d->m_pOutput_buf < d->m_pOutput_buf_end);
+}
+#endif  // MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN &&
+        // MINIZ_HAS_64BIT_REGISTERS
+
+static mz_bool tdefl_compress_block(tdefl_compressor *d, mz_bool static_block) {
+  if (static_block)
+    tdefl_start_static_block(d);
+  else
+    tdefl_start_dynamic_block(d);
+  return tdefl_compress_lz_codes(d);
+}
+
+static int tdefl_flush_block(tdefl_compressor *d, int flush) {
+  mz_uint saved_bit_buf, saved_bits_in;
+  mz_uint8 *pSaved_output_buf;
+  mz_bool comp_block_succeeded = MZ_FALSE;
+  int n, use_raw_block =
+             ((d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS) != 0) &&
+             (d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size;
+  mz_uint8 *pOutput_buf_start =
+      ((d->m_pPut_buf_func == NULL) &&
+       ((*d->m_pOut_buf_size - d->m_out_buf_ofs) >= TDEFL_OUT_BUF_SIZE))
+          ? ((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs)
+          : d->m_output_buf;
+
+  d->m_pOutput_buf = pOutput_buf_start;
+  d->m_pOutput_buf_end = d->m_pOutput_buf + TDEFL_OUT_BUF_SIZE - 16;
+
+  MZ_ASSERT(!d->m_output_flush_remaining);
+  d->m_output_flush_ofs = 0;
+  d->m_output_flush_remaining = 0;
+
+  *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> d->m_num_flags_left);
+  d->m_pLZ_code_buf -= (d->m_num_flags_left == 8);
+
+  if ((d->m_flags & TDEFL_WRITE_ZLIB_HEADER) && (!d->m_block_index)) {
+    TDEFL_PUT_BITS(0x78, 8);
+    TDEFL_PUT_BITS(0x01, 8);
+  }
+
+  TDEFL_PUT_BITS(flush == TDEFL_FINISH, 1);
+
+  pSaved_output_buf = d->m_pOutput_buf;
+  saved_bit_buf = d->m_bit_buffer;
+  saved_bits_in = d->m_bits_in;
+
+  if (!use_raw_block)
+    comp_block_succeeded =
+        tdefl_compress_block(d, (d->m_flags & TDEFL_FORCE_ALL_STATIC_BLOCKS) ||
+                                    (d->m_total_lz_bytes < 48));
+
+  // If the block gets expanded, forget the current contents of the output
+  // buffer and send a raw block instead.
+  if (((use_raw_block) ||
+       ((d->m_total_lz_bytes) && ((d->m_pOutput_buf - pSaved_output_buf + 1U) >=
+                                  d->m_total_lz_bytes))) &&
+      ((d->m_lookahead_pos - d->m_lz_code_buf_dict_pos) <= d->m_dict_size)) {
+    mz_uint i;
+    d->m_pOutput_buf = pSaved_output_buf;
+    d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+    TDEFL_PUT_BITS(0, 2);
+    if (d->m_bits_in) {
+      TDEFL_PUT_BITS(0, 8 - d->m_bits_in);
+    }
+    for (i = 2; i; --i, d->m_total_lz_bytes ^= 0xFFFF) {
+      TDEFL_PUT_BITS(d->m_total_lz_bytes & 0xFFFF, 16);
+    }
+    for (i = 0; i < d->m_total_lz_bytes; ++i) {
+      TDEFL_PUT_BITS(
+          d->m_dict[(d->m_lz_code_buf_dict_pos + i) & TDEFL_LZ_DICT_SIZE_MASK],
+          8);
+    }
+  }
+  // Check for the extremely unlikely (if not impossible) case of the compressed
+  // block not fitting into the output buffer when using dynamic codes.
+  else if (!comp_block_succeeded) {
+    d->m_pOutput_buf = pSaved_output_buf;
+    d->m_bit_buffer = saved_bit_buf, d->m_bits_in = saved_bits_in;
+    tdefl_compress_block(d, MZ_TRUE);
+  }
+
+  if (flush) {
+    if (flush == TDEFL_FINISH) {
+      if (d->m_bits_in) {
+        TDEFL_PUT_BITS(0, 8 - d->m_bits_in);
+      }
+      if (d->m_flags & TDEFL_WRITE_ZLIB_HEADER) {
+        mz_uint i, a = d->m_adler32;
+        for (i = 0; i < 4; i++) {
+          TDEFL_PUT_BITS((a >> 24) & 0xFF, 8);
+          a <<= 8;
+        }
+      }
+    } else {
+      mz_uint i, z = 0;
+      TDEFL_PUT_BITS(0, 3);
+      if (d->m_bits_in) {
+        TDEFL_PUT_BITS(0, 8 - d->m_bits_in);
+      }
+      for (i = 2; i; --i, z ^= 0xFFFF) {
+        TDEFL_PUT_BITS(z & 0xFFFF, 16);
+      }
+    }
+  }
+
+  MZ_ASSERT(d->m_pOutput_buf < d->m_pOutput_buf_end);
+
+  memset(&d->m_huff_count[0][0], 0,
+         sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+  memset(&d->m_huff_count[1][0], 0,
+         sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+
+  d->m_pLZ_code_buf = d->m_lz_code_buf + 1;
+  d->m_pLZ_flags = d->m_lz_code_buf;
+  d->m_num_flags_left = 8;
+  d->m_lz_code_buf_dict_pos += d->m_total_lz_bytes;
+  d->m_total_lz_bytes = 0;
+  d->m_block_index++;
+
+  if ((n = (int)(d->m_pOutput_buf - pOutput_buf_start)) != 0) {
+    if (d->m_pPut_buf_func) {
+      *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+      if (!(*d->m_pPut_buf_func)(d->m_output_buf, n, d->m_pPut_buf_user))
+        return (d->m_prev_return_status = TDEFL_STATUS_PUT_BUF_FAILED);
+    } else if (pOutput_buf_start == d->m_output_buf) {
+      int bytes_to_copy = (int)MZ_MIN(
+          (size_t)n, (size_t)(*d->m_pOut_buf_size - d->m_out_buf_ofs));
+      memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs, d->m_output_buf,
+             bytes_to_copy);
+      d->m_out_buf_ofs += bytes_to_copy;
+      if ((n -= bytes_to_copy) != 0) {
+        d->m_output_flush_ofs = bytes_to_copy;
+        d->m_output_flush_remaining = n;
+      }
+    } else {
+      d->m_out_buf_ofs += n;
+    }
+  }
+
+  return d->m_output_flush_remaining;
+}
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+#define TDEFL_READ_UNALIGNED_WORD(p) *(const mz_uint16 *)(p)
+static MZ_FORCEINLINE void tdefl_find_match(
+    tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist,
+    mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len) {
+  mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK,
+                match_len = *pMatch_len, probe_pos = pos, next_probe_pos,
+                probe_len;
+  mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+  const mz_uint16 *s = (const mz_uint16 *)(d->m_dict + pos), *p, *q;
+  mz_uint16 c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]),
+            s01 = TDEFL_READ_UNALIGNED_WORD(s);
+  MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN);
+  if (max_match_len <= match_len) return;
+  for (;;) {
+    for (;;) {
+      if (--num_probes_left == 0) return;
+#define TDEFL_PROBE                                                            \
+  next_probe_pos = d->m_next[probe_pos];                                       \
+  if ((!next_probe_pos) ||                                                     \
+      ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist))       \
+    return;                                                                    \
+  probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK;                        \
+  if (TDEFL_READ_UNALIGNED_WORD(&d->m_dict[probe_pos + match_len - 1]) == c01) \
+    break;
+      TDEFL_PROBE;
+      TDEFL_PROBE;
+      TDEFL_PROBE;
+    }
+    if (!dist) break;
+    q = (const mz_uint16 *)(d->m_dict + probe_pos);
+    if (TDEFL_READ_UNALIGNED_WORD(q) != s01) continue;
+    p = s;
+    probe_len = 32;
+    do {
+    } while (
+        (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+        (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+        (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+        (TDEFL_READ_UNALIGNED_WORD(++p) == TDEFL_READ_UNALIGNED_WORD(++q)) &&
+        (--probe_len > 0));
+    if (!probe_len) {
+      *pMatch_dist = dist;
+      *pMatch_len = MZ_MIN(max_match_len, TDEFL_MAX_MATCH_LEN);
+      break;
+    } else if ((probe_len = ((mz_uint)(p - s) * 2) +
+                            (mz_uint)(*(const mz_uint8 *)p ==
+                                      *(const mz_uint8 *)q)) > match_len) {
+      *pMatch_dist = dist;
+      if ((*pMatch_len = match_len = MZ_MIN(max_match_len, probe_len)) ==
+          max_match_len)
+        break;
+      c01 = TDEFL_READ_UNALIGNED_WORD(&d->m_dict[pos + match_len - 1]);
+    }
+  }
+}
+#else
+static MZ_FORCEINLINE void tdefl_find_match(
+    tdefl_compressor *d, mz_uint lookahead_pos, mz_uint max_dist,
+    mz_uint max_match_len, mz_uint *pMatch_dist, mz_uint *pMatch_len) {
+  mz_uint dist, pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK,
+                match_len = *pMatch_len, probe_pos = pos, next_probe_pos,
+                probe_len;
+  mz_uint num_probes_left = d->m_max_probes[match_len >= 32];
+  const mz_uint8 *s = d->m_dict + pos, *p, *q;
+  mz_uint8 c0 = d->m_dict[pos + match_len], c1 = d->m_dict[pos + match_len - 1];
+  MZ_ASSERT(max_match_len <= TDEFL_MAX_MATCH_LEN);
+  if (max_match_len <= match_len) return;
+  for (;;) {
+    for (;;) {
+      if (--num_probes_left == 0) return;
+#define TDEFL_PROBE                                                      \
+  next_probe_pos = d->m_next[probe_pos];                                 \
+  if ((!next_probe_pos) ||                                               \
+      ((dist = (mz_uint16)(lookahead_pos - next_probe_pos)) > max_dist)) \
+    return;                                                              \
+  probe_pos = next_probe_pos & TDEFL_LZ_DICT_SIZE_MASK;                  \
+  if ((d->m_dict[probe_pos + match_len] == c0) &&                        \
+      (d->m_dict[probe_pos + match_len - 1] == c1))                      \
+    break;
+      TDEFL_PROBE;
+      TDEFL_PROBE;
+      TDEFL_PROBE;
+    }
+    if (!dist) break;
+    p = s;
+    q = d->m_dict + probe_pos;
+    for (probe_len = 0; probe_len < max_match_len; probe_len++)
+      if (*p++ != *q++) break;
+    if (probe_len > match_len) {
+      *pMatch_dist = dist;
+      if ((*pMatch_len = match_len = probe_len) == max_match_len) return;
+      c0 = d->m_dict[pos + match_len];
+      c1 = d->m_dict[pos + match_len - 1];
+    }
+  }
+}
+#endif  // #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+static mz_bool tdefl_compress_fast(tdefl_compressor *d) {
+  // Faster, minimally featured LZRW1-style match+parse loop with better
+  // register utilization. Intended for applications where raw throughput is
+  // valued more highly than ratio.
+  mz_uint lookahead_pos = d->m_lookahead_pos,
+          lookahead_size = d->m_lookahead_size, dict_size = d->m_dict_size,
+          total_lz_bytes = d->m_total_lz_bytes,
+          num_flags_left = d->m_num_flags_left;
+  mz_uint8 *pLZ_code_buf = d->m_pLZ_code_buf, *pLZ_flags = d->m_pLZ_flags;
+  mz_uint cur_pos = lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+
+  while ((d->m_src_buf_left) || ((d->m_flush) && (lookahead_size))) {
+    const mz_uint TDEFL_COMP_FAST_LOOKAHEAD_SIZE = 4096;
+    mz_uint dst_pos =
+        (lookahead_pos + lookahead_size) & TDEFL_LZ_DICT_SIZE_MASK;
+    mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(
+        d->m_src_buf_left, TDEFL_COMP_FAST_LOOKAHEAD_SIZE - lookahead_size);
+    d->m_src_buf_left -= num_bytes_to_process;
+    lookahead_size += num_bytes_to_process;
+
+    while (num_bytes_to_process) {
+      mz_uint32 n = MZ_MIN(TDEFL_LZ_DICT_SIZE - dst_pos, num_bytes_to_process);
+      memcpy(d->m_dict + dst_pos, d->m_pSrc, n);
+      if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+        memcpy(d->m_dict + TDEFL_LZ_DICT_SIZE + dst_pos, d->m_pSrc,
+               MZ_MIN(n, (TDEFL_MAX_MATCH_LEN - 1) - dst_pos));
+      d->m_pSrc += n;
+      dst_pos = (dst_pos + n) & TDEFL_LZ_DICT_SIZE_MASK;
+      num_bytes_to_process -= n;
+    }
+
+    dict_size = MZ_MIN(TDEFL_LZ_DICT_SIZE - lookahead_size, dict_size);
+    if ((!d->m_flush) && (lookahead_size < TDEFL_COMP_FAST_LOOKAHEAD_SIZE))
+      break;
+
+    while (lookahead_size >= 4) {
+      mz_uint cur_match_dist, cur_match_len = 1;
+      mz_uint8 *pCur_dict = d->m_dict + cur_pos;
+      mz_uint first_trigram = (*(const mz_uint32 *)pCur_dict) & 0xFFFFFF;
+      mz_uint hash =
+          (first_trigram ^ (first_trigram >> (24 - (TDEFL_LZ_HASH_BITS - 8)))) &
+          TDEFL_LEVEL1_HASH_SIZE_MASK;
+      mz_uint probe_pos = d->m_hash[hash];
+      d->m_hash[hash] = (mz_uint16)lookahead_pos;
+
+      if (((cur_match_dist = (mz_uint16)(lookahead_pos - probe_pos)) <=
+           dict_size) &&
+          ((*(const mz_uint32 *)(d->m_dict +
+                                 (probe_pos &= TDEFL_LZ_DICT_SIZE_MASK)) &
+            0xFFFFFF) == first_trigram)) {
+        const mz_uint16 *p = (const mz_uint16 *)pCur_dict;
+        const mz_uint16 *q = (const mz_uint16 *)(d->m_dict + probe_pos);
+        mz_uint32 probe_len = 32;
+        do {
+        } while ((TDEFL_READ_UNALIGNED_WORD(++p) ==
+                  TDEFL_READ_UNALIGNED_WORD(++q)) &&
+                 (TDEFL_READ_UNALIGNED_WORD(++p) ==
+                  TDEFL_READ_UNALIGNED_WORD(++q)) &&
+                 (TDEFL_READ_UNALIGNED_WORD(++p) ==
+                  TDEFL_READ_UNALIGNED_WORD(++q)) &&
+                 (TDEFL_READ_UNALIGNED_WORD(++p) ==
+                  TDEFL_READ_UNALIGNED_WORD(++q)) &&
+                 (--probe_len > 0));
+        cur_match_len = ((mz_uint)(p - (const mz_uint16 *)pCur_dict) * 2) +
+                        (mz_uint)(*(const mz_uint8 *)p == *(const mz_uint8 *)q);
+        if (!probe_len)
+          cur_match_len = cur_match_dist ? TDEFL_MAX_MATCH_LEN : 0;
+
+        if ((cur_match_len < TDEFL_MIN_MATCH_LEN) ||
+            ((cur_match_len == TDEFL_MIN_MATCH_LEN) &&
+             (cur_match_dist >= 8U * 1024U))) {
+          cur_match_len = 1;
+          *pLZ_code_buf++ = (mz_uint8)first_trigram;
+          *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+          d->m_huff_count[0][(mz_uint8)first_trigram]++;
+        } else {
+          mz_uint32 s0, s1;
+          cur_match_len = MZ_MIN(cur_match_len, lookahead_size);
+
+          MZ_ASSERT((cur_match_len >= TDEFL_MIN_MATCH_LEN) &&
+                    (cur_match_dist >= 1) &&
+                    (cur_match_dist <= TDEFL_LZ_DICT_SIZE));
+
+          cur_match_dist--;
+
+          pLZ_code_buf[0] = (mz_uint8)(cur_match_len - TDEFL_MIN_MATCH_LEN);
+          *(mz_uint16 *)(&pLZ_code_buf[1]) = (mz_uint16)cur_match_dist;
+          pLZ_code_buf += 3;
+          *pLZ_flags = (mz_uint8)((*pLZ_flags >> 1) | 0x80);
+
+          s0 = s_tdefl_small_dist_sym[cur_match_dist & 511];
+          s1 = s_tdefl_large_dist_sym[cur_match_dist >> 8];
+          d->m_huff_count[1][(cur_match_dist < 512) ? s0 : s1]++;
+
+          d->m_huff_count[0][s_tdefl_len_sym[cur_match_len -
+                                             TDEFL_MIN_MATCH_LEN]]++;
+        }
+      } else {
+        *pLZ_code_buf++ = (mz_uint8)first_trigram;
+        *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+        d->m_huff_count[0][(mz_uint8)first_trigram]++;
+      }
+
+      if (--num_flags_left == 0) {
+        num_flags_left = 8;
+        pLZ_flags = pLZ_code_buf++;
+      }
+
+      total_lz_bytes += cur_match_len;
+      lookahead_pos += cur_match_len;
+      dict_size = MZ_MIN(dict_size + cur_match_len, TDEFL_LZ_DICT_SIZE);
+      cur_pos = (cur_pos + cur_match_len) & TDEFL_LZ_DICT_SIZE_MASK;
+      MZ_ASSERT(lookahead_size >= cur_match_len);
+      lookahead_size -= cur_match_len;
+
+      if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) {
+        int n;
+        d->m_lookahead_pos = lookahead_pos;
+        d->m_lookahead_size = lookahead_size;
+        d->m_dict_size = dict_size;
+        d->m_total_lz_bytes = total_lz_bytes;
+        d->m_pLZ_code_buf = pLZ_code_buf;
+        d->m_pLZ_flags = pLZ_flags;
+        d->m_num_flags_left = num_flags_left;
+        if ((n = tdefl_flush_block(d, 0)) != 0)
+          return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        total_lz_bytes = d->m_total_lz_bytes;
+        pLZ_code_buf = d->m_pLZ_code_buf;
+        pLZ_flags = d->m_pLZ_flags;
+        num_flags_left = d->m_num_flags_left;
+      }
+    }
+
+    while (lookahead_size) {
+      mz_uint8 lit = d->m_dict[cur_pos];
+
+      total_lz_bytes++;
+      *pLZ_code_buf++ = lit;
+      *pLZ_flags = (mz_uint8)(*pLZ_flags >> 1);
+      if (--num_flags_left == 0) {
+        num_flags_left = 8;
+        pLZ_flags = pLZ_code_buf++;
+      }
+
+      d->m_huff_count[0][lit]++;
+
+      lookahead_pos++;
+      dict_size = MZ_MIN(dict_size + 1, TDEFL_LZ_DICT_SIZE);
+      cur_pos = (cur_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
+      lookahead_size--;
+
+      if (pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) {
+        int n;
+        d->m_lookahead_pos = lookahead_pos;
+        d->m_lookahead_size = lookahead_size;
+        d->m_dict_size = dict_size;
+        d->m_total_lz_bytes = total_lz_bytes;
+        d->m_pLZ_code_buf = pLZ_code_buf;
+        d->m_pLZ_flags = pLZ_flags;
+        d->m_num_flags_left = num_flags_left;
+        if ((n = tdefl_flush_block(d, 0)) != 0)
+          return (n < 0) ? MZ_FALSE : MZ_TRUE;
+        total_lz_bytes = d->m_total_lz_bytes;
+        pLZ_code_buf = d->m_pLZ_code_buf;
+        pLZ_flags = d->m_pLZ_flags;
+        num_flags_left = d->m_num_flags_left;
+      }
+    }
+  }
+
+  d->m_lookahead_pos = lookahead_pos;
+  d->m_lookahead_size = lookahead_size;
+  d->m_dict_size = dict_size;
+  d->m_total_lz_bytes = total_lz_bytes;
+  d->m_pLZ_code_buf = pLZ_code_buf;
+  d->m_pLZ_flags = pLZ_flags;
+  d->m_num_flags_left = num_flags_left;
+  return MZ_TRUE;
+}
+#endif  // MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+
+static MZ_FORCEINLINE void tdefl_record_literal(tdefl_compressor *d,
+                                                mz_uint8 lit) {
+  d->m_total_lz_bytes++;
+  *d->m_pLZ_code_buf++ = lit;
+  *d->m_pLZ_flags = (mz_uint8)(*d->m_pLZ_flags >> 1);
+  if (--d->m_num_flags_left == 0) {
+    d->m_num_flags_left = 8;
+    d->m_pLZ_flags = d->m_pLZ_code_buf++;
+  }
+  d->m_huff_count[0][lit]++;
+}
+
+static MZ_FORCEINLINE void tdefl_record_match(tdefl_compressor *d,
+                                              mz_uint match_len,
+                                              mz_uint match_dist) {
+  mz_uint32 s0, s1;
+
+  MZ_ASSERT((match_len >= TDEFL_MIN_MATCH_LEN) && (match_dist >= 1) &&
+            (match_dist <= TDEFL_LZ_DICT_SIZE));
+
+  d->m_total_lz_bytes += match_len;
+
+  d->m_pLZ_code_buf[0] = (mz_uint8)(match_len - TDEFL_MIN_MATCH_LEN);
+
+  match_dist -= 1;
+  d->m_pLZ_code_buf[1] = (mz_uint8)(match_dist & 0xFF);
+  d->m_pLZ_code_buf[2] = (mz_uint8)(match_dist >> 8);
+  d->m_pLZ_code_buf += 3;
+
+  *d->m_pLZ_flags = (mz_uint8)((*d->m_pLZ_flags >> 1) | 0x80);
+  if (--d->m_num_flags_left == 0) {
+    d->m_num_flags_left = 8;
+    d->m_pLZ_flags = d->m_pLZ_code_buf++;
+  }
+
+  s0 = s_tdefl_small_dist_sym[match_dist & 511];
+  s1 = s_tdefl_large_dist_sym[(match_dist >> 8) & 127];
+  d->m_huff_count[1][(match_dist < 512) ? s0 : s1]++;
+
+  if (match_len >= TDEFL_MIN_MATCH_LEN)
+    d->m_huff_count[0][s_tdefl_len_sym[match_len - TDEFL_MIN_MATCH_LEN]]++;
+}
+
+static mz_bool tdefl_compress_normal(tdefl_compressor *d) {
+  const mz_uint8 *pSrc = d->m_pSrc;
+  size_t src_buf_left = d->m_src_buf_left;
+  tdefl_flush flush = d->m_flush;
+
+  while ((src_buf_left) || ((flush) && (d->m_lookahead_size))) {
+    mz_uint len_to_move, cur_match_dist, cur_match_len, cur_pos;
+    // Update dictionary and hash chains. Keeps the lookahead size equal to
+    // TDEFL_MAX_MATCH_LEN.
+    if ((d->m_lookahead_size + d->m_dict_size) >= (TDEFL_MIN_MATCH_LEN - 1)) {
+      mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) &
+                        TDEFL_LZ_DICT_SIZE_MASK,
+              ins_pos = d->m_lookahead_pos + d->m_lookahead_size - 2;
+      mz_uint hash = (d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK]
+                      << TDEFL_LZ_HASH_SHIFT) ^
+                     d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK];
+      mz_uint num_bytes_to_process = (mz_uint)MZ_MIN(
+          src_buf_left, TDEFL_MAX_MATCH_LEN - d->m_lookahead_size);
+      const mz_uint8 *pSrc_end = pSrc + num_bytes_to_process;
+      src_buf_left -= num_bytes_to_process;
+      d->m_lookahead_size += num_bytes_to_process;
+      while (pSrc != pSrc_end) {
+        mz_uint8 c = *pSrc++;
+        d->m_dict[dst_pos] = c;
+        if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+          d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+        hash = ((hash << TDEFL_LZ_HASH_SHIFT) ^ c) & (TDEFL_LZ_HASH_SIZE - 1);
+        d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash];
+        d->m_hash[hash] = (mz_uint16)(ins_pos);
+        dst_pos = (dst_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK;
+        ins_pos++;
+      }
+    } else {
+      while ((src_buf_left) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN)) {
+        mz_uint8 c = *pSrc++;
+        mz_uint dst_pos = (d->m_lookahead_pos + d->m_lookahead_size) &
+                          TDEFL_LZ_DICT_SIZE_MASK;
+        src_buf_left--;
+        d->m_dict[dst_pos] = c;
+        if (dst_pos < (TDEFL_MAX_MATCH_LEN - 1))
+          d->m_dict[TDEFL_LZ_DICT_SIZE + dst_pos] = c;
+        if ((++d->m_lookahead_size + d->m_dict_size) >= TDEFL_MIN_MATCH_LEN) {
+          mz_uint ins_pos = d->m_lookahead_pos + (d->m_lookahead_size - 1) - 2;
+          mz_uint hash = ((d->m_dict[ins_pos & TDEFL_LZ_DICT_SIZE_MASK]
+                           << (TDEFL_LZ_HASH_SHIFT * 2)) ^
+                          (d->m_dict[(ins_pos + 1) & TDEFL_LZ_DICT_SIZE_MASK]
+                           << TDEFL_LZ_HASH_SHIFT) ^
+                          c) &
+                         (TDEFL_LZ_HASH_SIZE - 1);
+          d->m_next[ins_pos & TDEFL_LZ_DICT_SIZE_MASK] = d->m_hash[hash];
+          d->m_hash[hash] = (mz_uint16)(ins_pos);
+        }
+      }
+    }
+    d->m_dict_size =
+        MZ_MIN(TDEFL_LZ_DICT_SIZE - d->m_lookahead_size, d->m_dict_size);
+    if ((!flush) && (d->m_lookahead_size < TDEFL_MAX_MATCH_LEN)) break;
+
+    // Simple lazy/greedy parsing state machine.
+    len_to_move = 1;
+    cur_match_dist = 0;
+    cur_match_len =
+        d->m_saved_match_len ? d->m_saved_match_len : (TDEFL_MIN_MATCH_LEN - 1);
+    cur_pos = d->m_lookahead_pos & TDEFL_LZ_DICT_SIZE_MASK;
+    if (d->m_flags & (TDEFL_RLE_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS)) {
+      if ((d->m_dict_size) && (!(d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS))) {
+        mz_uint8 c = d->m_dict[(cur_pos - 1) & TDEFL_LZ_DICT_SIZE_MASK];
+        cur_match_len = 0;
+        while (cur_match_len < d->m_lookahead_size) {
+          if (d->m_dict[cur_pos + cur_match_len] != c) break;
+          cur_match_len++;
+        }
+        if (cur_match_len < TDEFL_MIN_MATCH_LEN)
+          cur_match_len = 0;
+        else
+          cur_match_dist = 1;
+      }
+    } else {
+      tdefl_find_match(d, d->m_lookahead_pos, d->m_dict_size,
+                       d->m_lookahead_size, &cur_match_dist, &cur_match_len);
+    }
+    if (((cur_match_len == TDEFL_MIN_MATCH_LEN) &&
+         (cur_match_dist >= 8U * 1024U)) ||
+        (cur_pos == cur_match_dist) ||
+        ((d->m_flags & TDEFL_FILTER_MATCHES) && (cur_match_len <= 5))) {
+      cur_match_dist = cur_match_len = 0;
+    }
+    if (d->m_saved_match_len) {
+      if (cur_match_len > d->m_saved_match_len) {
+        tdefl_record_literal(d, (mz_uint8)d->m_saved_lit);
+        if (cur_match_len >= 128) {
+          tdefl_record_match(d, cur_match_len, cur_match_dist);
+          d->m_saved_match_len = 0;
+          len_to_move = cur_match_len;
+        } else {
+          d->m_saved_lit = d->m_dict[cur_pos];
+          d->m_saved_match_dist = cur_match_dist;
+          d->m_saved_match_len = cur_match_len;
+        }
+      } else {
+        tdefl_record_match(d, d->m_saved_match_len, d->m_saved_match_dist);
+        len_to_move = d->m_saved_match_len - 1;
+        d->m_saved_match_len = 0;
+      }
+    } else if (!cur_match_dist)
+      tdefl_record_literal(d,
+                           d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)]);
+    else if ((d->m_greedy_parsing) || (d->m_flags & TDEFL_RLE_MATCHES) ||
+             (cur_match_len >= 128)) {
+      tdefl_record_match(d, cur_match_len, cur_match_dist);
+      len_to_move = cur_match_len;
+    } else {
+      d->m_saved_lit = d->m_dict[MZ_MIN(cur_pos, sizeof(d->m_dict) - 1)];
+      d->m_saved_match_dist = cur_match_dist;
+      d->m_saved_match_len = cur_match_len;
+    }
+    // Move the lookahead forward by len_to_move bytes.
+    d->m_lookahead_pos += len_to_move;
+    MZ_ASSERT(d->m_lookahead_size >= len_to_move);
+    d->m_lookahead_size -= len_to_move;
+    d->m_dict_size =
+        MZ_MIN(d->m_dict_size + len_to_move, (mz_uint)TDEFL_LZ_DICT_SIZE);
+    // Check if it's time to flush the current LZ codes to the internal output
+    // buffer.
+    if ((d->m_pLZ_code_buf > &d->m_lz_code_buf[TDEFL_LZ_CODE_BUF_SIZE - 8]) ||
+        ((d->m_total_lz_bytes > 31 * 1024) &&
+         (((((mz_uint)(d->m_pLZ_code_buf - d->m_lz_code_buf) * 115) >> 7) >=
+           d->m_total_lz_bytes) ||
+          (d->m_flags & TDEFL_FORCE_ALL_RAW_BLOCKS)))) {
+      int n;
+      d->m_pSrc = pSrc;
+      d->m_src_buf_left = src_buf_left;
+      if ((n = tdefl_flush_block(d, 0)) != 0)
+        return (n < 0) ? MZ_FALSE : MZ_TRUE;
+    }
+  }
+
+  d->m_pSrc = pSrc;
+  d->m_src_buf_left = src_buf_left;
+  return MZ_TRUE;
+}
+
+static tdefl_status tdefl_flush_output_buffer(tdefl_compressor *d) {
+  if (d->m_pIn_buf_size) {
+    *d->m_pIn_buf_size = d->m_pSrc - (const mz_uint8 *)d->m_pIn_buf;
+  }
+
+  if (d->m_pOut_buf_size) {
+    size_t n = MZ_MIN(*d->m_pOut_buf_size - d->m_out_buf_ofs,
+                      d->m_output_flush_remaining);
+    memcpy((mz_uint8 *)d->m_pOut_buf + d->m_out_buf_ofs,
+           d->m_output_buf + d->m_output_flush_ofs, n);
+    d->m_output_flush_ofs += (mz_uint)n;
+    d->m_output_flush_remaining -= (mz_uint)n;
+    d->m_out_buf_ofs += n;
+
+    *d->m_pOut_buf_size = d->m_out_buf_ofs;
+  }
+
+  return (d->m_finished && !d->m_output_flush_remaining) ? TDEFL_STATUS_DONE
+                                                         : TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_compress(tdefl_compressor *d, const void *pIn_buf,
+                            size_t *pIn_buf_size, void *pOut_buf,
+                            size_t *pOut_buf_size, tdefl_flush flush) {
+  if (!d) {
+    if (pIn_buf_size) *pIn_buf_size = 0;
+    if (pOut_buf_size) *pOut_buf_size = 0;
+    return TDEFL_STATUS_BAD_PARAM;
+  }
+
+  d->m_pIn_buf = pIn_buf;
+  d->m_pIn_buf_size = pIn_buf_size;
+  d->m_pOut_buf = pOut_buf;
+  d->m_pOut_buf_size = pOut_buf_size;
+  d->m_pSrc = (const mz_uint8 *)(pIn_buf);
+  d->m_src_buf_left = pIn_buf_size ? *pIn_buf_size : 0;
+  d->m_out_buf_ofs = 0;
+  d->m_flush = flush;
+
+  if (((d->m_pPut_buf_func != NULL) ==
+       ((pOut_buf != NULL) || (pOut_buf_size != NULL))) ||
+      (d->m_prev_return_status != TDEFL_STATUS_OKAY) ||
+      (d->m_wants_to_finish && (flush != TDEFL_FINISH)) ||
+      (pIn_buf_size && *pIn_buf_size && !pIn_buf) ||
+      (pOut_buf_size && *pOut_buf_size && !pOut_buf)) {
+    if (pIn_buf_size) *pIn_buf_size = 0;
+    if (pOut_buf_size) *pOut_buf_size = 0;
+    return (d->m_prev_return_status = TDEFL_STATUS_BAD_PARAM);
+  }
+  d->m_wants_to_finish |= (flush == TDEFL_FINISH);
+
+  if ((d->m_output_flush_remaining) || (d->m_finished))
+    return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+
+#if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  if (((d->m_flags & TDEFL_MAX_PROBES_MASK) == 1) &&
+      ((d->m_flags & TDEFL_GREEDY_PARSING_FLAG) != 0) &&
+      ((d->m_flags & (TDEFL_FILTER_MATCHES | TDEFL_FORCE_ALL_RAW_BLOCKS |
+                      TDEFL_RLE_MATCHES)) == 0)) {
+    if (!tdefl_compress_fast(d)) return d->m_prev_return_status;
+  } else
+#endif  // #if MINIZ_USE_UNALIGNED_LOADS_AND_STORES && MINIZ_LITTLE_ENDIAN
+  {
+    if (!tdefl_compress_normal(d)) return d->m_prev_return_status;
+  }
+
+  if ((d->m_flags & (TDEFL_WRITE_ZLIB_HEADER | TDEFL_COMPUTE_ADLER32)) &&
+      (pIn_buf))
+    d->m_adler32 =
+        (mz_uint32)mz_adler32(d->m_adler32, (const mz_uint8 *)pIn_buf,
+                              d->m_pSrc - (const mz_uint8 *)pIn_buf);
+
+  if ((flush) && (!d->m_lookahead_size) && (!d->m_src_buf_left) &&
+      (!d->m_output_flush_remaining)) {
+    if (tdefl_flush_block(d, flush) < 0) return d->m_prev_return_status;
+    d->m_finished = (flush == TDEFL_FINISH);
+    if (flush == TDEFL_FULL_FLUSH) {
+      MZ_CLEAR_OBJ(d->m_hash);
+      MZ_CLEAR_OBJ(d->m_next);
+      d->m_dict_size = 0;
+    }
+  }
+
+  return (d->m_prev_return_status = tdefl_flush_output_buffer(d));
+}
+
+tdefl_status tdefl_compress_buffer(tdefl_compressor *d, const void *pIn_buf,
+                                   size_t in_buf_size, tdefl_flush flush) {
+  MZ_ASSERT(d->m_pPut_buf_func);
+  return tdefl_compress(d, pIn_buf, &in_buf_size, NULL, NULL, flush);
+}
+
+tdefl_status tdefl_init(tdefl_compressor *d,
+                        tdefl_put_buf_func_ptr pPut_buf_func,
+                        void *pPut_buf_user, int flags) {
+  d->m_pPut_buf_func = pPut_buf_func;
+  d->m_pPut_buf_user = pPut_buf_user;
+  d->m_flags = (mz_uint)(flags);
+  d->m_max_probes[0] = 1 + ((flags & 0xFFF) + 2) / 3;
+  d->m_greedy_parsing = (flags & TDEFL_GREEDY_PARSING_FLAG) != 0;
+  d->m_max_probes[1] = 1 + (((flags & 0xFFF) >> 2) + 2) / 3;
+  if (!(flags & TDEFL_NONDETERMINISTIC_PARSING_FLAG)) MZ_CLEAR_OBJ(d->m_hash);
+  d->m_lookahead_pos = d->m_lookahead_size = d->m_dict_size =
+      d->m_total_lz_bytes = d->m_lz_code_buf_dict_pos = d->m_bits_in = 0;
+  d->m_output_flush_ofs = d->m_output_flush_remaining = d->m_finished =
+      d->m_block_index = d->m_bit_buffer = d->m_wants_to_finish = 0;
+  d->m_pLZ_code_buf = d->m_lz_code_buf + 1;
+  d->m_pLZ_flags = d->m_lz_code_buf;
+  d->m_num_flags_left = 8;
+  d->m_pOutput_buf = d->m_output_buf;
+  d->m_pOutput_buf_end = d->m_output_buf;
+  d->m_prev_return_status = TDEFL_STATUS_OKAY;
+  d->m_saved_match_dist = d->m_saved_match_len = d->m_saved_lit = 0;
+  d->m_adler32 = 1;
+  d->m_pIn_buf = NULL;
+  d->m_pOut_buf = NULL;
+  d->m_pIn_buf_size = NULL;
+  d->m_pOut_buf_size = NULL;
+  d->m_flush = TDEFL_NO_FLUSH;
+  d->m_pSrc = NULL;
+  d->m_src_buf_left = 0;
+  d->m_out_buf_ofs = 0;
+  memset(&d->m_huff_count[0][0], 0,
+         sizeof(d->m_huff_count[0][0]) * TDEFL_MAX_HUFF_SYMBOLS_0);
+  memset(&d->m_huff_count[1][0], 0,
+         sizeof(d->m_huff_count[1][0]) * TDEFL_MAX_HUFF_SYMBOLS_1);
+  return TDEFL_STATUS_OKAY;
+}
+
+tdefl_status tdefl_get_prev_return_status(tdefl_compressor *d) {
+  return d->m_prev_return_status;
+}
+
+mz_uint32 tdefl_get_adler32(tdefl_compressor *d) { return d->m_adler32; }
+
+mz_bool tdefl_compress_mem_to_output(const void *pBuf, size_t buf_len,
+                                     tdefl_put_buf_func_ptr pPut_buf_func,
+                                     void *pPut_buf_user, int flags) {
+  tdefl_compressor *pComp;
+  mz_bool succeeded;
+  if (((buf_len) && (!pBuf)) || (!pPut_buf_func)) return MZ_FALSE;
+  pComp = (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor));
+  if (!pComp) return MZ_FALSE;
+  succeeded = (tdefl_init(pComp, pPut_buf_func, pPut_buf_user, flags) ==
+               TDEFL_STATUS_OKAY);
+  succeeded =
+      succeeded && (tdefl_compress_buffer(pComp, pBuf, buf_len, TDEFL_FINISH) ==
+                    TDEFL_STATUS_DONE);
+  MZ_FREE(pComp);
+  return succeeded;
+}
+
+typedef struct {
+  size_t m_size, m_capacity;
+  mz_uint8 *m_pBuf;
+  mz_bool m_expandable;
+} tdefl_output_buffer;
+
+static mz_bool tdefl_output_buffer_putter(const void *pBuf, int len,
+                                          void *pUser) {
+  tdefl_output_buffer *p = (tdefl_output_buffer *)pUser;
+  size_t new_size = p->m_size + len;
+  if (new_size > p->m_capacity) {
+    size_t new_capacity = p->m_capacity;
+    mz_uint8 *pNew_buf;
+    if (!p->m_expandable) return MZ_FALSE;
+    do {
+      new_capacity = MZ_MAX(128U, new_capacity << 1U);
+    } while (new_size > new_capacity);
+    pNew_buf = (mz_uint8 *)MZ_REALLOC(p->m_pBuf, new_capacity);
+    if (!pNew_buf) return MZ_FALSE;
+    p->m_pBuf = pNew_buf;
+    p->m_capacity = new_capacity;
+  }
+  memcpy((mz_uint8 *)p->m_pBuf + p->m_size, pBuf, len);
+  p->m_size = new_size;
+  return MZ_TRUE;
+}
+
+void *tdefl_compress_mem_to_heap(const void *pSrc_buf, size_t src_buf_len,
+                                 size_t *pOut_len, int flags) {
+  tdefl_output_buffer out_buf;
+  MZ_CLEAR_OBJ(out_buf);
+  if (!pOut_len)
+    return MZ_FALSE;
+  else
+    *pOut_len = 0;
+  out_buf.m_expandable = MZ_TRUE;
+  if (!tdefl_compress_mem_to_output(
+          pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags))
+    return NULL;
+  *pOut_len = out_buf.m_size;
+  return out_buf.m_pBuf;
+}
+
+size_t tdefl_compress_mem_to_mem(void *pOut_buf, size_t out_buf_len,
+                                 const void *pSrc_buf, size_t src_buf_len,
+                                 int flags) {
+  tdefl_output_buffer out_buf;
+  MZ_CLEAR_OBJ(out_buf);
+  if (!pOut_buf) return 0;
+  out_buf.m_pBuf = (mz_uint8 *)pOut_buf;
+  out_buf.m_capacity = out_buf_len;
+  if (!tdefl_compress_mem_to_output(
+          pSrc_buf, src_buf_len, tdefl_output_buffer_putter, &out_buf, flags))
+    return 0;
+  return out_buf.m_size;
+}
+
+#ifndef MINIZ_NO_ZLIB_APIS
+static const mz_uint s_tdefl_num_probes[11] = {0,   1,   6,   32,  16,  32,
+                                               128, 256, 512, 768, 1500};
+
+// level may actually range from [0,10] (10 is a "hidden" max level, where we
+// want a bit more compression and it's fine if throughput to fall off a cliff
+// on some files).
+mz_uint tdefl_create_comp_flags_from_zip_params(int level, int window_bits,
+                                                int strategy) {
+  mz_uint comp_flags =
+      s_tdefl_num_probes[(level >= 0) ? MZ_MIN(10, level) : MZ_DEFAULT_LEVEL] |
+      ((level <= 3) ? TDEFL_GREEDY_PARSING_FLAG : 0);
+  if (window_bits > 0) comp_flags |= TDEFL_WRITE_ZLIB_HEADER;
+
+  if (!level)
+    comp_flags |= TDEFL_FORCE_ALL_RAW_BLOCKS;
+  else if (strategy == MZ_FILTERED)
+    comp_flags |= TDEFL_FILTER_MATCHES;
+  else if (strategy == MZ_HUFFMAN_ONLY)
+    comp_flags &= ~TDEFL_MAX_PROBES_MASK;
+  else if (strategy == MZ_FIXED)
+    comp_flags |= TDEFL_FORCE_ALL_STATIC_BLOCKS;
+  else if (strategy == MZ_RLE)
+    comp_flags |= TDEFL_RLE_MATCHES;
+
+  return comp_flags;
+}
+#endif  // MINIZ_NO_ZLIB_APIS
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4204)  // nonstandard extension used : non-constant
+                                 // aggregate initializer (also supported by GNU
+                                 // C and C99, so no big deal)
+#pragma warning(disable : 4244)  // 'initializing': conversion from '__int64' to
+                                 // 'int', possible loss of data
+#pragma warning( \
+    disable : 4267)  // 'argument': conversion from '__int64' to 'int',
+                     // possible loss of data
+#pragma warning(disable : 4996)  // 'strdup': The POSIX name for this item is
+                                 // deprecated. Instead, use the ISO C and C++
+                                 // conformant name: _strdup.
+#endif
+
+// Simple PNG writer function by Alex Evans, 2011. Released into the public
+// domain: https://gist.github.com/908299, more context at
+// http://altdevblogaday.org/2011/04/06/a-smaller-jpg-encoder/.
+// This is actually a modification of Alex's original code so PNG files
+// generated by this function pass pngcheck.
+void *tdefl_write_image_to_png_file_in_memory_ex(const void *pImage, int w,
+                                                 int h, int num_chans,
+                                                 size_t *pLen_out,
+                                                 mz_uint level, mz_bool flip) {
+  // Using a local copy of this array here in case MINIZ_NO_ZLIB_APIS was
+  // defined.
+  static const mz_uint s_tdefl_png_num_probes[11] = {
+      0, 1, 6, 32, 16, 32, 128, 256, 512, 768, 1500};
+  tdefl_compressor *pComp =
+      (tdefl_compressor *)MZ_MALLOC(sizeof(tdefl_compressor));
+  tdefl_output_buffer out_buf;
+  int i, bpl = w * num_chans, y, z;
+  mz_uint32 c;
+  *pLen_out = 0;
+  if (!pComp) return NULL;
+  MZ_CLEAR_OBJ(out_buf);
+  out_buf.m_expandable = MZ_TRUE;
+  out_buf.m_capacity = 57 + MZ_MAX(64, (1 + bpl) * h);
+  if (NULL == (out_buf.m_pBuf = (mz_uint8 *)MZ_MALLOC(out_buf.m_capacity))) {
+    MZ_FREE(pComp);
+    return NULL;
+  }
+  // write dummy header
+  for (z = 41; z; --z) tdefl_output_buffer_putter(&z, 1, &out_buf);
+  // compress image data
+  tdefl_init(
+      pComp, tdefl_output_buffer_putter, &out_buf,
+      s_tdefl_png_num_probes[MZ_MIN(10, level)] | TDEFL_WRITE_ZLIB_HEADER);
+  for (y = 0; y < h; ++y) {
+    tdefl_compress_buffer(pComp, &z, 1, TDEFL_NO_FLUSH);
+    tdefl_compress_buffer(pComp,
+                          (mz_uint8 *)pImage + (flip ? (h - 1 - y) : y) * bpl,
+                          bpl, TDEFL_NO_FLUSH);
+  }
+  if (tdefl_compress_buffer(pComp, NULL, 0, TDEFL_FINISH) !=
+      TDEFL_STATUS_DONE) {
+    MZ_FREE(pComp);
+    MZ_FREE(out_buf.m_pBuf);
+    return NULL;
+  }
+  // write real header
+  *pLen_out = out_buf.m_size - 41;
+  {
+    static const mz_uint8 chans[] = {0x00, 0x00, 0x04, 0x02, 0x06};
+    mz_uint8 pnghdr[41] = {0x89,
+                           0x50,
+                           0x4e,
+                           0x47,
+                           0x0d,
+                           0x0a,
+                           0x1a,
+                           0x0a,
+                           0x00,
+                           0x00,
+                           0x00,
+                           0x0d,
+                           0x49,
+                           0x48,
+                           0x44,
+                           0x52,
+                           0,
+                           0,
+                           (mz_uint8)(w >> 8),
+                           (mz_uint8)w,
+                           0,
+                           0,
+                           (mz_uint8)(h >> 8),
+                           (mz_uint8)h,
+                           8,
+                           chans[num_chans],
+                           0,
+                           0,
+                           0,
+                           0,
+                           0,
+                           0,
+                           0,
+                           (mz_uint8)(*pLen_out >> 24),
+                           (mz_uint8)(*pLen_out >> 16),
+                           (mz_uint8)(*pLen_out >> 8),
+                           (mz_uint8)*pLen_out,
+                           0x49,
+                           0x44,
+                           0x41,
+                           0x54};
+    c = (mz_uint32)mz_crc32(MZ_CRC32_INIT, pnghdr + 12, 17);
+    for (i = 0; i < 4; ++i, c <<= 8)
+      ((mz_uint8 *)(pnghdr + 29))[i] = (mz_uint8)(c >> 24);
+    memcpy(out_buf.m_pBuf, pnghdr, 41);
+  }
+  // write footer (IDAT CRC-32, followed by IEND chunk)
+  if (!tdefl_output_buffer_putter(
+          "\0\0\0\0\0\0\0\0\x49\x45\x4e\x44\xae\x42\x60\x82", 16, &out_buf)) {
+    *pLen_out = 0;
+    MZ_FREE(pComp);
+    MZ_FREE(out_buf.m_pBuf);
+    return NULL;
+  }
+  c = (mz_uint32)mz_crc32(MZ_CRC32_INIT, out_buf.m_pBuf + 41 - 4,
+                          *pLen_out + 4);
+  for (i = 0; i < 4; ++i, c <<= 8)
+    (out_buf.m_pBuf + out_buf.m_size - 16)[i] = (mz_uint8)(c >> 24);
+  // compute final size of file, grab compressed data buffer and return
+  *pLen_out += 57;
+  MZ_FREE(pComp);
+  return out_buf.m_pBuf;
+}
+void *tdefl_write_image_to_png_file_in_memory(const void *pImage, int w, int h,
+                                              int num_chans, size_t *pLen_out) {
+  // Level 6 corresponds to TDEFL_DEFAULT_MAX_PROBES or MZ_DEFAULT_LEVEL (but we
+  // can't depend on MZ_DEFAULT_LEVEL being available in case the zlib API's
+  // where #defined out)
+  return tdefl_write_image_to_png_file_in_memory_ex(pImage, w, h, num_chans,
+                                                    pLen_out, 6, MZ_FALSE);
+}
+
+// ------------------- .ZIP archive reading
+
+#ifndef MINIZ_NO_ARCHIVE_APIS
+
+#ifdef MINIZ_NO_STDIO
+#define MZ_FILE void *
+#else
+#include <stdio.h>
+#include <sys/stat.h>
+
+#if defined(_MSC_VER) || defined(__MINGW64__)
+static FILE *mz_fopen(const char *pFilename, const char *pMode) {
+  FILE *pFile = NULL;
+  fopen_s(&pFile, pFilename, pMode);
+  return pFile;
+}
+static FILE *mz_freopen(const char *pPath, const char *pMode, FILE *pStream) {
+  FILE *pFile = NULL;
+  if (freopen_s(&pFile, pPath, pMode, pStream)) return NULL;
+  return pFile;
+}
+#ifndef MINIZ_NO_TIME
+#include <sys/utime.h>
+#endif
+#define MZ_FILE FILE
+#define MZ_FOPEN mz_fopen
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 _ftelli64
+#define MZ_FSEEK64 _fseeki64
+#define MZ_FILE_STAT_STRUCT _stat
+#define MZ_FILE_STAT _stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN mz_freopen
+#define MZ_DELETE_FILE remove
+#elif defined(__MINGW32__)
+#ifndef MINIZ_NO_TIME
+#include <sys/utime.h>
+#endif
+#define MZ_FILE FILE
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftello64
+#define MZ_FSEEK64 fseeko64
+#define MZ_FILE_STAT_STRUCT _stat
+#define MZ_FILE_STAT _stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(f, m, s) freopen(f, m, s)
+#define MZ_DELETE_FILE remove
+#elif defined(__TINYC__)
+#ifndef MINIZ_NO_TIME
+#include <sys/utime.h>
+#endif
+#define MZ_FILE FILE
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftell
+#define MZ_FSEEK64 fseek
+#define MZ_FILE_STAT_STRUCT stat
+#define MZ_FILE_STAT stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(f, m, s) freopen(f, m, s)
+#define MZ_DELETE_FILE remove
+#elif defined(__GNUC__) && defined(_LARGEFILE64_SOURCE) && _LARGEFILE64_SOURCE
+#ifndef MINIZ_NO_TIME
+#include <utime.h>
+#endif
+#define MZ_FILE FILE
+#define MZ_FOPEN(f, m) fopen64(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftello64
+#define MZ_FSEEK64 fseeko64
+#define MZ_FILE_STAT_STRUCT stat64
+#define MZ_FILE_STAT stat64
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(p, m, s) freopen64(p, m, s)
+#define MZ_DELETE_FILE remove
+#else
+#ifndef MINIZ_NO_TIME
+#include <utime.h>
+#endif
+#define MZ_FILE FILE
+#define MZ_FOPEN(f, m) fopen(f, m)
+#define MZ_FCLOSE fclose
+#define MZ_FREAD fread
+#define MZ_FWRITE fwrite
+#define MZ_FTELL64 ftello
+#define MZ_FSEEK64 fseeko
+#define MZ_FILE_STAT_STRUCT stat
+#define MZ_FILE_STAT stat
+#define MZ_FFLUSH fflush
+#define MZ_FREOPEN(f, m, s) freopen(f, m, s)
+#define MZ_DELETE_FILE remove
+#endif  // #ifdef _MSC_VER
+#endif  // #ifdef MINIZ_NO_STDIO
+
+#define MZ_TOLOWER(c) ((((c) >= 'A') && ((c) <= 'Z')) ? ((c) - 'A' + 'a') : (c))
+
+// Various ZIP archive enums. To completely avoid cross platform compiler
+// alignment and platform endian issues, miniz.c doesn't use structs for any of
+// this stuff.
+enum {
+  // ZIP archive identifiers and record sizes
+  MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG = 0x06054b50,
+  MZ_ZIP_CENTRAL_DIR_HEADER_SIG = 0x02014b50,
+  MZ_ZIP_LOCAL_DIR_HEADER_SIG = 0x04034b50,
+  MZ_ZIP_LOCAL_DIR_HEADER_SIZE = 30,
+  MZ_ZIP_CENTRAL_DIR_HEADER_SIZE = 46,
+  MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE = 22,
+  // Central directory header record offsets
+  MZ_ZIP_CDH_SIG_OFS = 0,
+  MZ_ZIP_CDH_VERSION_MADE_BY_OFS = 4,
+  MZ_ZIP_CDH_VERSION_NEEDED_OFS = 6,
+  MZ_ZIP_CDH_BIT_FLAG_OFS = 8,
+  MZ_ZIP_CDH_METHOD_OFS = 10,
+  MZ_ZIP_CDH_FILE_TIME_OFS = 12,
+  MZ_ZIP_CDH_FILE_DATE_OFS = 14,
+  MZ_ZIP_CDH_CRC32_OFS = 16,
+  MZ_ZIP_CDH_COMPRESSED_SIZE_OFS = 20,
+  MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS = 24,
+  MZ_ZIP_CDH_FILENAME_LEN_OFS = 28,
+  MZ_ZIP_CDH_EXTRA_LEN_OFS = 30,
+  MZ_ZIP_CDH_COMMENT_LEN_OFS = 32,
+  MZ_ZIP_CDH_DISK_START_OFS = 34,
+  MZ_ZIP_CDH_INTERNAL_ATTR_OFS = 36,
+  MZ_ZIP_CDH_EXTERNAL_ATTR_OFS = 38,
+  MZ_ZIP_CDH_LOCAL_HEADER_OFS = 42,
+  // Local directory header offsets
+  MZ_ZIP_LDH_SIG_OFS = 0,
+  MZ_ZIP_LDH_VERSION_NEEDED_OFS = 4,
+  MZ_ZIP_LDH_BIT_FLAG_OFS = 6,
+  MZ_ZIP_LDH_METHOD_OFS = 8,
+  MZ_ZIP_LDH_FILE_TIME_OFS = 10,
+  MZ_ZIP_LDH_FILE_DATE_OFS = 12,
+  MZ_ZIP_LDH_CRC32_OFS = 14,
+  MZ_ZIP_LDH_COMPRESSED_SIZE_OFS = 18,
+  MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS = 22,
+  MZ_ZIP_LDH_FILENAME_LEN_OFS = 26,
+  MZ_ZIP_LDH_EXTRA_LEN_OFS = 28,
+  // End of central directory offsets
+  MZ_ZIP_ECDH_SIG_OFS = 0,
+  MZ_ZIP_ECDH_NUM_THIS_DISK_OFS = 4,
+  MZ_ZIP_ECDH_NUM_DISK_CDIR_OFS = 6,
+  MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS = 8,
+  MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS = 10,
+  MZ_ZIP_ECDH_CDIR_SIZE_OFS = 12,
+  MZ_ZIP_ECDH_CDIR_OFS_OFS = 16,
+  MZ_ZIP_ECDH_COMMENT_SIZE_OFS = 20,
+};
+
+typedef struct {
+  void *m_p;
+  size_t m_size, m_capacity;
+  mz_uint m_element_size;
+} mz_zip_array;
+
+struct mz_zip_internal_state_tag {
+  mz_zip_array m_central_dir;
+  mz_zip_array m_central_dir_offsets;
+  mz_zip_array m_sorted_central_dir_offsets;
+  MZ_FILE *m_pFile;
+  void *m_pMem;
+  size_t m_mem_size;
+  size_t m_mem_capacity;
+};
+
+#define MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(array_ptr, element_size) \
+  (array_ptr)->m_element_size = element_size
+#define MZ_ZIP_ARRAY_ELEMENT(array_ptr, element_type, index) \
+  ((element_type *)((array_ptr)->m_p))[index]
+
+static MZ_FORCEINLINE void mz_zip_array_clear(mz_zip_archive *pZip,
+                                              mz_zip_array *pArray) {
+  pZip->m_pFree(pZip->m_pAlloc_opaque, pArray->m_p);
+  memset(pArray, 0, sizeof(mz_zip_array));
+}
+
+static mz_bool mz_zip_array_ensure_capacity(mz_zip_archive *pZip,
+                                            mz_zip_array *pArray,
+                                            size_t min_new_capacity,
+                                            mz_uint growing) {
+  void *pNew_p;
+  size_t new_capacity = min_new_capacity;
+  MZ_ASSERT(pArray->m_element_size);
+  if (pArray->m_capacity >= min_new_capacity) return MZ_TRUE;
+  if (growing) {
+    new_capacity = MZ_MAX(1, pArray->m_capacity);
+    while (new_capacity < min_new_capacity) new_capacity *= 2;
+  }
+  if (NULL == (pNew_p = pZip->m_pRealloc(pZip->m_pAlloc_opaque, pArray->m_p,
+                                         pArray->m_element_size, new_capacity)))
+    return MZ_FALSE;
+  pArray->m_p = pNew_p;
+  pArray->m_capacity = new_capacity;
+  return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_reserve(mz_zip_archive *pZip,
+                                                   mz_zip_array *pArray,
+                                                   size_t new_capacity,
+                                                   mz_uint growing) {
+  if (new_capacity > pArray->m_capacity) {
+    if (!mz_zip_array_ensure_capacity(pZip, pArray, new_capacity, growing))
+      return MZ_FALSE;
+  }
+  return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_resize(mz_zip_archive *pZip,
+                                                  mz_zip_array *pArray,
+                                                  size_t new_size,
+                                                  mz_uint growing) {
+  if (new_size > pArray->m_capacity) {
+    if (!mz_zip_array_ensure_capacity(pZip, pArray, new_size, growing))
+      return MZ_FALSE;
+  }
+  pArray->m_size = new_size;
+  return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_ensure_room(mz_zip_archive *pZip,
+                                                       mz_zip_array *pArray,
+                                                       size_t n) {
+  return mz_zip_array_reserve(pZip, pArray, pArray->m_size + n, MZ_TRUE);
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_array_push_back(mz_zip_archive *pZip,
+                                                     mz_zip_array *pArray,
+                                                     const void *pElements,
+                                                     size_t n) {
+  size_t orig_size = pArray->m_size;
+  if (!mz_zip_array_resize(pZip, pArray, orig_size + n, MZ_TRUE))
+    return MZ_FALSE;
+  memcpy((mz_uint8 *)pArray->m_p + orig_size * pArray->m_element_size,
+         pElements, n * pArray->m_element_size);
+  return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_TIME
+static time_t mz_zip_dos_to_time_t(int dos_time, int dos_date) {
+  struct tm tm;
+  memset(&tm, 0, sizeof(tm));
+  tm.tm_isdst = -1;
+  tm.tm_year = ((dos_date >> 9) & 127) + 1980 - 1900;
+  tm.tm_mon = ((dos_date >> 5) & 15) - 1;
+  tm.tm_mday = dos_date & 31;
+  tm.tm_hour = (dos_time >> 11) & 31;
+  tm.tm_min = (dos_time >> 5) & 63;
+  tm.tm_sec = (dos_time << 1) & 62;
+  return mktime(&tm);
+}
+
+static void mz_zip_time_to_dos_time(time_t time, mz_uint16 *pDOS_time,
+                                    mz_uint16 *pDOS_date) {
+#ifdef _MSC_VER
+  struct tm tm_struct;
+  struct tm *tm = &tm_struct;
+  errno_t err = localtime_s(tm, &time);
+  if (err) {
+    *pDOS_date = 0;
+    *pDOS_time = 0;
+    return;
+  }
+#else
+  struct tm *tm = localtime(&time);
+#endif
+  *pDOS_time = (mz_uint16)(((tm->tm_hour) << 11) + ((tm->tm_min) << 5) +
+                           ((tm->tm_sec) >> 1));
+  *pDOS_date = (mz_uint16)(((tm->tm_year + 1900 - 1980) << 9) +
+                           ((tm->tm_mon + 1) << 5) + tm->tm_mday);
+}
+#endif
+
+#ifndef MINIZ_NO_STDIO
+static mz_bool mz_zip_get_file_modified_time(const char *pFilename,
+                                             mz_uint16 *pDOS_time,
+                                             mz_uint16 *pDOS_date) {
+#ifdef MINIZ_NO_TIME
+  (void)pFilename;
+  *pDOS_date = *pDOS_time = 0;
+#else
+  struct MZ_FILE_STAT_STRUCT file_stat;
+  // On Linux with x86 glibc, this call will fail on large files (>= 0x80000000
+  // bytes) unless you compiled with _LARGEFILE64_SOURCE. Argh.
+  if (MZ_FILE_STAT(pFilename, &file_stat) != 0) return MZ_FALSE;
+  mz_zip_time_to_dos_time(file_stat.st_mtime, pDOS_time, pDOS_date);
+#endif  // #ifdef MINIZ_NO_TIME
+  return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_TIME
+static mz_bool mz_zip_set_file_times(const char *pFilename, time_t access_time,
+                                     time_t modified_time) {
+  struct utimbuf t;
+  t.actime = access_time;
+  t.modtime = modified_time;
+  return !utime(pFilename, &t);
+}
+#endif  // #ifndef MINIZ_NO_TIME
+#endif  // #ifndef MINIZ_NO_STDIO
+
+static mz_bool mz_zip_reader_init_internal(mz_zip_archive *pZip,
+                                           mz_uint32 flags) {
+  (void)flags;
+  if ((!pZip) || (pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_INVALID))
+    return MZ_FALSE;
+
+  if (!pZip->m_pAlloc) pZip->m_pAlloc = def_alloc_func;
+  if (!pZip->m_pFree) pZip->m_pFree = def_free_func;
+  if (!pZip->m_pRealloc) pZip->m_pRealloc = def_realloc_func;
+
+  pZip->m_zip_mode = MZ_ZIP_MODE_READING;
+  pZip->m_archive_size = 0;
+  pZip->m_central_directory_file_ofs = 0;
+  pZip->m_total_files = 0;
+
+  if (NULL == (pZip->m_pState = (mz_zip_internal_state *)pZip->m_pAlloc(
+                   pZip->m_pAlloc_opaque, 1, sizeof(mz_zip_internal_state))))
+    return MZ_FALSE;
+  memset(pZip->m_pState, 0, sizeof(mz_zip_internal_state));
+  MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir,
+                                sizeof(mz_uint8));
+  MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir_offsets,
+                                sizeof(mz_uint32));
+  MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_sorted_central_dir_offsets,
+                                sizeof(mz_uint32));
+  return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE mz_bool
+mz_zip_reader_filename_less(const mz_zip_array *pCentral_dir_array,
+                            const mz_zip_array *pCentral_dir_offsets,
+                            mz_uint l_index, mz_uint r_index) {
+  const mz_uint8 *pL = &MZ_ZIP_ARRAY_ELEMENT(
+                     pCentral_dir_array, mz_uint8,
+                     MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_offsets, mz_uint32,
+                                          l_index)),
+                 *pE;
+  const mz_uint8 *pR = &MZ_ZIP_ARRAY_ELEMENT(
+      pCentral_dir_array, mz_uint8,
+      MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_offsets, mz_uint32, r_index));
+  mz_uint l_len = MZ_READ_LE16(pL + MZ_ZIP_CDH_FILENAME_LEN_OFS),
+          r_len = MZ_READ_LE16(pR + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+  mz_uint8 l = 0, r = 0;
+  pL += MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+  pR += MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+  pE = pL + MZ_MIN(l_len, r_len);
+  while (pL < pE) {
+    if ((l = MZ_TOLOWER(*pL)) != (r = MZ_TOLOWER(*pR))) break;
+    pL++;
+    pR++;
+  }
+  return (pL == pE) ? (l_len < r_len) : (l < r);
+}
+
+#define MZ_SWAP_UINT32(a, b) \
+  do {                       \
+    mz_uint32 t = a;         \
+    a = b;                   \
+    b = t;                   \
+  }                          \
+  MZ_MACRO_END
+
+// Heap sort of lowercased filenames, used to help accelerate plain central
+// directory searches by mz_zip_reader_locate_file(). (Could also use qsort(),
+// but it could allocate memory.)
+static void mz_zip_reader_sort_central_dir_offsets_by_filename(
+    mz_zip_archive *pZip) {
+  mz_zip_internal_state *pState = pZip->m_pState;
+  const mz_zip_array *pCentral_dir_offsets = &pState->m_central_dir_offsets;
+  const mz_zip_array *pCentral_dir = &pState->m_central_dir;
+  mz_uint32 *pIndices = &MZ_ZIP_ARRAY_ELEMENT(
+      &pState->m_sorted_central_dir_offsets, mz_uint32, 0);
+  const int size = pZip->m_total_files;
+  int start = (size - 2) >> 1, end;
+  while (start >= 0) {
+    int child, root = start;
+    for (;;) {
+      if ((child = (root << 1) + 1) >= size) break;
+      child +=
+          (((child + 1) < size) &&
+           (mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets,
+                                        pIndices[child], pIndices[child + 1])));
+      if (!mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets,
+                                       pIndices[root], pIndices[child]))
+        break;
+      MZ_SWAP_UINT32(pIndices[root], pIndices[child]);
+      root = child;
+    }
+    start--;
+  }
+
+  end = size - 1;
+  while (end > 0) {
+    int child, root = 0;
+    MZ_SWAP_UINT32(pIndices[end], pIndices[0]);
+    for (;;) {
+      if ((child = (root << 1) + 1) >= end) break;
+      child +=
+          (((child + 1) < end) &&
+           mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets,
+                                       pIndices[child], pIndices[child + 1]));
+      if (!mz_zip_reader_filename_less(pCentral_dir, pCentral_dir_offsets,
+                                       pIndices[root], pIndices[child]))
+        break;
+      MZ_SWAP_UINT32(pIndices[root], pIndices[child]);
+      root = child;
+    }
+    end--;
+  }
+}
+
+static mz_bool mz_zip_reader_read_central_dir(mz_zip_archive *pZip,
+                                              mz_uint32 flags) {
+  mz_uint cdir_size, num_this_disk, cdir_disk_index;
+  mz_uint64 cdir_ofs;
+  mz_int64 cur_file_ofs;
+  const mz_uint8 *p;
+  mz_uint32 buf_u32[4096 / sizeof(mz_uint32)];
+  mz_uint8 *pBuf = (mz_uint8 *)buf_u32;
+  mz_bool sort_central_dir =
+      ((flags & MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY) == 0);
+  // Basic sanity checks - reject files which are too small, and check the first
+  // 4 bytes of the file to make sure a local header is there.
+  if (pZip->m_archive_size < MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+  // Find the end of central directory record by scanning the file from the end
+  // towards the beginning.
+  cur_file_ofs =
+      MZ_MAX((mz_int64)pZip->m_archive_size - (mz_int64)sizeof(buf_u32), 0);
+  for (;;) {
+    int i,
+        n = (int)MZ_MIN(sizeof(buf_u32), pZip->m_archive_size - cur_file_ofs);
+    if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pBuf, n) != (mz_uint)n)
+      return MZ_FALSE;
+    for (i = n - 4; i >= 0; --i)
+      if (MZ_READ_LE32(pBuf + i) == MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG) break;
+    if (i >= 0) {
+      cur_file_ofs += i;
+      break;
+    }
+    if ((!cur_file_ofs) || ((pZip->m_archive_size - cur_file_ofs) >=
+                            (0xFFFF + MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)))
+      return MZ_FALSE;
+    cur_file_ofs = MZ_MAX(cur_file_ofs - (sizeof(buf_u32) - 3), 0);
+  }
+  // Read and verify the end of central directory record.
+  if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pBuf,
+                    MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE) !=
+      MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+  if ((MZ_READ_LE32(pBuf + MZ_ZIP_ECDH_SIG_OFS) !=
+       MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG) ||
+      ((pZip->m_total_files =
+            MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS)) !=
+       MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS)))
+    return MZ_FALSE;
+
+  num_this_disk = MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_NUM_THIS_DISK_OFS);
+  cdir_disk_index = MZ_READ_LE16(pBuf + MZ_ZIP_ECDH_NUM_DISK_CDIR_OFS);
+  if (((num_this_disk | cdir_disk_index) != 0) &&
+      ((num_this_disk != 1) || (cdir_disk_index != 1)))
+    return MZ_FALSE;
+
+  if ((cdir_size = MZ_READ_LE32(pBuf + MZ_ZIP_ECDH_CDIR_SIZE_OFS)) <
+      pZip->m_total_files * MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+
+  cdir_ofs = MZ_READ_LE32(pBuf + MZ_ZIP_ECDH_CDIR_OFS_OFS);
+  if ((cdir_ofs + (mz_uint64)cdir_size) > pZip->m_archive_size) return MZ_FALSE;
+
+  pZip->m_central_directory_file_ofs = cdir_ofs;
+
+  if (pZip->m_total_files) {
+    mz_uint i, n;
+
+    // Read the entire central directory into a heap block, and allocate another
+    // heap block to hold the unsorted central dir file record offsets, and
+    // another to hold the sorted indices.
+    if ((!mz_zip_array_resize(pZip, &pZip->m_pState->m_central_dir, cdir_size,
+                              MZ_FALSE)) ||
+        (!mz_zip_array_resize(pZip, &pZip->m_pState->m_central_dir_offsets,
+                              pZip->m_total_files, MZ_FALSE)))
+      return MZ_FALSE;
+
+    if (sort_central_dir) {
+      if (!mz_zip_array_resize(pZip,
+                               &pZip->m_pState->m_sorted_central_dir_offsets,
+                               pZip->m_total_files, MZ_FALSE))
+        return MZ_FALSE;
+    }
+
+    if (pZip->m_pRead(pZip->m_pIO_opaque, cdir_ofs,
+                      pZip->m_pState->m_central_dir.m_p,
+                      cdir_size) != cdir_size)
+      return MZ_FALSE;
+
+    // Now create an index into the central directory file records, do some
+    // basic sanity checking on each record, and check for zip64 entries (which
+    // are not yet supported).
+    p = (const mz_uint8 *)pZip->m_pState->m_central_dir.m_p;
+    for (n = cdir_size, i = 0; i < pZip->m_total_files; ++i) {
+      mz_uint total_header_size, comp_size, decomp_size, disk_index;
+      if ((n < MZ_ZIP_CENTRAL_DIR_HEADER_SIZE) ||
+          (MZ_READ_LE32(p) != MZ_ZIP_CENTRAL_DIR_HEADER_SIG))
+        return MZ_FALSE;
+      MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32,
+                           i) =
+          (mz_uint32)(p - (const mz_uint8 *)pZip->m_pState->m_central_dir.m_p);
+      if (sort_central_dir)
+        MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_sorted_central_dir_offsets,
+                             mz_uint32, i) = i;
+      comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+      decomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
+      if (((!MZ_READ_LE32(p + MZ_ZIP_CDH_METHOD_OFS)) &&
+           (decomp_size != comp_size)) ||
+          (decomp_size && !comp_size) || (decomp_size == 0xFFFFFFFF) ||
+          (comp_size == 0xFFFFFFFF))
+        return MZ_FALSE;
+      disk_index = MZ_READ_LE16(p + MZ_ZIP_CDH_DISK_START_OFS);
+      if ((disk_index != num_this_disk) && (disk_index != 1)) return MZ_FALSE;
+      if (((mz_uint64)MZ_READ_LE32(p + MZ_ZIP_CDH_LOCAL_HEADER_OFS) +
+           MZ_ZIP_LOCAL_DIR_HEADER_SIZE + comp_size) > pZip->m_archive_size)
+        return MZ_FALSE;
+      if ((total_header_size = MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
+                               MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS) +
+                               MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS) +
+                               MZ_READ_LE16(p + MZ_ZIP_CDH_COMMENT_LEN_OFS)) >
+          n)
+        return MZ_FALSE;
+      n -= total_header_size;
+      p += total_header_size;
+    }
+  }
+
+  if (sort_central_dir)
+    mz_zip_reader_sort_central_dir_offsets_by_filename(pZip);
+
+  return MZ_TRUE;
+}
+
+mz_bool mz_zip_reader_init(mz_zip_archive *pZip, mz_uint64 size,
+                           mz_uint32 flags) {
+  if ((!pZip) || (!pZip->m_pRead)) return MZ_FALSE;
+  if (!mz_zip_reader_init_internal(pZip, flags)) return MZ_FALSE;
+  pZip->m_archive_size = size;
+  if (!mz_zip_reader_read_central_dir(pZip, flags)) {
+    mz_zip_reader_end(pZip);
+    return MZ_FALSE;
+  }
+  return MZ_TRUE;
+}
+
+static size_t mz_zip_mem_read_func(void *pOpaque, mz_uint64 file_ofs,
+                                   void *pBuf, size_t n) {
+  mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+  size_t s = (file_ofs >= pZip->m_archive_size)
+                 ? 0
+                 : (size_t)MZ_MIN(pZip->m_archive_size - file_ofs, n);
+  memcpy(pBuf, (const mz_uint8 *)pZip->m_pState->m_pMem + file_ofs, s);
+  return s;
+}
+
+mz_bool mz_zip_reader_init_mem(mz_zip_archive *pZip, const void *pMem,
+                               size_t size, mz_uint32 flags) {
+  if (!mz_zip_reader_init_internal(pZip, flags)) return MZ_FALSE;
+  pZip->m_archive_size = size;
+  pZip->m_pRead = mz_zip_mem_read_func;
+  pZip->m_pIO_opaque = pZip;
+#ifdef __cplusplus
+  pZip->m_pState->m_pMem = const_cast<void *>(pMem);
+#else
+  pZip->m_pState->m_pMem = (void *)pMem;
+#endif
+  pZip->m_pState->m_mem_size = size;
+  if (!mz_zip_reader_read_central_dir(pZip, flags)) {
+    mz_zip_reader_end(pZip);
+    return MZ_FALSE;
+  }
+  return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_STDIO
+static size_t mz_zip_file_read_func(void *pOpaque, mz_uint64 file_ofs,
+                                    void *pBuf, size_t n) {
+  mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+  mz_int64 cur_ofs = MZ_FTELL64(pZip->m_pState->m_pFile);
+  if (((mz_int64)file_ofs < 0) ||
+      (((cur_ofs != (mz_int64)file_ofs)) &&
+       (MZ_FSEEK64(pZip->m_pState->m_pFile, (mz_int64)file_ofs, SEEK_SET))))
+    return 0;
+  return MZ_FREAD(pBuf, 1, n, pZip->m_pState->m_pFile);
+}
+
+mz_bool mz_zip_reader_init_file(mz_zip_archive *pZip, const char *pFilename,
+                                mz_uint32 flags) {
+  mz_uint64 file_size;
+  MZ_FILE *pFile = MZ_FOPEN(pFilename, "rb");
+  if (!pFile) return MZ_FALSE;
+  if (MZ_FSEEK64(pFile, 0, SEEK_END)) {
+    MZ_FCLOSE(pFile);
+    return MZ_FALSE;
+  }
+  file_size = MZ_FTELL64(pFile);
+  if (!mz_zip_reader_init_internal(pZip, flags)) {
+    MZ_FCLOSE(pFile);
+    return MZ_FALSE;
+  }
+  pZip->m_pRead = mz_zip_file_read_func;
+  pZip->m_pIO_opaque = pZip;
+  pZip->m_pState->m_pFile = pFile;
+  pZip->m_archive_size = file_size;
+  if (!mz_zip_reader_read_central_dir(pZip, flags)) {
+    mz_zip_reader_end(pZip);
+    return MZ_FALSE;
+  }
+  return MZ_TRUE;
+}
+#endif  // #ifndef MINIZ_NO_STDIO
+
+mz_uint mz_zip_reader_get_num_files(mz_zip_archive *pZip) {
+  return pZip ? pZip->m_total_files : 0;
+}
+
+static MZ_FORCEINLINE const mz_uint8 *mz_zip_reader_get_cdh(
+    mz_zip_archive *pZip, mz_uint file_index) {
+  if ((!pZip) || (!pZip->m_pState) || (file_index >= pZip->m_total_files) ||
+      (pZip->m_zip_mode != MZ_ZIP_MODE_READING))
+    return NULL;
+  return &MZ_ZIP_ARRAY_ELEMENT(
+      &pZip->m_pState->m_central_dir, mz_uint8,
+      MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32,
+                           file_index));
+}
+
+mz_bool mz_zip_reader_is_file_encrypted(mz_zip_archive *pZip,
+                                        mz_uint file_index) {
+  mz_uint m_bit_flag;
+  const mz_uint8 *p = mz_zip_reader_get_cdh(pZip, file_index);
+  if (!p) return MZ_FALSE;
+  m_bit_flag = MZ_READ_LE16(p + MZ_ZIP_CDH_BIT_FLAG_OFS);
+  return (m_bit_flag & 1);
+}
+
+mz_bool mz_zip_reader_is_file_a_directory(mz_zip_archive *pZip,
+                                          mz_uint file_index) {
+  mz_uint filename_len, external_attr;
+  const mz_uint8 *p = mz_zip_reader_get_cdh(pZip, file_index);
+  if (!p) return MZ_FALSE;
+
+  // First see if the filename ends with a '/' character.
+  filename_len = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+  if (filename_len) {
+    if (*(p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_len - 1) == '/')
+      return MZ_TRUE;
+  }
+
+  // Bugfix: This code was also checking if the internal attribute was non-zero,
+  // which wasn't correct.
+  // Most/all zip writers (hopefully) set DOS file/directory attributes in the
+  // low 16-bits, so check for the DOS directory flag and ignore the source OS
+  // ID in the created by field.
+  // FIXME: Remove this check? Is it necessary - we already check the filename.
+  external_attr = MZ_READ_LE32(p + MZ_ZIP_CDH_EXTERNAL_ATTR_OFS);
+  if ((external_attr & 0x10) != 0) return MZ_TRUE;
+
+  return MZ_FALSE;
+}
+
+mz_bool mz_zip_reader_file_stat(mz_zip_archive *pZip, mz_uint file_index,
+                                mz_zip_archive_file_stat *pStat) {
+  mz_uint n;
+  const mz_uint8 *p = mz_zip_reader_get_cdh(pZip, file_index);
+  if ((!p) || (!pStat)) return MZ_FALSE;
+
+  // Unpack the central directory record.
+  pStat->m_file_index = file_index;
+  pStat->m_central_dir_ofs = MZ_ZIP_ARRAY_ELEMENT(
+      &pZip->m_pState->m_central_dir_offsets, mz_uint32, file_index);
+  pStat->m_version_made_by = MZ_READ_LE16(p + MZ_ZIP_CDH_VERSION_MADE_BY_OFS);
+  pStat->m_version_needed = MZ_READ_LE16(p + MZ_ZIP_CDH_VERSION_NEEDED_OFS);
+  pStat->m_bit_flag = MZ_READ_LE16(p + MZ_ZIP_CDH_BIT_FLAG_OFS);
+  pStat->m_method = MZ_READ_LE16(p + MZ_ZIP_CDH_METHOD_OFS);
+#ifndef MINIZ_NO_TIME
+  pStat->m_time =
+      mz_zip_dos_to_time_t(MZ_READ_LE16(p + MZ_ZIP_CDH_FILE_TIME_OFS),
+                           MZ_READ_LE16(p + MZ_ZIP_CDH_FILE_DATE_OFS));
+#endif
+  pStat->m_crc32 = MZ_READ_LE32(p + MZ_ZIP_CDH_CRC32_OFS);
+  pStat->m_comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+  pStat->m_uncomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
+  pStat->m_internal_attr = MZ_READ_LE16(p + MZ_ZIP_CDH_INTERNAL_ATTR_OFS);
+  pStat->m_external_attr = MZ_READ_LE32(p + MZ_ZIP_CDH_EXTERNAL_ATTR_OFS);
+  pStat->m_local_header_ofs = MZ_READ_LE32(p + MZ_ZIP_CDH_LOCAL_HEADER_OFS);
+
+  // Copy as much of the filename and comment as possible.
+  n = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+  n = MZ_MIN(n, MZ_ZIP_MAX_ARCHIVE_FILENAME_SIZE - 1);
+  memcpy(pStat->m_filename, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, n);
+  pStat->m_filename[n] = '\0';
+
+  n = MZ_READ_LE16(p + MZ_ZIP_CDH_COMMENT_LEN_OFS);
+  n = MZ_MIN(n, MZ_ZIP_MAX_ARCHIVE_FILE_COMMENT_SIZE - 1);
+  pStat->m_comment_size = n;
+  memcpy(pStat->m_comment, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
+                               MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS) +
+                               MZ_READ_LE16(p + MZ_ZIP_CDH_EXTRA_LEN_OFS),
+         n);
+  pStat->m_comment[n] = '\0';
+
+  return MZ_TRUE;
+}
+
+mz_uint mz_zip_reader_get_filename(mz_zip_archive *pZip, mz_uint file_index,
+                                   char *pFilename, mz_uint filename_buf_size) {
+  mz_uint n;
+  const mz_uint8 *p = mz_zip_reader_get_cdh(pZip, file_index);
+  if (!p) {
+    if (filename_buf_size) pFilename[0] = '\0';
+    return 0;
+  }
+  n = MZ_READ_LE16(p + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+  if (filename_buf_size) {
+    n = MZ_MIN(n, filename_buf_size - 1);
+    memcpy(pFilename, p + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, n);
+    pFilename[n] = '\0';
+  }
+  return n + 1;
+}
+
+static MZ_FORCEINLINE mz_bool mz_zip_reader_string_equal(const char *pA,
+                                                         const char *pB,
+                                                         mz_uint len,
+                                                         mz_uint flags) {
+  mz_uint i;
+  if (flags & MZ_ZIP_FLAG_CASE_SENSITIVE) return 0 == memcmp(pA, pB, len);
+  for (i = 0; i < len; ++i)
+    if (MZ_TOLOWER(pA[i]) != MZ_TOLOWER(pB[i])) return MZ_FALSE;
+  return MZ_TRUE;
+}
+
+static MZ_FORCEINLINE int mz_zip_reader_filename_compare(
+    const mz_zip_array *pCentral_dir_array,
+    const mz_zip_array *pCentral_dir_offsets, mz_uint l_index, const char *pR,
+    mz_uint r_len) {
+  const mz_uint8 *pL = &MZ_ZIP_ARRAY_ELEMENT(
+                     pCentral_dir_array, mz_uint8,
+                     MZ_ZIP_ARRAY_ELEMENT(pCentral_dir_offsets, mz_uint32,
+                                          l_index)),
+                 *pE;
+  mz_uint l_len = MZ_READ_LE16(pL + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+  mz_uint8 l = 0, r = 0;
+  pL += MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+  pE = pL + MZ_MIN(l_len, r_len);
+  while (pL < pE) {
+    if ((l = MZ_TOLOWER(*pL)) != (r = MZ_TOLOWER(*pR))) break;
+    pL++;
+    pR++;
+  }
+  return (pL == pE) ? (int)(l_len - r_len) : (l - r);
+}
+
+static int mz_zip_reader_locate_file_binary_search(mz_zip_archive *pZip,
+                                                   const char *pFilename) {
+  mz_zip_internal_state *pState = pZip->m_pState;
+  const mz_zip_array *pCentral_dir_offsets = &pState->m_central_dir_offsets;
+  const mz_zip_array *pCentral_dir = &pState->m_central_dir;
+  mz_uint32 *pIndices = &MZ_ZIP_ARRAY_ELEMENT(
+      &pState->m_sorted_central_dir_offsets, mz_uint32, 0);
+  const int size = pZip->m_total_files;
+  const mz_uint filename_len = (mz_uint)strlen(pFilename);
+  int l = 0, h = size - 1;
+  while (l <= h) {
+    int m = (l + h) >> 1, file_index = pIndices[m],
+        comp =
+            mz_zip_reader_filename_compare(pCentral_dir, pCentral_dir_offsets,
+                                           file_index, pFilename, filename_len);
+    if (!comp)
+      return file_index;
+    else if (comp < 0)
+      l = m + 1;
+    else
+      h = m - 1;
+  }
+  return -1;
+}
+
+int mz_zip_reader_locate_file(mz_zip_archive *pZip, const char *pName,
+                              const char *pComment, mz_uint flags) {
+  mz_uint file_index;
+  size_t name_len, comment_len;
+  if ((!pZip) || (!pZip->m_pState) || (!pName) ||
+      (pZip->m_zip_mode != MZ_ZIP_MODE_READING))
+    return -1;
+  if (((flags & (MZ_ZIP_FLAG_IGNORE_PATH | MZ_ZIP_FLAG_CASE_SENSITIVE)) == 0) &&
+      (!pComment) && (pZip->m_pState->m_sorted_central_dir_offsets.m_size))
+    return mz_zip_reader_locate_file_binary_search(pZip, pName);
+  name_len = strlen(pName);
+  if (name_len > 0xFFFF) return -1;
+  comment_len = pComment ? strlen(pComment) : 0;
+  if (comment_len > 0xFFFF) return -1;
+  for (file_index = 0; file_index < pZip->m_total_files; file_index++) {
+    const mz_uint8 *pHeader = &MZ_ZIP_ARRAY_ELEMENT(
+        &pZip->m_pState->m_central_dir, mz_uint8,
+        MZ_ZIP_ARRAY_ELEMENT(&pZip->m_pState->m_central_dir_offsets, mz_uint32,
+                             file_index));
+    mz_uint filename_len = MZ_READ_LE16(pHeader + MZ_ZIP_CDH_FILENAME_LEN_OFS);
+    const char *pFilename =
+        (const char *)pHeader + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE;
+    if (filename_len < name_len) continue;
+    if (comment_len) {
+      mz_uint file_extra_len = MZ_READ_LE16(pHeader + MZ_ZIP_CDH_EXTRA_LEN_OFS),
+              file_comment_len =
+                  MZ_READ_LE16(pHeader + MZ_ZIP_CDH_COMMENT_LEN_OFS);
+      const char *pFile_comment = pFilename + filename_len + file_extra_len;
+      if ((file_comment_len != comment_len) ||
+          (!mz_zip_reader_string_equal(pComment, pFile_comment,
+                                       file_comment_len, flags)))
+        continue;
+    }
+    if ((flags & MZ_ZIP_FLAG_IGNORE_PATH) && (filename_len)) {
+      int ofs = filename_len - 1;
+      do {
+        if ((pFilename[ofs] == '/') || (pFilename[ofs] == '\\') ||
+            (pFilename[ofs] == ':'))
+          break;
+      } while (--ofs >= 0);
+      ofs++;
+      pFilename += ofs;
+      filename_len -= ofs;
+    }
+    if ((filename_len == name_len) &&
+        (mz_zip_reader_string_equal(pName, pFilename, filename_len, flags)))
+      return file_index;
+  }
+  return -1;
+}
+
+mz_bool mz_zip_reader_extract_to_mem_no_alloc(mz_zip_archive *pZip,
+                                              mz_uint file_index, void *pBuf,
+                                              size_t buf_size, mz_uint flags,
+                                              void *pUser_read_buf,
+                                              size_t user_read_buf_size) {
+  int status = TINFL_STATUS_DONE;
+  mz_uint64 needed_size, cur_file_ofs, comp_remaining,
+      out_buf_ofs = 0, read_buf_size, read_buf_ofs = 0, read_buf_avail;
+  mz_zip_archive_file_stat file_stat;
+  void *pRead_buf;
+  mz_uint32
+      local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) /
+                       sizeof(mz_uint32)];
+  mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+  tinfl_decompressor inflator;
+
+  if ((buf_size) && (!pBuf)) return MZ_FALSE;
+
+  if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat)) return MZ_FALSE;
+
+  // Empty file, or a directory (but not always a directory - I've seen odd zips
+  // with directories that have compressed data which inflates to 0 bytes)
+  if (!file_stat.m_comp_size) return MZ_TRUE;
+
+  // Entry is a subdirectory (I've seen old zips with dir entries which have
+  // compressed deflate data which inflates to 0 bytes, but these entries claim
+  // to uncompress to 512 bytes in the headers).
+  // I'm torn how to handle this case - should it fail instead?
+  if (mz_zip_reader_is_file_a_directory(pZip, file_index)) return MZ_TRUE;
+
+  // Encryption and patch files are not supported.
+  if (file_stat.m_bit_flag & (1 | 32)) return MZ_FALSE;
+
+  // This function only supports stored and deflate.
+  if ((!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (file_stat.m_method != 0) &&
+      (file_stat.m_method != MZ_DEFLATED))
+    return MZ_FALSE;
+
+  // Ensure supplied output buffer is large enough.
+  needed_size = (flags & MZ_ZIP_FLAG_COMPRESSED_DATA) ? file_stat.m_comp_size
+                                                      : file_stat.m_uncomp_size;
+  if (buf_size < needed_size) return MZ_FALSE;
+
+  // Read and parse the local directory entry.
+  cur_file_ofs = file_stat.m_local_header_ofs;
+  if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pLocal_header,
+                    MZ_ZIP_LOCAL_DIR_HEADER_SIZE) !=
+      MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+  if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+    return MZ_FALSE;
+
+  cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE +
+                  MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) +
+                  MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+  if ((cur_file_ofs + file_stat.m_comp_size) > pZip->m_archive_size)
+    return MZ_FALSE;
+
+  if ((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!file_stat.m_method)) {
+    // The file is stored or the caller has requested the compressed data.
+    if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pBuf,
+                      (size_t)needed_size) != needed_size)
+      return MZ_FALSE;
+    return ((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) != 0) ||
+           (mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf,
+                     (size_t)file_stat.m_uncomp_size) == file_stat.m_crc32);
+  }
+
+  // Decompress the file either directly from memory or from a file input
+  // buffer.
+  tinfl_init(&inflator);
+
+  if (pZip->m_pState->m_pMem) {
+    // Read directly from the archive in memory.
+    pRead_buf = (mz_uint8 *)pZip->m_pState->m_pMem + cur_file_ofs;
+    read_buf_size = read_buf_avail = file_stat.m_comp_size;
+    comp_remaining = 0;
+  } else if (pUser_read_buf) {
+    // Use a user provided read buffer.
+    if (!user_read_buf_size) return MZ_FALSE;
+    pRead_buf = (mz_uint8 *)pUser_read_buf;
+    read_buf_size = user_read_buf_size;
+    read_buf_avail = 0;
+    comp_remaining = file_stat.m_comp_size;
+  } else {
+    // Temporarily allocate a read buffer.
+    read_buf_size =
+        MZ_MIN(file_stat.m_comp_size, (mz_uint)MZ_ZIP_MAX_IO_BUF_SIZE);
+#ifdef _MSC_VER
+    if (((0, sizeof(size_t) == sizeof(mz_uint32))) &&
+        (read_buf_size > 0x7FFFFFFF))
+#else
+    if (((sizeof(size_t) == sizeof(mz_uint32))) && (read_buf_size > 0x7FFFFFFF))
+#endif
+      return MZ_FALSE;
+    if (NULL == (pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1,
+                                            (size_t)read_buf_size)))
+      return MZ_FALSE;
+    read_buf_avail = 0;
+    comp_remaining = file_stat.m_comp_size;
+  }
+
+  do {
+    size_t in_buf_size,
+        out_buf_size = (size_t)(file_stat.m_uncomp_size - out_buf_ofs);
+    if ((!read_buf_avail) && (!pZip->m_pState->m_pMem)) {
+      read_buf_avail = MZ_MIN(read_buf_size, comp_remaining);
+      if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pRead_buf,
+                        (size_t)read_buf_avail) != read_buf_avail) {
+        status = TINFL_STATUS_FAILED;
+        break;
+      }
+      cur_file_ofs += read_buf_avail;
+      comp_remaining -= read_buf_avail;
+      read_buf_ofs = 0;
+    }
+    in_buf_size = (size_t)read_buf_avail;
+    status = tinfl_decompress(
+        &inflator, (mz_uint8 *)pRead_buf + read_buf_ofs, &in_buf_size,
+        (mz_uint8 *)pBuf, (mz_uint8 *)pBuf + out_buf_ofs, &out_buf_size,
+        TINFL_FLAG_USING_NON_WRAPPING_OUTPUT_BUF |
+            (comp_remaining ? TINFL_FLAG_HAS_MORE_INPUT : 0));
+    read_buf_avail -= in_buf_size;
+    read_buf_ofs += in_buf_size;
+    out_buf_ofs += out_buf_size;
+  } while (status == TINFL_STATUS_NEEDS_MORE_INPUT);
+
+  if (status == TINFL_STATUS_DONE) {
+    // Make sure the entire file was decompressed, and check its CRC.
+    if ((out_buf_ofs != file_stat.m_uncomp_size) ||
+        (mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf,
+                  (size_t)file_stat.m_uncomp_size) != file_stat.m_crc32))
+      status = TINFL_STATUS_FAILED;
+  }
+
+  if ((!pZip->m_pState->m_pMem) && (!pUser_read_buf))
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+
+  return status == TINFL_STATUS_DONE;
+}
+
+mz_bool mz_zip_reader_extract_file_to_mem_no_alloc(
+    mz_zip_archive *pZip, const char *pFilename, void *pBuf, size_t buf_size,
+    mz_uint flags, void *pUser_read_buf, size_t user_read_buf_size) {
+  int file_index = mz_zip_reader_locate_file(pZip, pFilename, NULL, flags);
+  if (file_index < 0) return MZ_FALSE;
+  return mz_zip_reader_extract_to_mem_no_alloc(pZip, file_index, pBuf, buf_size,
+                                               flags, pUser_read_buf,
+                                               user_read_buf_size);
+}
+
+mz_bool mz_zip_reader_extract_to_mem(mz_zip_archive *pZip, mz_uint file_index,
+                                     void *pBuf, size_t buf_size,
+                                     mz_uint flags) {
+  return mz_zip_reader_extract_to_mem_no_alloc(pZip, file_index, pBuf, buf_size,
+                                               flags, NULL, 0);
+}
+
+mz_bool mz_zip_reader_extract_file_to_mem(mz_zip_archive *pZip,
+                                          const char *pFilename, void *pBuf,
+                                          size_t buf_size, mz_uint flags) {
+  return mz_zip_reader_extract_file_to_mem_no_alloc(pZip, pFilename, pBuf,
+                                                    buf_size, flags, NULL, 0);
+}
+
+void *mz_zip_reader_extract_to_heap(mz_zip_archive *pZip, mz_uint file_index,
+                                    size_t *pSize, mz_uint flags) {
+  mz_uint64 comp_size, uncomp_size, alloc_size;
+  const mz_uint8 *p = mz_zip_reader_get_cdh(pZip, file_index);
+  void *pBuf;
+
+  if (pSize) *pSize = 0;
+  if (!p) return NULL;
+
+  comp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+  uncomp_size = MZ_READ_LE32(p + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS);
+
+  alloc_size = (flags & MZ_ZIP_FLAG_COMPRESSED_DATA) ? comp_size : uncomp_size;
+#ifdef _MSC_VER
+  if (((0, sizeof(size_t) == sizeof(mz_uint32))) && (alloc_size > 0x7FFFFFFF))
+#else
+  if (((sizeof(size_t) == sizeof(mz_uint32))) && (alloc_size > 0x7FFFFFFF))
+#endif
+    return NULL;
+  if (NULL ==
+      (pBuf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, (size_t)alloc_size)))
+    return NULL;
+
+  if (!mz_zip_reader_extract_to_mem(pZip, file_index, pBuf, (size_t)alloc_size,
+                                    flags)) {
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+    return NULL;
+  }
+
+  if (pSize) *pSize = (size_t)alloc_size;
+  return pBuf;
+}
+
+void *mz_zip_reader_extract_file_to_heap(mz_zip_archive *pZip,
+                                         const char *pFilename, size_t *pSize,
+                                         mz_uint flags) {
+  int file_index = mz_zip_reader_locate_file(pZip, pFilename, NULL, flags);
+  if (file_index < 0) {
+    if (pSize) *pSize = 0;
+    return MZ_FALSE;
+  }
+  return mz_zip_reader_extract_to_heap(pZip, file_index, pSize, flags);
+}
+
+mz_bool mz_zip_reader_extract_to_callback(mz_zip_archive *pZip,
+                                          mz_uint file_index,
+                                          mz_file_write_func pCallback,
+                                          void *pOpaque, mz_uint flags) {
+  int status = TINFL_STATUS_DONE;
+  mz_uint file_crc32 = MZ_CRC32_INIT;
+  mz_uint64 read_buf_size, read_buf_ofs = 0, read_buf_avail, comp_remaining,
+                           out_buf_ofs = 0, cur_file_ofs;
+  mz_zip_archive_file_stat file_stat;
+  void *pRead_buf = NULL;
+  void *pWrite_buf = NULL;
+  mz_uint32
+      local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) /
+                       sizeof(mz_uint32)];
+  mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+
+  if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat)) return MZ_FALSE;
+
+  // Empty file, or a directory (but not always a directory - I've seen odd zips
+  // with directories that have compressed data which inflates to 0 bytes)
+  if (!file_stat.m_comp_size) return MZ_TRUE;
+
+  // Entry is a subdirectory (I've seen old zips with dir entries which have
+  // compressed deflate data which inflates to 0 bytes, but these entries claim
+  // to uncompress to 512 bytes in the headers).
+  // I'm torn how to handle this case - should it fail instead?
+  if (mz_zip_reader_is_file_a_directory(pZip, file_index)) return MZ_TRUE;
+
+  // Encryption and patch files are not supported.
+  if (file_stat.m_bit_flag & (1 | 32)) return MZ_FALSE;
+
+  // This function only supports stored and deflate.
+  if ((!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (file_stat.m_method != 0) &&
+      (file_stat.m_method != MZ_DEFLATED))
+    return MZ_FALSE;
+
+  // Read and parse the local directory entry.
+  cur_file_ofs = file_stat.m_local_header_ofs;
+  if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pLocal_header,
+                    MZ_ZIP_LOCAL_DIR_HEADER_SIZE) !=
+      MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+  if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+    return MZ_FALSE;
+
+  cur_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE +
+                  MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) +
+                  MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+  if ((cur_file_ofs + file_stat.m_comp_size) > pZip->m_archive_size)
+    return MZ_FALSE;
+
+  // Decompress the file either directly from memory or from a file input
+  // buffer.
+  if (pZip->m_pState->m_pMem) {
+    pRead_buf = (mz_uint8 *)pZip->m_pState->m_pMem + cur_file_ofs;
+    read_buf_size = read_buf_avail = file_stat.m_comp_size;
+    comp_remaining = 0;
+  } else {
+    read_buf_size =
+        MZ_MIN(file_stat.m_comp_size, (mz_uint)MZ_ZIP_MAX_IO_BUF_SIZE);
+    if (NULL == (pRead_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1,
+                                            (size_t)read_buf_size)))
+      return MZ_FALSE;
+    read_buf_avail = 0;
+    comp_remaining = file_stat.m_comp_size;
+  }
+
+  if ((flags & MZ_ZIP_FLAG_COMPRESSED_DATA) || (!file_stat.m_method)) {
+    // The file is stored or the caller has requested the compressed data.
+    if (pZip->m_pState->m_pMem) {
+#ifdef _MSC_VER
+      if (((0, sizeof(size_t) == sizeof(mz_uint32))) &&
+          (file_stat.m_comp_size > 0xFFFFFFFF))
+#else
+      if (((sizeof(size_t) == sizeof(mz_uint32))) &&
+          (file_stat.m_comp_size > 0xFFFFFFFF))
+#endif
+        return MZ_FALSE;
+      if (pCallback(pOpaque, out_buf_ofs, pRead_buf,
+                    (size_t)file_stat.m_comp_size) != file_stat.m_comp_size)
+        status = TINFL_STATUS_FAILED;
+      else if (!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+        file_crc32 =
+            (mz_uint32)mz_crc32(file_crc32, (const mz_uint8 *)pRead_buf,
+                                (size_t)file_stat.m_comp_size);
+      cur_file_ofs += file_stat.m_comp_size;
+      out_buf_ofs += file_stat.m_comp_size;
+      comp_remaining = 0;
+    } else {
+      while (comp_remaining) {
+        read_buf_avail = MZ_MIN(read_buf_size, comp_remaining);
+        if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pRead_buf,
+                          (size_t)read_buf_avail) != read_buf_avail) {
+          status = TINFL_STATUS_FAILED;
+          break;
+        }
+
+        if (!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA))
+          file_crc32 = (mz_uint32)mz_crc32(
+              file_crc32, (const mz_uint8 *)pRead_buf, (size_t)read_buf_avail);
+
+        if (pCallback(pOpaque, out_buf_ofs, pRead_buf,
+                      (size_t)read_buf_avail) != read_buf_avail) {
+          status = TINFL_STATUS_FAILED;
+          break;
+        }
+        cur_file_ofs += read_buf_avail;
+        out_buf_ofs += read_buf_avail;
+        comp_remaining -= read_buf_avail;
+      }
+    }
+  } else {
+    tinfl_decompressor inflator;
+    tinfl_init(&inflator);
+
+    if (NULL == (pWrite_buf = pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1,
+                                             TINFL_LZ_DICT_SIZE)))
+      status = TINFL_STATUS_FAILED;
+    else {
+      do {
+        mz_uint8 *pWrite_buf_cur =
+            (mz_uint8 *)pWrite_buf + (out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+        size_t in_buf_size,
+            out_buf_size =
+                TINFL_LZ_DICT_SIZE - (out_buf_ofs & (TINFL_LZ_DICT_SIZE - 1));
+        if ((!read_buf_avail) && (!pZip->m_pState->m_pMem)) {
+          read_buf_avail = MZ_MIN(read_buf_size, comp_remaining);
+          if (pZip->m_pRead(pZip->m_pIO_opaque, cur_file_ofs, pRead_buf,
+                            (size_t)read_buf_avail) != read_buf_avail) {
+            status = TINFL_STATUS_FAILED;
+            break;
+          }
+          cur_file_ofs += read_buf_avail;
+          comp_remaining -= read_buf_avail;
+          read_buf_ofs = 0;
+        }
+
+        in_buf_size = (size_t)read_buf_avail;
+        status = tinfl_decompress(
+            &inflator, (const mz_uint8 *)pRead_buf + read_buf_ofs, &in_buf_size,
+            (mz_uint8 *)pWrite_buf, pWrite_buf_cur, &out_buf_size,
+            comp_remaining ? TINFL_FLAG_HAS_MORE_INPUT : 0);
+        read_buf_avail -= in_buf_size;
+        read_buf_ofs += in_buf_size;
+
+        if (out_buf_size) {
+          if (pCallback(pOpaque, out_buf_ofs, pWrite_buf_cur, out_buf_size) !=
+              out_buf_size) {
+            status = TINFL_STATUS_FAILED;
+            break;
+          }
+          file_crc32 =
+              (mz_uint32)mz_crc32(file_crc32, pWrite_buf_cur, out_buf_size);
+          if ((out_buf_ofs += out_buf_size) > file_stat.m_uncomp_size) {
+            status = TINFL_STATUS_FAILED;
+            break;
+          }
+        }
+      } while ((status == TINFL_STATUS_NEEDS_MORE_INPUT) ||
+               (status == TINFL_STATUS_HAS_MORE_OUTPUT));
+    }
+  }
+
+  if ((status == TINFL_STATUS_DONE) &&
+      (!(flags & MZ_ZIP_FLAG_COMPRESSED_DATA))) {
+    // Make sure the entire file was decompressed, and check its CRC.
+    if ((out_buf_ofs != file_stat.m_uncomp_size) ||
+        (file_crc32 != file_stat.m_crc32))
+      status = TINFL_STATUS_FAILED;
+  }
+
+  if (!pZip->m_pState->m_pMem) pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+  if (pWrite_buf) pZip->m_pFree(pZip->m_pAlloc_opaque, pWrite_buf);
+
+  return status == TINFL_STATUS_DONE;
+}
+
+mz_bool mz_zip_reader_extract_file_to_callback(mz_zip_archive *pZip,
+                                               const char *pFilename,
+                                               mz_file_write_func pCallback,
+                                               void *pOpaque, mz_uint flags) {
+  int file_index = mz_zip_reader_locate_file(pZip, pFilename, NULL, flags);
+  if (file_index < 0) return MZ_FALSE;
+  return mz_zip_reader_extract_to_callback(pZip, file_index, pCallback, pOpaque,
+                                           flags);
+}
+
+#ifndef MINIZ_NO_STDIO
+static size_t mz_zip_file_write_callback(void *pOpaque, mz_uint64 ofs,
+                                         const void *pBuf, size_t n) {
+  (void)ofs;
+  return MZ_FWRITE(pBuf, 1, n, (MZ_FILE *)pOpaque);
+}
+
+mz_bool mz_zip_reader_extract_to_file(mz_zip_archive *pZip, mz_uint file_index,
+                                      const char *pDst_filename,
+                                      mz_uint flags) {
+  mz_bool status;
+  mz_zip_archive_file_stat file_stat;
+  MZ_FILE *pFile;
+  if (!mz_zip_reader_file_stat(pZip, file_index, &file_stat)) return MZ_FALSE;
+  pFile = MZ_FOPEN(pDst_filename, "wb");
+  if (!pFile) return MZ_FALSE;
+  status = mz_zip_reader_extract_to_callback(
+      pZip, file_index, mz_zip_file_write_callback, pFile, flags);
+  if (MZ_FCLOSE(pFile) == EOF) return MZ_FALSE;
+#ifndef MINIZ_NO_TIME
+  if (status)
+    mz_zip_set_file_times(pDst_filename, file_stat.m_time, file_stat.m_time);
+#endif
+  return status;
+}
+#endif  // #ifndef MINIZ_NO_STDIO
+
+mz_bool mz_zip_reader_end(mz_zip_archive *pZip) {
+  if ((!pZip) || (!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) ||
+      (pZip->m_zip_mode != MZ_ZIP_MODE_READING))
+    return MZ_FALSE;
+
+  if (pZip->m_pState) {
+    mz_zip_internal_state *pState = pZip->m_pState;
+    pZip->m_pState = NULL;
+    mz_zip_array_clear(pZip, &pState->m_central_dir);
+    mz_zip_array_clear(pZip, &pState->m_central_dir_offsets);
+    mz_zip_array_clear(pZip, &pState->m_sorted_central_dir_offsets);
+
+#ifndef MINIZ_NO_STDIO
+    if (pState->m_pFile) {
+      MZ_FCLOSE(pState->m_pFile);
+      pState->m_pFile = NULL;
+    }
+#endif  // #ifndef MINIZ_NO_STDIO
+
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+  }
+  pZip->m_zip_mode = MZ_ZIP_MODE_INVALID;
+
+  return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_reader_extract_file_to_file(mz_zip_archive *pZip,
+                                           const char *pArchive_filename,
+                                           const char *pDst_filename,
+                                           mz_uint flags) {
+  int file_index =
+      mz_zip_reader_locate_file(pZip, pArchive_filename, NULL, flags);
+  if (file_index < 0) return MZ_FALSE;
+  return mz_zip_reader_extract_to_file(pZip, file_index, pDst_filename, flags);
+}
+#endif
+
+// ------------------- .ZIP archive writing
+
+#ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+
+static void mz_write_le16(mz_uint8 *p, mz_uint16 v) {
+  p[0] = (mz_uint8)v;
+  p[1] = (mz_uint8)(v >> 8);
+}
+static void mz_write_le32(mz_uint8 *p, mz_uint32 v) {
+  p[0] = (mz_uint8)v;
+  p[1] = (mz_uint8)(v >> 8);
+  p[2] = (mz_uint8)(v >> 16);
+  p[3] = (mz_uint8)(v >> 24);
+}
+#define MZ_WRITE_LE16(p, v) mz_write_le16((mz_uint8 *)(p), (mz_uint16)(v))
+#define MZ_WRITE_LE32(p, v) mz_write_le32((mz_uint8 *)(p), (mz_uint32)(v))
+
+mz_bool mz_zip_writer_init(mz_zip_archive *pZip, mz_uint64 existing_size) {
+  if ((!pZip) || (pZip->m_pState) || (!pZip->m_pWrite) ||
+      (pZip->m_zip_mode != MZ_ZIP_MODE_INVALID))
+    return MZ_FALSE;
+
+  if (pZip->m_file_offset_alignment) {
+    // Ensure user specified file offset alignment is a power of 2.
+    if (pZip->m_file_offset_alignment & (pZip->m_file_offset_alignment - 1))
+      return MZ_FALSE;
+  }
+
+  if (!pZip->m_pAlloc) pZip->m_pAlloc = def_alloc_func;
+  if (!pZip->m_pFree) pZip->m_pFree = def_free_func;
+  if (!pZip->m_pRealloc) pZip->m_pRealloc = def_realloc_func;
+
+  pZip->m_zip_mode = MZ_ZIP_MODE_WRITING;
+  pZip->m_archive_size = existing_size;
+  pZip->m_central_directory_file_ofs = 0;
+  pZip->m_total_files = 0;
+
+  if (NULL == (pZip->m_pState = (mz_zip_internal_state *)pZip->m_pAlloc(
+                   pZip->m_pAlloc_opaque, 1, sizeof(mz_zip_internal_state))))
+    return MZ_FALSE;
+  memset(pZip->m_pState, 0, sizeof(mz_zip_internal_state));
+  MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir,
+                                sizeof(mz_uint8));
+  MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_central_dir_offsets,
+                                sizeof(mz_uint32));
+  MZ_ZIP_ARRAY_SET_ELEMENT_SIZE(&pZip->m_pState->m_sorted_central_dir_offsets,
+                                sizeof(mz_uint32));
+  return MZ_TRUE;
+}
+
+static size_t mz_zip_heap_write_func(void *pOpaque, mz_uint64 file_ofs,
+                                     const void *pBuf, size_t n) {
+  mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+  mz_zip_internal_state *pState = pZip->m_pState;
+  mz_uint64 new_size = MZ_MAX(file_ofs + n, pState->m_mem_size);
+#ifdef _MSC_VER
+  if ((!n) ||
+      ((0, sizeof(size_t) == sizeof(mz_uint32)) && (new_size > 0x7FFFFFFF)))
+#else
+  if ((!n) ||
+      ((sizeof(size_t) == sizeof(mz_uint32)) && (new_size > 0x7FFFFFFF)))
+#endif
+    return 0;
+  if (new_size > pState->m_mem_capacity) {
+    void *pNew_block;
+    size_t new_capacity = MZ_MAX(64, pState->m_mem_capacity);
+    while (new_capacity < new_size) new_capacity *= 2;
+    if (NULL == (pNew_block = pZip->m_pRealloc(
+                     pZip->m_pAlloc_opaque, pState->m_pMem, 1, new_capacity)))
+      return 0;
+    pState->m_pMem = pNew_block;
+    pState->m_mem_capacity = new_capacity;
+  }
+  memcpy((mz_uint8 *)pState->m_pMem + file_ofs, pBuf, n);
+  pState->m_mem_size = (size_t)new_size;
+  return n;
+}
+
+mz_bool mz_zip_writer_init_heap(mz_zip_archive *pZip,
+                                size_t size_to_reserve_at_beginning,
+                                size_t initial_allocation_size) {
+  pZip->m_pWrite = mz_zip_heap_write_func;
+  pZip->m_pIO_opaque = pZip;
+  if (!mz_zip_writer_init(pZip, size_to_reserve_at_beginning)) return MZ_FALSE;
+  if (0 != (initial_allocation_size = MZ_MAX(initial_allocation_size,
+                                             size_to_reserve_at_beginning))) {
+    if (NULL == (pZip->m_pState->m_pMem = pZip->m_pAlloc(
+                     pZip->m_pAlloc_opaque, 1, initial_allocation_size))) {
+      mz_zip_writer_end(pZip);
+      return MZ_FALSE;
+    }
+    pZip->m_pState->m_mem_capacity = initial_allocation_size;
+  }
+  return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_STDIO
+static size_t mz_zip_file_write_func(void *pOpaque, mz_uint64 file_ofs,
+                                     const void *pBuf, size_t n) {
+  mz_zip_archive *pZip = (mz_zip_archive *)pOpaque;
+  mz_int64 cur_ofs = MZ_FTELL64(pZip->m_pState->m_pFile);
+  if (((mz_int64)file_ofs < 0) ||
+      (((cur_ofs != (mz_int64)file_ofs)) &&
+       (MZ_FSEEK64(pZip->m_pState->m_pFile, (mz_int64)file_ofs, SEEK_SET))))
+    return 0;
+  return MZ_FWRITE(pBuf, 1, n, pZip->m_pState->m_pFile);
+}
+
+mz_bool mz_zip_writer_init_file(mz_zip_archive *pZip, const char *pFilename,
+                                mz_uint64 size_to_reserve_at_beginning) {
+  MZ_FILE *pFile;
+  pZip->m_pWrite = mz_zip_file_write_func;
+  pZip->m_pIO_opaque = pZip;
+  if (!mz_zip_writer_init(pZip, size_to_reserve_at_beginning)) return MZ_FALSE;
+  if (NULL == (pFile = MZ_FOPEN(pFilename, "wb"))) {
+    mz_zip_writer_end(pZip);
+    return MZ_FALSE;
+  }
+  pZip->m_pState->m_pFile = pFile;
+  if (size_to_reserve_at_beginning) {
+    mz_uint64 cur_ofs = 0;
+    char buf[4096];
+    MZ_CLEAR_OBJ(buf);
+    do {
+      size_t n = (size_t)MZ_MIN(sizeof(buf), size_to_reserve_at_beginning);
+      if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_ofs, buf, n) != n) {
+        mz_zip_writer_end(pZip);
+        return MZ_FALSE;
+      }
+      cur_ofs += n;
+      size_to_reserve_at_beginning -= n;
+    } while (size_to_reserve_at_beginning);
+  }
+  return MZ_TRUE;
+}
+#endif  // #ifndef MINIZ_NO_STDIO
+
+mz_bool mz_zip_writer_init_from_reader(mz_zip_archive *pZip,
+                                       const char *pFilename) {
+  mz_zip_internal_state *pState;
+  if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_READING))
+    return MZ_FALSE;
+  // No sense in trying to write to an archive that's already at the support max
+  // size
+  if ((pZip->m_total_files == 0xFFFF) ||
+      ((pZip->m_archive_size + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
+        MZ_ZIP_LOCAL_DIR_HEADER_SIZE) > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  pState = pZip->m_pState;
+
+  if (pState->m_pFile) {
+#ifdef MINIZ_NO_STDIO
+    pFilename;
+    return MZ_FALSE;
+#else
+    // Archive is being read from stdio - try to reopen as writable.
+    if (pZip->m_pIO_opaque != pZip) return MZ_FALSE;
+    if (!pFilename) return MZ_FALSE;
+    pZip->m_pWrite = mz_zip_file_write_func;
+    if (NULL ==
+        (pState->m_pFile = MZ_FREOPEN(pFilename, "r+b", pState->m_pFile))) {
+      // The mz_zip_archive is now in a bogus state because pState->m_pFile is
+      // NULL, so just close it.
+      mz_zip_reader_end(pZip);
+      return MZ_FALSE;
+    }
+#endif  // #ifdef MINIZ_NO_STDIO
+  } else if (pState->m_pMem) {
+    // Archive lives in a memory block. Assume it's from the heap that we can
+    // resize using the realloc callback.
+    if (pZip->m_pIO_opaque != pZip) return MZ_FALSE;
+    pState->m_mem_capacity = pState->m_mem_size;
+    pZip->m_pWrite = mz_zip_heap_write_func;
+  }
+  // Archive is being read via a user provided read function - make sure the
+  // user has specified a write function too.
+  else if (!pZip->m_pWrite)
+    return MZ_FALSE;
+
+  // Start writing new files at the archive's current central directory
+  // location.
+  pZip->m_archive_size = pZip->m_central_directory_file_ofs;
+  pZip->m_zip_mode = MZ_ZIP_MODE_WRITING;
+  pZip->m_central_directory_file_ofs = 0;
+
+  return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_add_mem(mz_zip_archive *pZip, const char *pArchive_name,
+                              const void *pBuf, size_t buf_size,
+                              mz_uint level_and_flags) {
+  return mz_zip_writer_add_mem_ex(pZip, pArchive_name, pBuf, buf_size, NULL, 0,
+                                  level_and_flags, 0, 0);
+}
+
+typedef struct {
+  mz_zip_archive *m_pZip;
+  mz_uint64 m_cur_archive_file_ofs;
+  mz_uint64 m_comp_size;
+} mz_zip_writer_add_state;
+
+static mz_bool mz_zip_writer_add_put_buf_callback(const void *pBuf, int len,
+                                                  void *pUser) {
+  mz_zip_writer_add_state *pState = (mz_zip_writer_add_state *)pUser;
+  if ((int)pState->m_pZip->m_pWrite(pState->m_pZip->m_pIO_opaque,
+                                    pState->m_cur_archive_file_ofs, pBuf,
+                                    len) != len)
+    return MZ_FALSE;
+  pState->m_cur_archive_file_ofs += len;
+  pState->m_comp_size += len;
+  return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_create_local_dir_header(
+    mz_zip_archive *pZip, mz_uint8 *pDst, mz_uint16 filename_size,
+    mz_uint16 extra_size, mz_uint64 uncomp_size, mz_uint64 comp_size,
+    mz_uint32 uncomp_crc32, mz_uint16 method, mz_uint16 bit_flags,
+    mz_uint16 dos_time, mz_uint16 dos_date) {
+  (void)pZip;
+  memset(pDst, 0, MZ_ZIP_LOCAL_DIR_HEADER_SIZE);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_SIG_OFS, MZ_ZIP_LOCAL_DIR_HEADER_SIG);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_VERSION_NEEDED_OFS, method ? 20 : 0);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_BIT_FLAG_OFS, bit_flags);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_METHOD_OFS, method);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_FILE_TIME_OFS, dos_time);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_FILE_DATE_OFS, dos_date);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_CRC32_OFS, uncomp_crc32);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_COMPRESSED_SIZE_OFS, comp_size);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_LDH_DECOMPRESSED_SIZE_OFS, uncomp_size);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_FILENAME_LEN_OFS, filename_size);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_LDH_EXTRA_LEN_OFS, extra_size);
+  return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_create_central_dir_header(
+    mz_zip_archive *pZip, mz_uint8 *pDst, mz_uint16 filename_size,
+    mz_uint16 extra_size, mz_uint16 comment_size, mz_uint64 uncomp_size,
+    mz_uint64 comp_size, mz_uint32 uncomp_crc32, mz_uint16 method,
+    mz_uint16 bit_flags, mz_uint16 dos_time, mz_uint16 dos_date,
+    mz_uint64 local_header_ofs, mz_uint32 ext_attributes) {
+  (void)pZip;
+  memset(pDst, 0, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_SIG_OFS, MZ_ZIP_CENTRAL_DIR_HEADER_SIG);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_VERSION_NEEDED_OFS, method ? 20 : 0);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_BIT_FLAG_OFS, bit_flags);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_METHOD_OFS, method);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_FILE_TIME_OFS, dos_time);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_FILE_DATE_OFS, dos_date);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_CRC32_OFS, uncomp_crc32);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS, comp_size);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_DECOMPRESSED_SIZE_OFS, uncomp_size);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_FILENAME_LEN_OFS, filename_size);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_EXTRA_LEN_OFS, extra_size);
+  MZ_WRITE_LE16(pDst + MZ_ZIP_CDH_COMMENT_LEN_OFS, comment_size);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_EXTERNAL_ATTR_OFS, ext_attributes);
+  MZ_WRITE_LE32(pDst + MZ_ZIP_CDH_LOCAL_HEADER_OFS, local_header_ofs);
+  return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_add_to_central_dir(
+    mz_zip_archive *pZip, const char *pFilename, mz_uint16 filename_size,
+    const void *pExtra, mz_uint16 extra_size, const void *pComment,
+    mz_uint16 comment_size, mz_uint64 uncomp_size, mz_uint64 comp_size,
+    mz_uint32 uncomp_crc32, mz_uint16 method, mz_uint16 bit_flags,
+    mz_uint16 dos_time, mz_uint16 dos_date, mz_uint64 local_header_ofs,
+    mz_uint32 ext_attributes) {
+  mz_zip_internal_state *pState = pZip->m_pState;
+  mz_uint32 central_dir_ofs = (mz_uint32)pState->m_central_dir.m_size;
+  size_t orig_central_dir_size = pState->m_central_dir.m_size;
+  mz_uint8 central_dir_header[MZ_ZIP_CENTRAL_DIR_HEADER_SIZE];
+
+  // No zip64 support yet
+  if ((local_header_ofs > 0xFFFFFFFF) ||
+      (((mz_uint64)pState->m_central_dir.m_size +
+        MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + filename_size + extra_size +
+        comment_size) > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  if (!mz_zip_writer_create_central_dir_header(
+          pZip, central_dir_header, filename_size, extra_size, comment_size,
+          uncomp_size, comp_size, uncomp_crc32, method, bit_flags, dos_time,
+          dos_date, local_header_ofs, ext_attributes))
+    return MZ_FALSE;
+
+  if ((!mz_zip_array_push_back(pZip, &pState->m_central_dir, central_dir_header,
+                               MZ_ZIP_CENTRAL_DIR_HEADER_SIZE)) ||
+      (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pFilename,
+                               filename_size)) ||
+      (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pExtra,
+                               extra_size)) ||
+      (!mz_zip_array_push_back(pZip, &pState->m_central_dir, pComment,
+                               comment_size)) ||
+      (!mz_zip_array_push_back(pZip, &pState->m_central_dir_offsets,
+                               &central_dir_ofs, 1))) {
+    // Try to push the central directory array back into its original state.
+    mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size,
+                        MZ_FALSE);
+    return MZ_FALSE;
+  }
+
+  return MZ_TRUE;
+}
+
+static mz_bool mz_zip_writer_validate_archive_name(const char *pArchive_name) {
+  // Basic ZIP archive filename validity checks: Valid filenames cannot start
+  // with a forward slash, cannot contain a drive letter, and cannot use
+  // DOS-style backward slashes.
+  if (*pArchive_name == '/') return MZ_FALSE;
+  while (*pArchive_name) {
+    if ((*pArchive_name == '\\') || (*pArchive_name == ':')) return MZ_FALSE;
+    pArchive_name++;
+  }
+  return MZ_TRUE;
+}
+
+static mz_uint mz_zip_writer_compute_padding_needed_for_file_alignment(
+    mz_zip_archive *pZip) {
+  mz_uint32 n;
+  if (!pZip->m_file_offset_alignment) return 0;
+  n = (mz_uint32)(pZip->m_archive_size & (pZip->m_file_offset_alignment - 1));
+  return (pZip->m_file_offset_alignment - n) &
+         (pZip->m_file_offset_alignment - 1);
+}
+
+static mz_bool mz_zip_writer_write_zeros(mz_zip_archive *pZip,
+                                         mz_uint64 cur_file_ofs, mz_uint32 n) {
+  char buf[4096];
+  memset(buf, 0, MZ_MIN(sizeof(buf), n));
+  while (n) {
+    mz_uint32 s = MZ_MIN(sizeof(buf), n);
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_file_ofs, buf, s) != s)
+      return MZ_FALSE;
+    cur_file_ofs += s;
+    n -= s;
+  }
+  return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_add_mem_ex(mz_zip_archive *pZip,
+                                 const char *pArchive_name, const void *pBuf,
+                                 size_t buf_size, const void *pComment,
+                                 mz_uint16 comment_size,
+                                 mz_uint level_and_flags, mz_uint64 uncomp_size,
+                                 mz_uint32 uncomp_crc32) {
+  mz_uint16 method = 0, dos_time = 0, dos_date = 0;
+  mz_uint level, ext_attributes = 0, num_alignment_padding_bytes;
+  mz_uint64 local_dir_header_ofs = pZip->m_archive_size,
+            cur_archive_file_ofs = pZip->m_archive_size, comp_size = 0;
+  size_t archive_name_size;
+  mz_uint8 local_dir_header[MZ_ZIP_LOCAL_DIR_HEADER_SIZE];
+  tdefl_compressor *pComp = NULL;
+  mz_bool store_data_uncompressed;
+  mz_zip_internal_state *pState;
+
+  if ((int)level_and_flags < 0) level_and_flags = MZ_DEFAULT_LEVEL;
+  level = level_and_flags & 0xF;
+  store_data_uncompressed =
+      ((!level) || (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA));
+
+  if ((!pZip) || (!pZip->m_pState) ||
+      (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || ((buf_size) && (!pBuf)) ||
+      (!pArchive_name) || ((comment_size) && (!pComment)) ||
+      (pZip->m_total_files == 0xFFFF) || (level > MZ_UBER_COMPRESSION))
+    return MZ_FALSE;
+
+  pState = pZip->m_pState;
+
+  if ((!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) && (uncomp_size))
+    return MZ_FALSE;
+  // No zip64 support yet
+  if ((buf_size > 0xFFFFFFFF) || (uncomp_size > 0xFFFFFFFF)) return MZ_FALSE;
+  if (!mz_zip_writer_validate_archive_name(pArchive_name)) return MZ_FALSE;
+
+#ifndef MINIZ_NO_TIME
+  {
+    time_t cur_time;
+    time(&cur_time);
+    mz_zip_time_to_dos_time(cur_time, &dos_time, &dos_date);
+  }
+#endif  // #ifndef MINIZ_NO_TIME
+
+  archive_name_size = strlen(pArchive_name);
+  if (archive_name_size > 0xFFFF) return MZ_FALSE;
+
+  num_alignment_padding_bytes =
+      mz_zip_writer_compute_padding_needed_for_file_alignment(pZip);
+
+  // no zip64 support yet
+  if ((pZip->m_total_files == 0xFFFF) ||
+      ((pZip->m_archive_size + num_alignment_padding_bytes +
+        MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
+        comment_size + archive_name_size) > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  if ((archive_name_size) && (pArchive_name[archive_name_size - 1] == '/')) {
+    // Set DOS Subdirectory attribute bit.
+    ext_attributes |= 0x10;
+    // Subdirectories cannot contain data.
+    if ((buf_size) || (uncomp_size)) return MZ_FALSE;
+  }
+
+  // Try to do any allocations before writing to the archive, so if an
+  // allocation fails the file remains unmodified. (A good idea if we're doing
+  // an in-place modification.)
+  if ((!mz_zip_array_ensure_room(
+          pZip, &pState->m_central_dir,
+          MZ_ZIP_CENTRAL_DIR_HEADER_SIZE + archive_name_size + comment_size)) ||
+      (!mz_zip_array_ensure_room(pZip, &pState->m_central_dir_offsets, 1)))
+    return MZ_FALSE;
+
+  if ((!store_data_uncompressed) && (buf_size)) {
+    if (NULL == (pComp = (tdefl_compressor *)pZip->m_pAlloc(
+                     pZip->m_pAlloc_opaque, 1, sizeof(tdefl_compressor))))
+      return MZ_FALSE;
+  }
+
+  if (!mz_zip_writer_write_zeros(
+          pZip, cur_archive_file_ofs,
+          num_alignment_padding_bytes + sizeof(local_dir_header))) {
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+    return MZ_FALSE;
+  }
+  local_dir_header_ofs += num_alignment_padding_bytes;
+  if (pZip->m_file_offset_alignment) {
+    MZ_ASSERT((local_dir_header_ofs & (pZip->m_file_offset_alignment - 1)) ==
+              0);
+  }
+  cur_archive_file_ofs +=
+      num_alignment_padding_bytes + sizeof(local_dir_header);
+
+  MZ_CLEAR_OBJ(local_dir_header);
+  if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pArchive_name,
+                     archive_name_size) != archive_name_size) {
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+    return MZ_FALSE;
+  }
+  cur_archive_file_ofs += archive_name_size;
+
+  if (!(level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA)) {
+    uncomp_crc32 =
+        (mz_uint32)mz_crc32(MZ_CRC32_INIT, (const mz_uint8 *)pBuf, buf_size);
+    uncomp_size = buf_size;
+    if (uncomp_size <= 3) {
+      level = 0;
+      store_data_uncompressed = MZ_TRUE;
+    }
+  }
+
+  if (store_data_uncompressed) {
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pBuf,
+                       buf_size) != buf_size) {
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+      return MZ_FALSE;
+    }
+
+    cur_archive_file_ofs += buf_size;
+    comp_size = buf_size;
+
+    if (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA) method = MZ_DEFLATED;
+  } else if (buf_size) {
+    mz_zip_writer_add_state state;
+
+    state.m_pZip = pZip;
+    state.m_cur_archive_file_ofs = cur_archive_file_ofs;
+    state.m_comp_size = 0;
+
+    if ((tdefl_init(pComp, mz_zip_writer_add_put_buf_callback, &state,
+                    tdefl_create_comp_flags_from_zip_params(
+                        level, -15, MZ_DEFAULT_STRATEGY)) !=
+         TDEFL_STATUS_OKAY) ||
+        (tdefl_compress_buffer(pComp, pBuf, buf_size, TDEFL_FINISH) !=
+         TDEFL_STATUS_DONE)) {
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+      return MZ_FALSE;
+    }
+
+    comp_size = state.m_comp_size;
+    cur_archive_file_ofs = state.m_cur_archive_file_ofs;
+
+    method = MZ_DEFLATED;
+  }
+
+  pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+  pComp = NULL;
+
+  // no zip64 support yet
+  if ((comp_size > 0xFFFFFFFF) || (cur_archive_file_ofs > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  if (!mz_zip_writer_create_local_dir_header(
+          pZip, local_dir_header, (mz_uint16)archive_name_size, 0, uncomp_size,
+          comp_size, uncomp_crc32, method, 0, dos_time, dos_date))
+    return MZ_FALSE;
+
+  if (pZip->m_pWrite(pZip->m_pIO_opaque, local_dir_header_ofs, local_dir_header,
+                     sizeof(local_dir_header)) != sizeof(local_dir_header))
+    return MZ_FALSE;
+
+  if (!mz_zip_writer_add_to_central_dir(
+          pZip, pArchive_name, (mz_uint16)archive_name_size, NULL, 0, pComment,
+          comment_size, uncomp_size, comp_size, uncomp_crc32, method, 0,
+          dos_time, dos_date, local_dir_header_ofs, ext_attributes))
+    return MZ_FALSE;
+
+  pZip->m_total_files++;
+  pZip->m_archive_size = cur_archive_file_ofs;
+
+  return MZ_TRUE;
+}
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_writer_add_file(mz_zip_archive *pZip, const char *pArchive_name,
+                               const char *pSrc_filename, const void *pComment,
+                               mz_uint16 comment_size,
+                               mz_uint level_and_flags) {
+  mz_uint uncomp_crc32 = MZ_CRC32_INIT, level, num_alignment_padding_bytes;
+  mz_uint16 method = 0, dos_time = 0, dos_date = 0, ext_attributes = 0;
+  mz_uint64 local_dir_header_ofs = pZip->m_archive_size,
+            cur_archive_file_ofs = pZip->m_archive_size, uncomp_size = 0,
+            comp_size = 0;
+  size_t archive_name_size;
+  mz_uint8 local_dir_header[MZ_ZIP_LOCAL_DIR_HEADER_SIZE];
+  MZ_FILE *pSrc_file = NULL;
+
+  if ((int)level_and_flags < 0) level_and_flags = MZ_DEFAULT_LEVEL;
+  level = level_and_flags & 0xF;
+
+  if ((!pZip) || (!pZip->m_pState) ||
+      (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) || (!pArchive_name) ||
+      ((comment_size) && (!pComment)) || (level > MZ_UBER_COMPRESSION))
+    return MZ_FALSE;
+  if (level_and_flags & MZ_ZIP_FLAG_COMPRESSED_DATA) return MZ_FALSE;
+  if (!mz_zip_writer_validate_archive_name(pArchive_name)) return MZ_FALSE;
+
+  archive_name_size = strlen(pArchive_name);
+  if (archive_name_size > 0xFFFF) return MZ_FALSE;
+
+  num_alignment_padding_bytes =
+      mz_zip_writer_compute_padding_needed_for_file_alignment(pZip);
+
+  // no zip64 support yet
+  if ((pZip->m_total_files == 0xFFFF) ||
+      ((pZip->m_archive_size + num_alignment_padding_bytes +
+        MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE +
+        comment_size + archive_name_size) > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  if (!mz_zip_get_file_modified_time(pSrc_filename, &dos_time, &dos_date))
+    return MZ_FALSE;
+
+  pSrc_file = MZ_FOPEN(pSrc_filename, "rb");
+  if (!pSrc_file) return MZ_FALSE;
+  MZ_FSEEK64(pSrc_file, 0, SEEK_END);
+  uncomp_size = MZ_FTELL64(pSrc_file);
+  MZ_FSEEK64(pSrc_file, 0, SEEK_SET);
+
+  if (uncomp_size > 0xFFFFFFFF) {
+    // No zip64 support yet
+    MZ_FCLOSE(pSrc_file);
+    return MZ_FALSE;
+  }
+  if (uncomp_size <= 3) level = 0;
+
+  if (!mz_zip_writer_write_zeros(
+          pZip, cur_archive_file_ofs,
+          num_alignment_padding_bytes + sizeof(local_dir_header))) {
+    MZ_FCLOSE(pSrc_file);
+    return MZ_FALSE;
+  }
+  local_dir_header_ofs += num_alignment_padding_bytes;
+  if (pZip->m_file_offset_alignment) {
+    MZ_ASSERT((local_dir_header_ofs & (pZip->m_file_offset_alignment - 1)) ==
+              0);
+  }
+  cur_archive_file_ofs +=
+      num_alignment_padding_bytes + sizeof(local_dir_header);
+
+  MZ_CLEAR_OBJ(local_dir_header);
+  if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pArchive_name,
+                     archive_name_size) != archive_name_size) {
+    MZ_FCLOSE(pSrc_file);
+    return MZ_FALSE;
+  }
+  cur_archive_file_ofs += archive_name_size;
+
+  if (uncomp_size) {
+    mz_uint64 uncomp_remaining = uncomp_size;
+    void *pRead_buf =
+        pZip->m_pAlloc(pZip->m_pAlloc_opaque, 1, MZ_ZIP_MAX_IO_BUF_SIZE);
+    if (!pRead_buf) {
+      MZ_FCLOSE(pSrc_file);
+      return MZ_FALSE;
+    }
+
+    if (!level) {
+      while (uncomp_remaining) {
+        mz_uint n =
+            (mz_uint)MZ_MIN((mz_uint)MZ_ZIP_MAX_IO_BUF_SIZE, uncomp_remaining);
+        if ((MZ_FREAD(pRead_buf, 1, n, pSrc_file) != n) ||
+            (pZip->m_pWrite(pZip->m_pIO_opaque, cur_archive_file_ofs, pRead_buf,
+                            n) != n)) {
+          pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+          MZ_FCLOSE(pSrc_file);
+          return MZ_FALSE;
+        }
+        uncomp_crc32 =
+            (mz_uint32)mz_crc32(uncomp_crc32, (const mz_uint8 *)pRead_buf, n);
+        uncomp_remaining -= n;
+        cur_archive_file_ofs += n;
+      }
+      comp_size = uncomp_size;
+    } else {
+      mz_bool result = MZ_FALSE;
+      mz_zip_writer_add_state state;
+      tdefl_compressor *pComp = (tdefl_compressor *)pZip->m_pAlloc(
+          pZip->m_pAlloc_opaque, 1, sizeof(tdefl_compressor));
+      if (!pComp) {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+        MZ_FCLOSE(pSrc_file);
+        return MZ_FALSE;
+      }
+
+      state.m_pZip = pZip;
+      state.m_cur_archive_file_ofs = cur_archive_file_ofs;
+      state.m_comp_size = 0;
+
+      if (tdefl_init(pComp, mz_zip_writer_add_put_buf_callback, &state,
+                     tdefl_create_comp_flags_from_zip_params(
+                         level, -15, MZ_DEFAULT_STRATEGY)) !=
+          TDEFL_STATUS_OKAY) {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+        MZ_FCLOSE(pSrc_file);
+        return MZ_FALSE;
+      }
+
+      for (;;) {
+        size_t in_buf_size = (mz_uint32)MZ_MIN(uncomp_remaining,
+                                               (mz_uint)MZ_ZIP_MAX_IO_BUF_SIZE);
+        tdefl_status status;
+
+        if (MZ_FREAD(pRead_buf, 1, in_buf_size, pSrc_file) != in_buf_size)
+          break;
+
+        uncomp_crc32 = (mz_uint32)mz_crc32(
+            uncomp_crc32, (const mz_uint8 *)pRead_buf, in_buf_size);
+        uncomp_remaining -= in_buf_size;
+
+        status = tdefl_compress_buffer(
+            pComp, pRead_buf, in_buf_size,
+            uncomp_remaining ? TDEFL_NO_FLUSH : TDEFL_FINISH);
+        if (status == TDEFL_STATUS_DONE) {
+          result = MZ_TRUE;
+          break;
+        } else if (status != TDEFL_STATUS_OKAY)
+          break;
+      }
+
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pComp);
+
+      if (!result) {
+        pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+        MZ_FCLOSE(pSrc_file);
+        return MZ_FALSE;
+      }
+
+      comp_size = state.m_comp_size;
+      cur_archive_file_ofs = state.m_cur_archive_file_ofs;
+
+      method = MZ_DEFLATED;
+    }
+
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pRead_buf);
+  }
+
+  MZ_FCLOSE(pSrc_file);
+  pSrc_file = NULL;
+
+  // no zip64 support yet
+  if ((comp_size > 0xFFFFFFFF) || (cur_archive_file_ofs > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  if (!mz_zip_writer_create_local_dir_header(
+          pZip, local_dir_header, (mz_uint16)archive_name_size, 0, uncomp_size,
+          comp_size, uncomp_crc32, method, 0, dos_time, dos_date))
+    return MZ_FALSE;
+
+  if (pZip->m_pWrite(pZip->m_pIO_opaque, local_dir_header_ofs, local_dir_header,
+                     sizeof(local_dir_header)) != sizeof(local_dir_header))
+    return MZ_FALSE;
+
+  if (!mz_zip_writer_add_to_central_dir(
+          pZip, pArchive_name, (mz_uint16)archive_name_size, NULL, 0, pComment,
+          comment_size, uncomp_size, comp_size, uncomp_crc32, method, 0,
+          dos_time, dos_date, local_dir_header_ofs, ext_attributes))
+    return MZ_FALSE;
+
+  pZip->m_total_files++;
+  pZip->m_archive_size = cur_archive_file_ofs;
+
+  return MZ_TRUE;
+}
+#endif  // #ifndef MINIZ_NO_STDIO
+
+mz_bool mz_zip_writer_add_from_zip_reader(mz_zip_archive *pZip,
+                                          mz_zip_archive *pSource_zip,
+                                          mz_uint file_index) {
+  mz_uint n, bit_flags, num_alignment_padding_bytes;
+  mz_uint64 comp_bytes_remaining, local_dir_header_ofs;
+  mz_uint64 cur_src_file_ofs, cur_dst_file_ofs;
+  mz_uint32
+      local_header_u32[(MZ_ZIP_LOCAL_DIR_HEADER_SIZE + sizeof(mz_uint32) - 1) /
+                       sizeof(mz_uint32)];
+  mz_uint8 *pLocal_header = (mz_uint8 *)local_header_u32;
+  mz_uint8 central_header[MZ_ZIP_CENTRAL_DIR_HEADER_SIZE];
+  size_t orig_central_dir_size;
+  mz_zip_internal_state *pState;
+  void *pBuf;
+  const mz_uint8 *pSrc_central_header;
+
+  if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING))
+    return MZ_FALSE;
+  if (NULL ==
+      (pSrc_central_header = mz_zip_reader_get_cdh(pSource_zip, file_index)))
+    return MZ_FALSE;
+  pState = pZip->m_pState;
+
+  num_alignment_padding_bytes =
+      mz_zip_writer_compute_padding_needed_for_file_alignment(pZip);
+
+  // no zip64 support yet
+  if ((pZip->m_total_files == 0xFFFF) ||
+      ((pZip->m_archive_size + num_alignment_padding_bytes +
+        MZ_ZIP_LOCAL_DIR_HEADER_SIZE + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE) >
+       0xFFFFFFFF))
+    return MZ_FALSE;
+
+  cur_src_file_ofs =
+      MZ_READ_LE32(pSrc_central_header + MZ_ZIP_CDH_LOCAL_HEADER_OFS);
+  cur_dst_file_ofs = pZip->m_archive_size;
+
+  if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs,
+                           pLocal_header, MZ_ZIP_LOCAL_DIR_HEADER_SIZE) !=
+      MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+  if (MZ_READ_LE32(pLocal_header) != MZ_ZIP_LOCAL_DIR_HEADER_SIG)
+    return MZ_FALSE;
+  cur_src_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE;
+
+  if (!mz_zip_writer_write_zeros(pZip, cur_dst_file_ofs,
+                                 num_alignment_padding_bytes))
+    return MZ_FALSE;
+  cur_dst_file_ofs += num_alignment_padding_bytes;
+  local_dir_header_ofs = cur_dst_file_ofs;
+  if (pZip->m_file_offset_alignment) {
+    MZ_ASSERT((local_dir_header_ofs & (pZip->m_file_offset_alignment - 1)) ==
+              0);
+  }
+
+  if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_dst_file_ofs, pLocal_header,
+                     MZ_ZIP_LOCAL_DIR_HEADER_SIZE) !=
+      MZ_ZIP_LOCAL_DIR_HEADER_SIZE)
+    return MZ_FALSE;
+  cur_dst_file_ofs += MZ_ZIP_LOCAL_DIR_HEADER_SIZE;
+
+  n = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_FILENAME_LEN_OFS) +
+      MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_EXTRA_LEN_OFS);
+  comp_bytes_remaining =
+      n + MZ_READ_LE32(pSrc_central_header + MZ_ZIP_CDH_COMPRESSED_SIZE_OFS);
+
+  if (NULL == (pBuf = pZip->m_pAlloc(
+                   pZip->m_pAlloc_opaque, 1,
+                   (size_t)MZ_MAX(sizeof(mz_uint32) * 4,
+                                  MZ_MIN((mz_uint)MZ_ZIP_MAX_IO_BUF_SIZE,
+                                         comp_bytes_remaining)))))
+    return MZ_FALSE;
+
+  while (comp_bytes_remaining) {
+    n = (mz_uint)MZ_MIN((mz_uint)MZ_ZIP_MAX_IO_BUF_SIZE, comp_bytes_remaining);
+    if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs, pBuf,
+                             n) != n) {
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+      return MZ_FALSE;
+    }
+    cur_src_file_ofs += n;
+
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_dst_file_ofs, pBuf, n) != n) {
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+      return MZ_FALSE;
+    }
+    cur_dst_file_ofs += n;
+
+    comp_bytes_remaining -= n;
+  }
+
+  bit_flags = MZ_READ_LE16(pLocal_header + MZ_ZIP_LDH_BIT_FLAG_OFS);
+  if (bit_flags & 8) {
+    // Copy data descriptor
+    if (pSource_zip->m_pRead(pSource_zip->m_pIO_opaque, cur_src_file_ofs, pBuf,
+                             sizeof(mz_uint32) * 4) != sizeof(mz_uint32) * 4) {
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+      return MZ_FALSE;
+    }
+
+    n = sizeof(mz_uint32) * ((MZ_READ_LE32(pBuf) == 0x08074b50) ? 4 : 3);
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, cur_dst_file_ofs, pBuf, n) != n) {
+      pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+      return MZ_FALSE;
+    }
+
+    cur_src_file_ofs += n;
+    cur_dst_file_ofs += n;
+  }
+  pZip->m_pFree(pZip->m_pAlloc_opaque, pBuf);
+
+  // no zip64 support yet
+  if (cur_dst_file_ofs > 0xFFFFFFFF) return MZ_FALSE;
+
+  orig_central_dir_size = pState->m_central_dir.m_size;
+
+  memcpy(central_header, pSrc_central_header, MZ_ZIP_CENTRAL_DIR_HEADER_SIZE);
+  MZ_WRITE_LE32(central_header + MZ_ZIP_CDH_LOCAL_HEADER_OFS,
+                local_dir_header_ofs);
+  if (!mz_zip_array_push_back(pZip, &pState->m_central_dir, central_header,
+                              MZ_ZIP_CENTRAL_DIR_HEADER_SIZE))
+    return MZ_FALSE;
+
+  n = MZ_READ_LE16(pSrc_central_header + MZ_ZIP_CDH_FILENAME_LEN_OFS) +
+      MZ_READ_LE16(pSrc_central_header + MZ_ZIP_CDH_EXTRA_LEN_OFS) +
+      MZ_READ_LE16(pSrc_central_header + MZ_ZIP_CDH_COMMENT_LEN_OFS);
+  if (!mz_zip_array_push_back(
+          pZip, &pState->m_central_dir,
+          pSrc_central_header + MZ_ZIP_CENTRAL_DIR_HEADER_SIZE, n)) {
+    mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size,
+                        MZ_FALSE);
+    return MZ_FALSE;
+  }
+
+  if (pState->m_central_dir.m_size > 0xFFFFFFFF) return MZ_FALSE;
+  n = (mz_uint32)orig_central_dir_size;
+  if (!mz_zip_array_push_back(pZip, &pState->m_central_dir_offsets, &n, 1)) {
+    mz_zip_array_resize(pZip, &pState->m_central_dir, orig_central_dir_size,
+                        MZ_FALSE);
+    return MZ_FALSE;
+  }
+
+  pZip->m_total_files++;
+  pZip->m_archive_size = cur_dst_file_ofs;
+
+  return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_finalize_archive(mz_zip_archive *pZip) {
+  mz_zip_internal_state *pState;
+  mz_uint64 central_dir_ofs, central_dir_size;
+  mz_uint8 hdr[MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE];
+
+  if ((!pZip) || (!pZip->m_pState) || (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING))
+    return MZ_FALSE;
+
+  pState = pZip->m_pState;
+
+  // no zip64 support yet
+  if ((pZip->m_total_files > 0xFFFF) ||
+      ((pZip->m_archive_size + pState->m_central_dir.m_size +
+        MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIZE) > 0xFFFFFFFF))
+    return MZ_FALSE;
+
+  central_dir_ofs = 0;
+  central_dir_size = 0;
+  if (pZip->m_total_files) {
+    // Write central directory
+    central_dir_ofs = pZip->m_archive_size;
+    central_dir_size = pState->m_central_dir.m_size;
+    pZip->m_central_directory_file_ofs = central_dir_ofs;
+    if (pZip->m_pWrite(pZip->m_pIO_opaque, central_dir_ofs,
+                       pState->m_central_dir.m_p,
+                       (size_t)central_dir_size) != central_dir_size)
+      return MZ_FALSE;
+    pZip->m_archive_size += central_dir_size;
+  }
+
+  // Write end of central directory record
+  MZ_CLEAR_OBJ(hdr);
+  MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_SIG_OFS,
+                MZ_ZIP_END_OF_CENTRAL_DIR_HEADER_SIG);
+  MZ_WRITE_LE16(hdr + MZ_ZIP_ECDH_CDIR_NUM_ENTRIES_ON_DISK_OFS,
+                pZip->m_total_files);
+  MZ_WRITE_LE16(hdr + MZ_ZIP_ECDH_CDIR_TOTAL_ENTRIES_OFS, pZip->m_total_files);
+  MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_CDIR_SIZE_OFS, central_dir_size);
+  MZ_WRITE_LE32(hdr + MZ_ZIP_ECDH_CDIR_OFS_OFS, central_dir_ofs);
+
+  if (pZip->m_pWrite(pZip->m_pIO_opaque, pZip->m_archive_size, hdr,
+                     sizeof(hdr)) != sizeof(hdr))
+    return MZ_FALSE;
+#ifndef MINIZ_NO_STDIO
+  if ((pState->m_pFile) && (MZ_FFLUSH(pState->m_pFile) == EOF)) return MZ_FALSE;
+#endif  // #ifndef MINIZ_NO_STDIO
+
+  pZip->m_archive_size += sizeof(hdr);
+
+  pZip->m_zip_mode = MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED;
+  return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_finalize_heap_archive(mz_zip_archive *pZip, void **pBuf,
+                                            size_t *pSize) {
+  if ((!pZip) || (!pZip->m_pState) || (!pBuf) || (!pSize)) return MZ_FALSE;
+  if (pZip->m_pWrite != mz_zip_heap_write_func) return MZ_FALSE;
+  if (!mz_zip_writer_finalize_archive(pZip)) return MZ_FALSE;
+
+  *pBuf = pZip->m_pState->m_pMem;
+  *pSize = pZip->m_pState->m_mem_size;
+  pZip->m_pState->m_pMem = NULL;
+  pZip->m_pState->m_mem_size = pZip->m_pState->m_mem_capacity = 0;
+  return MZ_TRUE;
+}
+
+mz_bool mz_zip_writer_end(mz_zip_archive *pZip) {
+  mz_zip_internal_state *pState;
+  mz_bool status = MZ_TRUE;
+  if ((!pZip) || (!pZip->m_pState) || (!pZip->m_pAlloc) || (!pZip->m_pFree) ||
+      ((pZip->m_zip_mode != MZ_ZIP_MODE_WRITING) &&
+       (pZip->m_zip_mode != MZ_ZIP_MODE_WRITING_HAS_BEEN_FINALIZED)))
+    return MZ_FALSE;
+
+  pState = pZip->m_pState;
+  pZip->m_pState = NULL;
+  mz_zip_array_clear(pZip, &pState->m_central_dir);
+  mz_zip_array_clear(pZip, &pState->m_central_dir_offsets);
+  mz_zip_array_clear(pZip, &pState->m_sorted_central_dir_offsets);
+
+#ifndef MINIZ_NO_STDIO
+  if (pState->m_pFile) {
+    MZ_FCLOSE(pState->m_pFile);
+    pState->m_pFile = NULL;
+  }
+#endif  // #ifndef MINIZ_NO_STDIO
+
+  if ((pZip->m_pWrite == mz_zip_heap_write_func) && (pState->m_pMem)) {
+    pZip->m_pFree(pZip->m_pAlloc_opaque, pState->m_pMem);
+    pState->m_pMem = NULL;
+  }
+
+  pZip->m_pFree(pZip->m_pAlloc_opaque, pState);
+  pZip->m_zip_mode = MZ_ZIP_MODE_INVALID;
+  return status;
+}
+
+#ifndef MINIZ_NO_STDIO
+mz_bool mz_zip_add_mem_to_archive_file_in_place(
+    const char *pZip_filename, const char *pArchive_name, const void *pBuf,
+    size_t buf_size, const void *pComment, mz_uint16 comment_size,
+    mz_uint level_and_flags) {
+  mz_bool status, created_new_archive = MZ_FALSE;
+  mz_zip_archive zip_archive;
+  struct MZ_FILE_STAT_STRUCT file_stat;
+  MZ_CLEAR_OBJ(zip_archive);
+  if ((int)level_and_flags < 0) level_and_flags = MZ_DEFAULT_LEVEL;
+  if ((!pZip_filename) || (!pArchive_name) || ((buf_size) && (!pBuf)) ||
+      ((comment_size) && (!pComment)) ||
+      ((level_and_flags & 0xF) > MZ_UBER_COMPRESSION))
+    return MZ_FALSE;
+  if (!mz_zip_writer_validate_archive_name(pArchive_name)) return MZ_FALSE;
+  if (MZ_FILE_STAT(pZip_filename, &file_stat) != 0) {
+    // Create a new archive.
+    if (!mz_zip_writer_init_file(&zip_archive, pZip_filename, 0))
+      return MZ_FALSE;
+    created_new_archive = MZ_TRUE;
+  } else {
+    // Append to an existing archive.
+    if (!mz_zip_reader_init_file(
+            &zip_archive, pZip_filename,
+            level_and_flags | MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY))
+      return MZ_FALSE;
+    if (!mz_zip_writer_init_from_reader(&zip_archive, pZip_filename)) {
+      mz_zip_reader_end(&zip_archive);
+      return MZ_FALSE;
+    }
+  }
+  status =
+      mz_zip_writer_add_mem_ex(&zip_archive, pArchive_name, pBuf, buf_size,
+                               pComment, comment_size, level_and_flags, 0, 0);
+  // Always finalize, even if adding failed for some reason, so we have a valid
+  // central directory. (This may not always succeed, but we can try.)
+  if (!mz_zip_writer_finalize_archive(&zip_archive)) status = MZ_FALSE;
+  if (!mz_zip_writer_end(&zip_archive)) status = MZ_FALSE;
+  if ((!status) && (created_new_archive)) {
+    // It's a new archive and something went wrong, so just delete it.
+    int ignoredStatus = MZ_DELETE_FILE(pZip_filename);
+    (void)ignoredStatus;
+  }
+  return status;
+}
+
+void *mz_zip_extract_archive_file_to_heap(const char *pZip_filename,
+                                          const char *pArchive_name,
+                                          size_t *pSize, mz_uint flags) {
+  int file_index;
+  mz_zip_archive zip_archive;
+  void *p = NULL;
+
+  if (pSize) *pSize = 0;
+
+  if ((!pZip_filename) || (!pArchive_name)) return NULL;
+
+  MZ_CLEAR_OBJ(zip_archive);
+  if (!mz_zip_reader_init_file(
+          &zip_archive, pZip_filename,
+          flags | MZ_ZIP_FLAG_DO_NOT_SORT_CENTRAL_DIRECTORY))
+    return NULL;
+
+  if ((file_index = mz_zip_reader_locate_file(&zip_archive, pArchive_name, NULL,
+                                              flags)) >= 0)
+    p = mz_zip_reader_extract_to_heap(&zip_archive, file_index, pSize, flags);
+
+  mz_zip_reader_end(&zip_archive);
+  return p;
+}
+
+#endif  // #ifndef MINIZ_NO_STDIO
+
+#endif  // #ifndef MINIZ_NO_ARCHIVE_WRITING_APIS
+
+#endif  // #ifndef MINIZ_NO_ARCHIVE_APIS
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // MINIZ_HEADER_FILE_ONLY
+
+/*
+  This is free and unencumbered software released into the public domain.
+
+  Anyone is free to copy, modify, publish, use, compile, sell, or
+  distribute this software, either in source code form or as a compiled
+  binary, for any purpose, commercial or non-commercial, and by any
+  means.
+
+  In jurisdictions that recognize copyright laws, the author or authors
+  of this software dedicate any and all copyright interest in the
+  software to the public domain. We make this dedication for the benefit
+  of the public at large and to the detriment of our heirs and
+  successors. We intend this dedication to be an overt act of
+  relinquishment in perpetuity of all present and future rights to this
+  software under copyright law.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+  OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+  ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+  OTHER DEALINGS IN THE SOFTWARE.
+
+  For more information, please refer to <http://unlicense.org/>
+*/
+
+// ---------------------- end of miniz ----------------------------------------
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}
+#else
+
+// Reuse MINIZ_LITTE_ENDIAN macro
+
+#if defined(__sparcv9)
+// Big endian
+#else
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || MINIZ_X86_OR_X64_CPU
+// Set MINIZ_LITTLE_ENDIAN to 1 if the processor is little endian.
+#define MINIZ_LITTLE_ENDIAN 1
+#endif
+#endif
+
+#endif  // TINYEXR_USE_MINIZ
+
+// static bool IsBigEndian(void) {
+//  union {
+//    unsigned int i;
+//    char c[4];
+//  } bint = {0x01020304};
+//
+//  return bint.c[0] == 1;
+//}
+
+static const int kEXRVersionSize = 8;
+
+static void swap2(unsigned short *val) {
+#ifdef MINIZ_LITTLE_ENDIAN
+  (void)val;
+#else
+  unsigned short tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[1];
+  dst[1] = src[0];
+#endif
+}
+
+static void swap4(unsigned int *val) {
+#ifdef MINIZ_LITTLE_ENDIAN
+  (void)val;
+#else
+  unsigned int tmp = *val;
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[3];
+  dst[1] = src[2];
+  dst[2] = src[1];
+  dst[3] = src[0];
+#endif
+}
+
+static void swap8(tinyexr::tinyexr_uint64 *val) {
+#ifdef MINIZ_LITTLE_ENDIAN
+  (void)val;
+#else
+  tinyexr::tinyexr_uint64 tmp = (*val);
+  unsigned char *dst = reinterpret_cast<unsigned char *>(val);
+  unsigned char *src = reinterpret_cast<unsigned char *>(&tmp);
+
+  dst[0] = src[7];
+  dst[1] = src[6];
+  dst[2] = src[5];
+  dst[3] = src[4];
+  dst[4] = src[3];
+  dst[5] = src[2];
+  dst[6] = src[1];
+  dst[7] = src[0];
+#endif
+}
+
+// https://gist.github.com/rygorous/2156668
+// Reuse MINIZ_LITTLE_ENDIAN flag from miniz.
+union FP32 {
+  unsigned int u;
+  float f;
+  struct {
+#if MINIZ_LITTLE_ENDIAN
+    unsigned int Mantissa : 23;
+    unsigned int Exponent : 8;
+    unsigned int Sign : 1;
+#else
+    unsigned int Sign : 1;
+    unsigned int Exponent : 8;
+    unsigned int Mantissa : 23;
+#endif
+  } s;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wpadded"
+#endif
+
+union FP16 {
+  unsigned short u;
+  struct {
+#if MINIZ_LITTLE_ENDIAN
+    unsigned int Mantissa : 10;
+    unsigned int Exponent : 5;
+    unsigned int Sign : 1;
+#else
+    unsigned int Sign : 1;
+    unsigned int Exponent : 5;
+    unsigned int Mantissa : 10;
+#endif
+  } s;
+};
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+
+static FP32 half_to_float(FP16 h) {
+  static const FP32 magic = {113 << 23};
+  static const unsigned int shifted_exp = 0x7c00
+                                          << 13;  // exponent mask after shift
+  FP32 o;
+
+  o.u = (h.u & 0x7fffU) << 13U;           // exponent/mantissa bits
+  unsigned int exp_ = shifted_exp & o.u;  // just the exponent
+  o.u += (127 - 15) << 23;                // exponent adjust
+
+  // handle exponent special cases
+  if (exp_ == shifted_exp)    // Inf/NaN?
+    o.u += (128 - 16) << 23;  // extra exp adjust
+  else if (exp_ == 0)         // Zero/Denormal?
+  {
+    o.u += 1 << 23;  // extra exp adjust
+    o.f -= magic.f;  // renormalize
+  }
+
+  o.u |= (h.u & 0x8000U) << 16U;  // sign bit
+  return o;
+}
+
+static FP16 float_to_half_full(FP32 f) {
+  FP16 o = {0};
+
+  // Based on ISPC reference code (with minor modifications)
+  if (f.s.Exponent == 0)  // Signed zero/denormal (which will underflow)
+    o.s.Exponent = 0;
+  else if (f.s.Exponent == 255)  // Inf or NaN (all exponent bits set)
+  {
+    o.s.Exponent = 31;
+    o.s.Mantissa = f.s.Mantissa ? 0x200 : 0;  // NaN->qNaN and Inf->Inf
+  } else                                      // Normalized number
+  {
+    // Exponent unbias the single, then bias the halfp
+    int newexp = f.s.Exponent - 127 + 15;
+    if (newexp >= 31)  // Overflow, return signed infinity
+      o.s.Exponent = 31;
+    else if (newexp <= 0)  // Underflow
+    {
+      if ((14 - newexp) <= 24)  // Mantissa might be non-zero
+      {
+        unsigned int mant = f.s.Mantissa | 0x800000;  // Hidden 1 bit
+        o.s.Mantissa = mant >> (14 - newexp);
+        if ((mant >> (13 - newexp)) & 1)  // Check for rounding
+          o.u++;  // Round, might overflow into exp bit, but this is OK
+      }
+    } else {
+      o.s.Exponent = static_cast<unsigned int>(newexp);
+      o.s.Mantissa = f.s.Mantissa >> 13;
+      if (f.s.Mantissa & 0x1000)  // Check for rounding
+        o.u++;                    // Round, might overflow to inf, this is OK
+    }
+  }
+
+  o.s.Sign = f.s.Sign;
+  return o;
+}
+
+// NOTE: From OpenEXR code
+// #define IMF_INCREASING_Y  0
+// #define IMF_DECREASING_Y  1
+// #define IMF_RAMDOM_Y    2
+//
+// #define IMF_NO_COMPRESSION  0
+// #define IMF_RLE_COMPRESSION 1
+// #define IMF_ZIPS_COMPRESSION  2
+// #define IMF_ZIP_COMPRESSION 3
+// #define IMF_PIZ_COMPRESSION 4
+// #define IMF_PXR24_COMPRESSION 5
+// #define IMF_B44_COMPRESSION 6
+// #define IMF_B44A_COMPRESSION  7
+
+static const char *ReadString(std::string *s, const char *ptr) {
+  // Read untile NULL(\0).
+  const char *p = ptr;
+  const char *q = ptr;
+  while ((*q) != 0) q++;
+
+  (*s) = std::string(p, q);
+
+  return q + 1;  // skip '\0'
+}
+
+static bool ReadAttribute(std::string *name, std::string *type,
+                          std::vector<unsigned char> *data, size_t *marker_size,
+                          const char *marker, size_t size) {
+  size_t name_len = strnlen(marker, size);
+  if (name_len == size) {
+    // String does not have a terminating character.
+    return false;
+  }
+  *name = std::string(marker, name_len);
+
+  marker += name_len + 1;
+  size -= name_len + 1;
+
+  size_t type_len = strnlen(marker, size);
+  if (type_len == size) {
+    return false;
+  }
+  *type = std::string(marker, type_len);
+
+  marker += type_len + 1;
+  size -= type_len + 1;
+
+  if (size < sizeof(uint32_t)) {
+    return false;
+  }
+
+  uint32_t data_len;
+  memcpy(&data_len, marker, sizeof(uint32_t));
+  tinyexr::swap4(reinterpret_cast<unsigned int *>(&data_len));
+
+  marker += sizeof(uint32_t);
+  size -= sizeof(uint32_t);
+
+  if (size < data_len) {
+    return false;
+  }
+
+  data->resize(static_cast<size_t>(data_len));
+  memcpy(&data->at(0), marker, static_cast<size_t>(data_len));
+
+  *marker_size = name_len + 1 + type_len + 1 + sizeof(uint32_t) + data_len;
+  return true;
+}
+
+static void WriteAttributeToMemory(std::vector<unsigned char> *out,
+                                   const char *name, const char *type,
+                                   const unsigned char *data, int len) {
+  out->insert(out->end(), name, name + strlen(name) + 1);
+  out->insert(out->end(), type, type + strlen(type) + 1);
+
+  int outLen = len;
+  tinyexr::swap4(reinterpret_cast<unsigned int *>(&outLen));
+  out->insert(out->end(), reinterpret_cast<unsigned char *>(&outLen),
+              reinterpret_cast<unsigned char *>(&outLen) + sizeof(int));
+  out->insert(out->end(), data, data + len);
+}
+
+typedef struct {
+  std::string name;  // less than 255 bytes long
+  int pixel_type;
+  int x_sampling;
+  int y_sampling;
+  unsigned char p_linear;
+  unsigned char pad[3];
+} ChannelInfo;
+
+typedef struct {
+  std::vector<tinyexr::ChannelInfo> channels;
+  std::vector<EXRAttribute> attributes;
+
+  int data_window[4];
+  int line_order;
+  int display_window[4];
+  float screen_window_center[2];
+  float screen_window_width;
+  float pixel_aspect_ratio;
+
+  int chunk_count;
+
+  // Tiled format
+  int tile_size_x;
+  int tile_size_y;
+  int tile_level_mode;
+  int tile_rounding_mode;
+
+  unsigned int header_len;
+
+  int compression_type;
+
+  void clear() {
+    channels.clear();
+    attributes.clear();
+
+    data_window[0] = 0;
+    data_window[1] = 0;
+    data_window[2] = 0;
+    data_window[3] = 0;
+    line_order = 0;
+    display_window[0] = 0;
+    display_window[1] = 0;
+    display_window[2] = 0;
+    display_window[3] = 0;
+    screen_window_center[0] = 0.0f;
+    screen_window_center[1] = 0.0f;
+    screen_window_width = 0.0f;
+    pixel_aspect_ratio = 0.0f;
+
+    chunk_count = 0;
+
+    // Tiled format
+    tile_size_x = 0;
+    tile_size_y = 0;
+    tile_level_mode = 0;
+    tile_rounding_mode = 0;
+
+    header_len = 0;
+    compression_type = 0;
+  }
+} HeaderInfo;
+
+static void ReadChannelInfo(std::vector<ChannelInfo> &channels,
+                            const std::vector<unsigned char> &data) {
+  const char *p = reinterpret_cast<const char *>(&data.at(0));
+
+  for (;;) {
+    if ((*p) == 0) {
+      break;
+    }
+    ChannelInfo info;
+    p = ReadString(&info.name, p);
+
+    memcpy(&info.pixel_type, p, sizeof(int));
+    p += 4;
+    info.p_linear = static_cast<unsigned char>(p[0]);  // uchar
+    p += 1 + 3;                                        // reserved: uchar[3]
+    memcpy(&info.x_sampling, p, sizeof(int));          // int
+    p += 4;
+    memcpy(&info.y_sampling, p, sizeof(int));  // int
+    p += 4;
+
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&info.pixel_type));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&info.x_sampling));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&info.y_sampling));
+
+    channels.push_back(info);
+  }
+}
+
+static void WriteChannelInfo(std::vector<unsigned char> &data,
+                             const std::vector<ChannelInfo> &channels) {
+  size_t sz = 0;
+
+  // Calculate total size.
+  for (size_t c = 0; c < channels.size(); c++) {
+    sz += strlen(channels[c].name.c_str()) + 1;  // +1 for \0
+    sz += 16;                                    // 4 * int
+  }
+  data.resize(sz + 1);
+
+  unsigned char *p = &data.at(0);
+
+  for (size_t c = 0; c < channels.size(); c++) {
+    memcpy(p, channels[c].name.c_str(), strlen(channels[c].name.c_str()));
+    p += strlen(channels[c].name.c_str());
+    (*p) = '\0';
+    p++;
+
+    int pixel_type = channels[c].pixel_type;
+    int x_sampling = channels[c].x_sampling;
+    int y_sampling = channels[c].y_sampling;
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&pixel_type));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&x_sampling));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&y_sampling));
+
+    memcpy(p, &pixel_type, sizeof(int));
+    p += sizeof(int);
+
+    (*p) = channels[c].p_linear;
+    p += 4;
+
+    memcpy(p, &x_sampling, sizeof(int));
+    p += sizeof(int);
+
+    memcpy(p, &y_sampling, sizeof(int));
+    p += sizeof(int);
+  }
+
+  (*p) = '\0';
+}
+
+static void CompressZip(unsigned char *dst,
+                        tinyexr::tinyexr_uint64 &compressedSize,
+                        const unsigned char *src, unsigned long src_size) {
+  std::vector<unsigned char> tmpBuf(src_size);
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfZipCompressor.cpp
+  //
+
+  //
+  // Reorder the pixel data.
+  //
+
+  const char *srcPtr = reinterpret_cast<const char *>(src);
+
+  {
+    char *t1 = reinterpret_cast<char *>(&tmpBuf.at(0));
+    char *t2 = reinterpret_cast<char *>(&tmpBuf.at(0)) + (src_size + 1) / 2;
+    const char *stop = srcPtr + src_size;
+
+    for (;;) {
+      if (srcPtr < stop)
+        *(t1++) = *(srcPtr++);
+      else
+        break;
+
+      if (srcPtr < stop)
+        *(t2++) = *(srcPtr++);
+      else
+        break;
+    }
+  }
+
+  //
+  // Predictor.
+  //
+
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + src_size;
+    int p = t[-1];
+
+    while (t < stop) {
+      int d = int(t[0]) - p + (128 + 256);
+      p = t[0];
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+#if TINYEXR_USE_MINIZ
+  //
+  // Compress the data using miniz
+  //
+
+  miniz::mz_ulong outSize = miniz::mz_compressBound(src_size);
+  int ret = miniz::mz_compress(
+      dst, &outSize, static_cast<const unsigned char *>(&tmpBuf.at(0)),
+      src_size);
+  assert(ret == miniz::MZ_OK);
+  (void)ret;
+
+  compressedSize = outSize;
+#else
+  uLong outSize = compressBound(static_cast<uLong>(src_size));
+  int ret = compress(dst, &outSize, static_cast<const Bytef *>(&tmpBuf.at(0)),
+                     src_size);
+  assert(ret == Z_OK);
+
+  compressedSize = outSize;
+#endif
+}
+
+static void DecompressZip(unsigned char *dst,
+                          unsigned long *uncompressed_size /* inout */,
+                          const unsigned char *src, unsigned long src_size) {
+  std::vector<unsigned char> tmpBuf(*uncompressed_size);
+
+#if TINYEXR_USE_MINIZ
+  int ret =
+      miniz::mz_uncompress(&tmpBuf.at(0), uncompressed_size, src, src_size);
+  assert(ret == miniz::MZ_OK);
+  (void)ret;
+#else
+  int ret = uncompress(&tmpBuf.at(0), uncompressed_size, src, src_size);
+  assert(ret == Z_OK);
+  (void)ret;
+#endif
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfZipCompressor.cpp
+  //
+
+  // Predictor.
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + (*uncompressed_size);
+
+    while (t < stop) {
+      int d = int(t[-1]) + int(t[0]) - 128;
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+  // Reorder the pixel data.
+  {
+    const char *t1 = reinterpret_cast<const char *>(&tmpBuf.at(0));
+    const char *t2 = reinterpret_cast<const char *>(&tmpBuf.at(0)) +
+                     (*uncompressed_size + 1) / 2;
+    char *s = reinterpret_cast<char *>(dst);
+    char *stop = s + (*uncompressed_size);
+
+    for (;;) {
+      if (s < stop)
+        *(s++) = *(t1++);
+      else
+        break;
+
+      if (s < stop)
+        *(s++) = *(t2++);
+      else
+        break;
+    }
+  }
+}
+
+// RLE code from OpenEXR --------------------------------------
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#endif
+
+const int MIN_RUN_LENGTH = 3;
+const int MAX_RUN_LENGTH = 127;
+
+//
+// Compress an array of bytes, using run-length encoding,
+// and return the length of the compressed data.
+//
+
+static int rleCompress(int inLength, const char in[], signed char out[]) {
+  const char *inEnd = in + inLength;
+  const char *runStart = in;
+  const char *runEnd = in + 1;
+  signed char *outWrite = out;
+
+  while (runStart < inEnd) {
+    while (runEnd < inEnd && *runStart == *runEnd &&
+           runEnd - runStart - 1 < MAX_RUN_LENGTH) {
+      ++runEnd;
+    }
+
+    if (runEnd - runStart >= MIN_RUN_LENGTH) {
+      //
+      // Compressable run
+      //
+
+      *outWrite++ = static_cast<char>(runEnd - runStart) - 1;
+      *outWrite++ = *(reinterpret_cast<const signed char *>(runStart));
+      runStart = runEnd;
+    } else {
+      //
+      // Uncompressable run
+      //
+
+      while (runEnd < inEnd &&
+             ((runEnd + 1 >= inEnd || *runEnd != *(runEnd + 1)) ||
+              (runEnd + 2 >= inEnd || *(runEnd + 1) != *(runEnd + 2))) &&
+             runEnd - runStart < MAX_RUN_LENGTH) {
+        ++runEnd;
+      }
+
+      *outWrite++ = static_cast<char>(runStart - runEnd);
+
+      while (runStart < runEnd) {
+        *outWrite++ = *(reinterpret_cast<const signed char *>(runStart++));
+      }
+    }
+
+    ++runEnd;
+  }
+
+  return static_cast<int>(outWrite - out);
+}
+
+//
+// Uncompress an array of bytes compressed with rleCompress().
+// Returns the length of the oncompressed data, or 0 if the
+// length of the uncompressed data would be more than maxLength.
+//
+
+static int rleUncompress(int inLength, int maxLength, const signed char in[],
+                         char out[]) {
+  char *outStart = out;
+
+  while (inLength > 0) {
+    if (*in < 0) {
+      int count = -(static_cast<int>(*in++));
+      inLength -= count + 1;
+
+      if (0 > (maxLength -= count)) return 0;
+
+      memcpy(out, in, count);
+      out += count;
+      in += count;
+    } else {
+      int count = *in++;
+      inLength -= 2;
+
+      if (0 > (maxLength -= count + 1)) return 0;
+
+      memset(out, *reinterpret_cast<const char *>(in), count + 1);
+      out += count + 1;
+
+      in++;
+    }
+  }
+
+  return static_cast<int>(out - outStart);
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+// End of RLE code from OpenEXR -----------------------------------
+
+static void CompressRle(unsigned char *dst,
+                        tinyexr::tinyexr_uint64 &compressedSize,
+                        const unsigned char *src, unsigned long src_size) {
+  std::vector<unsigned char> tmpBuf(src_size);
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfRleCompressor.cpp
+  //
+
+  //
+  // Reorder the pixel data.
+  //
+
+  const char *srcPtr = reinterpret_cast<const char *>(src);
+
+  {
+    char *t1 = reinterpret_cast<char *>(&tmpBuf.at(0));
+    char *t2 = reinterpret_cast<char *>(&tmpBuf.at(0)) + (src_size + 1) / 2;
+    const char *stop = srcPtr + src_size;
+
+    for (;;) {
+      if (srcPtr < stop)
+        *(t1++) = *(srcPtr++);
+      else
+        break;
+
+      if (srcPtr < stop)
+        *(t2++) = *(srcPtr++);
+      else
+        break;
+    }
+  }
+
+  //
+  // Predictor.
+  //
+
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + src_size;
+    int p = t[-1];
+
+    while (t < stop) {
+      int d = int(t[0]) - p + (128 + 256);
+      p = t[0];
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+  // outSize will be (srcSiz * 3) / 2 at max.
+  int outSize = rleCompress(static_cast<int>(src_size),
+                            reinterpret_cast<const char *>(&tmpBuf.at(0)),
+                            reinterpret_cast<signed char *>(dst));
+  assert(outSize > 0);
+
+  compressedSize = static_cast<tinyexr::tinyexr_uint64>(outSize);
+}
+
+static void DecompressRle(unsigned char *dst,
+                          const unsigned long uncompressed_size,
+                          const unsigned char *src, unsigned long src_size) {
+  std::vector<unsigned char> tmpBuf(uncompressed_size);
+
+  int ret = rleUncompress(static_cast<int>(src_size),
+                          static_cast<int>(uncompressed_size),
+                          reinterpret_cast<const signed char *>(src),
+                          reinterpret_cast<char *>(&tmpBuf.at(0)));
+  assert(ret == static_cast<int>(uncompressed_size));
+  (void)ret;
+
+  //
+  // Apply EXR-specific? postprocess. Grabbed from OpenEXR's
+  // ImfRleCompressor.cpp
+  //
+
+  // Predictor.
+  {
+    unsigned char *t = &tmpBuf.at(0) + 1;
+    unsigned char *stop = &tmpBuf.at(0) + uncompressed_size;
+
+    while (t < stop) {
+      int d = int(t[-1]) + int(t[0]) - 128;
+      t[0] = static_cast<unsigned char>(d);
+      ++t;
+    }
+  }
+
+  // Reorder the pixel data.
+  {
+    const char *t1 = reinterpret_cast<const char *>(&tmpBuf.at(0));
+    const char *t2 = reinterpret_cast<const char *>(&tmpBuf.at(0)) +
+                     (uncompressed_size + 1) / 2;
+    char *s = reinterpret_cast<char *>(dst);
+    char *stop = s + uncompressed_size;
+
+    for (;;) {
+      if (s < stop)
+        *(s++) = *(t1++);
+      else
+        break;
+
+      if (s < stop)
+        *(s++) = *(t2++);
+      else
+        break;
+    }
+  }
+}
+
+#if TINYEXR_USE_PIZ
+
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++11-long-long"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wpadded"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wc++11-extensions"
+#pragma clang diagnostic ignored "-Wconversion"
+#endif
+
+//
+// PIZ compress/uncompress, based on OpenEXR's ImfPizCompressor.cpp
+//
+// -----------------------------------------------------------------
+// Copyright (c) 2004, Industrial Light & Magic, a division of Lucas
+// Digital Ltd. LLC)
+// (3 clause BSD license)
+//
+
+struct PIZChannelData {
+  unsigned short *start;
+  unsigned short *end;
+  int nx;
+  int ny;
+  int ys;
+  int size;
+};
+
+//-----------------------------------------------------------------------------
+//
+//  16-bit Haar Wavelet encoding and decoding
+//
+//  The source code in this file is derived from the encoding
+//  and decoding routines written by Christian Rouet for his
+//  PIZ image file format.
+//
+//-----------------------------------------------------------------------------
+
+//
+// Wavelet basis functions without modulo arithmetic; they produce
+// the best compression ratios when the wavelet-transformed data are
+// Huffman-encoded, but the wavelet transform works only for 14-bit
+// data (untransformed data values must be less than (1 << 14)).
+//
+
+inline void wenc14(unsigned short a, unsigned short b, unsigned short &l,
+                   unsigned short &h) {
+  short as = static_cast<short>(a);
+  short bs = static_cast<short>(b);
+
+  short ms = (as + bs) >> 1;
+  short ds = as - bs;
+
+  l = static_cast<unsigned short>(ms);
+  h = static_cast<unsigned short>(ds);
+}
+
+inline void wdec14(unsigned short l, unsigned short h, unsigned short &a,
+                   unsigned short &b) {
+  short ls = static_cast<short>(l);
+  short hs = static_cast<short>(h);
+
+  int hi = hs;
+  int ai = ls + (hi & 1) + (hi >> 1);
+
+  short as = static_cast<short>(ai);
+  short bs = static_cast<short>(ai - hi);
+
+  a = static_cast<unsigned short>(as);
+  b = static_cast<unsigned short>(bs);
+}
+
+//
+// Wavelet basis functions with modulo arithmetic; they work with full
+// 16-bit data, but Huffman-encoding the wavelet-transformed data doesn't
+// compress the data quite as well.
+//
+
+const int NBITS = 16;
+const int A_OFFSET = 1 << (NBITS - 1);
+const int M_OFFSET = 1 << (NBITS - 1);
+const int MOD_MASK = (1 << NBITS) - 1;
+
+inline void wenc16(unsigned short a, unsigned short b, unsigned short &l,
+                   unsigned short &h) {
+  int ao = (a + A_OFFSET) & MOD_MASK;
+  int m = ((ao + b) >> 1);
+  int d = ao - b;
+
+  if (d < 0) m = (m + M_OFFSET) & MOD_MASK;
+
+  d &= MOD_MASK;
+
+  l = static_cast<unsigned short>(m);
+  h = static_cast<unsigned short>(d);
+}
+
+inline void wdec16(unsigned short l, unsigned short h, unsigned short &a,
+                   unsigned short &b) {
+  int m = l;
+  int d = h;
+  int bb = (m - (d >> 1)) & MOD_MASK;
+  int aa = (d + bb - A_OFFSET) & MOD_MASK;
+  b = static_cast<unsigned short>(bb);
+  a = static_cast<unsigned short>(aa);
+}
+
+//
+// 2D Wavelet encoding:
+//
+
+static void wav2Encode(
+    unsigned short *in,  // io: values are transformed in place
+    int nx,              // i : x size
+    int ox,              // i : x offset
+    int ny,              // i : y size
+    int oy,              // i : y offset
+    unsigned short mx)   // i : maximum in[x][y] value
+{
+  bool w14 = (mx < (1 << 14));
+  int n = (nx > ny) ? ny : nx;
+  int p = 1;   // == 1 <<  level
+  int p2 = 2;  // == 1 << (level+1)
+
+  //
+  // Hierachical loop on smaller dimension n
+  //
+
+  while (p2 <= n) {
+    unsigned short *py = in;
+    unsigned short *ey = in + oy * (ny - p2);
+    int oy1 = oy * p;
+    int oy2 = oy * p2;
+    int ox1 = ox * p;
+    int ox2 = ox * p2;
+    unsigned short i00, i01, i10, i11;
+
+    //
+    // Y loop
+    //
+
+    for (; py <= ey; py += oy2) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      //
+      // X loop
+      //
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+        unsigned short *p10 = px + oy1;
+        unsigned short *p11 = p10 + ox1;
+
+        //
+        // 2D wavelet encoding
+        //
+
+        if (w14) {
+          wenc14(*px, *p01, i00, i01);
+          wenc14(*p10, *p11, i10, i11);
+          wenc14(i00, i10, *px, *p10);
+          wenc14(i01, i11, *p01, *p11);
+        } else {
+          wenc16(*px, *p01, i00, i01);
+          wenc16(*p10, *p11, i10, i11);
+          wenc16(i00, i10, *px, *p10);
+          wenc16(i01, i11, *p01, *p11);
+        }
+      }
+
+      //
+      // Encode (1D) odd column (still in Y loop)
+      //
+
+      if (nx & p) {
+        unsigned short *p10 = px + oy1;
+
+        if (w14)
+          wenc14(*px, *p10, i00, *p10);
+        else
+          wenc16(*px, *p10, i00, *p10);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Encode (1D) odd line (must loop in X)
+    //
+
+    if (ny & p) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+
+        if (w14)
+          wenc14(*px, *p01, i00, *p01);
+        else
+          wenc16(*px, *p01, i00, *p01);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Next level
+    //
+
+    p = p2;
+    p2 <<= 1;
+  }
+}
+
+//
+// 2D Wavelet decoding:
+//
+
+static void wav2Decode(
+    unsigned short *in,  // io: values are transformed in place
+    int nx,              // i : x size
+    int ox,              // i : x offset
+    int ny,              // i : y size
+    int oy,              // i : y offset
+    unsigned short mx)   // i : maximum in[x][y] value
+{
+  bool w14 = (mx < (1 << 14));
+  int n = (nx > ny) ? ny : nx;
+  int p = 1;
+  int p2;
+
+  //
+  // Search max level
+  //
+
+  while (p <= n) p <<= 1;
+
+  p >>= 1;
+  p2 = p;
+  p >>= 1;
+
+  //
+  // Hierarchical loop on smaller dimension n
+  //
+
+  while (p >= 1) {
+    unsigned short *py = in;
+    unsigned short *ey = in + oy * (ny - p2);
+    int oy1 = oy * p;
+    int oy2 = oy * p2;
+    int ox1 = ox * p;
+    int ox2 = ox * p2;
+    unsigned short i00, i01, i10, i11;
+
+    //
+    // Y loop
+    //
+
+    for (; py <= ey; py += oy2) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      //
+      // X loop
+      //
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+        unsigned short *p10 = px + oy1;
+        unsigned short *p11 = p10 + ox1;
+
+        //
+        // 2D wavelet decoding
+        //
+
+        if (w14) {
+          wdec14(*px, *p10, i00, i10);
+          wdec14(*p01, *p11, i01, i11);
+          wdec14(i00, i01, *px, *p01);
+          wdec14(i10, i11, *p10, *p11);
+        } else {
+          wdec16(*px, *p10, i00, i10);
+          wdec16(*p01, *p11, i01, i11);
+          wdec16(i00, i01, *px, *p01);
+          wdec16(i10, i11, *p10, *p11);
+        }
+      }
+
+      //
+      // Decode (1D) odd column (still in Y loop)
+      //
+
+      if (nx & p) {
+        unsigned short *p10 = px + oy1;
+
+        if (w14)
+          wdec14(*px, *p10, i00, *p10);
+        else
+          wdec16(*px, *p10, i00, *p10);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Decode (1D) odd line (must loop in X)
+    //
+
+    if (ny & p) {
+      unsigned short *px = py;
+      unsigned short *ex = py + ox * (nx - p2);
+
+      for (; px <= ex; px += ox2) {
+        unsigned short *p01 = px + ox1;
+
+        if (w14)
+          wdec14(*px, *p01, i00, *p01);
+        else
+          wdec16(*px, *p01, i00, *p01);
+
+        *px = i00;
+      }
+    }
+
+    //
+    // Next level
+    //
+
+    p2 = p;
+    p >>= 1;
+  }
+}
+
+//-----------------------------------------------------------------------------
+//
+//  16-bit Huffman compression and decompression.
+//
+//  The source code in this file is derived from the 8-bit
+//  Huffman compression and decompression routines written
+//  by Christian Rouet for his PIZ image file format.
+//
+//-----------------------------------------------------------------------------
+
+// Adds some modification for tinyexr.
+
+const int HUF_ENCBITS = 16;  // literal (value) bit length
+const int HUF_DECBITS = 14;  // decoding bit size (>= 8)
+
+const int HUF_ENCSIZE = (1 << HUF_ENCBITS) + 1;  // encoding table size
+const int HUF_DECSIZE = 1 << HUF_DECBITS;        // decoding table size
+const int HUF_DECMASK = HUF_DECSIZE - 1;
+
+struct HufDec {  // short code    long code
+  //-------------------------------
+  int len : 8;   // code length    0
+  int lit : 24;  // lit      p size
+  int *p;        // 0      lits
+};
+
+inline long long hufLength(long long code) { return code & 63; }
+
+inline long long hufCode(long long code) { return code >> 6; }
+
+inline void outputBits(int nBits, long long bits, long long &c, int &lc,
+                       char *&out) {
+  c <<= nBits;
+  lc += nBits;
+
+  c |= bits;
+
+  while (lc >= 8) *out++ = static_cast<char>((c >> (lc -= 8)));
+}
+
+inline long long getBits(int nBits, long long &c, int &lc, const char *&in) {
+  while (lc < nBits) {
+    c = (c << 8) | *(reinterpret_cast<const unsigned char *>(in++));
+    lc += 8;
+  }
+
+  lc -= nBits;
+  return (c >> lc) & ((1 << nBits) - 1);
+}
+
+//
+// ENCODING TABLE BUILDING & (UN)PACKING
+//
+
+//
+// Build a "canonical" Huffman code table:
+//  - for each (uncompressed) symbol, hcode contains the length
+//    of the corresponding code (in the compressed data)
+//  - canonical codes are computed and stored in hcode
+//  - the rules for constructing canonical codes are as follows:
+//    * shorter codes (if filled with zeroes to the right)
+//      have a numerically higher value than longer codes
+//    * for codes with the same length, numerical values
+//      increase with numerical symbol values
+//  - because the canonical code table can be constructed from
+//    symbol lengths alone, the code table can be transmitted
+//    without sending the actual code values
+//  - see http://www.compressconsult.com/huffman/
+//
+
+static void hufCanonicalCodeTable(long long hcode[HUF_ENCSIZE]) {
+  long long n[59];
+
+  //
+  // For each i from 0 through 58, count the
+  // number of different codes of length i, and
+  // store the count in n[i].
+  //
+
+  for (int i = 0; i <= 58; ++i) n[i] = 0;
+
+  for (int i = 0; i < HUF_ENCSIZE; ++i) n[hcode[i]] += 1;
+
+  //
+  // For each i from 58 through 1, compute the
+  // numerically lowest code with length i, and
+  // store that code in n[i].
+  //
+
+  long long c = 0;
+
+  for (int i = 58; i > 0; --i) {
+    long long nc = ((c + n[i]) >> 1);
+    n[i] = c;
+    c = nc;
+  }
+
+  //
+  // hcode[i] contains the length, l, of the
+  // code for symbol i.  Assign the next available
+  // code of length l to the symbol and store both
+  // l and the code in hcode[i].
+  //
+
+  for (int i = 0; i < HUF_ENCSIZE; ++i) {
+    int l = static_cast<int>(hcode[i]);
+
+    if (l > 0) hcode[i] = l | (n[l]++ << 6);
+  }
+}
+
+//
+// Compute Huffman codes (based on frq input) and store them in frq:
+//  - code structure is : [63:lsb - 6:msb] | [5-0: bit length];
+//  - max code length is 58 bits;
+//  - codes outside the range [im-iM] have a null length (unused values);
+//  - original frequencies are destroyed;
+//  - encoding tables are used by hufEncode() and hufBuildDecTable();
+//
+
+struct FHeapCompare {
+  bool operator()(long long *a, long long *b) { return *a > *b; }
+};
+
+static void hufBuildEncTable(
+    long long *frq,  // io: input frequencies [HUF_ENCSIZE], output table
+    int *im,         //  o: min frq index
+    int *iM)         //  o: max frq index
+{
+  //
+  // This function assumes that when it is called, array frq
+  // indicates the frequency of all possible symbols in the data
+  // that are to be Huffman-encoded.  (frq[i] contains the number
+  // of occurrences of symbol i in the data.)
+  //
+  // The loop below does three things:
+  //
+  // 1) Finds the minimum and maximum indices that point
+  //    to non-zero entries in frq:
+  //
+  //     frq[im] != 0, and frq[i] == 0 for all i < im
+  //     frq[iM] != 0, and frq[i] == 0 for all i > iM
+  //
+  // 2) Fills array fHeap with pointers to all non-zero
+  //    entries in frq.
+  //
+  // 3) Initializes array hlink such that hlink[i] == i
+  //    for all array entries.
+  //
+
+  int hlink[HUF_ENCSIZE];
+  long long *fHeap[HUF_ENCSIZE];
+
+  *im = 0;
+
+  while (!frq[*im]) (*im)++;
+
+  int nf = 0;
+
+  for (int i = *im; i < HUF_ENCSIZE; i++) {
+    hlink[i] = i;
+
+    if (frq[i]) {
+      fHeap[nf] = &frq[i];
+      nf++;
+      *iM = i;
+    }
+  }
+
+  //
+  // Add a pseudo-symbol, with a frequency count of 1, to frq;
+  // adjust the fHeap and hlink array accordingly.  Function
+  // hufEncode() uses the pseudo-symbol for run-length encoding.
+  //
+
+  (*iM)++;
+  frq[*iM] = 1;
+  fHeap[nf] = &frq[*iM];
+  nf++;
+
+  //
+  // Build an array, scode, such that scode[i] contains the number
+  // of bits assigned to symbol i.  Conceptually this is done by
+  // constructing a tree whose leaves are the symbols with non-zero
+  // frequency:
+  //
+  //     Make a heap that contains all symbols with a non-zero frequency,
+  //     with the least frequent symbol on top.
+  //
+  //     Repeat until only one symbol is left on the heap:
+  //
+  //         Take the two least frequent symbols off the top of the heap.
+  //         Create a new node that has first two nodes as children, and
+  //         whose frequency is the sum of the frequencies of the first
+  //         two nodes.  Put the new node back into the heap.
+  //
+  // The last node left on the heap is the root of the tree.  For each
+  // leaf node, the distance between the root and the leaf is the length
+  // of the code for the corresponding symbol.
+  //
+  // The loop below doesn't actually build the tree; instead we compute
+  // the distances of the leaves from the root on the fly.  When a new
+  // node is added to the heap, then that node's descendants are linked
+  // into a single linear list that starts at the new node, and the code
+  // lengths of the descendants (that is, their distance from the root
+  // of the tree) are incremented by one.
+  //
+
+  std::make_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+
+  long long scode[HUF_ENCSIZE];
+  memset(scode, 0, sizeof(long long) * HUF_ENCSIZE);
+
+  while (nf > 1) {
+    //
+    // Find the indices, mm and m, of the two smallest non-zero frq
+    // values in fHeap, add the smallest frq to the second-smallest
+    // frq, and remove the smallest frq value from fHeap.
+    //
+
+    int mm = fHeap[0] - frq;
+    std::pop_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+    --nf;
+
+    int m = fHeap[0] - frq;
+    std::pop_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+
+    frq[m] += frq[mm];
+    std::push_heap(&fHeap[0], &fHeap[nf], FHeapCompare());
+
+    //
+    // The entries in scode are linked into lists with the
+    // entries in hlink serving as "next" pointers and with
+    // the end of a list marked by hlink[j] == j.
+    //
+    // Traverse the lists that start at scode[m] and scode[mm].
+    // For each element visited, increment the length of the
+    // corresponding code by one bit. (If we visit scode[j]
+    // during the traversal, then the code for symbol j becomes
+    // one bit longer.)
+    //
+    // Merge the lists that start at scode[m] and scode[mm]
+    // into a single list that starts at scode[m].
+    //
+
+    //
+    // Add a bit to all codes in the first list.
+    //
+
+    for (int j = m;; j = hlink[j]) {
+      scode[j]++;
+
+      assert(scode[j] <= 58);
+
+      if (hlink[j] == j) {
+        //
+        // Merge the two lists.
+        //
+
+        hlink[j] = mm;
+        break;
+      }
+    }
+
+    //
+    // Add a bit to all codes in the second list
+    //
+
+    for (int j = mm;; j = hlink[j]) {
+      scode[j]++;
+
+      assert(scode[j] <= 58);
+
+      if (hlink[j] == j) break;
+    }
+  }
+
+  //
+  // Build a canonical Huffman code table, replacing the code
+  // lengths in scode with (code, code length) pairs.  Copy the
+  // code table from scode into frq.
+  //
+
+  hufCanonicalCodeTable(scode);
+  memcpy(frq, scode, sizeof(long long) * HUF_ENCSIZE);
+}
+
+//
+// Pack an encoding table:
+//  - only code lengths, not actual codes, are stored
+//  - runs of zeroes are compressed as follows:
+//
+//    unpacked    packed
+//    --------------------------------
+//    1 zero    0  (6 bits)
+//    2 zeroes    59
+//    3 zeroes    60
+//    4 zeroes    61
+//    5 zeroes    62
+//    n zeroes (6 or more)  63 n-6  (6 + 8 bits)
+//
+
+const int SHORT_ZEROCODE_RUN = 59;
+const int LONG_ZEROCODE_RUN = 63;
+const int SHORTEST_LONG_RUN = 2 + LONG_ZEROCODE_RUN - SHORT_ZEROCODE_RUN;
+const int LONGEST_LONG_RUN = 255 + SHORTEST_LONG_RUN;
+
+static void hufPackEncTable(
+    const long long *hcode,  // i : encoding table [HUF_ENCSIZE]
+    int im,                  // i : min hcode index
+    int iM,                  // i : max hcode index
+    char **pcode)            //  o: ptr to packed table (updated)
+{
+  char *p = *pcode;
+  long long c = 0;
+  int lc = 0;
+
+  for (; im <= iM; im++) {
+    int l = hufLength(hcode[im]);
+
+    if (l == 0) {
+      int zerun = 1;
+
+      while ((im < iM) && (zerun < LONGEST_LONG_RUN)) {
+        if (hufLength(hcode[im + 1]) > 0) break;
+        im++;
+        zerun++;
+      }
+
+      if (zerun >= 2) {
+        if (zerun >= SHORTEST_LONG_RUN) {
+          outputBits(6, LONG_ZEROCODE_RUN, c, lc, p);
+          outputBits(8, zerun - SHORTEST_LONG_RUN, c, lc, p);
+        } else {
+          outputBits(6, SHORT_ZEROCODE_RUN + zerun - 2, c, lc, p);
+        }
+        continue;
+      }
+    }
+
+    outputBits(6, l, c, lc, p);
+  }
+
+  if (lc > 0) *p++ = (unsigned char)(c << (8 - lc));
+
+  *pcode = p;
+}
+
+//
+// Unpack an encoding table packed by hufPackEncTable():
+//
+
+static bool hufUnpackEncTable(
+    const char **pcode,  // io: ptr to packed table (updated)
+    int ni,              // i : input size (in bytes)
+    int im,              // i : min hcode index
+    int iM,              // i : max hcode index
+    long long *hcode)    //  o: encoding table [HUF_ENCSIZE]
+{
+  memset(hcode, 0, sizeof(long long) * HUF_ENCSIZE);
+
+  const char *p = *pcode;
+  long long c = 0;
+  int lc = 0;
+
+  for (; im <= iM; im++) {
+    if (p - *pcode > ni) {
+      return false;
+    }
+
+    long long l = hcode[im] = getBits(6, c, lc, p);  // code length
+
+    if (l == (long long)LONG_ZEROCODE_RUN) {
+      if (p - *pcode > ni) {
+        return false;
+      }
+
+      int zerun = getBits(8, c, lc, p) + SHORTEST_LONG_RUN;
+
+      if (im + zerun > iM + 1) {
+        return false;
+      }
+
+      while (zerun--) hcode[im++] = 0;
+
+      im--;
+    } else if (l >= (long long)SHORT_ZEROCODE_RUN) {
+      int zerun = l - SHORT_ZEROCODE_RUN + 2;
+
+      if (im + zerun > iM + 1) {
+        return false;
+      }
+
+      while (zerun--) hcode[im++] = 0;
+
+      im--;
+    }
+  }
+
+  *pcode = const_cast<char *>(p);
+
+  hufCanonicalCodeTable(hcode);
+
+  return true;
+}
+
+//
+// DECODING TABLE BUILDING
+//
+
+//
+// Clear a newly allocated decoding table so that it contains only zeroes.
+//
+
+static void hufClearDecTable(HufDec *hdecod)  // io: (allocated by caller)
+//     decoding table [HUF_DECSIZE]
+{
+  for (int i = 0; i < HUF_DECSIZE; i++) {
+    hdecod[i].len = 0;
+    hdecod[i].lit = 0;
+    hdecod[i].p = NULL;
+  }
+  // memset(hdecod, 0, sizeof(HufDec) * HUF_DECSIZE);
+}
+
+//
+// Build a decoding hash table based on the encoding table hcode:
+//  - short codes (<= HUF_DECBITS) are resolved with a single table access;
+//  - long code entry allocations are not optimized, because long codes are
+//    unfrequent;
+//  - decoding tables are used by hufDecode();
+//
+
+static bool hufBuildDecTable(const long long *hcode,  // i : encoding table
+                             int im,                  // i : min index in hcode
+                             int iM,                  // i : max index in hcode
+                             HufDec *hdecod)  //  o: (allocated by caller)
+//     decoding table [HUF_DECSIZE]
+{
+  //
+  // Init hashtable & loop on all codes.
+  // Assumes that hufClearDecTable(hdecod) has already been called.
+  //
+
+  for (; im <= iM; im++) {
+    long long c = hufCode(hcode[im]);
+    int l = hufLength(hcode[im]);
+
+    if (c >> l) {
+      //
+      // Error: c is supposed to be an l-bit code,
+      // but c contains a value that is greater
+      // than the largest l-bit number.
+      //
+
+      // invalidTableEntry();
+      return false;
+    }
+
+    if (l > HUF_DECBITS) {
+      //
+      // Long code: add a secondary entry
+      //
+
+      HufDec *pl = hdecod + (c >> (l - HUF_DECBITS));
+
+      if (pl->len) {
+        //
+        // Error: a short code has already
+        // been stored in table entry *pl.
+        //
+
+        // invalidTableEntry();
+        return false;
+      }
+
+      pl->lit++;
+
+      if (pl->p) {
+        int *p = pl->p;
+        pl->p = new int[pl->lit];
+
+        for (int i = 0; i < pl->lit - 1; ++i) pl->p[i] = p[i];
+
+        delete[] p;
+      } else {
+        pl->p = new int[1];
+      }
+
+      pl->p[pl->lit - 1] = im;
+    } else if (l) {
+      //
+      // Short code: init all primary entries
+      //
+
+      HufDec *pl = hdecod + (c << (HUF_DECBITS - l));
+
+      for (long long i = 1ULL << (HUF_DECBITS - l); i > 0; i--, pl++) {
+        if (pl->len || pl->p) {
+          //
+          // Error: a short code or a long code has
+          // already been stored in table entry *pl.
+          //
+
+          // invalidTableEntry();
+          return false;
+        }
+
+        pl->len = l;
+        pl->lit = im;
+      }
+    }
+  }
+
+  return true;
+}
+
+//
+// Free the long code entries of a decoding table built by hufBuildDecTable()
+//
+
+static void hufFreeDecTable(HufDec *hdecod)  // io: Decoding table
+{
+  for (int i = 0; i < HUF_DECSIZE; i++) {
+    if (hdecod[i].p) {
+      delete[] hdecod[i].p;
+      hdecod[i].p = 0;
+    }
+  }
+}
+
+//
+// ENCODING
+//
+
+inline void outputCode(long long code, long long &c, int &lc, char *&out) {
+  outputBits(hufLength(code), hufCode(code), c, lc, out);
+}
+
+inline void sendCode(long long sCode, int runCount, long long runCode,
+                     long long &c, int &lc, char *&out) {
+  //
+  // Output a run of runCount instances of the symbol sCount.
+  // Output the symbols explicitly, or if that is shorter, output
+  // the sCode symbol once followed by a runCode symbol and runCount
+  // expressed as an 8-bit number.
+  //
+
+  if (hufLength(sCode) + hufLength(runCode) + 8 < hufLength(sCode) * runCount) {
+    outputCode(sCode, c, lc, out);
+    outputCode(runCode, c, lc, out);
+    outputBits(8, runCount, c, lc, out);
+  } else {
+    while (runCount-- >= 0) outputCode(sCode, c, lc, out);
+  }
+}
+
+//
+// Encode (compress) ni values based on the Huffman encoding table hcode:
+//
+
+static int hufEncode            // return: output size (in bits)
+    (const long long *hcode,    // i : encoding table
+     const unsigned short *in,  // i : uncompressed input buffer
+     const int ni,              // i : input buffer size (in bytes)
+     int rlc,                   // i : rl code
+     char *out)                 //  o: compressed output buffer
+{
+  char *outStart = out;
+  long long c = 0;  // bits not yet written to out
+  int lc = 0;       // number of valid bits in c (LSB)
+  int s = in[0];
+  int cs = 0;
+
+  //
+  // Loop on input values
+  //
+
+  for (int i = 1; i < ni; i++) {
+    //
+    // Count same values or send code
+    //
+
+    if (s == in[i] && cs < 255) {
+      cs++;
+    } else {
+      sendCode(hcode[s], cs, hcode[rlc], c, lc, out);
+      cs = 0;
+    }
+
+    s = in[i];
+  }
+
+  //
+  // Send remaining code
+  //
+
+  sendCode(hcode[s], cs, hcode[rlc], c, lc, out);
+
+  if (lc) *out = (c << (8 - lc)) & 0xff;
+
+  return (out - outStart) * 8 + lc;
+}
+
+//
+// DECODING
+//
+
+//
+// In order to force the compiler to inline them,
+// getChar() and getCode() are implemented as macros
+// instead of "inline" functions.
+//
+
+#define getChar(c, lc, in)                   \
+  {                                          \
+    c = (c << 8) | *(unsigned char *)(in++); \
+    lc += 8;                                 \
+  }
+
+#define getCode(po, rlc, c, lc, in, out, oe) \
+  {                                          \
+    if (po == rlc) {                         \
+      if (lc < 8) getChar(c, lc, in);        \
+                                             \
+      lc -= 8;                               \
+                                             \
+      unsigned char cs = (c >> lc);          \
+                                             \
+      if (out + cs > oe) return false;       \
+                                             \
+      unsigned short s = out[-1];            \
+                                             \
+      while (cs-- > 0) *out++ = s;           \
+    } else if (out < oe) {                   \
+      *out++ = po;                           \
+    } else {                                 \
+      return false;                          \
+    }                                        \
+  }
+
+//
+// Decode (uncompress) ni bits based on encoding & decoding tables:
+//
+
+static bool hufDecode(const long long *hcode,  // i : encoding table
+                      const HufDec *hdecod,    // i : decoding table
+                      const char *in,          // i : compressed input buffer
+                      int ni,                  // i : input size (in bits)
+                      int rlc,                 // i : run-length code
+                      int no,  // i : expected output size (in bytes)
+                      unsigned short *out)  //  o: uncompressed output buffer
+{
+  long long c = 0;
+  int lc = 0;
+  unsigned short *outb = out;
+  unsigned short *oe = out + no;
+  const char *ie = in + (ni + 7) / 8;  // input byte size
+
+  //
+  // Loop on input bytes
+  //
+
+  while (in < ie) {
+    getChar(c, lc, in);
+
+    //
+    // Access decoding table
+    //
+
+    while (lc >= HUF_DECBITS) {
+      const HufDec pl = hdecod[(c >> (lc - HUF_DECBITS)) & HUF_DECMASK];
+
+      if (pl.len) {
+        //
+        // Get short code
+        //
+
+        lc -= pl.len;
+        getCode(pl.lit, rlc, c, lc, in, out, oe);
+      } else {
+        if (!pl.p) {
+          return false;
+        }
+        // invalidCode(); // wrong code
+
+        //
+        // Search long code
+        //
+
+        int j;
+
+        for (j = 0; j < pl.lit; j++) {
+          int l = hufLength(hcode[pl.p[j]]);
+
+          while (lc < l && in < ie)  // get more bits
+            getChar(c, lc, in);
+
+          if (lc >= l) {
+            if (hufCode(hcode[pl.p[j]]) ==
+                ((c >> (lc - l)) & (((long long)(1) << l) - 1))) {
+              //
+              // Found : get long code
+              //
+
+              lc -= l;
+              getCode(pl.p[j], rlc, c, lc, in, out, oe);
+              break;
+            }
+          }
+        }
+
+        if (j == pl.lit) {
+          return false;
+          // invalidCode(); // Not found
+        }
+      }
+    }
+  }
+
+  //
+  // Get remaining (short) codes
+  //
+
+  int i = (8 - ni) & 7;
+  c >>= i;
+  lc -= i;
+
+  while (lc > 0) {
+    const HufDec pl = hdecod[(c << (HUF_DECBITS - lc)) & HUF_DECMASK];
+
+    if (pl.len) {
+      lc -= pl.len;
+      getCode(pl.lit, rlc, c, lc, in, out, oe);
+    } else {
+      return false;
+      // invalidCode(); // wrong (long) code
+    }
+  }
+
+  if (out - outb != no) {
+    return false;
+  }
+  // notEnoughData ();
+
+  return true;
+}
+
+static void countFrequencies(long long freq[HUF_ENCSIZE],
+                             const unsigned short data[/*n*/], int n) {
+  for (int i = 0; i < HUF_ENCSIZE; ++i) freq[i] = 0;
+
+  for (int i = 0; i < n; ++i) ++freq[data[i]];
+}
+
+static void writeUInt(char buf[4], unsigned int i) {
+  unsigned char *b = (unsigned char *)buf;
+
+  b[0] = i;
+  b[1] = i >> 8;
+  b[2] = i >> 16;
+  b[3] = i >> 24;
+}
+
+static unsigned int readUInt(const char buf[4]) {
+  const unsigned char *b = (const unsigned char *)buf;
+
+  return (b[0] & 0x000000ff) | ((b[1] << 8) & 0x0000ff00) |
+         ((b[2] << 16) & 0x00ff0000) | ((b[3] << 24) & 0xff000000);
+}
+
+//
+// EXTERNAL INTERFACE
+//
+
+static int hufCompress(const unsigned short raw[], int nRaw,
+                       char compressed[]) {
+  if (nRaw == 0) return 0;
+
+  long long freq[HUF_ENCSIZE];
+
+  countFrequencies(freq, raw, nRaw);
+
+  int im = 0;
+  int iM = 0;
+  hufBuildEncTable(freq, &im, &iM);
+
+  char *tableStart = compressed + 20;
+  char *tableEnd = tableStart;
+  hufPackEncTable(freq, im, iM, &tableEnd);
+  int tableLength = tableEnd - tableStart;
+
+  char *dataStart = tableEnd;
+  int nBits = hufEncode(freq, raw, nRaw, iM, dataStart);
+  int data_length = (nBits + 7) / 8;
+
+  writeUInt(compressed, im);
+  writeUInt(compressed + 4, iM);
+  writeUInt(compressed + 8, tableLength);
+  writeUInt(compressed + 12, nBits);
+  writeUInt(compressed + 16, 0);  // room for future extensions
+
+  return dataStart + data_length - compressed;
+}
+
+static bool hufUncompress(const char compressed[], int nCompressed,
+                          unsigned short raw[], int nRaw) {
+  if (nCompressed == 0) {
+    if (nRaw != 0) return false;
+
+    return false;
+  }
+
+  int im = readUInt(compressed);
+  int iM = readUInt(compressed + 4);
+  // int tableLength = readUInt (compressed + 8);
+  int nBits = readUInt(compressed + 12);
+
+  if (im < 0 || im >= HUF_ENCSIZE || iM < 0 || iM >= HUF_ENCSIZE) return false;
+
+  const char *ptr = compressed + 20;
+
+  //
+  // Fast decoder needs at least 2x64-bits of compressed data, and
+  // needs to be run-able on this platform. Otherwise, fall back
+  // to the original decoder
+  //
+
+  // if (FastHufDecoder::enabled() && nBits > 128)
+  //{
+  //    FastHufDecoder fhd (ptr, nCompressed - (ptr - compressed), im, iM, iM);
+  //    fhd.decode ((unsigned char*)ptr, nBits, raw, nRaw);
+  //}
+  // else
+  {
+    std::vector<long long> freq(HUF_ENCSIZE);
+    std::vector<HufDec> hdec(HUF_DECSIZE);
+
+    hufClearDecTable(&hdec.at(0));
+
+    hufUnpackEncTable(&ptr, nCompressed - (ptr - compressed), im, iM,
+                      &freq.at(0));
+
+    {
+      if (nBits > 8 * (nCompressed - (ptr - compressed))) {
+        return false;
+      }
+
+      hufBuildDecTable(&freq.at(0), im, iM, &hdec.at(0));
+      hufDecode(&freq.at(0), &hdec.at(0), ptr, nBits, iM, nRaw, raw);
+    }
+    // catch (...)
+    //{
+    //    hufFreeDecTable (hdec);
+    //    throw;
+    //}
+
+    hufFreeDecTable(&hdec.at(0));
+  }
+
+  return true;
+}
+
+//
+// Functions to compress the range of values in the pixel data
+//
+
+const int USHORT_RANGE = (1 << 16);
+const int BITMAP_SIZE = (USHORT_RANGE >> 3);
+
+static void bitmapFromData(const unsigned short data[/*nData*/], int nData,
+                           unsigned char bitmap[BITMAP_SIZE],
+                           unsigned short &minNonZero,
+                           unsigned short &maxNonZero) {
+  for (int i = 0; i < BITMAP_SIZE; ++i) bitmap[i] = 0;
+
+  for (int i = 0; i < nData; ++i) bitmap[data[i] >> 3] |= (1 << (data[i] & 7));
+
+  bitmap[0] &= ~1;  // zero is not explicitly stored in
+                    // the bitmap; we assume that the
+                    // data always contain zeroes
+  minNonZero = BITMAP_SIZE - 1;
+  maxNonZero = 0;
+
+  for (int i = 0; i < BITMAP_SIZE; ++i) {
+    if (bitmap[i]) {
+      if (minNonZero > i) minNonZero = i;
+      if (maxNonZero < i) maxNonZero = i;
+    }
+  }
+}
+
+static unsigned short forwardLutFromBitmap(
+    const unsigned char bitmap[BITMAP_SIZE], unsigned short lut[USHORT_RANGE]) {
+  int k = 0;
+
+  for (int i = 0; i < USHORT_RANGE; ++i) {
+    if ((i == 0) || (bitmap[i >> 3] & (1 << (i & 7))))
+      lut[i] = k++;
+    else
+      lut[i] = 0;
+  }
+
+  return k - 1;  // maximum value stored in lut[],
+}  // i.e. number of ones in bitmap minus 1
+
+static unsigned short reverseLutFromBitmap(
+    const unsigned char bitmap[BITMAP_SIZE], unsigned short lut[USHORT_RANGE]) {
+  int k = 0;
+
+  for (int i = 0; i < USHORT_RANGE; ++i) {
+    if ((i == 0) || (bitmap[i >> 3] & (1 << (i & 7)))) lut[k++] = i;
+  }
+
+  int n = k - 1;
+
+  while (k < USHORT_RANGE) lut[k++] = 0;
+
+  return n;  // maximum k where lut[k] is non-zero,
+}  // i.e. number of ones in bitmap minus 1
+
+static void applyLut(const unsigned short lut[USHORT_RANGE],
+                     unsigned short data[/*nData*/], int nData) {
+  for (int i = 0; i < nData; ++i) data[i] = lut[data[i]];
+}
+
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif  // __clang__
+
+static bool CompressPiz(unsigned char *outPtr, unsigned int &outSize,
+                        const unsigned char *inPtr, size_t inSize,
+                        const std::vector<ChannelInfo> &channelInfo,
+                        int data_width, int num_lines) {
+  unsigned char bitmap[BITMAP_SIZE];
+  unsigned short minNonZero;
+  unsigned short maxNonZero;
+
+#if !MINIZ_LITTLE_ENDIAN
+  // @todo { PIZ compression on BigEndian architecture. }
+  assert(0);
+  return false;
+#endif
+
+  // Assume `inSize` is multiple of 2 or 4.
+  std::vector<unsigned short> tmpBuffer(inSize / sizeof(unsigned short));
+
+  std::vector<PIZChannelData> channelData(channelInfo.size());
+  unsigned short *tmpBufferEnd = &tmpBuffer.at(0);
+
+  for (size_t c = 0; c < channelData.size(); c++) {
+    PIZChannelData &cd = channelData[c];
+
+    cd.start = tmpBufferEnd;
+    cd.end = cd.start;
+
+    cd.nx = data_width;
+    cd.ny = num_lines;
+    // cd.ys = c.channel().ySampling;
+
+    size_t pixelSize = sizeof(int);  // UINT and FLOAT
+    if (channelInfo[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      pixelSize = sizeof(short);
+    }
+
+    cd.size = static_cast<int>(pixelSize / sizeof(short));
+
+    tmpBufferEnd += cd.nx * cd.ny * cd.size;
+  }
+
+  const unsigned char *ptr = inPtr;
+  for (int y = 0; y < num_lines; ++y) {
+    for (size_t i = 0; i < channelData.size(); ++i) {
+      PIZChannelData &cd = channelData[i];
+
+      // if (modp (y, cd.ys) != 0)
+      //    continue;
+
+      size_t n = static_cast<size_t>(cd.nx * cd.size);
+      memcpy(cd.end, ptr, n * sizeof(unsigned short));
+      ptr += n * sizeof(unsigned short);
+      cd.end += n;
+    }
+  }
+
+  bitmapFromData(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()), bitmap,
+                 minNonZero, maxNonZero);
+
+  unsigned short lut[USHORT_RANGE];
+  unsigned short maxValue = forwardLutFromBitmap(bitmap, lut);
+  applyLut(lut, &tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()));
+
+  //
+  // Store range compression info in _outBuffer
+  //
+
+  char *buf = reinterpret_cast<char *>(outPtr);
+
+  memcpy(buf, &minNonZero, sizeof(unsigned short));
+  buf += sizeof(unsigned short);
+  memcpy(buf, &maxNonZero, sizeof(unsigned short));
+  buf += sizeof(unsigned short);
+
+  if (minNonZero <= maxNonZero) {
+    memcpy(buf, reinterpret_cast<char *>(&bitmap[0] + minNonZero),
+           maxNonZero - minNonZero + 1);
+    buf += maxNonZero - minNonZero + 1;
+  }
+
+  //
+  // Apply wavelet encoding
+  //
+
+  for (size_t i = 0; i < channelData.size(); ++i) {
+    PIZChannelData &cd = channelData[i];
+
+    for (int j = 0; j < cd.size; ++j) {
+      wav2Encode(cd.start + j, cd.nx, cd.size, cd.ny, cd.nx * cd.size,
+                 maxValue);
+    }
+  }
+
+  //
+  // Apply Huffman encoding; append the result to _outBuffer
+  //
+
+  // length header(4byte), then huff data. Initialize length header with zero,
+  // then later fill it by `length`.
+  char *lengthPtr = buf;
+  int zero = 0;
+  memcpy(buf, &zero, sizeof(int));
+  buf += sizeof(int);
+
+  int length =
+      hufCompress(&tmpBuffer.at(0), static_cast<int>(tmpBuffer.size()), buf);
+  memcpy(lengthPtr, &length, sizeof(int));
+
+  outSize = static_cast<unsigned int>(
+      (reinterpret_cast<unsigned char *>(buf) - outPtr) +
+      static_cast<unsigned int>(length));
+  return true;
+}
+
+static bool DecompressPiz(unsigned char *outPtr, const unsigned char *inPtr,
+                          size_t tmpBufSize, int num_channels,
+                          const EXRChannelInfo *channels, int data_width,
+                          int num_lines) {
+  unsigned char bitmap[BITMAP_SIZE];
+  unsigned short minNonZero;
+  unsigned short maxNonZero;
+
+#if !MINIZ_LITTLE_ENDIAN
+  // @todo { PIZ compression on BigEndian architecture. }
+  assert(0);
+  return false;
+#endif
+
+  memset(bitmap, 0, BITMAP_SIZE);
+
+  const unsigned char *ptr = inPtr;
+  minNonZero = *(reinterpret_cast<const unsigned short *>(ptr));
+  maxNonZero = *(reinterpret_cast<const unsigned short *>(ptr + 2));
+  ptr += 4;
+
+  if (maxNonZero >= BITMAP_SIZE) {
+    return false;
+  }
+
+  if (minNonZero <= maxNonZero) {
+    memcpy(reinterpret_cast<char *>(&bitmap[0] + minNonZero), ptr,
+           maxNonZero - minNonZero + 1);
+    ptr += maxNonZero - minNonZero + 1;
+  }
+
+  unsigned short lut[USHORT_RANGE];
+  memset(lut, 0, sizeof(unsigned short) * USHORT_RANGE);
+  unsigned short maxValue = reverseLutFromBitmap(bitmap, lut);
+
+  //
+  // Huffman decoding
+  //
+
+  int length;
+
+  length = *(reinterpret_cast<const int *>(ptr));
+  ptr += sizeof(int);
+
+  std::vector<unsigned short> tmpBuffer(tmpBufSize);
+  hufUncompress(reinterpret_cast<const char *>(ptr), length, &tmpBuffer.at(0),
+                static_cast<int>(tmpBufSize));
+
+  //
+  // Wavelet decoding
+  //
+
+  std::vector<PIZChannelData> channelData(static_cast<size_t>(num_channels));
+
+  unsigned short *tmpBufferEnd = &tmpBuffer.at(0);
+
+  for (size_t i = 0; i < static_cast<size_t>(num_channels); ++i) {
+    const EXRChannelInfo &chan = channels[i];
+
+    size_t pixelSize = sizeof(int);  // UINT and FLOAT
+    if (chan.pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      pixelSize = sizeof(short);
+    }
+
+    channelData[i].start = tmpBufferEnd;
+    channelData[i].end = channelData[i].start;
+    channelData[i].nx = data_width;
+    channelData[i].ny = num_lines;
+    // channelData[i].ys = 1;
+    channelData[i].size = static_cast<int>(pixelSize / sizeof(short));
+
+    tmpBufferEnd += channelData[i].nx * channelData[i].ny * channelData[i].size;
+  }
+
+  for (size_t i = 0; i < channelData.size(); ++i) {
+    PIZChannelData &cd = channelData[i];
+
+    for (int j = 0; j < cd.size; ++j) {
+      wav2Decode(cd.start + j, cd.nx, cd.size, cd.ny, cd.nx * cd.size,
+                 maxValue);
+    }
+  }
+
+  //
+  // Expand the pixel data to their original range
+  //
+
+  applyLut(lut, &tmpBuffer.at(0), static_cast<int>(tmpBufSize));
+
+  for (int y = 0; y < num_lines; y++) {
+    for (size_t i = 0; i < channelData.size(); ++i) {
+      PIZChannelData &cd = channelData[i];
+
+      // if (modp (y, cd.ys) != 0)
+      //    continue;
+
+      size_t n = static_cast<size_t>(cd.nx * cd.size);
+      memcpy(outPtr, cd.end, static_cast<size_t>(n * sizeof(unsigned short)));
+      outPtr += n * sizeof(unsigned short);
+      cd.end += n;
+    }
+  }
+
+  return true;
+}
+#endif  // TINYEXR_USE_PIZ
+
+#if TINYEXR_USE_ZFP
+struct ZFPCompressionParam {
+  double rate;
+  int precision;
+  double tolerance;
+  int type;  // TINYEXR_ZFP_COMPRESSIONTYPE_*
+
+  ZFPCompressionParam() {
+    type = TINYEXR_ZFP_COMPRESSIONTYPE_RATE;
+    rate = 2.0;
+    precision = 0;
+    tolerance = 0.0f;
+  }
+};
+
+bool FindZFPCompressionParam(ZFPCompressionParam *param,
+                             const EXRAttribute *attributes,
+                             int num_attributes) {
+  bool foundType = false;
+
+  for (int i = 0; i < num_attributes; i++) {
+    if ((strcmp(attributes[i].name, "zfpCompressionType") == 0) &&
+        (attributes[i].size == 1)) {
+      param->type = static_cast<int>(attributes[i].value[0]);
+
+      foundType = true;
+    }
+  }
+
+  if (!foundType) {
+    return false;
+  }
+
+  if (param->type == TINYEXR_ZFP_COMPRESSIONTYPE_RATE) {
+    for (int i = 0; i < num_attributes; i++) {
+      if ((strcmp(attributes[i].name, "zfpCompressionRate") == 0) &&
+          (attributes[i].size == 8)) {
+        param->rate = *(reinterpret_cast<double *>(attributes[i].value));
+        return true;
+      }
+    }
+  } else if (param->type == TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION) {
+    for (int i = 0; i < num_attributes; i++) {
+      if ((strcmp(attributes[i].name, "zfpCompressionPrecision") == 0) &&
+          (attributes[i].size == 4)) {
+        param->rate = *(reinterpret_cast<int *>(attributes[i].value));
+        return true;
+      }
+    }
+  } else if (param->type == TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY) {
+    for (int i = 0; i < num_attributes; i++) {
+      if ((strcmp(attributes[i].name, "zfpCompressionTolerance") == 0) &&
+          (attributes[i].size == 8)) {
+        param->tolerance = *(reinterpret_cast<double *>(attributes[i].value));
+        return true;
+      }
+    }
+  } else {
+    assert(0);
+  }
+
+  return false;
+}
+
+// Assume pixel format is FLOAT for all channels.
+static bool DecompressZfp(float *dst, int dst_width, int dst_num_lines,
+                          int num_channels, const unsigned char *src,
+                          unsigned long src_size,
+                          const ZFPCompressionParam &param) {
+  size_t uncompressed_size = dst_width * dst_num_lines * num_channels;
+
+  zfp_stream *zfp = NULL;
+  zfp_field *field = NULL;
+
+  assert((dst_width % 4) == 0);
+  assert((dst_num_lines % 4) == 0);
+
+  if ((dst_width & 3U) || (dst_num_lines & 3U)) {
+    return false;
+  }
+
+  field =
+      zfp_field_2d(reinterpret_cast<void *>(const_cast<unsigned char *>(src)),
+                   zfp_type_float, dst_width, dst_num_lines * num_channels);
+  zfp = zfp_stream_open(NULL);
+
+  if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_RATE) {
+    zfp_stream_set_rate(zfp, param.rate, zfp_type_float, /* dimention */ 2,
+                        /* write random access */ 0);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION) {
+    zfp_stream_set_precision(zfp, param.precision, zfp_type_float);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY) {
+    zfp_stream_set_accuracy(zfp, param.tolerance, zfp_type_float);
+  } else {
+    assert(0);
+  }
+
+  size_t buf_size = zfp_stream_maximum_size(zfp, field);
+  std::vector<unsigned char> buf(buf_size);
+  memcpy(&buf.at(0), src, src_size);
+
+  bitstream *stream = stream_open(&buf.at(0), buf_size);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_stream_rewind(zfp);
+
+  size_t image_size = dst_width * dst_num_lines;
+
+  for (int c = 0; c < num_channels; c++) {
+    // decompress 4x4 pixel block.
+    for (int y = 0; y < dst_num_lines; y += 4) {
+      for (int x = 0; x < dst_width; x += 4) {
+        float fblock[16];
+        zfp_decode_block_float_2(zfp, fblock);
+        for (int j = 0; j < 4; j++) {
+          for (int i = 0; i < 4; i++) {
+            dst[c * image_size + ((y + j) * dst_width + (x + i))] =
+                fblock[j * 4 + i];
+          }
+        }
+      }
+    }
+  }
+
+  zfp_field_free(field);
+  zfp_stream_close(zfp);
+  stream_close(stream);
+
+  return true;
+}
+
+// Assume pixel format is FLOAT for all channels.
+bool CompressZfp(std::vector<unsigned char> *outBuf, unsigned int *outSize,
+                 const float *inPtr, int width, int num_lines, int num_channels,
+                 const ZFPCompressionParam &param) {
+  zfp_stream *zfp = NULL;
+  zfp_field *field = NULL;
+
+  assert((width % 4) == 0);
+  assert((num_lines % 4) == 0);
+
+  if ((width & 3U) || (num_lines & 3U)) {
+    return false;
+  }
+
+  // create input array.
+  field = zfp_field_2d(reinterpret_cast<void *>(const_cast<float *>(inPtr)),
+                       zfp_type_float, width, num_lines * num_channels);
+
+  zfp = zfp_stream_open(NULL);
+
+  if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_RATE) {
+    zfp_stream_set_rate(zfp, param.rate, zfp_type_float, 2, 0);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_PRECISION) {
+    zfp_stream_set_precision(zfp, param.precision, zfp_type_float);
+  } else if (param.type == TINYEXR_ZFP_COMPRESSIONTYPE_ACCURACY) {
+    zfp_stream_set_accuracy(zfp, param.tolerance, zfp_type_float);
+  } else {
+    assert(0);
+  }
+
+  size_t buf_size = zfp_stream_maximum_size(zfp, field);
+
+  outBuf->resize(buf_size);
+
+  bitstream *stream = stream_open(&outBuf->at(0), buf_size);
+  zfp_stream_set_bit_stream(zfp, stream);
+  zfp_field_free(field);
+
+  size_t image_size = width * num_lines;
+
+  for (int c = 0; c < num_channels; c++) {
+    // compress 4x4 pixel block.
+    for (int y = 0; y < num_lines; y += 4) {
+      for (int x = 0; x < width; x += 4) {
+        float fblock[16];
+        for (int j = 0; j < 4; j++) {
+          for (int i = 0; i < 4; i++) {
+            fblock[j * 4 + i] =
+                inPtr[c * image_size + ((y + j) * width + (x + i))];
+          }
+        }
+        zfp_encode_block_float_2(zfp, fblock);
+      }
+    }
+  }
+
+  zfp_stream_flush(zfp);
+  (*outSize) = zfp_stream_compressed_size(zfp);
+
+  zfp_stream_close(zfp);
+
+  return true;
+}
+
+#endif
+
+//
+// -----------------------------------------------------------------
+//
+
+static void DecodePixelData(/* out */ unsigned char **out_images,
+                            const int *requested_pixel_types,
+                            const unsigned char *data_ptr, size_t data_len,
+                            int compression_type, int line_order, int width,
+                            int height, int x_stride, int y, int line_no,
+                            int num_lines, size_t pixel_data_size,
+                            size_t num_attributes,
+                            const EXRAttribute *attributes, size_t num_channels,
+                            const EXRChannelInfo *channels,
+                            const std::vector<size_t> &channel_offset_list) {
+  if (compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {  // PIZ
+#if TINYEXR_USE_PIZ
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(
+        static_cast<size_t>(width * num_lines) * pixel_data_size));
+    size_t tmpBufLen = static_cast<size_t>(
+        static_cast<size_t>(width * num_lines) * pixel_data_size);
+
+    bool ret = tinyexr::DecompressPiz(
+        reinterpret_cast<unsigned char *>(&outBuf.at(0)), data_ptr, tmpBufLen,
+        static_cast<int>(num_channels), channels, width, num_lines);
+
+    assert(ret);
+    (void)ret;
+
+    // For PIZ_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            FP16 hf;
+
+            hf.u = line_ptr[u];
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+              unsigned short *image =
+                  reinterpret_cast<unsigned short **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += static_cast<size_t>(
+                             (height - 1 - (line_no + static_cast<int>(v)))) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = hf.u;
+            } else {  // HALF -> FLOAT
+              FP32 f32 = half_to_float(hf);
+              float *image = reinterpret_cast<float **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += static_cast<size_t>(
+                             (height - 1 - (line_no + static_cast<int>(v)))) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = f32.f;
+            }
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT);
+
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned int *line_ptr = reinterpret_cast<unsigned int *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            unsigned int val = line_ptr[u];
+
+            tinyexr::swap4(&val);
+
+            unsigned int *image =
+                reinterpret_cast<unsigned int **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += static_cast<size_t>(
+                           (height - 1 - (line_no + static_cast<int>(v)))) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(&outBuf.at(
+              v * pixel_data_size * static_cast<size_t>(x_stride) +
+              channel_offset_list[c] * static_cast<size_t>(x_stride)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val = line_ptr[u];
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += static_cast<size_t>(
+                           (height - 1 - (line_no + static_cast<int>(v)))) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+#else
+    assert(0 && "PIZ is enabled in this build");
+#endif
+
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS ||
+             compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(width) *
+                                      static_cast<size_t>(num_lines) *
+                                      pixel_data_size);
+
+    unsigned long dstLen = outBuf.size();
+    assert(dstLen > 0);
+    tinyexr::DecompressZip(reinterpret_cast<unsigned char *>(&outBuf.at(0)),
+                           &dstLen, data_ptr,
+                           static_cast<unsigned long>(data_len));
+
+    // For ZIP_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+              &outBuf.at(v * static_cast<size_t>(pixel_data_size) *
+                             static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            tinyexr::FP16 hf;
+
+            hf.u = line_ptr[u];
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+              unsigned short *image =
+                  reinterpret_cast<unsigned short **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = hf.u;
+            } else {  // HALF -> FLOAT
+              tinyexr::FP32 f32 = half_to_float(hf);
+              float *image = reinterpret_cast<float **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = f32.f;
+            }
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT);
+
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned int *line_ptr = reinterpret_cast<unsigned int *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            unsigned int val = line_ptr[u];
+
+            tinyexr::swap4(&val);
+
+            unsigned int *image =
+                reinterpret_cast<unsigned int **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val = line_ptr[u];
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) {
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(width) *
+                                      static_cast<size_t>(num_lines) *
+                                      pixel_data_size);
+
+    unsigned long dstLen = outBuf.size();
+    assert(dstLen > 0);
+    tinyexr::DecompressRle(reinterpret_cast<unsigned char *>(&outBuf.at(0)),
+                           dstLen, data_ptr,
+                           static_cast<unsigned long>(data_len));
+
+    // For RLE_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+              &outBuf.at(v * static_cast<size_t>(pixel_data_size) *
+                             static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            tinyexr::FP16 hf;
+
+            hf.u = line_ptr[u];
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+              unsigned short *image =
+                  reinterpret_cast<unsigned short **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = hf.u;
+            } else {  // HALF -> FLOAT
+              tinyexr::FP32 f32 = half_to_float(hf);
+              float *image = reinterpret_cast<float **>(out_images)[c];
+              if (line_order == 0) {
+                image += (static_cast<size_t>(line_no) + v) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              } else {
+                image += (static_cast<size_t>(height) - 1U -
+                          (static_cast<size_t>(line_no) + v)) *
+                             static_cast<size_t>(x_stride) +
+                         u;
+              }
+              *image = f32.f;
+            }
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT);
+
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const unsigned int *line_ptr = reinterpret_cast<unsigned int *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            unsigned int val = line_ptr[u];
+
+            tinyexr::swap4(&val);
+
+            unsigned int *image =
+                reinterpret_cast<unsigned int **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val = line_ptr[u];
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if TINYEXR_USE_ZFP
+    tinyexr::ZFPCompressionParam zfp_compression_param;
+    if (!FindZFPCompressionParam(&zfp_compression_param, attributes,
+                                 num_attributes)) {
+      assert(0);
+      return;
+    }
+
+    // Allocate original data size.
+    std::vector<unsigned char> outBuf(static_cast<size_t>(width) *
+                                      static_cast<size_t>(num_lines) *
+                                      pixel_data_size);
+
+    unsigned long dstLen = outBuf.size();
+    assert(dstLen > 0);
+    tinyexr::DecompressZfp(reinterpret_cast<float *>(&outBuf.at(0)), width,
+                           num_lines, num_channels, data_ptr,
+                           static_cast<unsigned long>(data_len),
+                           zfp_compression_param);
+
+    // For ZFP_COMPRESSION:
+    //   pixel sample data for channel 0 for scanline 0
+    //   pixel sample data for channel 1 for scanline 0
+    //   pixel sample data for channel ... for scanline 0
+    //   pixel sample data for channel n for scanline 0
+    //   pixel sample data for channel 0 for scanline 1
+    //   pixel sample data for channel 1 for scanline 1
+    //   pixel sample data for channel ... for scanline 1
+    //   pixel sample data for channel n for scanline 1
+    //   ...
+    for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+      assert(channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT);
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        assert(requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT);
+        for (size_t v = 0; v < static_cast<size_t>(num_lines); v++) {
+          const float *line_ptr = reinterpret_cast<float *>(
+              &outBuf.at(v * pixel_data_size * static_cast<size_t>(width) +
+                         channel_offset_list[c] * static_cast<size_t>(width)));
+          for (size_t u = 0; u < static_cast<size_t>(width); u++) {
+            float val = line_ptr[u];
+
+            tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+            float *image = reinterpret_cast<float **>(out_images)[c];
+            if (line_order == 0) {
+              image += (static_cast<size_t>(line_no) + v) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            } else {
+              image += (static_cast<size_t>(height) - 1U -
+                        (static_cast<size_t>(line_no) + v)) *
+                           static_cast<size_t>(x_stride) +
+                       u;
+            }
+            *image = val;
+          }
+        }
+      } else {
+        assert(0);
+      }
+    }
+#else
+    (void)attributes;
+    (void)num_attributes;
+    (void)num_channels;
+    assert(0);
+#endif
+  } else if (compression_type == TINYEXR_COMPRESSIONTYPE_NONE) {
+    for (size_t c = 0; c < num_channels; c++) {
+      if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+        const unsigned short *line_ptr =
+            reinterpret_cast<const unsigned short *>(
+                data_ptr +
+                c * static_cast<size_t>(width) * sizeof(unsigned short));
+
+        if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+          unsigned short *outLine =
+              reinterpret_cast<unsigned short *>(out_images[c]);
+          if (line_order == 0) {
+            outLine += y * x_stride;
+          } else {
+            outLine += (height - 1 - y) * x_stride;
+          }
+
+          for (int u = 0; u < width; u++) {
+            tinyexr::FP16 hf;
+
+            hf.u = line_ptr[u];
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            outLine[u] = hf.u;
+          }
+        } else if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT) {
+          float *outLine = reinterpret_cast<float *>(out_images[c]);
+          if (line_order == 0) {
+            outLine += y * x_stride;
+          } else {
+            outLine += (height - 1 - y) * x_stride;
+          }
+
+          for (int u = 0; u < width; u++) {
+            tinyexr::FP16 hf;
+
+            hf.u = line_ptr[u];
+
+            tinyexr::swap2(reinterpret_cast<unsigned short *>(&hf.u));
+
+            tinyexr::FP32 f32 = half_to_float(hf);
+
+            outLine[u] = f32.f;
+          }
+        } else {
+          assert(0);
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+        const float *line_ptr = reinterpret_cast<const float *>(
+            data_ptr + c * static_cast<size_t>(width) * sizeof(float));
+
+        float *outLine = reinterpret_cast<float *>(out_images[c]);
+        if (line_order == 0) {
+          outLine += y * x_stride;
+        } else {
+          outLine += (height - 1 - y) * x_stride;
+        }
+
+        for (int u = 0; u < width; u++) {
+          float val = line_ptr[u];
+
+          tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+          outLine[u] = val;
+        }
+      } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+        const unsigned int *line_ptr = reinterpret_cast<const unsigned int *>(
+            data_ptr + c * static_cast<size_t>(width) * sizeof(unsigned int));
+
+        unsigned int *outLine = reinterpret_cast<unsigned int *>(out_images[c]);
+        if (line_order == 0) {
+          outLine += y * x_stride;
+        } else {
+          outLine += (height - 1 - y) * x_stride;
+        }
+
+        for (int u = 0; u < width; u++) {
+          unsigned int val = line_ptr[u];
+
+          tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+          outLine[u] = val;
+        }
+      }
+    }
+  }
+}
+
+static void DecodeTiledPixelData(
+    unsigned char **out_images, int *width, int *height,
+    const int *requested_pixel_types, const unsigned char *data_ptr,
+    size_t data_len, int compression_type, int line_order, int data_width,
+    int data_height, int tile_offset_x, int tile_offset_y, int tile_size_x,
+    int tile_size_y, size_t pixel_data_size, size_t num_attributes,
+    const EXRAttribute *attributes, size_t num_channels,
+    const EXRChannelInfo *channels,
+    const std::vector<size_t> &channel_offset_list) {
+  assert(tile_offset_x * tile_size_x < data_width);
+  assert(tile_offset_y * tile_size_y < data_height);
+
+  // Compute actual image size in a tile.
+  if ((tile_offset_x + 1) * tile_size_x >= data_width) {
+    (*width) = data_width - (tile_offset_x * tile_size_x);
+  } else {
+    (*width) = tile_size_x;
+  }
+
+  if ((tile_offset_y + 1) * tile_size_y >= data_height) {
+    (*height) = data_height - (tile_offset_y * tile_size_y);
+  } else {
+    (*height) = tile_size_y;
+  }
+
+  // Image size = tile size.
+  DecodePixelData(out_images, requested_pixel_types, data_ptr, data_len,
+                  compression_type, line_order, (*width), tile_size_y,
+                  /* stride */ tile_size_x, /* y */ 0, /* line_no */ 0,
+                  (*height), pixel_data_size, num_attributes, attributes,
+                  num_channels, channels, channel_offset_list);
+}
+
+static void ComputeChannelLayout(std::vector<size_t> *channel_offset_list,
+                                 int *pixel_data_size, size_t *channel_offset,
+                                 int num_channels,
+                                 const EXRChannelInfo *channels) {
+  channel_offset_list->resize(static_cast<size_t>(num_channels));
+
+  (*pixel_data_size) = 0;
+  (*channel_offset) = 0;
+
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+    (*channel_offset_list)[c] = (*channel_offset);
+    if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      (*pixel_data_size) += sizeof(unsigned short);
+      (*channel_offset) += sizeof(unsigned short);
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+      (*pixel_data_size) += sizeof(float);
+      (*channel_offset) += sizeof(float);
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+      (*pixel_data_size) += sizeof(unsigned int);
+      (*channel_offset) += sizeof(unsigned int);
+    } else {
+      assert(0);
+    }
+  }
+}
+
+static unsigned char **AllocateImage(int num_channels,
+                                     const EXRChannelInfo *channels,
+                                     const int *requested_pixel_types,
+                                     int data_width, int data_height) {
+  unsigned char **images =
+      reinterpret_cast<unsigned char **>(static_cast<float **>(
+          malloc(sizeof(float *) * static_cast<size_t>(num_channels))));
+
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+    size_t data_len =
+        static_cast<size_t>(data_width) * static_cast<size_t>(data_height);
+    if (channels[c].pixel_type == TINYEXR_PIXELTYPE_HALF) {
+      // pixel_data_size += sizeof(unsigned short);
+      // channel_offset += sizeof(unsigned short);
+      // Alloc internal image for half type.
+      if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+        images[c] =
+            reinterpret_cast<unsigned char *>(static_cast<unsigned short *>(
+                malloc(sizeof(unsigned short) * data_len)));
+      } else if (requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT) {
+        images[c] = reinterpret_cast<unsigned char *>(
+            static_cast<float *>(malloc(sizeof(float) * data_len)));
+      } else {
+        assert(0);
+      }
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_FLOAT) {
+      // pixel_data_size += sizeof(float);
+      // channel_offset += sizeof(float);
+      images[c] = reinterpret_cast<unsigned char *>(
+          static_cast<float *>(malloc(sizeof(float) * data_len)));
+    } else if (channels[c].pixel_type == TINYEXR_PIXELTYPE_UINT) {
+      // pixel_data_size += sizeof(unsigned int);
+      // channel_offset += sizeof(unsigned int);
+      images[c] = reinterpret_cast<unsigned char *>(
+          static_cast<unsigned int *>(malloc(sizeof(unsigned int) * data_len)));
+    } else {
+      assert(0);
+    }
+  }
+
+  return images;
+}
+
+static int ParseEXRHeader(HeaderInfo *info, bool *empty_header,
+                          const EXRVersion *version, std::string *err,
+                          const unsigned char *buf, size_t size) {
+  const char *marker = reinterpret_cast<const char *>(&buf[0]);
+
+  if (empty_header) {
+    (*empty_header) = false;
+  }
+
+  if (version->multipart) {
+    if (size > 0 && marker[0] == '\0') {
+      // End of header list.
+      if (empty_header) {
+        (*empty_header) = true;
+      }
+      return TINYEXR_SUCCESS;
+    }
+  }
+
+  // According to the spec, the header of every OpenEXR file must contain at
+  // least the following attributes:
+  //
+  // channels chlist
+  // compression compression
+  // dataWindow box2i
+  // displayWindow box2i
+  // lineOrder lineOrder
+  // pixelAspectRatio float
+  // screenWindowCenter v2f
+  // screenWindowWidth float
+  bool has_channels = false;
+  bool has_compression = false;
+  bool has_data_window = false;
+  bool has_display_window = false;
+  bool has_line_order = false;
+  bool has_pixel_aspect_ratio = false;
+  bool has_screen_window_center = false;
+  bool has_screen_window_width = false;
+
+  info->data_window[0] = 0;
+  info->data_window[1] = 0;
+  info->data_window[2] = 0;
+  info->data_window[3] = 0;
+  info->line_order = 0;  // @fixme
+  info->display_window[0] = 0;
+  info->display_window[1] = 0;
+  info->display_window[2] = 0;
+  info->display_window[3] = 0;
+  info->screen_window_center[0] = 0.0f;
+  info->screen_window_center[1] = 0.0f;
+  info->screen_window_width = -1.0f;
+  info->pixel_aspect_ratio = -1.0f;
+
+  info->tile_size_x = -1;
+  info->tile_size_y = -1;
+  info->tile_level_mode = -1;
+  info->tile_rounding_mode = -1;
+
+  info->attributes.clear();
+
+  // Read attributes
+  size_t orig_size = size;
+  for (;;) {
+    if (0 == size) {
+      return TINYEXR_ERROR_INVALID_DATA;
+    } else if (marker[0] == '\0') {
+      size--;
+      break;
+    }
+
+    std::string attr_name;
+    std::string attr_type;
+    std::vector<unsigned char> data;
+    size_t marker_size;
+    if (!tinyexr::ReadAttribute(&attr_name, &attr_type, &data, &marker_size,
+                                marker, size)) {
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    marker += marker_size;
+    size -= marker_size;
+
+    if (version->tiled && attr_name.compare("tiles") == 0) {
+      unsigned int x_size, y_size;
+      unsigned char tile_mode;
+      assert(data.size() == 9);
+      memcpy(&x_size, &data.at(0), sizeof(int));
+      memcpy(&y_size, &data.at(4), sizeof(int));
+      tile_mode = data[8];
+      tinyexr::swap4(&x_size);
+      tinyexr::swap4(&y_size);
+
+      info->tile_size_x = static_cast<int>(x_size);
+      info->tile_size_y = static_cast<int>(y_size);
+
+      // mode = levelMode + roundingMode * 16
+      info->tile_level_mode = tile_mode & 0x3;
+      info->tile_rounding_mode = (tile_mode >> 4) & 0x1;
+
+    } else if (attr_name.compare("compression") == 0) {
+      bool ok = false;
+      if ((data[0] >= TINYEXR_COMPRESSIONTYPE_NONE) &&
+          (data[0] < TINYEXR_COMPRESSIONTYPE_PIZ)) {
+        ok = true;
+      }
+
+      if (data[0] == TINYEXR_COMPRESSIONTYPE_PIZ) {
+#if TINYEXR_USE_PIZ
+        ok = true;
+#else
+        if (err) {
+          (*err) = "PIZ compression is not supported.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+#endif
+      }
+
+      if (data[0] == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if TINYEXR_USE_ZFP
+        ok = true;
+#else
+        if (err) {
+          (*err) = "ZFP compression is not supported.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+#endif
+      }
+
+      if (!ok) {
+        if (err) {
+          (*err) = "Unknown compression type.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+      }
+
+      info->compression_type = static_cast<int>(data[0]);
+      has_compression = true;
+
+    } else if (attr_name.compare("channels") == 0) {
+      // name: zero-terminated string, from 1 to 255 bytes long
+      // pixel type: int, possible values are: UINT = 0 HALF = 1 FLOAT = 2
+      // pLinear: unsigned char, possible values are 0 and 1
+      // reserved: three chars, should be zero
+      // xSampling: int
+      // ySampling: int
+
+      ReadChannelInfo(info->channels, data);
+
+      if (info->channels.size() < 1) {
+        if (err) {
+          (*err) = "# of channels is zero.";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+      has_channels = true;
+
+    } else if (attr_name.compare("dataWindow") == 0) {
+      memcpy(&info->data_window[0], &data.at(0), sizeof(int));
+      memcpy(&info->data_window[1], &data.at(4), sizeof(int));
+      memcpy(&info->data_window[2], &data.at(8), sizeof(int));
+      memcpy(&info->data_window[3], &data.at(12), sizeof(int));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&info->data_window[0]));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&info->data_window[1]));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&info->data_window[2]));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&info->data_window[3]));
+
+      has_data_window = true;
+    } else if (attr_name.compare("displayWindow") == 0) {
+      memcpy(&info->display_window[0], &data.at(0), sizeof(int));
+      memcpy(&info->display_window[1], &data.at(4), sizeof(int));
+      memcpy(&info->display_window[2], &data.at(8), sizeof(int));
+      memcpy(&info->display_window[3], &data.at(12), sizeof(int));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->display_window[0]));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->display_window[1]));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->display_window[2]));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->display_window[3]));
+
+      has_display_window = true;
+    } else if (attr_name.compare("lineOrder") == 0) {
+      info->line_order = static_cast<int>(data[0]);
+      has_line_order = true;
+    } else if (attr_name.compare("pixelAspectRatio") == 0) {
+      memcpy(&info->pixel_aspect_ratio, &data.at(0), sizeof(float));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->pixel_aspect_ratio));
+      has_pixel_aspect_ratio = true;
+    } else if (attr_name.compare("screenWindowCenter") == 0) {
+      memcpy(&info->screen_window_center[0], &data.at(0), sizeof(float));
+      memcpy(&info->screen_window_center[1], &data.at(4), sizeof(float));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->screen_window_center[0]));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->screen_window_center[1]));
+      has_screen_window_center = true;
+    } else if (attr_name.compare("screenWindowWidth") == 0) {
+      memcpy(&info->screen_window_width, &data.at(0), sizeof(float));
+      tinyexr::swap4(
+          reinterpret_cast<unsigned int *>(&info->screen_window_width));
+
+      has_screen_window_width = true;
+    } else if (attr_name.compare("chunkCount") == 0) {
+      memcpy(&info->chunk_count, &data.at(0), sizeof(int));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&info->chunk_count));
+    } else {
+      // Custom attribute(up to TINYEXR_MAX_ATTRIBUTES)
+      if (info->attributes.size() < TINYEXR_MAX_ATTRIBUTES) {
+        EXRAttribute attrib;
+        strncpy(attrib.name, attr_name.c_str(), 255);
+        attrib.name[255] = '\0';
+        strncpy(attrib.type, attr_type.c_str(), 255);
+        attrib.type[255] = '\0';
+        attrib.size = static_cast<int>(data.size());
+        attrib.value = static_cast<unsigned char *>(malloc(data.size()));
+        memcpy(reinterpret_cast<char *>(attrib.value), &data.at(0),
+               data.size());
+        info->attributes.push_back(attrib);
+      }
+    }
+  }
+
+  // Check if required attributes exist
+  {
+    std::stringstream ss_err;
+
+    if (!has_compression) {
+      ss_err << "\"compression\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_channels) {
+      ss_err << "\"channels\" attribute not found in the header." << std::endl;
+    }
+
+    if (!has_line_order) {
+      ss_err << "\"lineOrder\" attribute not found in the header." << std::endl;
+    }
+
+    if (!has_display_window) {
+      ss_err << "\"displayWindow\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_data_window) {
+      ss_err << "\"dataWindow\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_pixel_aspect_ratio) {
+      ss_err << "\"pixelAspectRatio\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_screen_window_width) {
+      ss_err << "\"screenWindowWidth\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!has_screen_window_center) {
+      ss_err << "\"screenWindowCenter\" attribute not found in the header."
+             << std::endl;
+    }
+
+    if (!(ss_err.str().empty())) {
+      if (err) {
+        (*err) += ss_err.str();
+      }
+      return TINYEXR_ERROR_INVALID_HEADER;
+    }
+  }
+
+  info->header_len = static_cast<unsigned int>(orig_size - size);
+
+  return TINYEXR_SUCCESS;
+}
+
+// C++ HeaderInfo to C EXRHeader conversion.
+static void ConvertHeader(EXRHeader *exr_header, const HeaderInfo &info) {
+  exr_header->pixel_aspect_ratio = info.pixel_aspect_ratio;
+  exr_header->screen_window_center[0] = info.screen_window_center[0];
+  exr_header->screen_window_center[1] = info.screen_window_center[1];
+  exr_header->screen_window_width = info.screen_window_width;
+  exr_header->chunk_count = info.chunk_count;
+  exr_header->display_window[0] = info.display_window[0];
+  exr_header->display_window[1] = info.display_window[1];
+  exr_header->display_window[2] = info.display_window[2];
+  exr_header->display_window[3] = info.display_window[3];
+  exr_header->data_window[0] = info.data_window[0];
+  exr_header->data_window[1] = info.data_window[1];
+  exr_header->data_window[2] = info.data_window[2];
+  exr_header->data_window[3] = info.data_window[3];
+  exr_header->line_order = info.line_order;
+  exr_header->compression_type = info.compression_type;
+
+  exr_header->tile_size_x = info.tile_size_x;
+  exr_header->tile_size_y = info.tile_size_y;
+  exr_header->tile_level_mode = info.tile_level_mode;
+  exr_header->tile_rounding_mode = info.tile_rounding_mode;
+
+  exr_header->num_channels = static_cast<int>(info.channels.size());
+
+  exr_header->channels = static_cast<EXRChannelInfo *>(malloc(
+      sizeof(EXRChannelInfo) * static_cast<size_t>(exr_header->num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+    strncpy(exr_header->channels[c].name, info.channels[c].name.c_str(), 255);
+    // manually add '\0' for safety.
+    exr_header->channels[c].name[255] = '\0';
+
+    exr_header->channels[c].pixel_type = info.channels[c].pixel_type;
+    exr_header->channels[c].p_linear = info.channels[c].p_linear;
+    exr_header->channels[c].x_sampling = info.channels[c].x_sampling;
+    exr_header->channels[c].y_sampling = info.channels[c].y_sampling;
+  }
+
+  exr_header->pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(exr_header->num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+    exr_header->pixel_types[c] = info.channels[c].pixel_type;
+  }
+
+  // Initially fill with values of `pixel_types`
+  exr_header->requested_pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(exr_header->num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+    exr_header->requested_pixel_types[c] = info.channels[c].pixel_type;
+  }
+
+  assert(info.attributes.size() < TINYEXR_MAX_ATTRIBUTES);
+  exr_header->num_custom_attributes = static_cast<int>(info.attributes.size());
+
+  for (size_t i = 0; i < info.attributes.size(); i++) {
+    memcpy(exr_header->custom_attributes[i].name, info.attributes[i].name, 256);
+    memcpy(exr_header->custom_attributes[i].type, info.attributes[i].type, 256);
+    exr_header->custom_attributes[i].size = info.attributes[i].size;
+    // Just copy poiner
+    exr_header->custom_attributes[i].value = info.attributes[i].value;
+  }
+
+  exr_header->header_len = info.header_len;
+}
+
+static int DecodeChunk(EXRImage *exr_image, const EXRHeader *exr_header,
+                       const std::vector<tinyexr::tinyexr_uint64> &offsets,
+                       const unsigned char *head) {
+  int num_channels = exr_header->num_channels;
+
+  int num_scanline_blocks = 1;
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    num_scanline_blocks = 16;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    num_scanline_blocks = 32;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    num_scanline_blocks = 16;
+  }
+
+  int data_width = exr_header->data_window[2] - exr_header->data_window[0] + 1;
+  int data_height = exr_header->data_window[3] - exr_header->data_window[1] + 1;
+
+  size_t num_blocks = offsets.size();
+
+  std::vector<size_t> channel_offset_list;
+  int pixel_data_size = 0;
+  size_t channel_offset = 0;
+  tinyexr::ComputeChannelLayout(&channel_offset_list, &pixel_data_size,
+                                &channel_offset, num_channels,
+                                exr_header->channels);
+
+  if (exr_header->tiled) {
+    size_t num_tiles = offsets.size();  // = # of blocks
+
+    exr_image->tiles = static_cast<EXRTile *>(
+        malloc(sizeof(EXRTile) * static_cast<size_t>(num_tiles)));
+
+    for (size_t tile_idx = 0; tile_idx < num_tiles; tile_idx++) {
+      // Allocate memory for each tile.
+      exr_image->tiles[tile_idx].images = tinyexr::AllocateImage(
+          num_channels, exr_header->channels, exr_header->requested_pixel_types,
+          data_width, data_height);
+
+      // 16 byte: tile coordinates
+      // 4 byte : data size
+      // ~      : data(uncompressed or compressed)
+      const unsigned char *data_ptr =
+          reinterpret_cast<const unsigned char *>(head + offsets[tile_idx]);
+
+      int tile_coordinates[4];
+      memcpy(tile_coordinates, data_ptr, sizeof(int) * 4);
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&tile_coordinates[0]));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&tile_coordinates[1]));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&tile_coordinates[2]));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&tile_coordinates[3]));
+
+      // @todo{ LoD }
+      assert(tile_coordinates[2] == 0);
+      assert(tile_coordinates[3] == 0);
+
+      int data_len;
+      memcpy(&data_len, data_ptr + 16,
+             sizeof(int));  // 16 = sizeof(tile_coordinates)
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&data_len));
+      assert(data_len >= 4);
+
+      // Move to data addr: 20 = 16 + 4;
+      data_ptr += 20;
+
+      tinyexr::DecodeTiledPixelData(
+          exr_image->tiles[tile_idx].images,
+          &(exr_image->tiles[tile_idx].width),
+          &(exr_image->tiles[tile_idx].height),
+          exr_header->requested_pixel_types, data_ptr,
+          static_cast<size_t>(data_len), exr_header->compression_type,
+          exr_header->line_order, data_width, data_height, tile_coordinates[0],
+          tile_coordinates[1], exr_header->tile_size_x, exr_header->tile_size_y,
+          static_cast<size_t>(pixel_data_size),
+          static_cast<size_t>(exr_header->num_custom_attributes),
+          exr_header->custom_attributes,
+          static_cast<size_t>(exr_header->num_channels), exr_header->channels,
+          channel_offset_list);
+
+      exr_image->tiles[tile_idx].offset_x = tile_coordinates[0];
+      exr_image->tiles[tile_idx].offset_y = tile_coordinates[1];
+      exr_image->tiles[tile_idx].level_x = tile_coordinates[2];
+      exr_image->tiles[tile_idx].level_y = tile_coordinates[3];
+
+      exr_image->num_tiles = static_cast<int>(num_tiles);
+    }
+  } else {  // scanline format
+
+    exr_image->images = tinyexr::AllocateImage(
+        num_channels, exr_header->channels, exr_header->requested_pixel_types,
+        data_width, data_height);
+
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (int y = 0; y < static_cast<int>(num_blocks); y++) {
+      size_t y_idx = static_cast<size_t>(y);
+      const unsigned char *data_ptr =
+          reinterpret_cast<const unsigned char *>(head + offsets[y_idx]);
+      // 4 byte: scan line
+      // 4 byte: data size
+      // ~     : pixel data(uncompressed or compressed)
+      int line_no;
+      memcpy(&line_no, data_ptr, sizeof(int));
+      int data_len;
+      memcpy(&data_len, data_ptr + 4, sizeof(int));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&line_no));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&data_len));
+
+      int end_line_no = (std::min)(line_no + num_scanline_blocks,
+                                   (exr_header->data_window[3] + 1));
+
+      int num_lines = end_line_no - line_no;
+      assert(num_lines > 0);
+
+      // Move to data addr: 8 = 4 + 4;
+      data_ptr += 8;
+
+      // Adjust line_no with data_window.bmin.y
+      line_no -= exr_header->data_window[1];
+
+      tinyexr::DecodePixelData(
+          exr_image->images, exr_header->requested_pixel_types, data_ptr,
+          static_cast<size_t>(data_len), exr_header->compression_type,
+          exr_header->line_order, data_width, data_height, data_width, y,
+          line_no, num_lines, static_cast<size_t>(pixel_data_size),
+          static_cast<size_t>(exr_header->num_custom_attributes),
+          exr_header->custom_attributes,
+          static_cast<size_t>(exr_header->num_channels), exr_header->channels,
+          channel_offset_list);
+    }  // omp parallel
+  }
+
+  // Overwrite `pixel_type` with `requested_pixel_type`.
+  {
+    for (int c = 0; c < exr_header->num_channels; c++) {
+      exr_header->pixel_types[c] = exr_header->requested_pixel_types[c];
+    }
+  }
+
+  {
+    exr_image->num_channels = num_channels;
+
+    exr_image->width = data_width;
+    exr_image->height = data_height;
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+static void ReconstructLineOffsets(std::vector<tinyexr::tinyexr_uint64> *offsets, size_t n, const unsigned char *head, const unsigned char *marker, const size_t size)
+{
+  assert(head < marker);
+  assert(offsets->size() == n);
+
+  for (size_t i = 0; i < n; i++) {
+    size_t offset = static_cast<size_t>(marker - head);
+    assert(offset < size); // Offset should not exceed whole EXR file/data size.
+
+    int y;
+    int data_len;
+
+    memcpy(&y, marker, sizeof(int));
+    memcpy(&data_len, marker + 4, sizeof(unsigned int));
+
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&y));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&data_len));
+
+    (*offsets)[i] = offset;
+
+    marker += data_len + 8; // 8 = 4 bytes(y) + 4 bytes(data_len)
+  }
+}
+
+static int DecodeEXRImage(EXRImage *exr_image, const EXRHeader *exr_header,
+                          const unsigned char *head,
+                          const unsigned char *marker, const size_t size, const char **err) {
+  if (exr_image == NULL || exr_header == NULL || head == NULL ||
+      marker == NULL || (size <= tinyexr::kEXRVersionSize)) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  int num_scanline_blocks = 1;
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    num_scanline_blocks = 16;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    num_scanline_blocks = 32;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    num_scanline_blocks = 16;
+  }
+
+  int data_width = exr_header->data_window[2] - exr_header->data_window[0] + 1;
+  int data_height = exr_header->data_window[3] - exr_header->data_window[1] + 1;
+
+  // Read offset tables.
+  size_t num_blocks;
+
+  if (exr_header->chunk_count > 0) {
+    // Use `chunkCount` attribute.
+    num_blocks = static_cast<size_t>(exr_header->chunk_count);
+  } else if (exr_header->tiled) {
+    // @todo { LoD }
+    size_t num_x_tiles = static_cast<size_t>(data_width) /
+                         static_cast<size_t>(exr_header->tile_size_x);
+    if (num_x_tiles * static_cast<size_t>(exr_header->tile_size_x) <
+        static_cast<size_t>(data_width)) {
+      num_x_tiles++;
+    }
+    size_t num_y_tiles = static_cast<size_t>(data_height) /
+                         static_cast<size_t>(exr_header->tile_size_y);
+    if (num_y_tiles * static_cast<size_t>(exr_header->tile_size_y) <
+        static_cast<size_t>(data_height)) {
+      num_y_tiles++;
+    }
+
+    num_blocks = num_x_tiles * num_y_tiles;
+  } else {
+    num_blocks = static_cast<size_t>(data_height) /
+                 static_cast<size_t>(num_scanline_blocks);
+    if (num_blocks * static_cast<size_t>(num_scanline_blocks) <
+        static_cast<size_t>(data_height)) {
+      num_blocks++;
+    }
+  }
+
+  std::vector<tinyexr::tinyexr_uint64> offsets(num_blocks);
+
+  for (size_t y = 0; y < num_blocks; y++) {
+    tinyexr::tinyexr_uint64 offset;
+    memcpy(&offset, marker, sizeof(tinyexr::tinyexr_uint64));
+    tinyexr::swap8(&offset);
+    marker += sizeof(tinyexr::tinyexr_uint64);  // = 8
+    offsets[y] = offset;
+  }
+
+  // If line offsets are invalid, we try to reconstruct it.
+  // See OpenEXR/IlmImf/ImfScanLineInputFile.cpp::readLineOffsets() for details.
+  for (size_t y = 0; y < num_blocks; y++) {
+    if (offsets[y] <= 0) {
+      // TODO(syoyo) Report as warning.
+      //if (err) {
+      //  stringstream ss;
+      //  ss << "Incomplete lineOffsets." << std::endl;
+      //  (*err) += ss.str();
+      //}
+      ReconstructLineOffsets(&offsets, num_blocks, head, marker, size);
+      break;
+    }
+  }
+
+  return DecodeChunk(exr_image, exr_header, offsets, head);
+}
+
+}  // namespace tinyexr
+
+int LoadEXR(float **out_rgba, int *width, int *height, const char *filename,
+            const char **err) {
+  if (out_rgba == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.\n";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  EXRVersion exr_version;
+  EXRImage exr_image;
+  EXRHeader exr_header;
+  InitEXRHeader(&exr_header);
+  InitEXRImage(&exr_image);
+
+  {
+    int ret = ParseEXRVersionFromFile(&exr_version, filename);
+    if (ret != TINYEXR_SUCCESS) {
+      return ret;
+    }
+
+    if (exr_version.multipart || exr_version.non_image) {
+      if (err) {
+        (*err) = "Loading multipart or DeepImage is not supported yet.\n";
+      }
+      return TINYEXR_ERROR_INVALID_DATA;  // @fixme.
+    }
+  }
+
+  {
+    int ret = ParseEXRHeaderFromFile(&exr_header, &exr_version, filename, err);
+    if (ret != TINYEXR_SUCCESS) {
+      return ret;
+    }
+  }
+
+  // Read HALF channel as FLOAT.
+  for (int i = 0; i < exr_header.num_channels; i++) {
+    if (exr_header.pixel_types[i] == TINYEXR_PIXELTYPE_HALF) {
+      exr_header.requested_pixel_types[i] = TINYEXR_PIXELTYPE_FLOAT;
+    }
+  }
+
+  {
+    int ret = LoadEXRImageFromFile(&exr_image, &exr_header, filename, err);
+    if (ret != TINYEXR_SUCCESS) {
+      return ret;
+    }
+  }
+
+  // RGBA
+  int idxR = -1;
+  int idxG = -1;
+  int idxB = -1;
+  int idxA = -1;
+  for (int c = 0; c < exr_header.num_channels; c++) {
+    if (strcmp(exr_header.channels[c].name, "R") == 0) {
+      idxR = c;
+    } else if (strcmp(exr_header.channels[c].name, "G") == 0) {
+      idxG = c;
+    } else if (strcmp(exr_header.channels[c].name, "B") == 0) {
+      idxB = c;
+    } else if (strcmp(exr_header.channels[c].name, "A") == 0) {
+      idxA = c;
+    }
+  }
+
+  if (idxR == -1) {
+    if (err) {
+      (*err) = "R channel not found\n";
+    }
+
+    // @todo { free exr_image }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  if (idxG == -1) {
+    if (err) {
+      (*err) = "G channel not found\n";
+    }
+    // @todo { free exr_image }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  if (idxB == -1) {
+    if (err) {
+      (*err) = "B channel not found\n";
+    }
+    // @todo { free exr_image }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  (*out_rgba) = reinterpret_cast<float *>(
+      malloc(4 * sizeof(float) * static_cast<size_t>(exr_image.width) *
+             static_cast<size_t>(exr_image.height)));
+  for (int i = 0; i < exr_image.width * exr_image.height; i++) {
+    (*out_rgba)[4 * i + 0] =
+        reinterpret_cast<float **>(exr_image.images)[idxR][i];
+    (*out_rgba)[4 * i + 1] =
+        reinterpret_cast<float **>(exr_image.images)[idxG][i];
+    (*out_rgba)[4 * i + 2] =
+        reinterpret_cast<float **>(exr_image.images)[idxB][i];
+    if (idxA != -1) {
+      (*out_rgba)[4 * i + 3] =
+          reinterpret_cast<float **>(exr_image.images)[idxA][i];
+    } else {
+      (*out_rgba)[4 * i + 3] = 1.0;
+    }
+  }
+
+  (*width) = exr_image.width;
+  (*height) = exr_image.height;
+
+  FreeEXRHeader(&exr_header);
+  FreeEXRImage(&exr_image);
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRHeaderFromMemory(EXRHeader *exr_header, const EXRVersion *version,
+                             const unsigned char *memory, size_t size,
+                             const char **err) {
+  if (memory == NULL || exr_header == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.\n";
+    }
+
+    // Invalid argument
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (size < tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  const unsigned char *marker = memory + tinyexr::kEXRVersionSize;
+  size_t marker_size = size - tinyexr::kEXRVersionSize;
+
+  tinyexr::HeaderInfo info;
+  info.clear();
+
+  std::string err_str;
+  int ret = ParseEXRHeader(&info, NULL, version, &err_str, marker, marker_size);
+
+  if (ret != TINYEXR_SUCCESS) {
+    if (err && !err_str.empty()) {
+      (*err) = strdup(err_str.c_str());  // May leak
+    }
+  }
+
+  ConvertHeader(exr_header, info);
+
+  // transfoer `tiled` from version.
+  exr_header->tiled = version->tiled;
+
+  return ret;
+}
+
+int LoadEXRFromMemory(float *out_rgba, const unsigned char *memory, size_t size,
+                      const char **err) {
+  if (out_rgba == NULL || memory == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.\n";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  EXRVersion exr_version;
+  EXRImage exr_image;
+  EXRHeader exr_header;
+
+  InitEXRHeader(&exr_header);
+
+  int ret = ParseEXRVersionFromMemory(&exr_version, memory, size);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  ret = ParseEXRHeaderFromMemory(&exr_header, &exr_version, memory, size, err);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  InitEXRImage(&exr_image);
+  ret = LoadEXRImageFromMemory(&exr_image, &exr_header, memory, size, err);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  // RGBA
+  int idxR = -1;
+  int idxG = -1;
+  int idxB = -1;
+  int idxA = -1;
+  for (int c = 0; c < exr_header.num_channels; c++) {
+    if (strcmp(exr_header.channels[c].name, "R") == 0) {
+      idxR = c;
+    } else if (strcmp(exr_header.channels[c].name, "G") == 0) {
+      idxG = c;
+    } else if (strcmp(exr_header.channels[c].name, "B") == 0) {
+      idxB = c;
+    } else if (strcmp(exr_header.channels[c].name, "A") == 0) {
+      idxA = c;
+    }
+  }
+
+  if (idxR == -1) {
+    if (err) {
+      (*err) = "R channel not found\n";
+    }
+
+    // @todo { free exr_image }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  if (idxG == -1) {
+    if (err) {
+      (*err) = "G channel not found\n";
+    }
+    // @todo { free exr_image }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  if (idxB == -1) {
+    if (err) {
+      (*err) = "B channel not found\n";
+    }
+    // @todo { free exr_image }
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  // Assume `out_rgba` have enough memory allocated.
+  for (int i = 0; i < exr_image.width * exr_image.height; i++) {
+    out_rgba[4 * i + 0] = reinterpret_cast<float **>(exr_image.images)[idxR][i];
+    out_rgba[4 * i + 1] = reinterpret_cast<float **>(exr_image.images)[idxG][i];
+    out_rgba[4 * i + 2] = reinterpret_cast<float **>(exr_image.images)[idxB][i];
+    if (idxA > 0) {
+      out_rgba[4 * i + 3] =
+          reinterpret_cast<float **>(exr_image.images)[idxA][i];
+    } else {
+      out_rgba[4 * i + 3] = 1.0;
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int LoadEXRImageFromFile(EXRImage *exr_image, const EXRHeader *exr_header,
+                         const char *filename, const char **err) {
+  if (exr_image == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if 0 //def _WIN32
+  FILE *fp = NULL;
+  fopen_s(&fp, filename, "rb");
+#else
+  FILE *fp = fopen(filename, "rb");
+#endif
+  if (!fp) {
+    if (err) {
+      (*err) = "Cannot read file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  size_t filesize;
+  // Compute size
+  fseek(fp, 0, SEEK_END);
+  filesize = static_cast<size_t>(ftell(fp));
+  fseek(fp, 0, SEEK_SET);
+
+  std::vector<unsigned char> buf(filesize);  // @todo { use mmap }
+  {
+    size_t ret;
+    ret = fread(&buf[0], 1, filesize, fp);
+    assert(ret == filesize);
+    fclose(fp);
+    (void)ret;
+  }
+
+  return LoadEXRImageFromMemory(exr_image, exr_header, &buf.at(0), filesize, err);
+}
+
+int LoadEXRImageFromMemory(EXRImage *exr_image, const EXRHeader *exr_header,
+                           const unsigned char *memory, const size_t size, const char **err) {
+  if (exr_image == NULL || memory == NULL || (size < tinyexr::kEXRVersionSize)) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (exr_header->header_len == 0) {
+    if (err) {
+      (*err) = "EXRHeader is not initialized.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  const unsigned char *head = memory;
+  const unsigned char *marker = reinterpret_cast<const unsigned char *>(
+      memory + exr_header->header_len +
+      8);  // +8 for magic number + version header.
+  return tinyexr::DecodeEXRImage(exr_image, exr_header, head, marker, size, err);
+}
+
+size_t SaveEXRImageToMemory(const EXRImage *exr_image,
+                            const EXRHeader *exr_header,
+                            unsigned char **memory_out, const char **err) {
+  if (exr_image == NULL || memory_out == NULL ||
+      exr_header->compression_type < 0) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return 0;  // @fixme
+  }
+
+#if !TINYEXR_USE_PIZ
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    if (err) {
+      (*err) = "PIZ compression is not supported in this build.";
+    }
+    return 0;
+  }
+#endif
+
+#if !TINYEXR_USE_ZFP
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    if (err) {
+      (*err) = "ZFP compression is not supported in this build.";
+    }
+    return 0;
+  }
+#endif
+
+#if TINYEXR_USE_ZFP
+  for (size_t i = 0; i < static_cast<size_t>(exr_header->num_channels); i++) {
+    if (exr_header->requested_pixel_types[i] != TINYEXR_PIXELTYPE_FLOAT) {
+      if (err) {
+        (*err) = "Pixel type must be FLOAT for ZFP compression.";
+      }
+      return 0;
+    }
+  }
+#endif
+
+  std::vector<unsigned char> memory;
+
+  // Header
+  {
+    const char header[] = {0x76, 0x2f, 0x31, 0x01};
+    memory.insert(memory.end(), header, header + 4);
+  }
+
+  // Version, scanline.
+  {
+    char marker[] = {2, 0, 0, 0};
+    /* @todo
+    if (exr_header->tiled) {
+      marker[1] |= 0x2;
+    }
+    if (exr_header->long_name) {
+      marker[1] |= 0x4;
+    }
+    if (exr_header->non_image) {
+      marker[1] |= 0x8;
+    }
+    if (exr_header->multipart) {
+      marker[1] |= 0x10;
+    }
+    */
+    memory.insert(memory.end(), marker, marker + 4);
+  }
+
+  int num_scanlines = 1;
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+    num_scanlines = 16;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    num_scanlines = 32;
+  } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    num_scanlines = 16;
+  }
+
+  // Write attributes.
+  std::vector<tinyexr::ChannelInfo> channels;
+  {
+    std::vector<unsigned char> data;
+
+    for (int c = 0; c < exr_header->num_channels; c++) {
+      tinyexr::ChannelInfo info;
+      info.p_linear = 0;
+      info.pixel_type = exr_header->requested_pixel_types[c];
+      info.x_sampling = 1;
+      info.y_sampling = 1;
+      info.name = std::string(exr_header->channels[c].name);
+      channels.push_back(info);
+    }
+
+    tinyexr::WriteChannelInfo(data, channels);
+
+    tinyexr::WriteAttributeToMemory(&memory, "channels", "chlist", &data.at(0),
+                                    static_cast<int>(data.size()));
+  }
+
+  {
+    int comp = exr_header->compression_type;
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&comp));
+    tinyexr::WriteAttributeToMemory(
+        &memory, "compression", "compression",
+        reinterpret_cast<const unsigned char *>(&comp), 1);
+  }
+
+  {
+    int data[4] = {0, 0, exr_image->width - 1, exr_image->height - 1};
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&data[0]));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&data[1]));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&data[2]));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&data[3]));
+    tinyexr::WriteAttributeToMemory(
+        &memory, "dataWindow", "box2i",
+        reinterpret_cast<const unsigned char *>(data), sizeof(int) * 4);
+    tinyexr::WriteAttributeToMemory(
+        &memory, "displayWindow", "box2i",
+        reinterpret_cast<const unsigned char *>(data), sizeof(int) * 4);
+  }
+
+  {
+    unsigned char line_order = 0;  // @fixme { read line_order from EXRHeader }
+    tinyexr::WriteAttributeToMemory(&memory, "lineOrder", "lineOrder",
+                                    &line_order, 1);
+  }
+
+  {
+    float aspectRatio = 1.0f;
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&aspectRatio));
+    tinyexr::WriteAttributeToMemory(
+        &memory, "pixelAspectRatio", "float",
+        reinterpret_cast<const unsigned char *>(&aspectRatio), sizeof(float));
+  }
+
+  {
+    float center[2] = {0.0f, 0.0f};
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&center[0]));
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&center[1]));
+    tinyexr::WriteAttributeToMemory(
+        &memory, "screenWindowCenter", "v2f",
+        reinterpret_cast<const unsigned char *>(center), 2 * sizeof(float));
+  }
+
+  {
+    float w = static_cast<float>(exr_image->width);
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&w));
+    tinyexr::WriteAttributeToMemory(&memory, "screenWindowWidth", "float",
+                                    reinterpret_cast<const unsigned char *>(&w),
+                                    sizeof(float));
+  }
+
+  // Custom attributes
+  if (exr_header->num_custom_attributes > 0) {
+    for (int i = 0; i < exr_header->num_custom_attributes; i++) {
+      tinyexr::WriteAttributeToMemory(
+          &memory, exr_header->custom_attributes[i].name,
+          exr_header->custom_attributes[i].type,
+          reinterpret_cast<const unsigned char *>(
+              exr_header->custom_attributes[i].value),
+          exr_header->custom_attributes[i].size);
+    }
+  }
+
+  {  // end of header
+    unsigned char e = 0;
+    memory.push_back(e);
+  }
+
+  int num_blocks = exr_image->height / num_scanlines;
+  if (num_blocks * num_scanlines < exr_image->height) {
+    num_blocks++;
+  }
+
+  std::vector<tinyexr::tinyexr_uint64> offsets(static_cast<size_t>(num_blocks));
+
+  size_t headerSize = memory.size();
+  tinyexr::tinyexr_uint64 offset =
+      headerSize +
+      static_cast<size_t>(num_blocks) *
+          sizeof(
+              tinyexr::tinyexr_int64);  // sizeof(header) + sizeof(offsetTable)
+
+  std::vector<unsigned char> data;
+
+  std::vector<std::vector<unsigned char> > data_list(
+      static_cast<size_t>(num_blocks));
+  std::vector<size_t> channel_offset_list(
+      static_cast<size_t>(exr_header->num_channels));
+
+  int pixel_data_size = 0;
+  size_t channel_offset = 0;
+  for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+    channel_offset_list[c] = channel_offset;
+    if (exr_header->requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+      pixel_data_size += sizeof(unsigned short);
+      channel_offset += sizeof(unsigned short);
+    } else if (exr_header->requested_pixel_types[c] ==
+               TINYEXR_PIXELTYPE_FLOAT) {
+      pixel_data_size += sizeof(float);
+      channel_offset += sizeof(float);
+    } else if (exr_header->requested_pixel_types[c] == TINYEXR_PIXELTYPE_UINT) {
+      pixel_data_size += sizeof(unsigned int);
+      channel_offset += sizeof(unsigned int);
+    } else {
+      assert(0);
+    }
+  }
+
+#if TINYEXR_USE_ZFP
+  tinyexr::ZFPCompressionParam zfp_compression_param;
+
+  // Use ZFP compression parameter from custom attributes(if such a parameter
+  // exists)
+  {
+    bool ret = tinyexr::FindZFPCompressionParam(
+        &zfp_compression_param, exr_header->custom_attributes,
+        exr_header->num_custom_attributes);
+
+    if (!ret) {
+      // Use predefined compression parameter.
+      zfp_compression_param.type = 0;
+      zfp_compression_param.rate = 2;
+    }
+  }
+#endif
+
+// Use signed int since some OpenMP compiler doesn't allow unsigned type for
+// `parallel for`
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+  for (int i = 0; i < num_blocks; i++) {
+    size_t ii = static_cast<size_t>(i);
+    int start_y = num_scanlines * i;
+    int endY = (std::min)(num_scanlines * (i + 1), exr_image->height);
+    int h = endY - start_y;
+
+    std::vector<unsigned char> buf(
+        static_cast<size_t>(exr_image->width * h * pixel_data_size));
+
+    for (size_t c = 0; c < static_cast<size_t>(exr_header->num_channels); c++) {
+      if (exr_header->pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+        if (exr_header->requested_pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT) {
+          for (int y = 0; y < h; y++) {
+            for (int x = 0; x < exr_image->width; x++) {
+              tinyexr::FP16 h16;
+              h16.u = reinterpret_cast<unsigned short **>(
+                  exr_image->images)[c][(y + start_y) * exr_image->width + x];
+
+              tinyexr::FP32 f32 = half_to_float(h16);
+
+              tinyexr::swap4(reinterpret_cast<unsigned int *>(&f32.f));
+
+              // Assume increasing Y
+              float *line_ptr = reinterpret_cast<float *>(&buf.at(
+                  static_cast<size_t>(pixel_data_size * y * exr_image->width) +
+                  channel_offset_list[c] *
+                      static_cast<size_t>(exr_image->width)));
+              line_ptr[x] = f32.f;
+            }
+          }
+        } else if (exr_header->requested_pixel_types[c] ==
+                   TINYEXR_PIXELTYPE_HALF) {
+          for (int y = 0; y < h; y++) {
+            for (int x = 0; x < exr_image->width; x++) {
+              unsigned short val = reinterpret_cast<unsigned short **>(
+                  exr_image->images)[c][(y + start_y) * exr_image->width + x];
+
+              tinyexr::swap2(&val);
+
+              // Assume increasing Y
+              unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+                  &buf.at(static_cast<size_t>(pixel_data_size * y *
+                                              exr_image->width) +
+                          channel_offset_list[c] *
+                              static_cast<size_t>(exr_image->width)));
+              line_ptr[x] = val;
+            }
+          }
+        } else {
+          assert(0);
+        }
+
+      } else if (exr_header->pixel_types[c] == TINYEXR_PIXELTYPE_FLOAT) {
+        if (exr_header->requested_pixel_types[c] == TINYEXR_PIXELTYPE_HALF) {
+          for (int y = 0; y < h; y++) {
+            for (int x = 0; x < exr_image->width; x++) {
+              tinyexr::FP32 f32;
+              f32.f = reinterpret_cast<float **>(
+                  exr_image->images)[c][(y + start_y) * exr_image->width + x];
+
+              tinyexr::FP16 h16;
+              h16 = float_to_half_full(f32);
+
+              tinyexr::swap2(reinterpret_cast<unsigned short *>(&h16.u));
+
+              // Assume increasing Y
+              unsigned short *line_ptr = reinterpret_cast<unsigned short *>(
+                  &buf.at(static_cast<size_t>(pixel_data_size * y *
+                                              exr_image->width) +
+                          channel_offset_list[c] *
+                              static_cast<size_t>(exr_image->width)));
+              line_ptr[x] = h16.u;
+            }
+          }
+        } else if (exr_header->requested_pixel_types[c] ==
+                   TINYEXR_PIXELTYPE_FLOAT) {
+          for (int y = 0; y < h; y++) {
+            for (int x = 0; x < exr_image->width; x++) {
+              float val = reinterpret_cast<float **>(
+                  exr_image->images)[c][(y + start_y) * exr_image->width + x];
+
+              tinyexr::swap4(reinterpret_cast<unsigned int *>(&val));
+
+              // Assume increasing Y
+              float *line_ptr = reinterpret_cast<float *>(&buf.at(
+                  static_cast<size_t>(pixel_data_size * y * exr_image->width) +
+                  channel_offset_list[c] *
+                      static_cast<size_t>(exr_image->width)));
+              line_ptr[x] = val;
+            }
+          }
+        } else {
+          assert(0);
+        }
+      } else if (exr_header->pixel_types[c] == TINYEXR_PIXELTYPE_UINT) {
+        for (int y = 0; y < h; y++) {
+          for (int x = 0; x < exr_image->width; x++) {
+            unsigned int val = reinterpret_cast<unsigned int **>(
+                exr_image->images)[c][(y + start_y) * exr_image->width + x];
+
+            tinyexr::swap4(&val);
+
+            // Assume increasing Y
+            unsigned int *line_ptr = reinterpret_cast<unsigned int *>(&buf.at(
+                static_cast<size_t>(pixel_data_size * y * exr_image->width) +
+                channel_offset_list[c] *
+                    static_cast<size_t>(exr_image->width)));
+            line_ptr[x] = val;
+          }
+        }
+      }
+    }
+
+    if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_NONE) {
+      // 4 byte: scan line
+      // 4 byte: data size
+      // ~     : pixel data(uncompressed)
+      std::vector<unsigned char> header(8);
+      unsigned int data_len = static_cast<unsigned int>(buf.size());
+      memcpy(&header.at(0), &start_y, sizeof(int));
+      memcpy(&header.at(4), &data_len, sizeof(unsigned int));
+
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(0)));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(4)));
+
+      data_list[ii].insert(data_list[ii].end(), header.begin(), header.end());
+      data_list[ii].insert(data_list[ii].end(), buf.begin(),
+                           buf.begin() + data_len);
+
+    } else if ((exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS) ||
+               (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZIP)) {
+#if TINYEXR_USE_MINIZ
+      std::vector<unsigned char> block(
+          tinyexr::miniz::mz_compressBound(buf.size()));
+#else
+      std::vector<unsigned char> block(compressBound(buf.size()));
+#endif
+      tinyexr::tinyexr_uint64 outSize = block.size();
+
+      tinyexr::CompressZip(&block.at(0), outSize,
+                           reinterpret_cast<const unsigned char *>(&buf.at(0)),
+                           buf.size());
+
+      // 4 byte: scan line
+      // 4 byte: data size
+      // ~     : pixel data(compressed)
+      std::vector<unsigned char> header(8);
+      unsigned int data_len = static_cast<unsigned int>(outSize);  // truncate
+      memcpy(&header.at(0), &start_y, sizeof(int));
+      memcpy(&header.at(4), &data_len, sizeof(unsigned int));
+
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(0)));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(4)));
+
+      data_list[ii].insert(data_list[ii].end(), header.begin(), header.end());
+      data_list[ii].insert(data_list[ii].end(), block.begin(),
+                           block.begin() + data_len);
+
+    } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_RLE) {
+      // (buf.size() * 3) / 2 would be enough.
+      std::vector<unsigned char> block((buf.size() * 3) / 2);
+
+      tinyexr::tinyexr_uint64 outSize = block.size();
+
+      tinyexr::CompressRle(&block.at(0), outSize,
+                           reinterpret_cast<const unsigned char *>(&buf.at(0)),
+                           buf.size());
+
+      // 4 byte: scan line
+      // 4 byte: data size
+      // ~     : pixel data(compressed)
+      std::vector<unsigned char> header(8);
+      unsigned int data_len = static_cast<unsigned int>(outSize);  // truncate
+      memcpy(&header.at(0), &start_y, sizeof(int));
+      memcpy(&header.at(4), &data_len, sizeof(unsigned int));
+
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(0)));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(4)));
+
+      data_list[ii].insert(data_list[ii].end(), header.begin(), header.end());
+      data_list[ii].insert(data_list[ii].end(), block.begin(),
+                           block.begin() + data_len);
+
+    } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+#if TINYEXR_USE_PIZ
+      unsigned int bufLen =
+          1024 + static_cast<unsigned int>(
+                     1.2 * static_cast<unsigned int>(
+                               buf.size()));  // @fixme { compute good bound. }
+      std::vector<unsigned char> block(bufLen);
+      unsigned int outSize = static_cast<unsigned int>(block.size());
+
+      CompressPiz(&block.at(0), outSize,
+                  reinterpret_cast<const unsigned char *>(&buf.at(0)),
+                  buf.size(), channels, exr_image->width, h);
+
+      // 4 byte: scan line
+      // 4 byte: data size
+      // ~     : pixel data(compressed)
+      std::vector<unsigned char> header(8);
+      unsigned int data_len = outSize;
+      memcpy(&header.at(0), &start_y, sizeof(int));
+      memcpy(&header.at(4), &data_len, sizeof(unsigned int));
+
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(0)));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(4)));
+
+      data_list[ii].insert(data_list[ii].end(), header.begin(), header.end());
+      data_list[ii].insert(data_list[ii].end(), block.begin(),
+                           block.begin() + data_len);
+
+#else
+      assert(0);
+#endif
+    } else if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+#if TINYEXR_USE_ZFP
+      std::vector<unsigned char> block;
+      unsigned int outSize;
+
+      tinyexr::CompressZfp(
+          &block, &outSize, reinterpret_cast<const float *>(&buf.at(0)),
+          exr_image->width, h, exr_header->num_channels, zfp_compression_param);
+
+      // 4 byte: scan line
+      // 4 byte: data size
+      // ~     : pixel data(compressed)
+      std::vector<unsigned char> header(8);
+      unsigned int data_len = outSize;
+      memcpy(&header.at(0), &start_y, sizeof(int));
+      memcpy(&header.at(4), &data_len, sizeof(unsigned int));
+
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(0)));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&header.at(4)));
+
+      data_list[ii].insert(data_list[ii].end(), header.begin(), header.end());
+      data_list[ii].insert(data_list[ii].end(), block.begin(),
+                           block.begin() + data_len);
+
+#else
+      assert(0);
+#endif
+    } else {
+      assert(0);
+    }
+  }  // omp parallel
+
+  for (size_t i = 0; i < static_cast<size_t>(num_blocks); i++) {
+    data.insert(data.end(), data_list[i].begin(), data_list[i].end());
+
+    offsets[i] = offset;
+    tinyexr::swap8(reinterpret_cast<tinyexr::tinyexr_uint64 *>(&offsets[i]));
+    offset += data_list[i].size();
+  }
+
+  {
+    memory.insert(
+        memory.end(), reinterpret_cast<unsigned char *>(&offsets.at(0)),
+        reinterpret_cast<unsigned char *>(&offsets.at(0)) +
+            sizeof(tinyexr::tinyexr_uint64) * static_cast<size_t>(num_blocks));
+  }
+
+  { memory.insert(memory.end(), data.begin(), data.end()); }
+
+  assert(memory.size() > 0);
+
+  (*memory_out) = static_cast<unsigned char *>(malloc(memory.size()));
+  memcpy((*memory_out), &memory.at(0), memory.size());
+
+  return memory.size();  // OK
+}
+
+int SaveEXRImageToFile(const EXRImage *exr_image, const EXRHeader *exr_header,
+                       const char *filename, const char **err) {
+  if (exr_image == NULL || filename == NULL ||
+      exr_header->compression_type < 0) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if !TINYEXR_USE_PIZ
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_PIZ) {
+    if (err) {
+      (*err) = "PIZ compression is not supported in this build.";
+    }
+    return 0;
+  }
+#endif
+
+#if !TINYEXR_USE_ZFP
+  if (exr_header->compression_type == TINYEXR_COMPRESSIONTYPE_ZFP) {
+    if (err) {
+      (*err) = "ZFP compression is not supported in this build.";
+    }
+    return 0;
+  }
+#endif
+
+#if 0 //def _WIN32
+  FILE *fp = NULL;
+  fopen_s(&fp, filename, "wb");
+#else
+  FILE *fp = fopen(filename, "wb");
+#endif
+  if (!fp) {
+    if (err) {
+      (*err) = "Cannot write a file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  unsigned char *mem = NULL;
+  size_t mem_size = SaveEXRImageToMemory(exr_image, exr_header, &mem, err);
+
+  if ((mem_size > 0) && mem) {
+    fwrite(mem, 1, mem_size, fp);
+  }
+  free(mem);
+
+  fclose(fp);
+
+  return TINYEXR_SUCCESS;
+}
+
+int LoadDeepEXR(DeepImage *deep_image, const char *filename, const char **err) {
+  if (deep_image == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  FILE *fp = fopen(filename, "rb");
+  if (!fp) {
+    if (err) {
+      (*err) = "Cannot read file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  size_t filesize;
+  // Compute size
+  fseek(fp, 0, SEEK_END);
+  filesize = static_cast<size_t>(ftell(fp));
+  fseek(fp, 0, SEEK_SET);
+
+  if (filesize == 0) {
+    fclose(fp);
+    if (err) {
+      (*err) = "File size is zero.";
+    }
+    return TINYEXR_ERROR_INVALID_FILE;
+  }
+
+  std::vector<char> buf(filesize);  // @todo { use mmap }
+  {
+    size_t ret;
+    ret = fread(&buf[0], 1, filesize, fp);
+    assert(ret == filesize);
+    (void)ret;
+  }
+  fclose(fp);
+
+  const char *head = &buf[0];
+  const char *marker = &buf[0];
+
+  // Header check.
+  {
+    const char header[] = {0x76, 0x2f, 0x31, 0x01};
+
+    if (memcmp(marker, header, 4) != 0) {
+      if (err) {
+        (*err) = "Invalid magic number.";
+      }
+      return TINYEXR_ERROR_INVALID_MAGIC_NUMBER;
+    }
+    marker += 4;
+  }
+
+  // Version, scanline.
+  {
+    // ver 2.0, scanline, deep bit on(0x800)
+    // must be [2, 0, 0, 0]
+    if (marker[0] != 2 || marker[1] != 8 || marker[2] != 0 || marker[3] != 0) {
+      if (err) {
+        (*err) = "Unsupported version or scanline.";
+      }
+      return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+    }
+
+    marker += 4;
+  }
+
+  int dx = -1;
+  int dy = -1;
+  int dw = -1;
+  int dh = -1;
+  int num_scanline_blocks = 1;  // 16 for ZIP compression.
+  int compression_type = -1;
+  int num_channels = -1;
+  std::vector<tinyexr::ChannelInfo> channels;
+
+  // Read attributes
+  size_t size = filesize - tinyexr::kEXRVersionSize;
+  for (;;) {
+    if (0 == size) {
+      return TINYEXR_ERROR_INVALID_DATA;
+    } else if (marker[0] == '\0') {
+      size--;
+      break;
+    }
+
+    std::string attr_name;
+    std::string attr_type;
+    std::vector<unsigned char> data;
+    size_t marker_size;
+    if (!tinyexr::ReadAttribute(&attr_name, &attr_type, &data, &marker_size,
+                                marker, size)) {
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+    marker += marker_size;
+    size -= marker_size;
+
+    if (attr_name.compare("compression") == 0) {
+      compression_type = data[0];
+      if (compression_type > TINYEXR_COMPRESSIONTYPE_PIZ) {
+        if (err) {
+          (*err) = "Unsupported compression type.";
+        }
+        return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+      }
+
+      if (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) {
+        num_scanline_blocks = 16;
+      }
+
+    } else if (attr_name.compare("channels") == 0) {
+      // name: zero-terminated string, from 1 to 255 bytes long
+      // pixel type: int, possible values are: UINT = 0 HALF = 1 FLOAT = 2
+      // pLinear: unsigned char, possible values are 0 and 1
+      // reserved: three chars, should be zero
+      // xSampling: int
+      // ySampling: int
+
+      tinyexr::ReadChannelInfo(channels, data);
+
+      num_channels = static_cast<int>(channels.size());
+
+      if (num_channels < 1) {
+        if (err) {
+          (*err) = "Invalid channels format.";
+        }
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+
+    } else if (attr_name.compare("dataWindow") == 0) {
+      memcpy(&dx, &data.at(0), sizeof(int));
+      memcpy(&dy, &data.at(4), sizeof(int));
+      memcpy(&dw, &data.at(8), sizeof(int));
+      memcpy(&dh, &data.at(12), sizeof(int));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&dx));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&dy));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&dw));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&dh));
+
+    } else if (attr_name.compare("displayWindow") == 0) {
+      int x;
+      int y;
+      int w;
+      int h;
+      memcpy(&x, &data.at(0), sizeof(int));
+      memcpy(&y, &data.at(4), sizeof(int));
+      memcpy(&w, &data.at(8), sizeof(int));
+      memcpy(&h, &data.at(12), sizeof(int));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&x));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&y));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&w));
+      tinyexr::swap4(reinterpret_cast<unsigned int *>(&h));
+    }
+  }
+
+  assert(dx >= 0);
+  assert(dy >= 0);
+  assert(dw >= 0);
+  assert(dh >= 0);
+  assert(num_channels >= 1);
+
+  int data_width = dw - dx + 1;
+  int data_height = dh - dy + 1;
+
+  std::vector<float> image(
+      static_cast<size_t>(data_width * data_height * 4));  // 4 = RGBA
+
+  // Read offset tables.
+  int num_blocks = data_height / num_scanline_blocks;
+  if (num_blocks * num_scanline_blocks < data_height) {
+    num_blocks++;
+  }
+
+  std::vector<tinyexr::tinyexr_int64> offsets(static_cast<size_t>(num_blocks));
+
+  for (size_t y = 0; y < static_cast<size_t>(num_blocks); y++) {
+    tinyexr::tinyexr_int64 offset;
+    memcpy(&offset, marker, sizeof(tinyexr::tinyexr_int64));
+    tinyexr::swap8(reinterpret_cast<tinyexr::tinyexr_uint64 *>(&offset));
+    marker += sizeof(tinyexr::tinyexr_int64);  // = 8
+    offsets[y] = offset;
+  }
+
+#if TINYEXR_USE_PIZ
+  if ((compression_type == TINYEXR_COMPRESSIONTYPE_NONE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_PIZ)) {
+#else
+  if ((compression_type == TINYEXR_COMPRESSIONTYPE_NONE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_RLE) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIPS) ||
+      (compression_type == TINYEXR_COMPRESSIONTYPE_ZIP)) {
+#endif
+    // OK
+  } else {
+    if (err) {
+      (*err) = "Unsupported format.";
+    }
+    return TINYEXR_ERROR_UNSUPPORTED_FORMAT;
+  }
+
+  deep_image->image = static_cast<float ***>(
+      malloc(sizeof(float **) * static_cast<size_t>(num_channels)));
+  for (int c = 0; c < num_channels; c++) {
+    deep_image->image[c] = static_cast<float **>(
+        malloc(sizeof(float *) * static_cast<size_t>(data_height)));
+    for (int y = 0; y < data_height; y++) {
+    }
+  }
+
+  deep_image->offset_table = static_cast<int **>(
+      malloc(sizeof(int *) * static_cast<size_t>(data_height)));
+  for (int y = 0; y < data_height; y++) {
+    deep_image->offset_table[y] = static_cast<int *>(
+        malloc(sizeof(int) * static_cast<size_t>(data_width)));
+  }
+
+  for (size_t y = 0; y < static_cast<size_t>(num_blocks); y++) {
+    const unsigned char *data_ptr =
+        reinterpret_cast<const unsigned char *>(head + offsets[y]);
+
+    // int: y coordinate
+    // int64: packed size of pixel offset table
+    // int64: packed size of sample data
+    // int64: unpacked size of sample data
+    // compressed pixel offset table
+    // compressed sample data
+    int line_no;
+    tinyexr::tinyexr_int64 packedOffsetTableSize;
+    tinyexr::tinyexr_int64 packedSampleDataSize;
+    tinyexr::tinyexr_int64 unpackedSampleDataSize;
+    memcpy(&line_no, data_ptr, sizeof(int));
+    memcpy(&packedOffsetTableSize, data_ptr + 4,
+           sizeof(tinyexr::tinyexr_int64));
+    memcpy(&packedSampleDataSize, data_ptr + 12,
+           sizeof(tinyexr::tinyexr_int64));
+    memcpy(&unpackedSampleDataSize, data_ptr + 20,
+           sizeof(tinyexr::tinyexr_int64));
+
+    tinyexr::swap4(reinterpret_cast<unsigned int *>(&line_no));
+    tinyexr::swap8(
+        reinterpret_cast<tinyexr::tinyexr_uint64 *>(&packedOffsetTableSize));
+    tinyexr::swap8(
+        reinterpret_cast<tinyexr::tinyexr_uint64 *>(&packedSampleDataSize));
+    tinyexr::swap8(
+        reinterpret_cast<tinyexr::tinyexr_uint64 *>(&unpackedSampleDataSize));
+
+    std::vector<int> pixelOffsetTable(static_cast<size_t>(data_width));
+
+    // decode pixel offset table.
+    {
+      unsigned long dstLen = pixelOffsetTable.size() * sizeof(int);
+      tinyexr::DecompressZip(
+          reinterpret_cast<unsigned char *>(&pixelOffsetTable.at(0)), &dstLen,
+          data_ptr + 28, static_cast<size_t>(packedOffsetTableSize));
+
+      assert(dstLen == pixelOffsetTable.size() * sizeof(int));
+      for (size_t i = 0; i < static_cast<size_t>(data_width); i++) {
+        deep_image->offset_table[y][i] = pixelOffsetTable[i];
+      }
+    }
+
+    std::vector<unsigned char> sample_data(
+        static_cast<size_t>(unpackedSampleDataSize));
+
+    // decode sample data.
+    {
+      unsigned long dstLen = static_cast<unsigned long>(unpackedSampleDataSize);
+      tinyexr::DecompressZip(
+          reinterpret_cast<unsigned char *>(&sample_data.at(0)), &dstLen,
+          data_ptr + 28 + packedOffsetTableSize,
+          static_cast<size_t>(packedSampleDataSize));
+      assert(dstLen == static_cast<unsigned long>(unpackedSampleDataSize));
+    }
+
+    // decode sample
+    int sampleSize = -1;
+    std::vector<int> channel_offset_list(static_cast<size_t>(num_channels));
+    {
+      int channel_offset = 0;
+      for (size_t i = 0; i < static_cast<size_t>(num_channels); i++) {
+        channel_offset_list[i] = channel_offset;
+        if (channels[i].pixel_type == TINYEXR_PIXELTYPE_UINT) {  // UINT
+          channel_offset += 4;
+        } else if (channels[i].pixel_type == TINYEXR_PIXELTYPE_HALF) {  // half
+          channel_offset += 2;
+        } else if (channels[i].pixel_type ==
+                   TINYEXR_PIXELTYPE_FLOAT) {  // float
+          channel_offset += 4;
+        } else {
+          assert(0);
+        }
+      }
+      sampleSize = channel_offset;
+    }
+    assert(sampleSize >= 2);
+
+    assert(static_cast<size_t>(
+               pixelOffsetTable[static_cast<size_t>(data_width - 1)] *
+               sampleSize) == sample_data.size());
+    int samples_per_line = static_cast<int>(sample_data.size()) / sampleSize;
+
+    //
+    // Alloc memory
+    //
+
+    //
+    // pixel data is stored as image[channels][pixel_samples]
+    //
+    {
+      tinyexr::tinyexr_uint64 data_offset = 0;
+      for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+        deep_image->image[c][y] = static_cast<float *>(
+            malloc(sizeof(float) * static_cast<size_t>(samples_per_line)));
+
+        if (channels[c].pixel_type == 0) {  // UINT
+          for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
+            unsigned int ui = *reinterpret_cast<unsigned int *>(
+                &sample_data.at(data_offset + x * sizeof(int)));
+            deep_image->image[c][y][x] = static_cast<float>(ui);  // @fixme
+          }
+          data_offset +=
+              sizeof(unsigned int) * static_cast<size_t>(samples_per_line);
+        } else if (channels[c].pixel_type == 1) {  // half
+          for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
+            tinyexr::FP16 f16;
+            f16.u = *reinterpret_cast<unsigned short *>(
+                &sample_data.at(data_offset + x * sizeof(short)));
+            tinyexr::FP32 f32 = half_to_float(f16);
+            deep_image->image[c][y][x] = f32.f;
+          }
+          data_offset += sizeof(short) * static_cast<size_t>(samples_per_line);
+        } else {  // float
+          for (size_t x = 0; x < static_cast<size_t>(samples_per_line); x++) {
+            float f = *reinterpret_cast<float *>(
+                &sample_data.at(data_offset + x * sizeof(float)));
+            deep_image->image[c][y][x] = f;
+          }
+          data_offset += sizeof(float) * static_cast<size_t>(samples_per_line);
+        }
+      }
+    }
+  }  // y
+
+  deep_image->width = data_width;
+  deep_image->height = data_height;
+
+  deep_image->channel_names = static_cast<const char **>(
+      malloc(sizeof(const char *) * static_cast<size_t>(num_channels)));
+  for (size_t c = 0; c < static_cast<size_t>(num_channels); c++) {
+#ifdef _WIN32
+    deep_image->channel_names[c] = _strdup(channels[c].name.c_str());
+#else
+    deep_image->channel_names[c] = strdup(channels[c].name.c_str());
+#endif
+  }
+  deep_image->num_channels = num_channels;
+
+  return TINYEXR_SUCCESS;
+}
+
+void InitEXRImage(EXRImage *exr_image) {
+  if (exr_image == NULL) {
+    return;
+  }
+
+  exr_image->width = 0;
+  exr_image->height = 0;
+  exr_image->num_channels = 0;
+
+  exr_image->images = NULL;
+  exr_image->tiles = NULL;
+}
+
+void InitEXRHeader(EXRHeader *exr_header) {
+  if (exr_header == NULL) {
+    return;
+  }
+
+  memset(exr_header, 0, sizeof(EXRHeader));
+}
+
+int FreeEXRHeader(EXRHeader *exr_header) {
+  if (exr_header == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (exr_header->channels) {
+    free(exr_header->channels);
+  }
+
+  if (exr_header->pixel_types) {
+    free(exr_header->pixel_types);
+  }
+
+  if (exr_header->requested_pixel_types) {
+    free(exr_header->requested_pixel_types);
+  }
+
+  for (int i = 0; i < exr_header->num_custom_attributes; i++) {
+    if (exr_header->custom_attributes[i].value) {
+      free(exr_header->custom_attributes[i].value);
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int FreeEXRImage(EXRImage *exr_image) {
+  if (exr_image == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  for (int i = 0; i < exr_image->num_channels; i++) {
+    if (exr_image->images && exr_image->images[i]) {
+      free(exr_image->images[i]);
+    }
+  }
+
+  if (exr_image->images) {
+    free(exr_image->images);
+  }
+
+  if (exr_image->tiles) {
+    for (int tid = 0; tid < exr_image->num_tiles; tid++) {
+      for (int i = 0; i < exr_image->num_channels; i++) {
+        if (exr_image->tiles[tid].images && exr_image->tiles[tid].images[i]) {
+          free(exr_image->tiles[tid].images[i]);
+        }
+      }
+      if (exr_image->tiles[tid].images) {
+        free(exr_image->tiles[tid].images);
+      }
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRHeaderFromFile(EXRHeader *exr_header, const EXRVersion *exr_version,
+                           const char *filename, const char **err) {
+  if (exr_header == NULL || exr_version == NULL || filename == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if 0 //def _WIN32
+  FILE *fp = NULL;
+  fopen_s(&fp, filename, "rb");
+#else
+  FILE *fp = fopen(filename, "rb");
+#endif
+  if (!fp) {
+    if (err) {
+      (*err) = "Cannot read file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  size_t filesize;
+  // Compute size
+  fseek(fp, 0, SEEK_END);
+  filesize = static_cast<size_t>(ftell(fp));
+  fseek(fp, 0, SEEK_SET);
+
+  std::vector<unsigned char> buf(filesize);  // @todo { use mmap }
+  {
+    size_t ret;
+    ret = fread(&buf[0], 1, filesize, fp);
+    assert(ret == filesize);
+    fclose(fp);
+
+    if (ret != filesize) {
+      if (err) {
+        (*err) = "fread error.";
+      }
+      return TINYEXR_ERROR_INVALID_FILE;
+    }
+  }
+
+  return ParseEXRHeaderFromMemory(exr_header, exr_version, &buf.at(0), filesize,
+                                  err);
+}
+
+int ParseEXRMultipartHeaderFromMemory(EXRHeader ***exr_headers,
+                                      int *num_headers,
+                                      const EXRVersion *exr_version,
+                                      const unsigned char *memory, size_t size,
+                                      const char **err) {
+  if (memory == NULL || exr_headers == NULL || num_headers == NULL ||
+      exr_version == NULL) {
+    // Invalid argument
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (size < tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  const unsigned char *marker = memory + tinyexr::kEXRVersionSize;
+  size_t marker_size = size - tinyexr::kEXRVersionSize;
+
+  std::vector<tinyexr::HeaderInfo> infos;
+
+  for (;;) {
+    tinyexr::HeaderInfo info;
+    info.clear();
+
+    std::string err_str;
+    bool empty_header = false;
+    int ret = ParseEXRHeader(&info, &empty_header, exr_version, &err_str,
+                             marker, marker_size);
+
+    if (ret != TINYEXR_SUCCESS) {
+      if (err) {
+        (*err) = strdup(err_str.c_str());  // may leak
+      }
+      return ret;
+    }
+
+    if (empty_header) {
+      marker += 1;  // skip '\0'
+      break;
+    }
+
+    // `chunkCount` must exist in the header.
+    if (info.chunk_count == 0) {
+      if (err) {
+        (*err) = "`chunkCount' attribute is not found in the header.";
+      }
+      return TINYEXR_ERROR_INVALID_DATA;
+    }
+
+    infos.push_back(info);
+
+    // move to next header.
+    marker += info.header_len;
+    size -= info.header_len;
+  }
+
+  // allocate memory for EXRHeader and create array of EXRHeader pointers.
+  (*exr_headers) =
+      static_cast<EXRHeader **>(malloc(sizeof(EXRHeader *) * infos.size()));
+  for (size_t i = 0; i < infos.size(); i++) {
+    EXRHeader *exr_header = static_cast<EXRHeader *>(malloc(sizeof(EXRHeader)));
+
+    ConvertHeader(exr_header, infos[i]);
+
+    // transfoer `tiled` from version.
+    exr_header->tiled = exr_version->tiled;
+
+    (*exr_headers)[i] = exr_header;
+  }
+
+  (*num_headers) = static_cast<int>(infos.size());
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRMultipartHeaderFromFile(EXRHeader ***exr_headers, int *num_headers,
+                                    const EXRVersion *exr_version,
+                                    const char *filename, const char **err) {
+  if (exr_headers == NULL || num_headers == NULL || exr_version == NULL ||
+      filename == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if 0 //def _WIN32
+  FILE *fp = NULL;
+  fopen_s(&fp, filename, "rb");
+#else
+  FILE *fp = fopen(filename, "rb");
+#endif
+  if (!fp) {
+    if (err) {
+      (*err) = "Cannot read file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  size_t filesize;
+  // Compute size
+  fseek(fp, 0, SEEK_END);
+  filesize = static_cast<size_t>(ftell(fp));
+  fseek(fp, 0, SEEK_SET);
+
+  std::vector<unsigned char> buf(filesize);  // @todo { use mmap }
+  {
+    size_t ret;
+    ret = fread(&buf[0], 1, filesize, fp);
+    assert(ret == filesize);
+    fclose(fp);
+
+    if (ret != filesize) {
+      if (err) {
+        (*err) = "fread error.";
+      }
+      return TINYEXR_ERROR_INVALID_FILE;
+    }
+  }
+
+  return ParseEXRMultipartHeaderFromMemory(
+      exr_headers, num_headers, exr_version, &buf.at(0), filesize, err);
+}
+
+int ParseEXRVersionFromMemory(EXRVersion *version, const unsigned char *memory,
+                              size_t size) {
+  if (version == NULL || memory == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  if (size < tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_DATA;
+  }
+
+  const unsigned char *marker = memory;
+
+  // Header check.
+  {
+    const char header[] = {0x76, 0x2f, 0x31, 0x01};
+
+    if (memcmp(marker, header, 4) != 0) {
+      return TINYEXR_ERROR_INVALID_MAGIC_NUMBER;
+    }
+    marker += 4;
+  }
+
+  version->tiled = false;
+  version->long_name = false;
+  version->non_image = false;
+  version->multipart = false;
+
+  // Parse version header.
+  {
+    // must be 2
+    if (marker[0] != 2) {
+      return TINYEXR_ERROR_INVALID_EXR_VERSION;
+    }
+
+    if (version == NULL) {
+      return TINYEXR_SUCCESS;  // May OK
+    }
+
+    version->version = 2;
+
+    if (marker[1] & 0x2) {  // 9th bit
+      version->tiled = true;
+    }
+    if (marker[1] & 0x4) {  // 10th bit
+      version->long_name = true;
+    }
+    if (marker[1] & 0x8) {        // 11th bit
+      version->non_image = true;  // (deep image)
+    }
+    if (marker[1] & 0x10) {  // 12th bit
+      version->multipart = true;
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int ParseEXRVersionFromFile(EXRVersion *version, const char *filename) {
+  if (filename == NULL) {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if 0 //def _WIN32
+  FILE *fp = NULL;
+  fopen_s(&fp, filename, "rb");
+#else
+  FILE *fp = fopen(filename, "rb");
+#endif
+  if (!fp) {
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  size_t file_size;
+  // Compute size
+  fseek(fp, 0, SEEK_END);
+  file_size = static_cast<size_t>(ftell(fp));
+  fseek(fp, 0, SEEK_SET);
+
+  if (file_size < tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_FILE;
+  }
+
+  unsigned char buf[tinyexr::kEXRVersionSize];
+  size_t ret = fread(&buf[0], 1, tinyexr::kEXRVersionSize, fp);
+  fclose(fp);
+
+  if (ret != tinyexr::kEXRVersionSize) {
+    return TINYEXR_ERROR_INVALID_FILE;
+  }
+
+  return ParseEXRVersionFromMemory(version, buf, tinyexr::kEXRVersionSize);
+}
+
+int LoadEXRMultipartImageFromMemory(EXRImage *exr_images,
+                                    const EXRHeader **exr_headers,
+                                    unsigned int num_parts,
+                                    const unsigned char *memory,
+                                    const char **err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts == 0 ||
+      memory == NULL) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  // compute total header size.
+  size_t total_header_size = 0;
+  for (unsigned int i = 0; i < num_parts; i++) {
+    if (exr_headers[i]->header_len == 0) {
+      if (err) {
+        (*err) = "EXRHeader is not initialized.";
+      }
+      return TINYEXR_ERROR_INVALID_ARGUMENT;
+    }
+
+    total_header_size += exr_headers[i]->header_len;
+  }
+
+  const char *marker = reinterpret_cast<const char *>(
+      memory + total_header_size + 4 +
+      4);  // +8 for magic number and version header.
+
+  marker += 1;  // Skip empty header.
+
+  // NOTE 1:
+  //   In multipart image, There is 'part number' before chunk data.
+  //   4 byte : part number
+  //   4+     : chunk
+  //
+  // NOTE 2:
+  //   EXR spec says 'part number' is 'unsigned long' but actually this is
+  //   'unsigned int(4 bytes)' in OpenEXR implementation...
+  //   http://www.openexr.com/openexrfilelayout.pdf
+
+  // Load chunk offset table.
+  std::vector<std::vector<tinyexr::tinyexr_uint64> > chunk_offset_table_list;
+  for (size_t i = 0; i < static_cast<size_t>(num_parts); i++) {
+    std::vector<tinyexr::tinyexr_uint64> offset_table(
+        static_cast<size_t>(exr_headers[i]->chunk_count));
+
+    for (size_t c = 0; c < offset_table.size(); c++) {
+      tinyexr::tinyexr_uint64 offset;
+      memcpy(&offset, marker, 8);
+      tinyexr::swap8(&offset);
+
+      offset_table[c] = offset + 4;  // +4 to skip 'part number'
+      marker += 8;
+    }
+
+    chunk_offset_table_list.push_back(offset_table);
+  }
+
+  // Decode image.
+  for (size_t i = 0; i < static_cast<size_t>(num_parts); i++) {
+    std::vector<tinyexr::tinyexr_uint64> &offset_table =
+        chunk_offset_table_list[i];
+
+    // First check 'part number' is identitical to 'i'
+    for (size_t c = 0; c < offset_table.size(); c++) {
+      const unsigned char *part_number_addr =
+          memory + offset_table[c] - 4;  // -4 to move to 'part number' field.
+      unsigned int part_no;
+      memcpy(&part_no, part_number_addr, sizeof(unsigned int));  // 4
+      tinyexr::swap4(&part_no);
+
+      if (part_no != i) {
+        assert(0);
+        return TINYEXR_ERROR_INVALID_DATA;
+      }
+    }
+
+    int ret = tinyexr::DecodeChunk(&exr_images[i], exr_headers[i], offset_table,
+                                   memory);
+    if (ret != TINYEXR_SUCCESS) {
+      return ret;
+    }
+  }
+
+  return TINYEXR_SUCCESS;
+}
+
+int LoadEXRMultipartImageFromFile(EXRImage *exr_images,
+                                  const EXRHeader **exr_headers,
+                                  unsigned int num_parts, const char *filename,
+                                  const char **err) {
+  if (exr_images == NULL || exr_headers == NULL || num_parts == 0) {
+    if (err) {
+      (*err) = "Invalid argument.";
+    }
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+#if 0 //def _WIN32
+  FILE *fp = NULL;
+  fopen_s(&fp, filename, "rb");
+#else
+  FILE *fp = fopen(filename, "rb");
+#endif
+  if (!fp) {
+    if (err) {
+      (*err) = "Cannot read file.";
+    }
+    return TINYEXR_ERROR_CANT_OPEN_FILE;
+  }
+
+  size_t filesize;
+  // Compute size
+  fseek(fp, 0, SEEK_END);
+  filesize = static_cast<size_t>(ftell(fp));
+  fseek(fp, 0, SEEK_SET);
+
+  std::vector<unsigned char> buf(filesize);  //  @todo { use mmap }
+  {
+    size_t ret;
+    ret = fread(&buf[0], 1, filesize, fp);
+    assert(ret == filesize);
+    fclose(fp);
+    (void)ret;
+  }
+
+  return LoadEXRMultipartImageFromMemory(exr_images, exr_headers, num_parts,
+                                         &buf.at(0), err);
+}
+
+int SaveEXR(const float *data, int width, int height, int components,
+            const char *outfilename) {
+  if (components == 3 || components == 4) {
+    // OK
+  } else {
+    return TINYEXR_ERROR_INVALID_ARGUMENT;
+  }
+
+  // Assume at least 16x16 pixels.
+  if (width < 16) return TINYEXR_ERROR_INVALID_ARGUMENT;
+  if (height < 16) return TINYEXR_ERROR_INVALID_ARGUMENT;
+
+  EXRHeader header;
+  InitEXRHeader(&header);
+
+  EXRImage image;
+  InitEXRImage(&image);
+
+  image.num_channels = components;
+
+  std::vector<float> images[4];
+  images[0].resize(static_cast<size_t>(width * height));
+  images[1].resize(static_cast<size_t>(width * height));
+  images[2].resize(static_cast<size_t>(width * height));
+  images[3].resize(static_cast<size_t>(width * height));
+
+  // Split RGB(A)RGB(A)RGB(A)... into R, G and B(and A) layers
+  for (size_t i = 0; i < static_cast<size_t>(width * height); i++) {
+    images[0][i] = data[static_cast<size_t>(components) * i + 0];
+    images[1][i] = data[static_cast<size_t>(components) * i + 1];
+    images[2][i] = data[static_cast<size_t>(components) * i + 2];
+    if (components == 4) {
+      images[3][i] = data[static_cast<size_t>(components) * i + 3];
+    }
+  }
+
+  float *image_ptr[4] = {0, 0, 0, 0};
+  if (components == 4) {
+    image_ptr[0] = &(images[3].at(0));  // A
+    image_ptr[1] = &(images[2].at(0));  // B
+    image_ptr[2] = &(images[1].at(0));  // G
+    image_ptr[3] = &(images[0].at(0));  // R
+  } else {
+    image_ptr[0] = &(images[2].at(0));  // B
+    image_ptr[1] = &(images[1].at(0));  // G
+    image_ptr[2] = &(images[0].at(0));  // R
+  }
+
+  image.images = reinterpret_cast<unsigned char **>(image_ptr);
+  image.width = width;
+  image.height = height;
+
+  header.num_channels = components;
+  header.channels = static_cast<EXRChannelInfo *>(malloc(
+      sizeof(EXRChannelInfo) * static_cast<size_t>(header.num_channels)));
+  // Must be (A)BGR order, since most of EXR viewers expect this channel order.
+  if (components == 4) {
+    strncpy(header.channels[0].name, "A", 255);
+    header.channels[0].name[strlen("A")] = '\0';
+    strncpy(header.channels[1].name, "B", 255);
+    header.channels[1].name[strlen("B")] = '\0';
+    strncpy(header.channels[2].name, "G", 255);
+    header.channels[2].name[strlen("G")] = '\0';
+    strncpy(header.channels[3].name, "R", 255);
+    header.channels[3].name[strlen("R")] = '\0';
+  } else {
+    strncpy(header.channels[0].name, "B", 255);
+    header.channels[0].name[strlen("B")] = '\0';
+    strncpy(header.channels[1].name, "G", 255);
+    header.channels[1].name[strlen("G")] = '\0';
+    strncpy(header.channels[2].name, "R", 255);
+    header.channels[2].name[strlen("R")] = '\0';
+  }
+
+  header.pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(header.num_channels)));
+  header.requested_pixel_types = static_cast<int *>(
+      malloc(sizeof(int) * static_cast<size_t>(header.num_channels)));
+  for (int i = 0; i < header.num_channels; i++) {
+    header.pixel_types[i] =
+        TINYEXR_PIXELTYPE_FLOAT;  // pixel type of input image
+    header.requested_pixel_types[i] =
+        TINYEXR_PIXELTYPE_HALF;  // pixel type of output image to be stored in
+                                 // .EXR
+  }
+
+  const char *err;
+  int ret = SaveEXRImageToFile(&image, &header, outfilename, &err);
+  if (ret != TINYEXR_SUCCESS) {
+    return ret;
+  }
+
+  free(header.channels);
+  free(header.pixel_types);
+  free(header.requested_pixel_types);
+
+  return ret;
+}
+
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+#endif
+
+#endif  // TINYEXR_H_
diff --git a/3rdparty/bimg/CMakeLists.txt b/3rdparty/bimg/CMakeLists.txt
new file mode 100644
index 0000000..27c1f30
--- /dev/null
+++ b/3rdparty/bimg/CMakeLists.txt
@@ -0,0 +1,26 @@
+PROJECT (BIMG CXX)
+
+FIND_PACKAGE (OpenGL)
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6)
+
+ADD_DEFINITIONS ("-D__STDC_LIMIT_MACROS -D__STDC_FORMAT_MACROS -D__STDC_CONSTANT_MACROS")
+
+INCLUDE_DIRECTORIES (
+	${CMAKE_CURRENT_SOURCE_DIR}/include
+	${CMAKE_CURRENT_SOURCE_DIR}/../bimg/include
+	${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/
+	${OPENGL_INCLUDE_DIR}
+	)
+
+SET (BIMG_SOURCES
+	#	../bx/src/amalgamated.cpp
+	#	src/image_encode.cpp
+	src/image_decode.cpp
+	src/image.cpp
+	)
+
+ADD_LIBRARY ( bimg SHARED ${BIMG_SOURCES} )
+
+TARGET_LINK_LIBRARIES (bimg
+	)
diff --git a/3rdparty/bimg/LICENSE b/3rdparty/bimg/LICENSE
new file mode 100644
index 0000000..dd17ed4
--- /dev/null
+++ b/3rdparty/bimg/LICENSE
@@ -0,0 +1,22 @@
+Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice, this
+      list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/bimg/README.md b/3rdparty/bimg/README.md
new file mode 100644
index 0000000..fee386f
--- /dev/null
+++ b/3rdparty/bimg/README.md
@@ -0,0 +1,45 @@
+bimg
+====
+
+Image library.
+
+[![Build Status](https://travis-ci.org/bkaradzic/bx.svg?branch=master)](https://travis-ci.org/bkaradzic/bimg)
+[![Build status](https://ci.appveyor.com/api/projects/status/edras3mltmoy31g5?svg=true)](https://ci.appveyor.com/project/bkaradzic/bimg)
+[![License](https://img.shields.io/badge/license-BSD--2%20clause-blue.svg)](https://bkaradzic.github.io/bgfx/license.html)
+[![Join the chat at https://gitter.im/bkaradzic/bgfx](https://badges.gitter.im/bkaradzic/bgfx.svg)](https://gitter.im/bkaradzic/bgfx)
+
+Contact
+-------
+
+[@bkaradzic](https://twitter.com/bkaradzic)  
+
+Project page  
+https://github.com/bkaradzic/bimg
+
+[License (BSD 2-clause)](https://github.com/bkaradzic/bimg/blob/master/LICENSE)
+-------------------------------------------------------------------------------
+
+	Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+	
+	https://github.com/bkaradzic/bimg
+	
+	Redistribution and use in source and binary forms, with or without
+	modification, are permitted provided that the following conditions are met:
+	
+	   1. Redistributions of source code must retain the above copyright notice,
+	      this list of conditions and the following disclaimer.
+	
+	   2. Redistributions in binary form must reproduce the above copyright notice,
+	      this list of conditions and the following disclaimer in the documentation
+	      and/or other materials provided with the distribution.
+	
+	THIS SOFTWARE IS PROVIDED BY COPYRIGHT HOLDER ``AS IS'' AND ANY EXPRESS OR
+	IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+	MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+	EVENT SHALL COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+	INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+	BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+	LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+	OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+	ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/3rdparty/bimg/include/bimg/bimg.h b/3rdparty/bimg/include/bimg/bimg.h
new file mode 100644
index 0000000..863dc3b
--- /dev/null
+++ b/3rdparty/bimg/include/bimg/bimg.h
@@ -0,0 +1,519 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#ifndef BIMG_IMAGE_H_HEADER_GUARD
+#define BIMG_IMAGE_H_HEADER_GUARD
+
+#include <stdint.h> // uint32_t
+#include <stdlib.h> // NULL
+
+namespace bx
+{
+	struct AllocatorI;
+	class  Error;
+	struct ReaderSeekerI;
+	struct WriterI;
+
+} // namespace bx
+
+namespace bimg
+{
+	typedef void (*PackFn)(void*, const float*);
+	typedef void (*UnpackFn)(float*, const void*);
+
+	/// Texture format enum.
+	///
+	/// Notation:
+	///
+	///       RGBA16S
+	///       ^   ^ ^
+	///       |   | +-- [ ]Unorm
+	///       |   |     [F]loat
+	///       |   |     [S]norm
+	///       |   |     [I]nt
+	///       |   |     [U]int
+	///       |   +---- Number of bits per component
+	///       +-------- Components
+	///
+	/// @attention Availability depends on Caps (see: formats).
+	///
+	/// @attention C99 equivalent is `bgfx_texture_format_t`.
+	///
+	struct TextureFormat
+	{
+		/// Texture formats:
+		enum Enum
+		{
+			BC1,          //!< DXT1
+			BC2,          //!< DXT3
+			BC3,          //!< DXT5
+			BC4,          //!< LATC1/ATI1
+			BC5,          //!< LATC2/ATI2
+			BC6H,         //!< BC6H
+			BC7,          //!< BC7
+			ETC1,         //!< ETC1 RGB8
+			ETC2,         //!< ETC2 RGB8
+			ETC2A,        //!< ETC2 RGBA8
+			ETC2A1,       //!< ETC2 RGB8A1
+			PTC12,        //!< PVRTC1 RGB 2BPP
+			PTC14,        //!< PVRTC1 RGB 4BPP
+			PTC12A,       //!< PVRTC1 RGBA 2BPP
+			PTC14A,       //!< PVRTC1 RGBA 4BPP
+			PTC22,        //!< PVRTC2 RGBA 2BPP
+			PTC24,        //!< PVRTC2 RGBA 4BPP
+
+			Unknown,      // Compressed formats above.
+
+			R1,
+			A8,
+			R8,
+			R8I,
+			R8U,
+			R8S,
+			R16,
+			R16I,
+			R16U,
+			R16F,
+			R16S,
+			R32I,
+			R32U,
+			R32F,
+			RG8,
+			RG8I,
+			RG8U,
+			RG8S,
+			RG16,
+			RG16I,
+			RG16U,
+			RG16F,
+			RG16S,
+			RG32I,
+			RG32U,
+			RG32F,
+			RGB8,
+			RGB8I,
+			RGB8U,
+			RGB8S,
+			RGB9E5F,
+			BGRA8,
+			RGBA8,
+			RGBA8I,
+			RGBA8U,
+			RGBA8S,
+			RGBA16,
+			RGBA16I,
+			RGBA16U,
+			RGBA16F,
+			RGBA16S,
+			RGBA32I,
+			RGBA32U,
+			RGBA32F,
+			R5G6B5,
+			RGBA4,
+			RGB5A1,
+			RGB10A2,
+			R11G11B10F,
+
+			UnknownDepth, // Depth formats below.
+
+			D16,
+			D24,
+			D24S8,
+			D32,
+			D16F,
+			D24F,
+			D32F,
+			D0S8,
+
+			Count
+		};
+	};
+
+	/// Texture info.
+	///
+	/// @attention C99 equivalent is `bgfx_texture_info_t`.
+	///
+	struct TextureInfo
+	{
+		TextureFormat::Enum format; //!< Texture format.
+		uint32_t storageSize;       //!< Total amount of bytes required to store texture.
+		uint16_t width;             //!< Texture width.
+		uint16_t height;            //!< Texture height.
+		uint16_t depth;             //!< Texture depth.
+		uint16_t numLayers;         //!< Number of layers in texture array.
+		uint8_t numMips;            //!< Number of MIP maps.
+		uint8_t bitsPerPixel;       //!< Format bits per pixel.
+		bool    cubeMap;            //!< Texture is cubemap.
+	};
+
+	struct ImageContainer
+	{
+		bx::AllocatorI* m_allocator;
+		void*           m_data;
+
+		TextureFormat::Enum m_format;
+
+		uint32_t m_size;
+		uint32_t m_offset;
+		uint32_t m_width;
+		uint32_t m_height;
+		uint32_t m_depth;
+		uint16_t m_numLayers;
+		uint8_t  m_numMips;
+		bool     m_hasAlpha;
+		bool     m_cubeMap;
+		bool     m_ktx;
+		bool     m_ktxLE;
+		bool     m_srgb;
+	};
+
+	struct ImageMip
+	{
+		TextureFormat::Enum m_format;
+		uint32_t m_width;
+		uint32_t m_height;
+		uint32_t m_blockSize;
+		uint32_t m_size;
+		uint8_t  m_bpp;
+		bool     m_hasAlpha;
+		const uint8_t* m_data;
+	};
+
+	struct ImageBlockInfo
+	{
+		uint8_t bitsPerPixel;
+		uint8_t blockWidth;
+		uint8_t blockHeight;
+		uint8_t blockSize;
+		uint8_t minBlockX;
+		uint8_t minBlockY;
+		uint8_t depthBits;
+		uint8_t stencilBits;
+		uint8_t rBits;
+		uint8_t gBits;
+		uint8_t bBits;
+		uint8_t aBits;
+		uint8_t encoding;
+	};
+
+	/// Returns true if texture format is compressed.
+	bool isCompressed(TextureFormat::Enum _format);
+
+	/// Returns true if texture format is uncompressed.
+	bool isColor(TextureFormat::Enum _format);
+
+	/// Returns true if texture format is depth.
+	bool isDepth(TextureFormat::Enum _format);
+
+	/// Returns true if texture format is valid.
+	bool isValid(TextureFormat::Enum _format);
+
+	/// Returns bits per pixel.
+	uint8_t getBitsPerPixel(TextureFormat::Enum _format);
+
+	/// Returns texture block info.
+	const ImageBlockInfo& getBlockInfo(TextureFormat::Enum _format);
+
+	/// Converts format to string.
+	const char* getName(TextureFormat::Enum _format);
+
+	/// Converts string to format.
+	TextureFormat::Enum getFormat(const char* _name);
+
+	/// Returns number of mip-maps required for complete mip-map chain.
+	uint8_t imageGetNumMips(
+		  TextureFormat::Enum _format
+		, uint16_t _width
+		, uint16_t _height
+		, uint16_t _depth = 0
+		);
+
+	/// Returns image size.
+	uint32_t imageGetSize(
+		  TextureInfo* _info
+		, uint16_t _width
+		, uint16_t _height
+		, uint16_t _depth
+		, bool _cubeMap
+		, bool _hasMips
+		, uint16_t _numLayers
+		, TextureFormat::Enum _format
+		);
+
+	///
+	void imageSolid(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _solid
+		);
+
+	///
+	void imageCheckerboard(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _step
+		, uint32_t _0
+		, uint32_t _1
+		);
+
+	///
+	void imageRgba8Downsample2x2(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	void imageRgba32fToLinear(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	void imageRgba32fToGamma(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	void imageRgba32fLinearDownsample2x2(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	void imageRgba32fDownsample2x2NormalMap(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	void imageSwizzleBgra8(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	void imageCopy(
+		  void* _dst
+		, uint32_t _height
+		, uint32_t _srcPitch
+		, const void* _src
+		, uint32_t _dstPitch
+		);
+
+	///
+	void imageCopy(
+		  void* _dst
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _bpp
+		, uint32_t _pitch
+		, const void* _src
+		);
+
+	///
+	bool imageConvert(
+		  TextureFormat::Enum _dstFormat
+		, TextureFormat::Enum _srcFormat
+		);
+
+	///
+	void imageConvert(
+		  void* _dst
+		, uint32_t _bpp
+		, PackFn _pack
+		, const void* _src
+		, UnpackFn _unpack
+		, uint32_t _size
+		);
+
+	///
+	void imageConvert(
+		  void* _dst
+		, uint32_t _dstBpp
+		, PackFn _pack
+		, const void* _src
+		, uint32_t _srcBpp
+		, UnpackFn _unpack
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _srcPitch
+		);
+
+	///
+	bool imageConvert(
+		  void* _dst
+		, TextureFormat::Enum _dstFormat
+		, const void* _src
+		, TextureFormat::Enum _srcFormat
+		, uint32_t _width
+		, uint32_t _height
+		);
+
+	///
+	ImageContainer* imageConvert(
+		  bx::AllocatorI* _allocator
+		, TextureFormat::Enum _dstFormat
+		, const void* _src
+		, uint32_t _size
+		);
+
+	///
+	ImageContainer* imageConvert(
+		  bx::AllocatorI* _allocator
+		, TextureFormat::Enum _dstFormat
+		, const ImageContainer& _input
+		);
+
+	///
+	ImageContainer* imageAlloc(
+		  bx::AllocatorI* _allocator
+		, TextureFormat::Enum _format
+		, uint16_t _width
+		, uint16_t _height
+		, uint16_t _depth
+		, uint16_t _numLayers
+		, bool _cubeMap
+		, bool _hasMips
+		, const void* _data = NULL
+		);
+
+	///
+	void imageFree(
+		  ImageContainer* _imageContainer
+		);
+
+	///
+	void imageWriteTga(
+		  bx::WriterI* _writer
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, const void* _src
+		, bool _grayscale
+		, bool _yflip
+		, bx::Error* _err = NULL
+		);
+
+	///
+	void imageWriteKtx(
+		  bx::WriterI* _writer
+		, TextureFormat::Enum _format
+		, bool _cubeMap
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _depth
+		, uint8_t _numMips
+		, const void* _src
+		, bx::Error* _err = NULL
+		);
+
+	///
+	void imageWriteKtx(
+		  bx::WriterI* _writer
+		, ImageContainer& _imageContainer
+		, const void* _data
+		, uint32_t _size
+		, bx::Error* _err = NULL
+		);
+
+	///
+	bool imageParse(
+		  ImageContainer& _imageContainer
+		, bx::ReaderSeekerI* _reader
+		);
+
+	///
+	bool imageParse(
+		  ImageContainer& _imageContainer
+		, const void* _data
+		, uint32_t _size
+		);
+
+	///
+	ImageContainer* imageParseDds(
+		  bx::AllocatorI* _allocator
+		, const void* _src
+		, uint32_t _size
+		);
+
+	///
+	ImageContainer* imageParseKtx(
+		  bx::AllocatorI* _allocator
+		, const void* _src
+		, uint32_t _size
+		);
+
+	///
+	ImageContainer* imageParsePvr3(
+		  bx::AllocatorI* _allocator
+		, const void* _src
+		, uint32_t _size
+		);
+
+	///
+	void imageDecodeToBgra8(
+		  void* _dst
+		, const void* _src
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, TextureFormat::Enum _format
+		);
+
+	///
+	void imageDecodeToRgba8(
+		  void* _dst
+		, const void* _src
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, TextureFormat::Enum _format
+		);
+
+	///
+	void imageDecodeToRgba32f(
+		  bx::AllocatorI* _allocator
+		, void* _dst
+		, const void* _src
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _pitch
+		, TextureFormat::Enum _format
+		);
+
+	///
+	bool imageGetRawData(
+		  const ImageContainer& _imageContainer
+		, uint16_t _side
+		, uint8_t _lod
+		, const void* _data
+		, uint32_t _size
+		, ImageMip& _mip
+		);
+
+} // namespace bimg
+
+#endif // BIMG_IMAGE_H_HEADER_GUARD
diff --git a/3rdparty/bimg/include/bimg/decode.h b/3rdparty/bimg/include/bimg/decode.h
new file mode 100644
index 0000000..3b9e53b
--- /dev/null
+++ b/3rdparty/bimg/include/bimg/decode.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#ifndef BIMG_DECODE_H_HEADER_GUARD
+#define BIMG_DECODE_H_HEADER_GUARD
+
+#include "bimg.h"
+
+namespace bimg
+{
+	///
+	ImageContainer* imageParse(
+		  bx::AllocatorI* _allocator
+		, const void* _data
+		, uint32_t _size
+		, TextureFormat::Enum _dstFormat = TextureFormat::Count
+		);
+
+} // namespace bimg
+
+#endif // BIMG_DECODE_H_HEADER_GUARD
diff --git a/3rdparty/bimg/include/bimg/encode.h b/3rdparty/bimg/include/bimg/encode.h
new file mode 100644
index 0000000..db74eb9
--- /dev/null
+++ b/3rdparty/bimg/include/bimg/encode.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bimg#license-bsd-2-clause
+ */
+
+#ifndef BIMG_ENCODE_H_HEADER_GUARD
+#define BIMG_ENCODE_H_HEADER_GUARD
+
+#include "bimg.h"
+
+namespace bimg
+{
+	///
+	bool imageEncodeFromRgba8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, TextureFormat::Enum _format);
+
+	///
+	bool imageEncodeFromRgba32f(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, TextureFormat::Enum _format);
+
+	///
+	void imageRgba32f11to01(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src);
+
+	///
+	void imageMakeDist(bx::AllocatorI* _allocator, void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, float _edge, const void* _src);
+
+} // namespace bimg
+
+#endif // BIMG_ENCODE_H_HEADER_GUARD
diff --git a/3rdparty/bimg/makefile b/3rdparty/bimg/makefile
new file mode 100644
index 0000000..02593d5
--- /dev/null
+++ b/3rdparty/bimg/makefile
@@ -0,0 +1,376 @@
+#
+# Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+# License: https://github.com/bkaradzic/bimg#license-bsd-2-clause
+#
+
+UNAME := $(shell uname)
+ifeq ($(UNAME),$(filter $(UNAME),Linux Darwin FreeBSD GNU/kFreeBSD))
+ifeq ($(UNAME),$(filter $(UNAME),Darwin))
+OS=darwin
+else
+ifeq ($(UNAME),$(filter $(UNAME),FreeBSD GNU/kFreeBSD))
+OS=bsd
+else
+OS=linux
+endif
+endif
+else
+OS=windows
+
+help: projgen
+
+endif
+
+# $(info $(OS))
+
+BX_DIR?=../bx
+GENIE?=$(BX_DIR)/tools/bin/$(OS)/genie
+NINJA?=$(BX_DIR)/tools/bin/$(OS)/ninja
+
+.PHONY: help
+
+help:
+	@echo Available targets:
+	@grep -E "^[a-zA-Z0-9_-]+:.*?## .*$$" $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
+
+clean: ## Clean all intermediate files.
+	@echo Cleaning...
+	-@rm -rf .build
+	@mkdir .build
+
+projgen: ## Generate project files for all configurations.
+	$(GENIE) --with-tools                     vs2012
+	$(GENIE) --with-tools                     vs2013
+	$(GENIE) --with-tools                     vs2015
+	$(GENIE) --with-tools                     vs2017
+	$(GENIE) --with-tools --gcc=mingw-gcc     gmake
+	$(GENIE) --with-tools --gcc=linux-gcc     gmake
+	$(GENIE) --with-tools --gcc=osx           gmake
+	$(GENIE) --with-tools --xcode=osx         xcode4
+	$(GENIE) --with-tools --xcode=ios         xcode4
+	$(GENIE)              --gcc=freebsd       gmake
+	$(GENIE)                                --gcc=android-arm   gmake
+	$(GENIE)                                --gcc=android-mips  gmake
+	$(GENIE)                                --gcc=android-x86   gmake
+	$(GENIE)                                --gcc=asmjs         gmake
+	$(GENIE)                                --gcc=ios-arm       gmake
+	$(GENIE)                                --gcc=ios-arm64     gmake
+	$(GENIE)                                --gcc=ios-simulator gmake
+	$(GENIE)                                --gcc=nacl          gmake
+	$(GENIE)                                --gcc=nacl-arm      gmake
+	$(GENIE)                                --gcc=pnacl         gmake
+	$(GENIE)                                --gcc=rpi           gmake
+
+.build/projects/gmake-android-arm:
+	$(GENIE) --gcc=android-arm gmake
+android-arm-debug: .build/projects/gmake-android-arm ## Build - Android ARM Debug
+	$(MAKE) -R -C .build/projects/gmake-android-arm config=debug
+android-arm-release: .build/projects/gmake-android-arm ## Build - Android ARM Release
+	$(MAKE) -R -C .build/projects/gmake-android-arm config=release
+android-arm: android-arm-debug android-arm-release ## Build - Android ARM Debug and Release
+
+.build/projects/gmake-android-mips:
+	$(GENIE) --gcc=android-mips gmake
+android-mips-debug: .build/projects/gmake-android-mips ## Build - Android MIPS Debug
+	$(MAKE) -R -C .build/projects/gmake-android-mips config=debug
+android-mips-release: .build/projects/gmake-android-mips ## Build - Android MIPS Release
+	$(MAKE) -R -C .build/projects/gmake-android-mips config=release
+android-mips: android-mips-debug android-mips-release ## Build - Android MIPS Debug and Release
+
+.build/projects/gmake-android-x86:
+	$(GENIE) --gcc=android-x86 gmake
+android-x86-debug: .build/projects/gmake-android-x86 ## Build - Android x86 Debug and Release
+	$(MAKE) -R -C .build/projects/gmake-android-x86 config=debug
+android-x86-release: .build/projects/gmake-android-x86 ## Build - Android x86 Debug and Release
+	$(MAKE) -R -C .build/projects/gmake-android-x86 config=release
+android-x86: android-x86-debug android-x86-release ## Build - Android x86 Debug and Release
+
+.build/projects/gmake-asmjs:
+	$(GENIE) --gcc=asmjs gmake
+asmjs-debug: .build/projects/gmake-asmjs ## Build - Emscripten Debug
+	$(MAKE) -R -C .build/projects/gmake-asmjs config=debug
+asmjs-release: .build/projects/gmake-asmjs ## Build - Emscripten Release
+	$(MAKE) -R -C .build/projects/gmake-asmjs config=release
+asmjs: asmjs-debug asmjs-release ## Build - Emscripten Debug and Release
+
+.build/projects/gmake-linux:
+	$(GENIE) --with-tools --gcc=linux-gcc gmake
+linux-debug32: .build/projects/gmake-linux ## Build - Linux x86 Debug
+	$(MAKE) -R -C .build/projects/gmake-linux config=debug32
+linux-release32: .build/projects/gmake-linux ## Build - Linux x86 Release
+	$(MAKE) -R -C .build/projects/gmake-linux config=release32
+linux-debug64: .build/projects/gmake-linux ## Build - Linux x64 Debug
+	$(MAKE) -R -C .build/projects/gmake-linux config=debug64
+linux-release64: .build/projects/gmake-linux ## Build - Linux x64 Release
+	$(MAKE) -R -C .build/projects/gmake-linux config=release64
+linux: linux-debug32 linux-release32 linux-debug64 linux-release64 ## Build - Linux x86/x64 Debug and Release
+
+.build/projects/gmake-freebsd:
+	$(GENIE) --with-tools --gcc=freebsd gmake
+freebsd-debug32: .build/projects/gmake-freebsd ## Build - FreeBSD x86 Debug
+	$(MAKE) -R -C .build/projects/gmake-freebsd config=debug32
+freebsd-release32: .build/projects/gmake-freebsd ## Build - FreeBSD x86 Release
+	$(MAKE) -R -C .build/projects/gmake-freebsd config=release32
+freebsd-debug64: .build/projects/gmake-freebsd ## Build - FreeBSD x86 Debug
+	$(MAKE) -R -C .build/projects/gmake-freebsd config=debug64
+freebsd-release64: .build/projects/gmake-freebsd ## Build - FreeBSD x86 Release
+	$(MAKE) -R -C .build/projects/gmake-freebsd config=release64
+freebsd: freebsd-debug32 freebsd-release32 freebsd-debug64 freebsd-release64 ## Build - FreeBSD x86/x64 Debug and Release
+
+.build/projects/gmake-mingw-gcc:
+	$(GENIE) --with-tools --gcc=mingw-gcc gmake
+mingw-gcc-debug32: .build/projects/gmake-mingw-gcc ## Build - MinGW GCC x86 Debug
+	$(MAKE) -R -C .build/projects/gmake-mingw-gcc config=debug32
+mingw-gcc-release32: .build/projects/gmake-mingw-gcc ## Build - MinGW GCC x86 Release
+	$(MAKE) -R -C .build/projects/gmake-mingw-gcc config=release32
+mingw-gcc-debug64: .build/projects/gmake-mingw-gcc ## Build - MinGW GCC x64 Debug
+	$(MAKE) -R -C .build/projects/gmake-mingw-gcc config=debug64
+mingw-gcc-release64: .build/projects/gmake-mingw-gcc ## Build - MinGW GCC x64 Release
+	$(MAKE) -R -C .build/projects/gmake-mingw-gcc config=release64
+mingw-gcc: mingw-gcc-debug32 mingw-gcc-release32 mingw-gcc-debug64 mingw-gcc-release64 ## Build - MinGW GCC x86/x64 Debug and Release
+
+.build/projects/gmake-mingw-clang:
+	$(GENIE) --gcc=mingw-clang gmake
+mingw-clang-debug32: .build/projects/gmake-mingw-clang ## Build - MinGW Clang x86 Debug
+	$(MAKE) -R -C .build/projects/gmake-mingw-clang config=debug32
+mingw-clang-release32: .build/projects/gmake-mingw-clang ## Build - MinGW Clang x86 Release
+	$(MAKE) -R -C .build/projects/gmake-mingw-clang config=release32
+mingw-clang-debug64: .build/projects/gmake-mingw-clang ## Build - MinGW Clang x64 Debug
+	$(MAKE) -R -C .build/projects/gmake-mingw-clang config=debug64
+mingw-clang-release64: .build/projects/gmake-mingw-clang ## Build - MinGW Clang x64 Release
+	$(MAKE) -R -C .build/projects/gmake-mingw-clang config=release64
+mingw-clang: mingw-clang-debug32 mingw-clang-release32 mingw-clang-debug64 mingw-clang-release64 ## Build - MinGW Clang x86/x64 Debug and Release
+
+.build/projects/vs2012:
+	$(GENIE) --with-tools vs2012
+vs2012-debug32: .build/projects/vs2012 ## Build - VS2012 x86 Debug
+	devenv .build/projects/vs2012/bimg.sln /Build "Debug|Win32"
+vs2012-release32: .build/projects/vs2012 ## Build - VS2012 x86 Release
+	devenv .build/projects/vs2012/bimg.sln /Build "Release|Win32"
+vs2012-debug64: .build/projects/vs2012 ## Build - VS2012 x64 Debug
+	devenv .build/projects/vs2012/bimg.sln /Build "Debug|x64"
+vs2012-release64: .build/projects/vs2012 ## Build - VS2012 x64 Release
+	devenv .build/projects/vs2012/bimg.sln /Build "Release|x64"
+vs2012: vs2012-debug32 vs2012-release32 vs2012-debug64 vs2012-release64 ## Build - VS2012 x86/x64 Debug and Release
+
+.build/projects/vs2013:
+	$(GENIE) --with-tools vs2013
+vs2013-debug32: .build/projects/vs2013 ## Build - VS2013 x86 Debug
+	devenv .build/projects/vs2013/bimg.sln /Build "Debug|Win32"
+vs2013-release32: .build/projects/vs2013 ## Build - VS2013 x86 Release
+	devenv .build/projects/vs2013/bimg.sln /Build "Release|Win32"
+vs2013-debug64: .build/projects/vs2013 ## Build - VS2013 x64 Debug
+	devenv .build/projects/vs2013/bimg.sln /Build "Debug|x64"
+vs2013-release64: .build/projects/vs2013 ## Build - VS2013 x64 Release
+	devenv .build/projects/vs2013/bimg.sln /Build "Release|x64"
+vs2013: vs2013-debug32 vs2013-release32 vs2013-debug64 vs2013-release64 ## Build - VS2013 x86/x64 Debug and Release
+
+.build/projects/vs2015:
+	$(GENIE) --with-tools vs2015
+vs2015-debug32: .build/projects/vs2015 ## Build - VS2015 x86 Debug
+	devenv .build/projects/vs2015/bimg.sln /Build "Debug|Win32"
+vs2015-release32: .build/projects/vs2015 ## Build - VS2015 x86 Release
+	devenv .build/projects/vs2015/bimg.sln /Build "Release|Win32"
+vs2015-debug64: .build/projects/vs2015 ## Build - VS2015 x64 Debug
+	devenv .build/projects/vs2015/bimg.sln /Build "Debug|x64"
+vs2015-release64: .build/projects/vs2015 ## Build - VS2015 x64 Release
+	devenv .build/projects/vs2015/bimg.sln /Build "Release|x64"
+vs2015: vs2015-debug32 vs2015-release32 vs2015-debug64 vs2015-release64 ## Build - VS2015 x86/x64 Debug and Release
+
+.build/projects/vs2017:
+	$(GENIE) --with-tools vs2017
+vs2017-debug32: .build/projects/vs2017 ## Build - vs2017 x86 Debug
+	devenv .build/projects/vs2017/bimg.sln /Build "Debug|Win32"
+vs2017-release32: .build/projects/vs2017 ## Build - vs2017 x86 Release
+	devenv .build/projects/vs2017/bimg.sln /Build "Release|Win32"
+vs2017-debug64: .build/projects/vs2017 ## Build - vs2017 x64 Debug
+	devenv .build/projects/vs2017/bimg.sln /Build "Debug|x64"
+vs2017-release64: .build/projects/vs2017 ## Build - vs2017 x64 Release
+	devenv .build/projects/vs2017/bimg.sln /Build "Release|x64"
+vs2017: vs2017-debug32 vs2017-release32 vs2017-debug64 vs2017-release64 ## Build - vs2017 x86/x64 Debug and Release
+
+.build/projects/gmake-nacl:
+	$(GENIE) --gcc=nacl gmake
+nacl-debug32: .build/projects/gmake-nacl ## Build - Native Client x86 Debug
+	$(MAKE) -R -C .build/projects/gmake-nacl config=debug32
+nacl-release32: .build/projects/gmake-nacl ## Build - Native Client x86 Release
+	$(MAKE) -R -C .build/projects/gmake-nacl config=release32
+nacl-debug64: .build/projects/gmake-nacl ## Build - Native Client x64 Debug
+	$(MAKE) -R -C .build/projects/gmake-nacl config=debug64
+nacl-release64: .build/projects/gmake-nacl ## Build - Native Client x64 Release
+	$(MAKE) -R -C .build/projects/gmake-nacl config=release64
+nacl: nacl-debug32 nacl-release32 nacl-debug64 nacl-release64 ## Build - Native Client x86/x64 Debug and Release
+
+.build/projects/gmake-nacl-arm:
+	$(GENIE) --gcc=nacl-arm gmake
+nacl-arm-debug: .build/projects/gmake-nacl-arm ## Build - Native Client ARM Debug
+	$(MAKE) -R -C .build/projects/gmake-nacl-arm config=debug
+nacl-arm-release: .build/projects/gmake-nacl-arm ## Build - Native Client ARM Release
+	$(MAKE) -R -C .build/projects/gmake-nacl-arm config=release
+nacl-arm: nacl-arm-debug32 nacl-arm-release32 ## Build - Native Client ARM Debug and Release
+
+.build/projects/gmake-pnacl:
+	$(GENIE) --gcc=pnacl gmake
+pnacl-debug: .build/projects/gmake-pnacl ## Build - Portable Native Client Debug
+	$(MAKE) -R -C .build/projects/gmake-pnacl config=debug
+pnacl-release: .build/projects/gmake-pnacl ## Build - Portable Native Client Release
+	$(MAKE) -R -C .build/projects/gmake-pnacl config=release
+pnacl: pnacl-debug pnacl-release ## Build - Portable Native Client Debug and Release
+
+.build/projects/gmake-osx:
+	$(GENIE) --with-tools --gcc=osx gmake
+osx-debug32: .build/projects/gmake-osx ## Build - OSX x86 Debug
+	$(MAKE) -C .build/projects/gmake-osx config=debug32
+osx-release32: .build/projects/gmake-osx ## Build - OSX x86 Release
+	$(MAKE) -C .build/projects/gmake-osx config=release32
+osx-debug64: .build/projects/gmake-osx ## Build - OSX x64 Debug
+	$(MAKE) -C .build/projects/gmake-osx config=debug64
+osx-release64: .build/projects/gmake-osx ## Build - OSX x64 Release
+	$(MAKE) -C .build/projects/gmake-osx config=release64
+osx: osx-debug32 osx-release32 osx-debug64 osx-release64 ## Build - OSX x86/x64 Debug and Release
+
+.build/projects/gmake-ios-arm:
+	$(GENIE) --gcc=ios-arm gmake
+ios-arm-debug: .build/projects/gmake-ios-arm ## Build - iOS ARM Debug
+	$(MAKE) -R -C .build/projects/gmake-ios-arm config=debug
+ios-arm-release: .build/projects/gmake-ios-arm ## Build - iOS ARM Release
+	$(MAKE) -R -C .build/projects/gmake-ios-arm config=release
+ios-arm: ios-arm-debug ios-arm-release ## Build - iOS ARM Debug and Release
+
+.build/projects/gmake-ios-arm64:
+	$(GENIE) --gcc=ios-arm64 gmake
+ios-arm64-debug: .build/projects/gmake-ios-arm64 ## Build - iOS ARM64 Debug
+	$(MAKE) -R -C .build/projects/gmake-ios-arm64 config=debug
+ios-arm64-release: .build/projects/gmake-ios-arm64 ## Build - iOS ARM64 Release
+	$(MAKE) -R -C .build/projects/gmake-ios-arm64 config=release
+ios-arm64: ios-arm64-debug ios-arm64-release ## Build - iOS ARM64 Debug and Release
+
+.build/projects/gmake-ios-simulator:
+	$(GENIE) --gcc=ios-simulator gmake
+ios-simulator-debug: .build/projects/gmake-ios-simulator ## Build - iOS Simulator Debug
+	$(MAKE) -R -C .build/projects/gmake-ios-simulator config=debug
+ios-simulator-release: .build/projects/gmake-ios-simulator ## Build - iOS Simulator Release
+	$(MAKE) -R -C .build/projects/gmake-ios-simulator config=release
+ios-simulator: ios-simulator-debug ios-simulator-release ## Build - iOS Simulator Debug and Release
+
+.build/projects/gmake-rpi:
+	$(GENIE) --gcc=rpi gmake
+rpi-debug: .build/projects/gmake-rpi ## Build - RasberryPi Debug
+	$(MAKE) -R -C .build/projects/gmake-rpi config=debug
+rpi-release: .build/projects/gmake-rpi ## Build - RasberryPi Release
+	$(MAKE) -R -C .build/projects/gmake-rpi config=release
+rpi: rpi-debug rpi-release ## Build - RasberryPi Debug and Release
+
+build-darwin: osx
+
+build-linux: linux-debug64 linux-release64
+
+build-windows: mingw-gcc
+
+build: build-$(OS)
+
+rebuild-shaders:
+	$(MAKE) -R -C examples rebuild
+
+assets: # Build assets.
+	$(NINJA) -C scripts
+
+analyze:
+	cppcheck src/
+	cppcheck examples/
+
+docs:
+	doxygen scripts/bimg.doxygen
+	markdown README.md > .build/docs/readme.html
+
+###
+
+SILENT ?= @
+
+UNAME := $(shell uname)
+ifeq ($(UNAME),$(filter $(UNAME),Linux Darwin FreeBSD GNU/kFreeBSD))
+ifeq ($(UNAME),$(filter $(UNAME),Darwin))
+OS=darwin
+BUILD_PROJECT_DIR=gmake-osx
+BUILD_OUTPUT_DIR=osx64_clang
+BUILD_TOOLS_CONFIG=release64
+BUILD_TOOLS_SUFFIX=Release
+EXE=
+else
+ifeq ($(UNAME),$(filter $(UNAME),FreeBSD GNU/kFreeBSD))
+OS=bsd
+BUILD_PROJECT_DIR=gmake-freebsd
+BUILD_OUTPUT_DIR=freebsd64_gcc
+BUILD_TOOLS_CONFIG=release64
+BUILD_TOOLS_SUFFIX=Release
+EXE=
+else
+OS=linux
+BUILD_PROJECT_DIR=gmake-linux
+BUILD_OUTPUT_DIR=linux64_gcc
+BUILD_TOOLS_CONFIG=release64
+BUILD_TOOLS_SUFFIX=Release
+EXE=
+endif
+endif
+else
+OS=windows
+BUILD_PROJECT_DIR=gmake-mingw-gcc
+BUILD_OUTPUT_DIR=win64_mingw-gcc
+BUILD_TOOLS_CONFIG=release64
+BUILD_TOOLS_SUFFIX=Release
+EXE=.exe
+endif
+
+geometryc: .build/projects/$(BUILD_PROJECT_DIR) ## Build geometryc tool.
+	$(SILENT) $(MAKE) -C .build/projects/$(BUILD_PROJECT_DIR) geometryc config=$(BUILD_TOOLS_CONFIG)
+	$(SILENT) cp .build/$(BUILD_OUTPUT_DIR)/bin/geometryc$(BUILD_TOOLS_SUFFIX)$(EXE) tools/bin/$(OS)/geometryc$(EXE)
+
+shaderc: .build/projects/$(BUILD_PROJECT_DIR) ## Build shaderc tool.
+	$(SILENT) $(MAKE) -C .build/projects/$(BUILD_PROJECT_DIR) shaderc config=$(BUILD_TOOLS_CONFIG)
+	$(SILENT) cp .build/$(BUILD_OUTPUT_DIR)/bin/shaderc$(BUILD_TOOLS_SUFFIX)$(EXE) tools/bin/$(OS)/shaderc$(EXE)
+
+texturec: .build/projects/$(BUILD_PROJECT_DIR) ## Build texturec tool.
+	$(SILENT) $(MAKE) -C .build/projects/$(BUILD_PROJECT_DIR) texturec config=$(BUILD_TOOLS_CONFIG)
+	$(SILENT) cp .build/$(BUILD_OUTPUT_DIR)/bin/texturec$(BUILD_TOOLS_SUFFIX)$(EXE) tools/bin/$(OS)/texturec$(EXE)
+
+texturev: .build/projects/$(BUILD_PROJECT_DIR) ## Build texturev tool.
+	$(SILENT) $(MAKE) -C .build/projects/$(BUILD_PROJECT_DIR) texturev config=$(BUILD_TOOLS_CONFIG)
+	$(SILENT) cp .build/$(BUILD_OUTPUT_DIR)/bin/texturev$(BUILD_TOOLS_SUFFIX)$(EXE) tools/bin/$(OS)/texturev$(EXE)
+
+tools: geometryc shaderc texturec texturev ## Build tools.
+
+clean-tools: ## Clean tools projects.
+	-$(SILENT) rm -r .build/projects/$(BUILD_PROJECT_DIR)
+
+dist-windows: .build/projects/gmake-mingw-gcc
+	$(SILENT) $(MAKE) -C .build/projects/gmake-mingw-gcc config=release64 -j 6 geometryc
+	$(SILENT) cp .build/win64_mingw-gcc/bin/geometrycRelease.exe tools/bin/windows/geometryc.exe
+	$(SILENT) $(MAKE) -C .build/projects/gmake-mingw-gcc config=release64 -j 6 shaderc
+	$(SILENT) cp .build/win64_mingw-gcc/bin/shadercRelease.exe   tools/bin/windows/shaderc.exe
+	$(SILENT) $(MAKE) -C .build/projects/gmake-mingw-gcc config=release64 -j 6 texturec
+	$(SILENT) cp .build/win64_mingw-gcc/bin/texturecRelease.exe  tools/bin/windows/texturec.exe
+	$(SILENT) $(MAKE) -C .build/projects/gmake-mingw-gcc config=release64 -j 6 texturev
+	$(SILENT) cp .build/win64_mingw-gcc/bin/texturevRelease.exe tools/bin/windows/texturev.exe
+
+dist-linux: .build/projects/gmake-linux
+	$(SILENT) $(MAKE) -C .build/projects/gmake-linux     config=release64 -j 6 geometryc
+	$(SILENT) cp .build/linux64_gcc/bin/geometrycRelease tools/bin/linux/geometryc
+	$(SILENT) $(MAKE) -C .build/projects/gmake-linux     config=release64 -j 6 shaderc
+	$(SILENT) cp .build/linux64_gcc/bin/shadercRelease   tools/bin/linux/shaderc
+	$(SILENT) $(MAKE) -C .build/projects/gmake-linux     config=release64 -j 6 texturec
+	$(SILENT) cp .build/linux64_gcc/bin/texturecRelease  tools/bin/linux/texturec
+	$(SILENT) $(MAKE) -C .build/projects/gmake-linux     config=release64 -j 6 texturev
+	$(SILENT) cp .build/linux64_gcc/bin/texturevRelease  tools/bin/linux/texturev
+
+dist-darwin: .build/projects/gmake-osx
+	$(SILENT) $(MAKE) -C .build/projects/gmake-osx       config=release64 -j 6 geometryc
+	$(SILENT) cp .build/osx64_clang/bin/geometrycRelease tools/bin/darwin/geometryc
+	$(SILENT) $(MAKE) -C .build/projects/gmake-osx       config=release64 -j 6 shaderc
+	$(SILENT) cp .build/osx64_clang/bin/shadercRelease   tools/bin/darwin/shaderc
+	$(SILENT) $(MAKE) -C .build/projects/gmake-osx       config=release64 -j 6 texturec
+	$(SILENT) cp .build/osx64_clang/bin/texturecRelease  tools/bin/darwin/texturec
+	$(SILENT) $(MAKE) -C .build/projects/gmake-osx       config=release64 -j 6 texturev
+	$(SILENT) cp .build/osx64_clang/bin/texturevRelease  tools/bin/darwin/texturev
+
+dist: clean dist-windows dist-linux dist-darwin
diff --git a/3rdparty/bimg/scripts/bimg.lua b/3rdparty/bimg/scripts/bimg.lua
new file mode 100644
index 0000000..8bf6e3a
--- /dev/null
+++ b/3rdparty/bimg/scripts/bimg.lua
@@ -0,0 +1,23 @@
+--
+-- Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+-- License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+--
+
+project "bimg"
+	kind "StaticLib"
+
+	includedirs {
+		path.join(BX_DIR, "include"),
+		path.join(BIMG_DIR, "include"),
+	}
+
+	files {
+		path.join(BIMG_DIR, "src/image.*"),
+	}
+
+	configuration { "linux-*" }
+		buildoptions {
+			"-fPIC",
+		}
+
+	configuration {}
diff --git a/3rdparty/bimg/scripts/bimg_decode.lua b/3rdparty/bimg/scripts/bimg_decode.lua
new file mode 100644
index 0000000..934cbec
--- /dev/null
+++ b/3rdparty/bimg/scripts/bimg_decode.lua
@@ -0,0 +1,26 @@
+--
+-- Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+-- License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+--
+
+project "bimg_decode"
+	kind "StaticLib"
+
+	includedirs {
+		path.join(BX_DIR, "include"),
+		path.join(BIMG_DIR, "include"),
+		path.join(BIMG_DIR, "3rdparty"),
+		path.join(BIMG_DIR, "3rdparty/nvtt"),
+		path.join(BIMG_DIR, "3rdparty/iqa/include"),
+	}
+
+	files {
+		path.join(BIMG_DIR, "src/image_decode.*"),
+	}
+
+	configuration { "linux-*" }
+		buildoptions {
+			"-fPIC",
+		}
+
+	configuration {}
diff --git a/3rdparty/bimg/scripts/bimg_encode.lua b/3rdparty/bimg/scripts/bimg_encode.lua
new file mode 100644
index 0000000..1b853f6
--- /dev/null
+++ b/3rdparty/bimg/scripts/bimg_encode.lua
@@ -0,0 +1,41 @@
+--
+-- Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+-- License: https://github.com/bkaradzic/bx#license-bsd-2-clause
+--
+
+project "bimg_encode"
+	kind "StaticLib"
+
+	includedirs {
+		path.join(BX_DIR, "include"),
+		path.join(BIMG_DIR, "include"),
+		path.join(BIMG_DIR, "3rdparty"),
+		path.join(BIMG_DIR, "3rdparty/nvtt"),
+		path.join(BIMG_DIR, "3rdparty/iqa/include"),
+	}
+
+	files {
+		path.join(BIMG_DIR, "src/image_encode.*"),
+		path.join(BIMG_DIR, "3rdparty/libsquish/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/libsquish/**.h"),
+		path.join(BIMG_DIR, "3rdparty/edtaa3/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/edtaa3/**.h"),
+		path.join(BIMG_DIR, "3rdparty/etc1/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/etc1/**.h"),
+		path.join(BIMG_DIR, "3rdparty/etc2/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/etc2/**.hpp"),
+		path.join(BIMG_DIR, "3rdparty/nvtt/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/nvtt/**.h"),
+		path.join(BIMG_DIR, "3rdparty/pvrtc/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/pvrtc/**.h"),
+		path.join(BIMG_DIR, "3rdparty/tinyexr/**.h"),
+		path.join(BIMG_DIR, "3rdparty/iqa/include/**.h"),
+		path.join(BIMG_DIR, "3rdparty/iqa/source/**.c"),
+	}
+
+	configuration { "linux-*" }
+		buildoptions {
+			"-fPIC",
+		}
+
+	configuration {}
diff --git a/3rdparty/bimg/scripts/genie.lua b/3rdparty/bimg/scripts/genie.lua
new file mode 100644
index 0000000..b22c4dd
--- /dev/null
+++ b/3rdparty/bimg/scripts/genie.lua
@@ -0,0 +1,76 @@
+--
+-- Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+-- License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+--
+
+newoption {
+	trigger = "with-amalgamated",
+	description = "Enable amalgamated build.",
+}
+
+newoption {
+	trigger = "with-shared-lib",
+	description = "Enable building shared library.",
+}
+
+newoption {
+	trigger = "with-tools",
+	description = "Enable building tools.",
+}
+
+solution "bimg"
+	configurations {
+		"Debug",
+		"Release",
+	}
+
+	if _ACTION == "xcode4" then
+		platforms {
+			"Universal",
+		}
+	else
+		platforms {
+			"x32",
+			"x64",
+			"Native", -- for targets where bitness is not specified
+		}
+	end
+
+	language "C++"
+	startproject "example-00-helloworld"
+
+MODULE_DIR = path.getabsolute("..")
+BIMG_DIR   = path.getabsolute("..")
+BX_DIR     = os.getenv("BX_DIR")
+
+local BIMG_BUILD_DIR = path.join(BIMG_DIR, ".build")
+local BIMG_THIRD_PARTY_DIR = path.join(BIMG_DIR, "3rdparty")
+if not BX_DIR then
+	BX_DIR = path.getabsolute(path.join(BIMG_DIR, "../bx"))
+end
+
+if not os.isdir(BX_DIR) then
+	print("bx not found at " .. BX_DIR)
+	print("For more info see: https://bkaradzic.github.io/bgfx/build.html")
+	os.exit()
+end
+
+dofile (path.join(BX_DIR, "scripts/toolchain.lua"))
+if not toolchain(BIMG_BUILD_DIR, BIMG_THIRD_PARTY_DIR) then
+	return -- no action specified
+end
+
+function copyLib()
+end
+
+group "libs"
+dofile "bimg.lua"
+dofile "bimg_decode.lua"
+dofile "bimg_encode.lua"
+
+dofile(path.join(BX_DIR, "scripts/bx.lua"))
+
+if _OPTIONS["with-tools"] then
+	group "tools"
+	dofile "texturec.lua"
+end
diff --git a/3rdparty/bimg/scripts/texturec.lua b/3rdparty/bimg/scripts/texturec.lua
new file mode 100644
index 0000000..02f6bb6
--- /dev/null
+++ b/3rdparty/bimg/scripts/texturec.lua
@@ -0,0 +1,57 @@
+--
+-- Copyright 2010-2017 Branimir Karadzic. All rights reserved.
+-- License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+--
+
+project "texturec"
+	kind "ConsoleApp"
+
+	includedirs {
+		path.join(BX_DIR,   "include"),
+		path.join(BIMG_DIR, "include"),
+		path.join(BIMG_DIR, "3rdparty"),
+		path.join(BIMG_DIR, "3rdparty/nvtt"),
+		path.join(BIMG_DIR, "3rdparty/iqa/include"),
+	}
+
+	files {
+		path.join(BIMG_DIR, "3rdparty/libsquish/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/libsquish/**.h"),
+		path.join(BIMG_DIR, "3rdparty/edtaa3/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/edtaa3/**.h"),
+		path.join(BIMG_DIR, "3rdparty/etc1/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/etc1/**.h"),
+		path.join(BIMG_DIR, "3rdparty/etc2/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/etc2/**.hpp"),
+		path.join(BIMG_DIR, "3rdparty/nvtt/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/nvtt/**.h"),
+		path.join(BIMG_DIR, "3rdparty/pvrtc/**.cpp"),
+		path.join(BIMG_DIR, "3rdparty/pvrtc/**.h"),
+		path.join(BIMG_DIR, "3rdparty/tinyexr/**.h"),
+		path.join(BIMG_DIR, "3rdparty/iqa/include/**.h"),
+		path.join(BIMG_DIR, "3rdparty/iqa/source/**.c"),
+		path.join(BIMG_DIR, "tools/texturec/**.cpp"),
+		path.join(BIMG_DIR, "tools/texturec/**.h"),
+	}
+
+	links {
+		"bimg_decode",
+		"bimg_encode",
+		"bimg",
+		"bx",
+	}
+
+	configuration { "mingw-*" }
+		targetextension ".exe"
+
+	configuration { "osx" }
+		links {
+			"Cocoa.framework",
+		}
+
+	configuration { "vs20* or mingw*" }
+		links {
+			"psapi",
+		}
+
+	configuration {}
diff --git a/3rdparty/bimg/src/bimg_p.h b/3rdparty/bimg/src/bimg_p.h
new file mode 100644
index 0000000..91bd31e
--- /dev/null
+++ b/3rdparty/bimg/src/bimg_p.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include <bimg/bimg.h>
+#include <bx/allocator.h>
+#include <bx/readerwriter.h>
+#include <bx/pixelformat.h>
+#include <bx/endian.h>
+#include <bx/error.h>
+#include <bx/simd_t.h>
+
+#define BIMG_CHUNK_MAGIC_TEX BX_MAKEFOURCC('T', 'E', 'X', 0x0)
+
+namespace bimg
+{
+	struct Memory
+	{
+		uint8_t* data;
+		uint32_t size;
+	};
+
+	struct TextureCreate
+	{
+		TextureFormat::Enum m_format;
+		uint16_t m_width;
+		uint16_t m_height;
+		uint16_t m_depth;
+		uint16_t m_numLayers;
+		uint8_t m_numMips;
+		bool m_cubeMap;
+		const Memory* m_mem;
+	};
+
+	inline uint8_t calcNumMips(bool _hasMips, uint16_t _width, uint16_t _height, uint16_t _depth = 1)
+	{
+		if (_hasMips)
+		{
+			const uint32_t max = bx::uint32_max(bx::uint32_max(_width, _height), _depth);
+			const uint32_t num = 1 + uint32_t(bx::flog2(float(max) ) );
+
+			return uint8_t(num);
+		}
+
+		return 1;
+	}
+
+	///
+	void imageConvert(
+		  void* _dst
+		, uint32_t _bpp
+		, bx::PackFn _pack
+		, const void* _src
+		, bx::UnpackFn _unpack
+		, uint32_t _size
+		);
+
+	///
+	void imageConvert(
+		  void* _dst
+		, uint32_t _dstBpp
+		, bx::PackFn _pack
+		, const void* _src
+		, uint32_t _srcBpp
+		, bx::UnpackFn _unpack
+		, uint32_t _width
+		, uint32_t _height
+		, uint32_t _srcPitch
+		);
+
+} // namespace bimg
diff --git a/3rdparty/bgfx/src/image.cpp b/3rdparty/bimg/src/image.cpp
similarity index 98%
rename from 3rdparty/bgfx/src/image.cpp
rename to 3rdparty/bimg/src/image.cpp
index e6c9e07..217ac96 100644
--- a/3rdparty/bgfx/src/image.cpp
+++ b/3rdparty/bimg/src/image.cpp
@@ -3,10 +3,9 @@
  * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
  */
 
-#include "bgfx_p.h"
-#include "image.h"
+#include "bimg_p.h"
 
-namespace bgfx
+namespace bimg
 {
 	static const ImageBlockInfo s_imageBlockInfo[] =
 	{
@@ -689,8 +688,8 @@ namespace bgfx
 
 	struct PackUnpack
 	{
-		bx::PackFn pack;
-		bx::UnpackFn unpack;
+		PackFn pack;
+		UnpackFn unpack;
 	};
 
 	static const PackUnpack s_packUnpack[] =
@@ -776,14 +775,14 @@ namespace bgfx
 
 	bool imageConvert(TextureFormat::Enum _dstFormat, TextureFormat::Enum _srcFormat)
 	{
-		bx::UnpackFn unpack = s_packUnpack[_srcFormat].unpack;
-		bx::PackFn   pack   = s_packUnpack[_dstFormat].pack;
+		UnpackFn unpack = s_packUnpack[_srcFormat].unpack;
+		PackFn   pack   = s_packUnpack[_dstFormat].pack;
 		return NULL != pack
 			&& NULL != unpack
 			;
 	}
 
-	void imageConvert(void* _dst, uint32_t _bpp, bx::PackFn _pack, const void* _src, bx::UnpackFn _unpack, uint32_t _size)
+	void imageConvert(void* _dst, uint32_t _bpp, PackFn _pack, const void* _src, UnpackFn _unpack, uint32_t _size)
 	{
 		const uint8_t* src = (uint8_t*)_src;
 		uint8_t* dst = (uint8_t*)_dst;
@@ -798,7 +797,7 @@ namespace bgfx
 		}
 	}
 
-	void imageConvert(void* _dst, uint32_t _dstBpp, bx::PackFn _pack, const void* _src, uint32_t _srcBpp, bx::UnpackFn _unpack, uint32_t _width, uint32_t _height, uint32_t _srcPitch)
+	void imageConvert(void* _dst, uint32_t _dstBpp, PackFn _pack, const void* _src, uint32_t _srcBpp, UnpackFn _unpack, uint32_t _width, uint32_t _height, uint32_t _srcPitch)
 	{
 		const uint8_t* src = (uint8_t*)_src;
 		uint8_t* dst = (uint8_t*)_dst;
@@ -818,8 +817,8 @@ namespace bgfx
 
 	bool imageConvert(void* _dst, TextureFormat::Enum _dstFormat, const void* _src, TextureFormat::Enum _srcFormat, uint32_t _width, uint32_t _height, uint32_t _srcPitch)
 	{
-		bx::UnpackFn unpack = s_packUnpack[_srcFormat].unpack;
-		bx::PackFn   pack   = s_packUnpack[_dstFormat].pack;
+		UnpackFn unpack = s_packUnpack[_srcFormat].unpack;
+		PackFn   pack   = s_packUnpack[_dstFormat].pack;
 		if (NULL == pack
 		||  NULL == unpack)
 		{
@@ -890,10 +889,19 @@ namespace bgfx
 		return output;
 	}
 
-	ImageContainer* imageParseBgfx(bx::AllocatorI* _allocator, const void* _src, uint32_t _size)
+	typedef bool (*ParseFn)(ImageContainer&, bx::ReaderSeekerI*);
+
+	template<uint32_t magicT, ParseFn parseFnT>
+	ImageContainer* imageParseT(bx::AllocatorI* _allocator, const void* _src, uint32_t _size)
 	{
+		bx::MemoryReader reader(_src, _size);
+
+		uint32_t magic;
+		bx::read(&reader, magic);
+
 		ImageContainer imageContainer;
-		if (!imageParse(imageContainer, _src, _size) )
+		if (magicT != magic
+		|| !parseFnT(imageContainer, &reader) )
 		{
 			return NULL;
 		}
@@ -2118,6 +2126,11 @@ namespace bgfx
 		return TextureFormat::Unknown != format;
 	}
 
+	ImageContainer* imageParseDds(bx::AllocatorI* _allocator, const void* _src, uint32_t _size)
+	{
+		return imageParseT<DDS_MAGIC, imageParseDds>(_allocator, _src, _size);
+	}
+
 // KTX
 #define KTX_MAGIC       BX_MAKEFOURCC(0xAB, 'K', 'T', 'X')
 #define KTX_HEADER_SIZE 64
@@ -2428,6 +2441,11 @@ namespace bgfx
 		return TextureFormat::Unknown != format;
 	}
 
+	ImageContainer* imageParseKtx(bx::AllocatorI* _allocator, const void* _src, uint32_t _size)
+	{
+		return imageParseT<KTX_MAGIC, imageParseKtx>(_allocator, _src, _size);
+	}
+
 // PVR3
 #define PVR3_MAKE8CC(_a, _b, _c, _d, _e, _f, _g, _h) (uint64_t(BX_MAKEFOURCC(_a, _b, _c, _d) ) | (uint64_t(BX_MAKEFOURCC(_e, _f, _g, _h) )<<32) )
 
@@ -2579,6 +2597,11 @@ namespace bgfx
 		return TextureFormat::Unknown != format;
 	}
 
+	ImageContainer* imageParsePvr3(bx::AllocatorI* _allocator, const void* _src, uint32_t _size)
+	{
+		return imageParseT<PVR3_MAGIC, imageParsePvr3>(_allocator, _src, _size);
+	}
+
 	bool imageParse(ImageContainer& _imageContainer, bx::ReaderSeekerI* _reader)
 	{
 		uint32_t magic;
@@ -2596,7 +2619,7 @@ namespace bgfx
 		{
 			return imageParsePvr3(_imageContainer, _reader);
 		}
-		else if (BGFX_CHUNK_MAGIC_TEX == magic)
+		else if (BIMG_CHUNK_MAGIC_TEX == magic)
 		{
 			TextureCreate tc;
 			bx::read(_reader, tc);
@@ -3258,4 +3281,4 @@ namespace bgfx
 		}
 	}
 
-} // namespace bgfx
+} // namespace bimg
diff --git a/3rdparty/bimg/src/image_decode.cpp b/3rdparty/bimg/src/image_decode.cpp
new file mode 100644
index 0000000..a231567
--- /dev/null
+++ b/3rdparty/bimg/src/image_decode.cpp
@@ -0,0 +1,393 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bimg_p.h"
+
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-function")
+
+BX_PRAGMA_DIAGNOSTIC_PUSH()
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wtype-limits")
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-parameter")
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wunused-value")
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG("-Wdeprecated-declarations")
+BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4100) // error C4100: '' : unreferenced formal parameter
+BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4505) // warning C4505: 'tinyexr::miniz::def_realloc_func': unreferenced local function has been removed
+#if BX_PLATFORM_EMSCRIPTEN
+#	include <compat/ctype.h>
+#endif // BX_PLATFORM_EMSCRIPTEN
+#define MINIZ_NO_ARCHIVE_APIS
+#define MINIZ_NO_STDIO
+#define TINYEXR_IMPLEMENTATION
+#include <tinyexr/tinyexr.h>
+BX_PRAGMA_DIAGNOSTIC_POP()
+
+BX_PRAGMA_DIAGNOSTIC_PUSH();
+BX_PRAGMA_DIAGNOSTIC_IGNORED_MSVC(4127) // warning C4127: conditional expression is constant
+#define LODEPNG_NO_COMPILE_ENCODER
+#define LODEPNG_NO_COMPILE_DISK
+#define LODEPNG_NO_COMPILE_ANCILLARY_CHUNKS
+#define LODEPNG_NO_COMPILE_ERROR_TEXT
+#define LODEPNG_NO_COMPILE_ALLOCATORS
+#define LODEPNG_NO_COMPILE_CPP
+#include <lodepng/lodepng.cpp>
+BX_PRAGMA_DIAGNOSTIC_POP();
+
+void* lodepng_malloc(size_t _size)
+{
+	return ::malloc(_size);
+}
+
+void* lodepng_realloc(void* _ptr, size_t _size)
+{
+	return ::realloc(_ptr, _size);
+}
+
+void lodepng_free(void* _ptr)
+{
+	::free(_ptr);
+}
+
+BX_PRAGMA_DIAGNOSTIC_PUSH();
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wmissing-field-initializers");
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wshadow");
+BX_PRAGMA_DIAGNOSTIC_IGNORED_CLANG_GCC("-Wint-to-pointer-cast")
+BX_PRAGMA_DIAGNOSTIC_IGNORED_GCC("-Warray-bounds");
+#if BX_COMPILER_GCC >= 60000
+BX_PRAGMA_DIAGNOSTIC_IGNORED_GCC("-Wmisleading-indentation");
+BX_PRAGMA_DIAGNOSTIC_IGNORED_GCC("-Wshift-negative-value");
+#endif // BX_COMPILER_GCC >= 60000_
+#define STBI_MALLOC(_size)        lodepng_malloc(_size)
+#define STBI_REALLOC(_ptr, _size) lodepng_realloc(_ptr, _size)
+#define STBI_FREE(_ptr)           lodepng_free(_ptr)
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb/stb_image.h>
+BX_PRAGMA_DIAGNOSTIC_POP();
+
+namespace bimg
+{
+	static ImageContainer* imageParseLodePng(bx::AllocatorI* _allocator, const void* _data, uint32_t _size)
+	{
+		static uint8_t pngMagic[] = { 0x89, 0x50, 0x4E, 0x47, 0x0d, 0x0a };
+
+		if (0 != bx::memCmp(_data, pngMagic, sizeof(pngMagic) ) )
+		{
+			return NULL;
+		}
+
+		ImageContainer* output = NULL;
+		bimg::TextureFormat::Enum format = bimg::TextureFormat::RGBA8;
+		uint32_t width  = 0;
+		uint32_t height = 0;
+
+		unsigned error;
+		LodePNGState state;
+		lodepng_state_init(&state);
+		state.decoder.color_convert = 0;
+
+		uint8_t* data = NULL;
+		error = lodepng_decode(&data, &width, &height, &state, (uint8_t*)_data, _size);
+
+		if (0 == error)
+		{
+			switch (state.info_raw.bitdepth)
+			{
+				case 8:
+					switch (state.info_raw.colortype)
+					{
+						case LCT_GREY:
+							format = bimg::TextureFormat::R8;
+							break;
+
+						case LCT_GREY_ALPHA:
+							format = bimg::TextureFormat::RG8;
+							break;
+
+						case LCT_RGB:
+							format = bimg::TextureFormat::RGB8;
+							break;
+
+						case LCT_RGBA:
+							format = bimg::TextureFormat::RGBA8;
+							break;
+
+						case LCT_PALETTE:
+							break;
+					}
+					break;
+
+				case 16:
+					switch (state.info_raw.colortype)
+					{
+						case LCT_GREY:
+							for (uint32_t ii = 0, num = width*height; ii < num; ++ii)
+							{
+								uint16_t* rgba = (uint16_t*)data + ii;
+								rgba[0] = bx::toHostEndian(rgba[0], false);
+							}
+							format = bimg::TextureFormat::R16;
+							break;
+
+						case LCT_GREY_ALPHA:
+							for (uint32_t ii = 0, num = width*height; ii < num; ++ii)
+							{
+								uint16_t* rgba = (uint16_t*)data + ii*2;
+								rgba[0] = bx::toHostEndian(rgba[0], false);
+								rgba[1] = bx::toHostEndian(rgba[1], false);
+							}
+							format = bimg::TextureFormat::RG16;
+							break;
+
+						case LCT_RGBA:
+							for (uint32_t ii = 0, num = width*height; ii < num; ++ii)
+							{
+								uint16_t* rgba = (uint16_t*)data + ii*4;
+								rgba[0] = bx::toHostEndian(rgba[0], false);
+								rgba[1] = bx::toHostEndian(rgba[1], false);
+								rgba[2] = bx::toHostEndian(rgba[2], false);
+								rgba[3] = bx::toHostEndian(rgba[3], false);
+							}
+							format = bimg::TextureFormat::RGBA16;
+							break;
+
+						case LCT_RGB:
+						case LCT_PALETTE:
+							break;
+					}
+					break;
+
+				default:
+					break;
+			}
+
+			output = imageAlloc(_allocator
+				, format
+				, uint16_t(width)
+				, uint16_t(height)
+				, 0
+				, 1
+				, false
+				, false
+				, data
+				);
+		}
+
+		lodepng_state_cleanup(&state);
+		lodepng_free(data);
+
+		return output;
+	}
+
+	static ImageContainer* imageParseTinyExr(bx::AllocatorI* _allocator, const void* _data, uint32_t _size)
+	{
+		EXRVersion exrVersion;
+		int result = ParseEXRVersionFromMemory(&exrVersion, (uint8_t*)_data, _size);
+		if (TINYEXR_SUCCESS != result)
+		{
+			return NULL;
+		}
+
+		bimg::TextureFormat::Enum format = bimg::TextureFormat::RGBA8;
+		uint32_t width  = 0;
+		uint32_t height = 0;
+
+		uint8_t* data = NULL;
+		const char* err = NULL;
+		EXRHeader exrHeader;
+		result = ParseEXRHeaderFromMemory(&exrHeader, &exrVersion, (uint8_t*)_data, _size, &err);
+		if (TINYEXR_SUCCESS == result)
+		{
+			EXRImage exrImage;
+			InitEXRImage(&exrImage);
+
+			result = LoadEXRImageFromMemory(&exrImage, &exrHeader, (uint8_t*)_data, _size, &err);
+			if (TINYEXR_SUCCESS == result)
+			{
+				uint8_t idxR = UINT8_MAX;
+				uint8_t idxG = UINT8_MAX;
+				uint8_t idxB = UINT8_MAX;
+				uint8_t idxA = UINT8_MAX;
+				for (uint8_t ii = 0, num = uint8_t(exrHeader.num_channels); ii < num; ++ii)
+				{
+					const EXRChannelInfo& channel = exrHeader.channels[ii];
+					if (UINT8_MAX == idxR
+					&&  0 == bx::strncmp(channel.name, "R") )
+					{
+						idxR = ii;
+					}
+					else if (UINT8_MAX == idxG
+					&&  0 == bx::strncmp(channel.name, "G") )
+					{
+						idxG = ii;
+					}
+					else if (UINT8_MAX == idxB
+					&&  0 == bx::strncmp(channel.name, "B") )
+					{
+						idxB = ii;
+					}
+					else if (UINT8_MAX == idxA
+					&&  0 == bx::strncmp(channel.name, "A") )
+					{
+						idxA = ii;
+					}
+				}
+
+				if (UINT8_MAX != idxR)
+				{
+					const bool asFloat = exrHeader.pixel_types[idxR] == TINYEXR_PIXELTYPE_FLOAT;
+					uint32_t srcBpp = 32;
+					uint32_t dstBpp = asFloat ? 32 : 16;
+					format = asFloat ? TextureFormat::R32F : TextureFormat::R16F;
+					uint32_t stepR = 1;
+					uint32_t stepG = 0;
+					uint32_t stepB = 0;
+					uint32_t stepA = 0;
+
+					if (UINT8_MAX != idxG)
+					{
+						srcBpp += 32;
+						dstBpp = asFloat ? 64 : 32;
+						format = asFloat ? TextureFormat::RG32F : TextureFormat::RG16F;
+						stepG  = 1;
+					}
+
+					if (UINT8_MAX != idxB)
+					{
+						srcBpp += 32;
+						dstBpp = asFloat ? 128 : 64;
+						format = asFloat ? TextureFormat::RGBA32F : TextureFormat::RGBA16F;
+						stepB  = 1;
+					}
+
+					if (UINT8_MAX != idxA)
+					{
+						srcBpp += 32;
+						dstBpp = asFloat ? 128 : 64;
+						format = asFloat ? TextureFormat::RGBA32F : TextureFormat::RGBA16F;
+						stepA  = 1;
+					}
+
+					data = (uint8_t*)BX_ALLOC(_allocator, exrImage.width * exrImage.height * dstBpp/8);
+
+					const float  zero = 0.0f;
+					const float* srcR = UINT8_MAX == idxR ? &zero : (const float*)(exrImage.images)[idxR];
+					const float* srcG = UINT8_MAX == idxG ? &zero : (const float*)(exrImage.images)[idxG];
+					const float* srcB = UINT8_MAX == idxB ? &zero : (const float*)(exrImage.images)[idxB];
+					const float* srcA = UINT8_MAX == idxA ? &zero : (const float*)(exrImage.images)[idxA];
+
+					const uint32_t bytesPerPixel = dstBpp/8;
+					for (uint32_t ii = 0, num = exrImage.width * exrImage.height; ii < num; ++ii)
+					{
+						float rgba[4] =
+						{
+							*srcR,
+							*srcG,
+							*srcB,
+							*srcA,
+						};
+						bx::memCopy(&data[ii * bytesPerPixel], rgba, bytesPerPixel);
+
+						srcR += stepR;
+						srcG += stepG;
+						srcB += stepB;
+						srcA += stepA;
+					}
+				}
+
+				FreeEXRImage(&exrImage);
+			}
+
+			FreeEXRHeader(&exrHeader);
+		}
+
+		ImageContainer* output = imageAlloc(_allocator
+			, format
+			, uint16_t(width)
+			, uint16_t(height)
+			, 0
+			, 1
+			, false
+			, false
+			, data
+			);
+		BX_FREE(_allocator, data);
+
+		return output;
+	}
+
+	static ImageContainer* imageParseStbImage(bx::AllocatorI* _allocator, const void* _data, uint32_t _size)
+	{
+		const int isHdr = stbi_is_hdr_from_memory((const uint8_t*)_data, (int)_size);
+
+		void* data;
+		uint32_t width  = 0;
+		uint32_t height = 0;
+		int comp = 0;
+		if (isHdr) { data = stbi_loadf_from_memory((const uint8_t*)_data, (int)_size, (int*)&width, (int*)&height, &comp, 4); }
+		else       { data = stbi_load_from_memory ((const uint8_t*)_data, (int)_size, (int*)&width, (int*)&height, &comp, 0); }
+
+		if (NULL == data)
+		{
+			return NULL;
+		}
+
+		bimg::TextureFormat::Enum format;
+		if (isHdr)
+		{
+			format = bimg::TextureFormat::RGBA32F;
+		}
+		else
+		{
+			if       (1 == comp)   { format = bimg::TextureFormat::R8;    }
+			else  if (2 == comp)   { format = bimg::TextureFormat::RG8;   }
+			else  if (3 == comp)   { format = bimg::TextureFormat::RGB8;  }
+			else/*if (4 == comp)*/ { format = bimg::TextureFormat::RGBA8; }
+		}
+
+		ImageContainer* output = imageAlloc(_allocator
+			, format
+			, uint16_t(width)
+			, uint16_t(height)
+			, 0
+			, 1
+			, false
+			, false
+			, data
+			);
+		stbi_image_free(data);
+
+		return output;
+	}
+
+	ImageContainer* imageParse(bx::AllocatorI* _allocator, const void* _data, uint32_t _size, TextureFormat::Enum _dstFormat)
+	{
+		ImageContainer* input = imageParseDds     (_allocator, _data, _size)        ;
+		input = NULL == input ? imageParseKtx     (_allocator, _data, _size) : input;
+		input = NULL == input ? imageParsePvr3    (_allocator, _data, _size) : input;
+		input = NULL == input ? imageParseLodePng (_allocator, _data, _size) : input;
+		input = NULL == input ? imageParseTinyExr (_allocator, _data, _size) : input;
+		input = NULL == input ? imageParseStbImage(_allocator, _data, _size) : input;
+
+		if (NULL == input)
+		{
+			return NULL;
+		}
+
+		_dstFormat = TextureFormat::Count == _dstFormat
+			? input->m_format
+			: _dstFormat
+			;
+
+		if (_dstFormat == input->m_format)
+		{
+			return input;
+		}
+
+		ImageContainer* output = imageConvert(_allocator, _dstFormat, *input);
+		imageFree(input);
+
+		return output;
+	}
+
+} // namespace bimg
diff --git a/3rdparty/bimg/src/image_encode.cpp b/3rdparty/bimg/src/image_encode.cpp
new file mode 100644
index 0000000..e06e5f9
--- /dev/null
+++ b/3rdparty/bimg/src/image_encode.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include "bimg_p.h"
+
+#include <libsquish/squish.h>
+#include <etc1/etc1.h>
+#include <etc2/ProcessRGB.hpp>
+#include <nvtt/nvtt.h>
+#include <pvrtc/PvrTcEncoder.h>
+#include <edtaa3/edtaa3func.h>
+
+namespace bimg
+{
+	bool imageEncodeFromRgba8(void* _dst, const void* _src, uint32_t _width, uint32_t _height, TextureFormat::Enum _format)
+	{
+		switch (_format)
+		{
+		case TextureFormat::BC1:
+		case TextureFormat::BC2:
+		case TextureFormat::BC3:
+		case TextureFormat::BC4:
+		case TextureFormat::BC5:
+			squish::CompressImage( (const uint8_t*)_src, _width, _height, _dst
+				, _format == TextureFormat::BC2 ? squish::kDxt3
+				: _format == TextureFormat::BC3 ? squish::kDxt5
+				: _format == TextureFormat::BC4 ? squish::kBc4
+				: _format == TextureFormat::BC5 ? squish::kBc5
+				:                                 squish::kDxt1
+				);
+			return true;
+
+		case TextureFormat::BC6H:
+			nvtt::compressBC6H( (const uint8_t*)_src, _width, _height, 4, _dst);
+			return true;
+
+		case TextureFormat::BC7:
+			nvtt::compressBC7( (const uint8_t*)_src, _width, _height, 4, _dst);
+			return true;
+
+		case TextureFormat::ETC1:
+			etc1_encode_image( (const uint8_t*)_src, _width, _height, 4, _width*4, (uint8_t*)_dst);
+			return true;
+
+		case TextureFormat::ETC2:
+			{
+				const uint32_t blockWidth  = (_width +3)/4;
+				const uint32_t blockHeight = (_height+3)/4;
+				const uint32_t pitch = _width*4;
+				const uint8_t* src = (const uint8_t*)_src;
+				uint64_t* dst = (uint64_t*)_dst;
+				for (uint32_t yy = 0; yy < blockHeight; ++yy)
+				{
+					for (uint32_t xx = 0; xx < blockWidth; ++xx)
+					{
+						uint8_t block[4*4*4];
+						const uint8_t* ptr = &src[(yy*pitch+xx*4)*4];
+
+						for (uint32_t ii = 0; ii < 16; ++ii)
+						{ // BGRx
+							bx::memCopy(&block[ii*4], &ptr[(ii%4)*pitch + (ii&~3)], 4);
+							bx::xchg(block[ii*4+0], block[ii*4+2]);
+						}
+
+						*dst++ = ProcessRGB_ETC2(block);
+					}
+				}
+			}
+			return true;
+
+		case TextureFormat::PTC14:
+			{
+				using namespace Javelin;
+				RgbaBitmap bmp;
+				bmp.width  = _width;
+				bmp.height = _height;
+				bmp.data   = (uint8_t*)const_cast<void*>(_src);
+				PvrTcEncoder::EncodeRgb4Bpp(_dst, bmp);
+				bmp.data = NULL;
+			}
+			return true;
+
+		case TextureFormat::PTC14A:
+			{
+				using namespace Javelin;
+				RgbaBitmap bmp;
+				bmp.width  = _width;
+				bmp.height = _height;
+				bmp.data   = (uint8_t*)const_cast<void*>(_src);
+				PvrTcEncoder::EncodeRgba4Bpp(_dst, bmp);
+				bmp.data = NULL;
+			}
+			return true;
+
+		case TextureFormat::BGRA8:
+			imageSwizzleBgra8(_dst, _width, _height, _width*4, _src);
+			return true;
+
+		case TextureFormat::RGBA8:
+			bx::memCopy(_dst, _src, _width*_height*4);
+			return true;
+
+		default:
+			break;
+		}
+
+		return imageConvert(_dst, _format, _src, TextureFormat::RGBA8, _width, _height);
+	}
+
+	bool imageEncodeFromRgba32f(bx::AllocatorI* _allocator, void* _dst, const void* _src, uint32_t _width, uint32_t _height, TextureFormat::Enum _format)
+	{
+		const uint8_t* src = (const uint8_t*)_src;
+
+		switch (_format)
+		{
+		case TextureFormat::RGBA8:
+			{
+				uint8_t* dst = (uint8_t*)_dst;
+				for (uint32_t yy = 0; yy < _height; ++yy)
+				{
+					for (uint32_t xx = 0; xx < _width; ++xx)
+					{
+						const uint32_t offset = yy*_width + xx;
+						const float* input = (const float*)&src[offset * 16];
+						uint8_t* output    = &dst[offset * 4];
+						output[0] = uint8_t(input[0]*255.0f + 0.5f);
+						output[1] = uint8_t(input[1]*255.0f + 0.5f);
+						output[2] = uint8_t(input[2]*255.0f + 0.5f);
+						output[3] = uint8_t(input[3]*255.0f + 0.5f);
+					}
+				}
+			}
+			return true;
+
+		case TextureFormat::BC5:
+			{
+				uint8_t* temp = (uint8_t*)BX_ALLOC(_allocator, _width*_height*4);
+				for (uint32_t yy = 0; yy < _height; ++yy)
+				{
+					for (uint32_t xx = 0; xx < _width; ++xx)
+					{
+						const uint32_t offset = yy*_width + xx;
+						const float* input = (const float*)&src[offset * 16];
+						uint8_t* output    = &temp[offset * 4];
+						output[0] = uint8_t(input[0]*255.0f + 0.5f);
+						output[1] = uint8_t(input[1]*255.0f + 0.5f);
+						output[2] = uint8_t(input[2]*255.0f + 0.5f);
+						output[3] = uint8_t(input[3]*255.0f + 0.5f);
+					}
+				}
+
+				imageEncodeFromRgba8(_dst, temp, _width, _height, _format);
+				BX_FREE(_allocator, temp);
+			}
+			return true;
+
+		default:
+			break;
+		}
+
+		return imageConvert(_dst, _format, _src, TextureFormat::RGBA32F, _width, _height);
+	}
+
+	void imageRgba32f11to01(void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, const void* _src)
+	{
+		const uint8_t* src = (const uint8_t*)_src;
+		uint8_t* dst = (uint8_t*)_dst;
+
+		for (uint32_t yy = 0; yy < _height; ++yy)
+		{
+			for (uint32_t xx = 0; xx < _width; ++xx)
+			{
+				const uint32_t offset = yy*_pitch + xx * 16;
+				const float* input = (const float*)&src[offset];
+				float* output = (float*)&dst[offset];
+				output[0] = input[0]*0.5f + 0.5f;
+				output[1] = input[1]*0.5f + 0.5f;
+				output[2] = input[2]*0.5f + 0.5f;
+				output[3] = input[3]*0.5f + 0.5f;
+			}
+		}
+	}
+
+	static void edtaa3(bx::AllocatorI* _allocator, double* _dst, uint32_t _width, uint32_t _height, double* _src)
+	{
+		const uint32_t numPixels = _width*_height;
+
+		short* xdist = (short *)BX_ALLOC(_allocator, numPixels*sizeof(short) );
+		short* ydist = (short *)BX_ALLOC(_allocator, numPixels*sizeof(short) );
+		double* gx   = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
+		double* gy   = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
+
+		::computegradient(_src, _width, _height, gx, gy);
+		::edtaa3(_src, gx, gy, _width, _height, xdist, ydist, _dst);
+
+		for (uint32_t ii = 0; ii < numPixels; ++ii)
+		{
+			if (_dst[ii] < 0.0)
+			{
+				_dst[ii] = 0.0;
+			}
+		}
+
+		BX_FREE(_allocator, xdist);
+		BX_FREE(_allocator, ydist);
+		BX_FREE(_allocator, gx);
+		BX_FREE(_allocator, gy);
+	}
+
+	inline double min(double _a, double _b)
+	{
+		return _a > _b ? _b : _a;
+	}
+
+	inline double max(double _a, double _b)
+	{
+		return _a > _b ? _a : _b;
+	}
+
+	inline double clamp(double _val, double _min, double _max)
+	{
+		return max(min(_val, _max), _min);
+	}
+
+	void imageMakeDist(bx::AllocatorI* _allocator, void* _dst, uint32_t _width, uint32_t _height, uint32_t _pitch, float _edge, const void* _src)
+	{
+		const uint32_t numPixels = _width*_height;
+
+		double* imgIn   = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
+		double* outside = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
+		double* inside  = (double*)BX_ALLOC(_allocator, numPixels*sizeof(double) );
+
+		for (uint32_t yy = 0; yy < _height; ++yy)
+		{
+			const uint8_t* src = (const uint8_t*)_src + yy*_pitch;
+			double* dst = &imgIn[yy*_width];
+			for (uint32_t xx = 0; xx < _width; ++xx)
+			{
+				dst[xx] = double(src[xx])/255.0;
+			}
+		}
+
+		edtaa3(_allocator, outside, _width, _height, imgIn);
+
+		for (uint32_t ii = 0; ii < numPixels; ++ii)
+		{
+			imgIn[ii] = 1.0 - imgIn[ii];
+		}
+
+		edtaa3(_allocator, inside, _width, _height, imgIn);
+
+		BX_FREE(_allocator, imgIn);
+
+		uint8_t* dst = (uint8_t*)_dst;
+
+		double edgeOffset = _edge*0.5;
+		double invEdge = 1.0/_edge;
+
+		for (uint32_t ii = 0; ii < numPixels; ++ii)
+		{
+			double dist = clamp( ( (outside[ii] - inside[ii])+edgeOffset) * invEdge, 0.0, 1.0);
+			dst[ii] = 255-uint8_t(dist * 255.0);
+		}
+
+		BX_FREE(_allocator, inside);
+		BX_FREE(_allocator, outside);
+	}
+
+} // namespace bimg
diff --git a/3rdparty/bimg/tools/texturec/texturec.cpp b/3rdparty/bimg/tools/texturec/texturec.cpp
new file mode 100644
index 0000000..e85566c
--- /dev/null
+++ b/3rdparty/bimg/tools/texturec/texturec.cpp
@@ -0,0 +1,427 @@
+/*
+ * Copyright 2011-2017 Branimir Karadzic. All rights reserved.
+ * License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause
+ */
+
+#include <stdio.h>
+#include <bx/allocator.h>
+#include <bx/readerwriter.h>
+#include <bx/endian.h>
+
+#include <bimg/decode.h>
+#include <bimg/encode.h>
+
+#if 0
+#	define BX_TRACE(_format, ...) fprintf(stderr, "" _format "\n", ##__VA_ARGS__)
+#endif // DEBUG
+
+#include <bx/bx.h>
+#include <bx/commandline.h>
+#include <bx/crtimpl.h>
+#include <bx/uint32_t.h>
+
+extern "C" {
+#include <iqa.h>
+}
+
+void help(const char* _error = NULL)
+{
+	if (NULL != _error)
+	{
+		fprintf(stderr, "Error:\n%s\n\n", _error);
+	}
+
+	fprintf(stderr
+		, "texturec, bgfx texture compiler tool\n"
+		  "Copyright 2011-2017 Branimir Karadzic. All rights reserved.\n"
+		  "License: https://github.com/bkaradzic/bgfx#license-bsd-2-clause\n\n"
+		);
+
+	fprintf(stderr
+		, "Usage: texturec -f <in> -o <out> [-t <format>]\n"
+
+		  "\n"
+		  "Supported input file types:\n"
+		  "    *.png                  Portable Network Graphics\n"
+		  "    *.tga                  Targa\n"
+		  "    *.dds                  Direct Draw Surface\n"
+		  "    *.ktx                  Khronos Texture\n"
+		  "    *.pvr                  PowerVR\n"
+
+		  "\n"
+		  "Options:\n"
+		  "  -f <file path>           Input file path.\n"
+		  "  -o <file path>           Output file path (file will be written in KTX format).\n"
+		  "  -t <format>              Output format type (BC1/2/3/4/5, ETC1, PVR14, etc.).\n"
+		  "  -m, --mips               Generate mip-maps.\n"
+		  "  -n, --normalmap          Input texture is normal map.\n"
+		  "      --sdf <edge>         Compute SDF texture.\n"
+		  "      --iqa                Image Quality Assesment\n"
+
+		  "\n"
+		  "For additional information, see https://github.com/bkaradzic/bgfx\n"
+		);
+}
+
+int main(int _argc, const char* _argv[])
+{
+	bx::CommandLine cmdLine(_argc, _argv);
+
+	if (cmdLine.hasArg('h', "help") )
+	{
+		help();
+		return EXIT_FAILURE;
+	}
+
+	const char* inputFileName = cmdLine.findOption('f');
+	if (NULL == inputFileName)
+	{
+		help("Input file must be specified.");
+		return EXIT_FAILURE;
+	}
+
+	const char* outputFileName = cmdLine.findOption('o');
+	if (NULL == outputFileName)
+	{
+		help("Output file must be specified.");
+		return EXIT_FAILURE;
+	}
+
+	bool sdf = false;
+	double edge = 16.0;
+	const char* edgeOpt = cmdLine.findOption("sdf");
+	if (NULL != edgeOpt)
+	{
+		sdf  = true;
+		edge = atof(edgeOpt);
+	}
+	BX_UNUSED(sdf, edge);
+
+	const bool mips      = cmdLine.hasArg('m',  "mips");
+	const bool normalMap = cmdLine.hasArg('n',  "normalmap");
+	const bool iqa       = cmdLine.hasArg('\0', "iqa");
+
+	bx::CrtFileReader reader;
+	if (!bx::open(&reader, inputFileName) )
+	{
+		help("Failed to open input file.");
+		return EXIT_FAILURE;
+	}
+
+	bx::CrtAllocator allocator;
+
+	uint32_t inputSize = (uint32_t)bx::getSize(&reader);
+	uint8_t* inputData = (uint8_t*)BX_ALLOC(&allocator, inputSize);
+
+	bx::read(&reader, inputData, inputSize);
+	bx::close(&reader);
+
+	{
+		using namespace bimg;
+
+		ImageContainer* input = imageParse(&allocator, inputData, inputSize);
+
+		if (NULL != input)
+		{
+			BX_FREE(&allocator, inputData);
+
+			const char* type = cmdLine.findOption('t');
+			bimg::TextureFormat::Enum format = input->m_format;
+
+			if (NULL != type)
+			{
+				format = bimg::getFormat(type);
+
+				if (!isValid(format) )
+				{
+					help("Invalid format specified.");
+					return EXIT_FAILURE;
+				}
+			}
+
+			ImageContainer* output = NULL;
+
+			ImageMip mip;
+			if (imageGetRawData(*input, 0, 0, input->m_data, input->m_size, mip) )
+			{
+				uint8_t numMips = mips
+					? imageGetNumMips(format, mip.m_width, mip.m_height)
+					: 1
+					;
+
+				void* temp = NULL;
+
+				if (normalMap)
+				{
+					output = imageAlloc(&allocator, format, mip.m_width, mip.m_height, 0, 1, false, mips);
+
+					ImageMip dstMip;
+					imageGetRawData(*output, 0, 0, NULL, 0, dstMip);
+
+					if (mip.m_width  != dstMip.m_width
+					&&  mip.m_height != dstMip.m_height)
+					{
+						printf("Invalid input image size %dx%d, it must be at least %dx%d to be converted to %s format.\n"
+							, mip.m_width
+							, mip.m_height
+							, dstMip.m_width
+							, dstMip.m_height
+							, getName(format)
+							);
+						return EXIT_FAILURE;
+					}
+
+					uint32_t size = imageGetSize(
+						  NULL
+						, dstMip.m_width
+						, dstMip.m_height
+						, 0
+						, false
+						, false
+						, 1
+						, TextureFormat::RGBA32F
+						);
+					temp = BX_ALLOC(&allocator, size);
+					float* rgba = (float*)temp;
+					float* rgbaDst = (float*)BX_ALLOC(&allocator, size);
+
+					imageDecodeToRgba32f(&allocator
+						, rgba
+						, mip.m_data
+						, mip.m_width
+						, mip.m_height
+						, mip.m_width*mip.m_bpp/8
+						, mip.m_format
+						);
+
+					if (TextureFormat::BC5 != mip.m_format)
+					{
+						for (uint32_t yy = 0; yy < mip.m_height; ++yy)
+						{
+							for (uint32_t xx = 0; xx < mip.m_width; ++xx)
+							{
+								const uint32_t offset = (yy*mip.m_width + xx) * 4;
+								float* inout = &rgba[offset];
+								inout[0] = inout[0] * 2.0f - 1.0f;
+								inout[1] = inout[1] * 2.0f - 1.0f;
+								inout[2] = inout[2] * 2.0f - 1.0f;
+								inout[3] = inout[3] * 2.0f - 1.0f;
+							}
+						}
+					}
+
+					imageRgba32f11to01(rgbaDst, dstMip.m_width, dstMip.m_height, dstMip.m_width*16, rgba);
+					imageEncodeFromRgba32f(&allocator, output->m_data, rgbaDst, dstMip.m_width, dstMip.m_height, format);
+
+					for (uint8_t lod = 1; lod < numMips; ++lod)
+					{
+						imageRgba32fDownsample2x2NormalMap(rgba, dstMip.m_width, dstMip.m_height, dstMip.m_width*16, rgba);
+						imageRgba32f11to01(rgbaDst, dstMip.m_width, dstMip.m_height, dstMip.m_width*16, rgba);
+						imageGetRawData(*output, 0, lod, output->m_data, output->m_size, dstMip);
+						uint8_t* data = const_cast<uint8_t*>(dstMip.m_data);
+						imageEncodeFromRgba32f(&allocator, data, rgbaDst, dstMip.m_width, dstMip.m_height, format);
+					}
+
+					BX_FREE(&allocator, rgbaDst);
+				}
+				else if (8 != getBlockInfo(input->m_format).rBits)
+				{
+					output = imageAlloc(&allocator, format, mip.m_width, mip.m_height, 0, 1, false, mips);
+
+					ImageMip dstMip;
+					imageGetRawData(*output, 0, 0, NULL, 0, dstMip);
+
+					if (mip.m_width  != dstMip.m_width
+					&&  mip.m_height != dstMip.m_height)
+					{
+						printf("Invalid input image size %dx%d, it must be at least %dx%d to be converted to %s format.\n"
+							, mip.m_width
+							, mip.m_height
+							, dstMip.m_width
+							, dstMip.m_height
+							, getName(format)
+							);
+						return EXIT_FAILURE;
+					}
+
+					uint32_t size = imageGetSize(
+						  NULL
+						, dstMip.m_width
+						, dstMip.m_height
+						, 0
+						, false
+						, false
+						, 1
+						, TextureFormat::RGBA32F
+						);
+					temp = BX_ALLOC(&allocator, size);
+					float* rgba = (float*)temp;
+					float* rgbaDst = (float*)BX_ALLOC(&allocator, size);
+
+					imageDecodeToRgba32f(&allocator
+						, rgba
+						, mip.m_data
+						, mip.m_width
+						, mip.m_height
+						, mip.m_width*mip.m_bpp/8
+						, mip.m_format
+						);
+					imageEncodeFromRgba32f(&allocator, output->m_data, rgba, dstMip.m_width, dstMip.m_height, format);
+
+					imageRgba32fToLinear(rgba
+						, mip.m_width
+						, mip.m_height
+						, mip.m_width*mip.m_bpp/8
+						, rgba
+						);
+
+					for (uint8_t lod = 1; lod < numMips; ++lod)
+					{
+						imageRgba32fLinearDownsample2x2(rgba, dstMip.m_width, dstMip.m_height, dstMip.m_width*16, rgba);
+						imageGetRawData(*output, 0, lod, output->m_data, output->m_size, dstMip);
+						uint8_t* data = const_cast<uint8_t*>(dstMip.m_data);
+
+						imageRgba32fToGamma(rgbaDst
+							, mip.m_width
+							, mip.m_height
+							, mip.m_width*mip.m_bpp/8
+							, rgba
+							);
+
+						imageEncodeFromRgba32f(&allocator, data, rgbaDst, dstMip.m_width, dstMip.m_height, format);
+					}
+
+					BX_FREE(&allocator, rgbaDst);
+				}
+				else
+				{
+					output = imageAlloc(&allocator, format, mip.m_width, mip.m_height, 0, 1, false, mips);
+
+					ImageMip dstMip;
+					imageGetRawData(*output, 0, 0, NULL, 0, dstMip);
+
+					if (mip.m_width  != dstMip.m_width
+					&&  mip.m_height != dstMip.m_height)
+					{
+						printf("Invalid input image size %dx%d, it must be at least %dx%d to be converted to %s format.\n"
+							, mip.m_width
+							, mip.m_height
+							, dstMip.m_width
+							, dstMip.m_height
+							, getName(format)
+							);
+						return EXIT_FAILURE;
+					}
+
+					uint32_t size = imageGetSize(
+						  NULL
+						, dstMip.m_width
+						, dstMip.m_height
+						, 0
+						, false
+						, false
+						, 1
+						, TextureFormat::RGBA8
+						);
+					temp = BX_ALLOC(&allocator, size);
+					bx::memSet(temp, 0, size);
+					uint8_t* rgba = (uint8_t*)temp;
+
+					imageDecodeToRgba8(rgba
+						, mip.m_data
+						, mip.m_width
+						, mip.m_height
+						, mip.m_width*mip.m_bpp/8
+						, mip.m_format
+						);
+
+					void* ref = NULL;
+					if (iqa)
+					{
+						ref = BX_ALLOC(&allocator, size);
+						bx::memCopy(ref, rgba, size);
+					}
+
+					imageEncodeFromRgba8(output->m_data, rgba, dstMip.m_width, dstMip.m_height, format);
+
+					for (uint8_t lod = 1; lod < numMips; ++lod)
+					{
+						imageRgba8Downsample2x2(rgba, dstMip.m_width, dstMip.m_height, dstMip.m_width*4, rgba);
+						imageGetRawData(*output, 0, lod, output->m_data, output->m_size, dstMip);
+						uint8_t* data = const_cast<uint8_t*>(dstMip.m_data);
+						imageEncodeFromRgba8(data, rgba, dstMip.m_width, dstMip.m_height, format);
+					}
+
+					if (NULL != ref)
+					{
+						imageDecodeToRgba8(rgba
+							, output->m_data
+							, mip.m_width
+							, mip.m_height
+							, mip.m_width*mip.m_bpp/8
+							, format
+							);
+
+						static const iqa_ssim_args args =
+						{
+							0.39f,     // alpha
+							0.731f,    // beta
+							1.12f,     // gamma
+							187,       // L
+							0.025987f, // K1
+							0.0173f,   // K2
+							1          // factor
+						};
+
+						float result = iqa_ssim( (uint8_t*)ref
+								, rgba
+								, mip.m_width
+								, mip.m_height
+								, mip.m_width*mip.m_bpp/8
+								, 0
+								, &args
+								);
+						printf("%f\n", result);
+
+						BX_FREE(&allocator, ref);
+					}
+				}
+
+				BX_FREE(&allocator, temp);
+			}
+
+			if (NULL != output)
+			{
+				bx::CrtFileWriter writer;
+				if (bx::open(&writer, outputFileName) )
+				{
+					if (NULL != bx::stristr(outputFileName, ".ktx") )
+					{
+						imageWriteKtx(&writer, *output, output->m_data, output->m_size);
+					}
+
+					bx::close(&writer);
+				}
+				else
+				{
+					help("Failed to open output file.");
+					return EXIT_FAILURE;
+				}
+
+				imageFree(output);
+			}
+			else
+			{
+				help("No output generated.");
+				return EXIT_FAILURE;
+			}
+		}
+		else
+		{
+			help("Failed to load input file.");
+			return EXIT_FAILURE;
+		}
+	}
+
+	return EXIT_SUCCESS;
+}
diff --git a/3rdparty/bx/LICENSE b/3rdparty/bx/LICENSE
index f184463..dd17ed4 100644
--- a/3rdparty/bx/LICENSE
+++ b/3rdparty/bx/LICENSE
@@ -1,7 +1,5 @@
 Copyright 2010-2017 Branimir Karadzic. All rights reserved.
 
-https://github.com/bkaradzic/bx
-
 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
 
@@ -22,5 +20,3 @@ PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY
 WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 OF THE POSSIBILITY OF SUCH DAMAGE.
-
-https://github.com/bkaradzic/bx/blob/master/LICENSE
diff --git a/3rdparty/bx/include/bx/platform.h b/3rdparty/bx/include/bx/platform.h
index 8d8c340..7e7591d 100644
--- a/3rdparty/bx/include/bx/platform.h
+++ b/3rdparty/bx/include/bx/platform.h
@@ -239,7 +239,7 @@
 #	elif defined(__MINGW32__) || defined(__MINGW64__)
 #		undef  BX_CRT_MINGW
 #		define BX_CRT_MINGW 1
-#	elif defined(__apple_build_version__) || defined(__ORBIS__) || defined(__EMSCRIPTEN__)
+#	elif defined(__apple_build_version__) || defined(__ORBIS__) || defined(__EMSCRIPTEN__) || defined(__llvm__)
 #		undef  BX_CRT_LIBCXX
 #		define BX_CRT_LIBCXX 1
 #	endif //
diff --git a/3rdparty/bx/include/bx/readerwriter.h b/3rdparty/bx/include/bx/readerwriter.h
index 7fda6f0..0866c88 100644
--- a/3rdparty/bx/include/bx/readerwriter.h
+++ b/3rdparty/bx/include/bx/readerwriter.h
@@ -8,6 +8,7 @@
 
 #include "allocator.h"
 #include "error.h"
+#include "endian.h"
 #include "uint32_t.h"
 
 BX_ERROR_RESULT(BX_ERROR_READERWRITER_OPEN,         BX_MAKEFOURCC('R', 'W', 0, 1) );
diff --git a/3rdparty/bx/scripts/toolchain.lua b/3rdparty/bx/scripts/toolchain.lua
index ee461ed..0152ce7 100644
--- a/3rdparty/bx/scripts/toolchain.lua
+++ b/3rdparty/bx/scripts/toolchain.lua
@@ -1444,5 +1444,11 @@ function strip()
 --				.. "--preload-file ../../../examples/runtime@/"
 		}
 
+	configuration { "riscv" }
+		postbuildcommands {
+			"$(SILENT) echo Stripping symbols.",
+			"$(SILENT) $(FREEDOM_E_SDK)/toolchain/bin/riscv32-unknown-elf-strip -s \"$(TARGET)\""
+		}
+
 	configuration {} -- reset configuration
 end
diff --git a/3rdparty/bx/tools/bin/darwin/genie b/3rdparty/bx/tools/bin/darwin/genie
index 48552bab3e7f0af0e67f9792dcaa4e6ff3852328..3a2544bce09f28b30298e6aa64b1ba40b99c988a 100755
GIT binary patch
delta 32070
zcma*QcU)A*_y50pS#|+I6j4zTP_Q8yum@2gqGAV2>;-$T*wMwVSaH#@i;CD#QBdr$
zU`J!GSYu*0nqX9-;df@v>-tVUe|#T5ACJtL=j)t1ckayGxpQZi&HjyE`%ii;o?F<*
zV))NLpM;gQ+<ZI?1|!Mo&tz&a_;@5dsqc0@->B3Pni37y^KI3P^^1gSnvx%xSSGej
zH`XiSXJLKbYIYFYt{LmQZ#1j$B!s6Cn{3hMQnpa7jQbXL9!9Mqi8bY1OwKfRz8lUb
zDnx9joR72;@<LQ)eb{O6u{qf_aEDq5qJfwX8E;ocKO_{+LO!-_b`8F%b}UEw*z%|#
zUoQR_N?J_JwH{z9<6eWE%h52A#O!=$G}#WDN*in0?wVQ~Yuo(on-mU3MMvPYp|)xE
z6&xE{43+4NiznFj*f%hZwHR(EL=~|)m@5|gJ-b=ty#{qVt{o7)ZPWKrUbawklD+C7
zx^~}m2TN0<8<m+vM^~aEWd)NOS~4G(w|F~dzAJB8YqrHW)hjgH%VKCTx7-|MYR>ht
z9e1ixp^KtFBc|6iL@emyFy^m_dG?De7DL25L&VHuF<Sm*68c$eo&_3d6)d*i1<DsW
z^ILv?Slj$^Kbb8CS8qdpxufRv)dk#)+TFS7Y0e*v+ABNT=7K@moVm8U1@D)<qgf0w
z=6PWb(;vBZJJ~*B?ynKOPe&|p339hxalPXF)<|xjh`RQ_8ZEY+g_4|qkU}5{-y1FI
zoeM{l(=OdhPp<XQqPcWTpIQHeNh|Yndeepzjaum|>8HZS8H?7qVm2I=UH|-#=lS^&
z%c=0FNLy6nR@&#*>FXP>HEG|Uv4u5XuB|(rey{lfJ1zT!ZCPup*8D_z@yI@fwGXw^
zCr9se)EZSyuQK?kQJWcHdo$#dw!Nb5*-$U-(AIRzuyb|+$NcClbL3gdUChr9`s<)<
z=ZLuvpP$%FCtXl8GR15Nx+;ZL7j1=y6|uRFDyoGi*~*V9ulXg}T8+A=HCbn?Jo=)R
z?VFxAdV)oZ^0bYaR81@GnZ9+>J58%x!nSHkfHrq!`nf4L9kl4x>FZ~k?X*rS(znJR
z*EF+p`j7<?HMQ&?ZTn6%E4%obJ*x9zlB!Mts!of!`Ola9sH#(_gRShzA9~L3AZsP{
z3bFb5NB_ys&zCT%DFw~5f7{ATJslXapy>KgBMktTWHQn}Szg;b`<vhV3G81Ae>e80
zv`Wu9dDlrh(J(#f((iWKl?LevS9@#P>c+Oeu2;~upRjq~=&J=Zw#~W`pp`w5o^d15
zsLcttHM=!H^9oPTxK+-iJv*3w@1BdM-QJ)6@_sj?HX$>;>BCk=t<t{q6_17(wL=*;
z%abWu-Hi0vPiz(~B{kjiWfe^ux6Ri4^%5<8tL@=yFL%2@zRZu$7q-dI|HHMA{grLD
z!f(oId$-wYzB!=%u+8@FO}sXCt1b5JA;;re>{ZV-wiW)}E4};gUyNExN_yA#MortE
zZ0qr7Ep1Y=P5gOCTe~s6U2a#S^MDQZhEXM`SeJeq(hq;=Yb@@)o|^ymfp^rJAD<s8
z$LH+4&Q|AR8LfW>TbGYsnx$g;q>r_Y+S>Bz8J{BUw2;*{^OvF8#Z~EJzO-<1+Ld5$
zaNTDrWISuT;#D;Bk=eLT)2=KOQyq<sw7mJ^tfTQe?cDs#&yL1BChhI)%<Bb>G<NM~
zh(&HjFKv23vCGZ)o!8Y__J)40UG48G8++^z*Y5T^DK7`&Jfj?lnXW~QRgGG@i)i3(
zEU5{X%r5T6w{{-49BDwy2}_1PDTX^|<kZY)592~R%{V3VeJSIKVp_}onR}}lYne3r
z-kEP|8K1dphfinTY;G)P(fT$LU)vZfYJ(e!3T>%UrGCzA($;v;qV>qkwCiF#U0M?>
z#D(!jA8pJE@n$?-Q1}YrHG#P73K2@y&#=PYu;1WgF!+YYhrGB-wF#S^Z*`r$gK|xK
z{~R%|=!4~A21SM*UM?f!5V;VM<0f&`TIIE8@2k;4_rj*;=TpqM5lwraiJ0dyWx04d
z!5E`CEYIvb(O9d1w#8j6nQ7dmeJmmx&NBAbew!y!XBj)2<LB8MVhV`Ry+(K8Gu!B)
z8H$MD*~Zq|E;li6wy}aX%S~j=HfA`dxzd=ZEQi`oR}nMESW8=2I5TyQF~m-bD46+j
zp0TV^Yga%NTwrXe*%uJq78qM-nNDK+0%J|*I!<(KZFy`hC-HiL@u;@2KxXPfqlZz;
zHjC4Xj9$*GZT6T>Ht!O~Zbq-nf{TquowdZ-;?ip4743Ll=K3|popxG@&zVu{jn~cE
z;J2c}7Nb{@E>rX)8;gJB%vAD^%#K@(bL_RG*O^at7~5#Y4?i;*8U?-S=UUbN4rNDq
z%AtMhS!VMz<7cDRqGx8~J;o<xwTdS*{cjjOi+y+Zdn)qW^Wtq)d8cS`KvRZ&uoUIs
z$#0|dG}c|}iZQQ@Rkb%eGdI05jx}l@zRxWE##q-*Gkl-f`JFM^=<#8*Ne$Ys%Elgh
z<sjSQ%ocI&y|IIJtdBiCRmQ7yqDo^`Iz^>%DxId%87iHn(m5)PSLu9}E>!7al`c_f
zqDq&kbh%1bs&utV*Qzwhs>}wJCaW|>rCU_GO{J+S-Ko+vmF`h#hD!ITbiYaus`Ri*
zkE-;TN>8XXOQok(dRC?9eXR28vlU!Y>1CB(Rq1t=-c;#rmEKipj!GY>^pQ%RsPw5y
ze^cpmmA+KzYn8rL={uGFq0(F*dn=v(F9n}eny1n)D*Z>LhBC5>no3P7HLKJ?rA{h!
zR%t<%7E-C3O5IgjOr;(wEvZsZm3mQX%`fMppsY&$R9ar86;<l5(g2lKRcW9~tEsex
zN^7aKj!J`78luwrDh*ZX4=N3(w8etb_Fnde7mtelli$L+ptMoKbqNbfe=SY<vz-4d
z^M^VALFUsq|6b;kIsZoHmvjDw%+KfiuQDIU`NuLJ!}<F%KY;UhWZv49;f91r&R>!F
zaL!+p`8u3GC-VWEKPB@%oIft}?wmg&^A4OpAoE{}stemI^Y1vnTjrl~-nv7=ZHBEf
zpUwGAGJlNo>t#NJ^J`>&3+GqJ{A$iGmHEY-PmuW;oTtV%yco~<xiUY5^D||>XHl!F
z-Bbzf*fCk=BRD@n=0i9?M&<)KKT_uXI6qY8Jvcv5=AAj;SLO|zkCypdcXh6AGXI?O
zol4XF_acX(y>wjWe3Z;*alWO@@8^6|nNQ_>BbiU)e3;B9a=xC-&*A*{G9SzNnle9v
z^FcD-hw@e$bX6pDWJe{LZ^8L;G9SwMGBRI-^QC3JBIirUyeH?2%6uWt7nXSw=Urs}
zQxSFbjxzt6^Y)atzIeo7l#c71|LRHkvz-4d^M^VALFUsq|6b;kIsZoHmvjDw%+Kfi
zuQDIU`NuLJ!}<F%KS1TJrSC}S%8nZ{AIbSEG9S+Qi!xt_^XFtffb*wh-iPzYW!|0h
zM`YfC^9N-9iyPksGXIYAyJh~Vm0^d3+nnDj^Vyu=B=g5Ozh34uIKM{bw{U)i%&+GB
zQkh@O`2?At!TAL;Kc4e*Wqt_fXSSEnlVPgNx8wX|nUCQ71ep)v{1}-J<ormP_v8Ff
znfKuQK+nvPpN*HxR1XUBsn~n?(4hlke5&=US-W4~TGgX_^{HN~cJIKxy?XWO7abT_
zvqp4a?`k#r)u>UUcAq|dYh}ii(mMVm7TwicOk<;mM2n~#twQFxyIRYVcF`jSih|Fy
zs+n#dw7uJfPm*29LSsqnGjP=CipYqxb#|5MND3V>*Uqazg(_9xsVCmgvGWmKm)kjJ
zUQV*Ju5kD<EV5~M(>85IhhS4__irWj?$^JnURrptsZSw2u9A;m#mMGO+AfVVl@e!y
zO@l?7B4#&B4_VJjl`<#f*;UcR<*Fu=I9uA}DlUJqGl@=trc#-<dL~-N`-nLkO~c82
z+-O=wW=XQCmz__A5K(xO$z3>YG96SDm|`j^9&IvJ`5#qEG0DRVZ>G2nDW>{*J~zeW
zN%=yXO;Kc`H=B0W^^vvjHzd0M=&Efe4DB_1$Y`hNVSRjhgf(o}v~{cC(PMg5tX|1y
zN<E*l)dGWRmvxF6(07<qj}aBC2l$A}TTDZZPU8I*QxWm*w7t9dX^Y9;=#!bX#q^`|
z|Id=^{l7}Ccg(&dEhtSTM30Ro^Xz7OO_|O1+V^>onPxLDH5ogK#>>q=6qK6{k+hnY
z|6R||T(;c2(&Feg;4{5V=I6f=Vf)SAqGOWT$MHI}c#*Kv?CDJ5wFYvOvlzMFT*_q#
zNhKjQVJVTM?;&}JsaMV3j+KDk9PyMQd?5KN$?Qz;`(+1;LF>)s9gD#=M!Y|4_SSk2
z6bGf;b`YO=5*Lz6N2EX+D++I;>XDKQrLwrPkE-{Dq>qrw@|pbT73uvT(fPF5*O{^p
z;6>G9NV);3sEE!pd)kw7Vvxw&U@os+93*R4*5x&6$NzhvJMGr&htx?{r<6+wNiQHp
ziLmojLsHxZ^JOd*>o=QyT@pze2dR=UW^vGXD6PfF99GsrX(b*%qDs+T+o{1~P%^d4
zNqFXQfqls3HH6O*#!WS!q}q@wv6Mkl9Y_Hz<&YEtsRT<VdI75fX#}@u07;48q7t@K
z?MRskrL;QE981!~A>!^Ps@^cT&?wpDS_i2g7nw`aeMt4W={(H_!=I2+#Yjg>@nt*B
zCYNxM#|`Cc+rW`SNO}*+musC!(rZX1<N)+^Cgna98gR+fIZsJ?2x%O*oICANmLJBY
z*A&OMm`j;MNUA<etlVs_>No(dMqKMea?ONPk0rUuxe`)quHI9Ujzj7!uB6kMogL`f
zAdM7L_fYjn2^r2O?k^r6r4x50Y0_|Ua0|8IBDh9#H8aR{6;hC@rb`Y<_aOD-C}&55
zq3{TO7^s_}21)*qVz|a#Ng4}ju&S|n21y4-h$mZVycZiOoVHO7=8WWmx~R&ze4&s-
zkcNnauXIs<P6k6hB(s?M6OHV4B;6Rr2NvZ<k0t2@qzpOD_`0N$)Oa*URptJ>PErd<
zPU=AO7m})u7M-`7t7`3HMErKDRLvMZsy<gLmNv|qL+T}h4pFm`G80NA@m{kiWfl}3
zsxG+{Gzn5+KEIsj`ao(T+sV_Jl<82qa4!xaX&<Cy9`%VNRU0GzNTrs$Hb#`(L6vkG
z%avRuqVCb8`lx`#L&WVcJ7_(}iufHgs$3^<z-JNVVks`tKGHCcq51I<^h5gScW@j%
zkwaf{M`ck|%}Ju#PCCt`N$i@fI^XO`)85rdB3`<h#IkED_xKQUB|!?{(UVA0GNf@L
z>X_M2&ZMuP)aCJ%OVVGEmWVPtsLwrV7VJ5hOW(-Xm$u97;CKyMC$3B^g?*VUUhbmG
zl$^rhkzAQ<a{U0QIo~k3B=v<<jiWqiiX0B96}NskNlPJxqC-g90cjYQCFj;ZAvI8S
zHXkO*$twJJQ`4`oa%Jjpv^&jzTOl=MDTJgfNOWfTM3K|$OGpKIRLkl0GNhHV9eiC5
zlN21s#p@3}l7b-dBy5&5XSFynaSyf3X}AV+J-d?2ajH1Khw6D`8W%Kzd*U#;zCv2h
z^?XXw*y$Wavy7~#obt9nDlf;Ungmmzc*{qtr#;22nJ&(!)A{$qGl@$~C0BtNTw-r2
zslyvX=^&fDxagTd19{90F(`v-Ie#WcXY=4_M-yM*ESCJl;|tU%Gf48A&C)8qmJE^>
zL0ZNK=8*IilHgG_VlSPi&m1vvFP&%W94_H^F?Fxm*F1!#z5#Q^UFn)1&#sz0hh~v$
zMZBoCk1j6(u7*5axzh}m3aNsK-cGk>q$}N;Ba-M$^X7^3@|g1TxwyJK!3>e>8c1<0
z$)!#LqzrDj!z7hnzy}WDu6RmPG$cGl<qftK5<Ssr#&@Yf($A0vaV@)&RCpnmHG(g9
zK1opvMeu&QT-QbHYKzm5D;UyrwMvldo25_&@V#Dxq#KY*h@eZ<EqzEjx|oZK;JVBw
z=>ep+EM<`7n7~oZm1NE#DIh__AE37C3fBhi*BW%2U4pbsU5vRaNg+#w(?P0vy+kgk
zD#|8TLrAmuVsc1Y56Ok6WfR>%vmoK=8bHzxNPW5e+mZAV(pXiNIhLe4ONIX-s^{gU
zeCAM*aMS#q>hEt_5%K;Pb8Rhosklm^+7HWku_ybz)-paUjOQBJ@2w%Z^Afi%usK)u
zDMfULREj0p=TVT<>e*YXxlFt~Otq>F*@GvP85CIo(o~*MGDsQ^shuj%oI_IbGO^?c
z6>ti!pSckNsAq-<u5wQ$ncI<+EQHfhicW>giyxEZ*#M;?S3u6oy407S=3hv<wVaC#
z;tKdtA3a(w?#dz?tzcJMzSbCWjewLPcAuf?gA|XI9MqmiL>5VnR*H!~$y;|7yV`O0
zds3I}hvXy5oTKAONrS>Os(A=Wn^%dl$EdSXSBqxHsGPQIxSWIBY`GM2AJQZgO><@7
zTD>B&XnRt+trdAxj$@s5?7@ASNUk_Y)z#`;2G3h3MjfY<CMJoM$7x0KVuLtO>^OcS
zAKYAAxk)#cT$(;lX6Y^0XEjMFn^-C;j2n4cwL#g;z4w}e+)_9wj+<I8O$Vij^C##m
ziJRHgo13~1xqgO3!&A=v^GSLRsjDoLrs|XQHfP$xM>Y|=FVkC@FOaG$kCNlOF{GJ1
zXyiEGxJBfhq|>g0YrR^Kh|ny0(tB*>0_i?}L<M>Q%kph4XEZ&69szQalQWt-qU!Kz
z<zV-N^eZ<&DoKwaP2n3g?xxvMOg%~S^oUb5Mc3aZCZ3}1?!KKXP*=QnqEE+10!xeI
z>*=E3$<ThOB9B6~iMxdVX|k=-<hx&a$GT~JXdSLu44tz%Bv<v~C-2v4P&RVKvPjwi
z3CqscB%OrRoL?H;=~nqIO*A`WuId!GhtKl6K}@`BE^Ce<UxRe9?F`jmXgd3Rd1}rg
z*8oUMFnmcm52+!BFWof1LYgimb!UZUa8b?p!7+q_dO?a)4-S{rB<+FJQ%=(Ve&S_k
zh?3{1Mt>pzGp-yFxBn~25pe{PGtYeMfpvL2KBb5ZNE7(VWJ}ub<r+R$S7r_&>BwFY
ze4fsF53X<#cA0LQL~<n^<mexG93Cd=;6ZUvMrR&k7q5a{<TGx<VU~R4?Bywykx-iP
zONM;J&4ctKcjbJN4nV@jg?z-_hBTDBQa<8J9}#yi(7>s6R5)eRn0tPV&(=+b&>(wA
zcb5He);7u>@pg;@_UDOCw&PF7#d&#j!4n+$w+PCmg*JKaK$*k?F_)x(lN^++mP7?T
z>GHQgx?~W+ug$*pBsI<w+b&YYHl7i8FH*}?Jj+L|;j?B_$eOdl{}P?`(m8g$<8#P8
zkY(pt>d51&1xcqNxpPm-vAPG+G=4CwCdv7NUO9OMa(g81g7E*DDiD#)o-N$Oo;2jn
zLaOy`sNRIqpU*3YYSD{&Rn%Nn=(X8Lt8!5|U8ak3{#gW*bsTk#V@~qYGtSQ9qkXt0
zwq2(4=E8@Ehiq=2>m1&Vt0ISOdq}fck_)vANCUVoa_Htl8qaMYhi>={E-RU%<g|7S
zQm9%oIO-T6A64FtS@7oYnPhzq-{dk%s$$KuK09xUiC3xiX>h4kI&DlS7f09UGs;S2
zA&Pdf<QOOAD3nutgZCk+?JYj;052Bhos@G+OuR<zn0QBQyGCbQcZWk(iUiH#tKQ)P
z9<WqNKDPYi?Nja{E4_F%7fVtHNWbv=Nis=pk3{@+>Y9+p99>L}5wlz=_&*j;r0eN#
z?CQ(~$u(fXXDrp=Ynwq*Nl4Xs_DUtG9HjZYMwQFKsn7Yq5FW#FIk*&3KYn$P%fV}q
zo{Fg7<<sH}y-#-~efA4Jdj9{N`!2i?@i)!&wW}}0`I}Vb%P%>w53ZfM!}N-!0et3U
zlENX?l=}+aj!{4_9yRhFZUX5d-;MGfe*H?EzeRm<<h97VMSYR}n#<k9r%I-fif>q|
z%=;CjREC24C6}Z@kOp&oJgLEVd^?UF?P>~4gMu+Mgn~{(>WrI$q)(8hatj_NDe5g(
zAwdM~=KFikTe0#E)%O@Y!F=8j>XfUHqWSnfBo+9bkMAq4*wF}+FX5r@SPByfpJ;}c
z3%{sbmKuo1^cv@qOH$7dEcx+VBiDdkAz}QqAZZq)Zd{2Nk`6(t#*_AHl1_aP&F)ce
z+Wz7)@unfygDW6mNkX5h7?ysMRO^wF07c*yCdu%Ti&C4-rCj9Fuoa~0YAl=On|Y^?
zVo(mX)k3&B@S7Md>5f2Y`R%#ETS1Q5aE#=KehWI4%O^fpd(6Zn1w*1&2im_huO`Xs
zlbCp)E^-T8v$(Qy^qD_%0kIe&G-^6SY9@^NwCE)z_OmGafQ}Ee&r>`9p1e{gj~|o?
zT%ug}jDaNhCVfrPQ%D&+-N<n`z%25pNbROY_&=m($hC0Mg+<|m)Xj3*xZ%iBPo4m>
zNpf*w2{%|SNi89D|JGVQj)S4ykw<zvdKTcLy7M*4X|@fd0G8x5`{Or>HrKr!I|F%V
zU3Tl#0^;r?s%p}=z;Ey4$&kL|;=0l`Z~t%943c(1;@4uARFbYjYQhIzC+R(;tNaT2
zg``!^d;xrmn&ne@x3ehugep_cg<V*^#gc2xH%WbPkV?`iNDD-nC$tnf_Jr2t{ssBS
z@6^rU@`b{JAkhSGXYpRPou<S9y5Ia=IkJpU+xVpTLP7Jzl2idwWxkT+e;pUVa}+66
z5!6owndQwq#8tff#a!R96I?xb*o0GqFNTCy+Zd7_LvrI+@N`#k;wiO4a$&9^FB;Xp
zXjeCudhqCxw}B0kjpv{&lKz3ThgX~Oj)*HFR{lz5-YLQbMDx>*y1}U^D}#6{iX|z$
zn0^+ukxa@YDBUq?N!kr*B%fE#%z2P5i^A{uQM$4?mo!w(@#;1FR&kN{8`Z?ugI)Xh
z0WLdqE2K*zh(4cEJ1k8~ifPZNA!~SYbTzfcu&eKxCkC%Kmr<{6)!?hmbAufAVV)xR
zIo;cXO7mfOk$g?AFQvtj=XCxwFLwEgE5~_Bu^md%{|3qsx~ft>95@KYlhn;e1izsB
z{G5*%MAq?!ABW7q+$~34IhN{*;{_}Y9R12O;^ng;up!Uc^5*ScUUYs*Wo@X$k$9Vx
z{douyfA(UQS8=<Nxcic>qLe>J*WdvfLY1ok32&HkWmFqdQ|`X`Bz1(uBgvdW(jtGc
z;T4s+r81W};M;v=axu{*J%>I<PK(8bT&Ozw1aNFyk+6dn97z|=_RdtjhE-YqTaG|)
zK^x;HmsrvZe#d%i-XlmQsY)PA^kI<P(aa%fSfKdhH8n+KFuUf7uopCa%8T0$$;wX)
z**|%Z{<WX~52PV#KVMGv!F9#mH`HT0Lxj^?y2ceDT=W;-oNYmu*S9`Pjd*P+CmrX8
zELnsxm+B_R=r7?S?@zP8OAh5{H(_^o-XwCS8-H&EOSO63Q-h?e2+{0!x|-Og><U3A
zkZUodNFG`lB<*i1&QrA0lV%(}$RO^XGM99bJ3xNTS!*bw&dBy4sdfvN=5iCsO`*k*
z%B#hrMXm>CwagQxcF^sTMR24gAF@pK&9e8jOW@l|IK8JTc5TJ3-+4*ajvf%DTC-G$
zpT)5x*+E(;-tVG2hm?U(>i%oQ$d5W>q1WWrkRNr<Z7rPsppMubDT4o?j_`=$@@dZa
zVy?|wl1Xh?E6Ohv3xL&ljLVxvKx)AAs=VcQLh8hy;gE6*ioe|ArbQM>SK4yPOSmXc
z8X0xlu@o*p<o9;$0BnpqlOo2nlU-g_E73vt=Ta;E(t(dq8xW;j<jhyUqh4dRiCUzi
zIFT#wPRO-*UXV4f1j&IbC1-GBCqB3)kAf_c3U(5MKG3PsJF}|-j~Q9tJ&@Y`YiW=L
z?XX<jcxxTI2>-um4At$*M`svBGr9eeOeO!+Rm79eu~s+sx$;TBkgIez@kF{Jd$X$x
zH$gi(>Ga+r_#?Tl_h%QD1oB~j4^rH>kMJBEs}JBXUahI`Aryzylpiwzbo&Gj<fw_f
zrxy5$M&h)ABKVWJJbl|F;Ui5PaZ(#UNW_1llZOxI_?!H4^GNz$4i_(_&oG*OG)(2^
z+wxWG`6!nD;;HG~9<#kY$vel0QJ?9ELKE3N`d?GjYx1{+K9*+~xei|ei8mx%!bvJR
ziH}yF{P~*YI(*I~(K(OyMR!jU)5to0n#>_v#O`Z!n^45~DXg^r*D2)!d=2#4;>sQx
zs4YlJwsPp*f4#`bd#_a->pQr1sU%&1bY2*b)1_Z0$zwW4VXFB;QUIg|+~4%)ABKk0
zh0_;vV@Ky198EWc{Gh4@4TWJdS(?u?jhwpA&tfV4UzL~hcgxwV_u<VZIe$-u)Qs29
zCK@FhA(i84B!Hx(Z&7jqCMW3=vqk(@YPqlQB#Y>mR3SOaM$Y52c;X^S8b6<<eq7`i
zlJ-HO<*ghAa!Gj~QZ7%B?MPa`fDio2v)l}lhAm{N-v4eYZSX>o_Yc)39zF~AP7GaO
z^db(wBYTI|^&~khW@#=jhq6f;4Cy@g;pKc<pV}vI*bAj4S{$^a2}_+URUM;k?1+~O
zWVOR_5=xG`_sy~u#w9L|x6pT95;?XhZyCyGbIx*>w(+xBZFo0c!CL2UUtbyzr6x}&
zaz2^~sVldloJkKrdc>D<n52;_xv2l#ETm^ZKf;|a=clk$92)VjrfjiF^wKQlwd!j`
zyhaxrvz}vmtM*g#(&qKzC51TdfU6yk9XV;=fmG#xllB8h^m&>5C|2%=Ir?qjLWlDt
zIG8?`u(Qw_ZKKE|>-bADyXT54+4L;GP6dZ-5}i%ts-D6wnw#V{y&RnFAZ=D}2Xb&;
zOA&WXbPZopgl2E4ug%#kez2zs-`~s!W3H2JdK}Uv?s%VEUKI7%!jT2|rjkP>cq>ao
z@G%ofW4E#NoLk?IPF-s|OLRBNkJsgdyadwU!uX7bOW#xu@(_jPKC&DxsXJMURLd8a
z9FnrrSUSNAa%UPk-FJ&>7P_#xyG3WRj+=LL2z?(#zG}<cW5;f>%|dM%n=bBJER7w%
zT;>q0W=maREW@{*Yyx*k>YE?FE~`nhLi+cIkGeMWUrWkV#3jor{(GO^;i?$qXsPe`
z-Bmsvzb-i%fj{!%!@-HhLUCxF^zV<*x2LEI5s-uAi`IAQt0<1&L&IlOEhy|Gq=srd
znB^Vsc1^r=qDCrlO}G`HMp}H0>k#*^`|UEcQUALbO}j1*7N8b$y}|LgP<4NXKy&2J
zYkcgf;`TQ=jy7}Uz)o@|tpv2+#A6qFmB_rsA+%_8wA9wl+!D=Ps8!xU_UA>@AHW`R
z!YnQ>yU<iq>5jPTLY1-J<r4nW6VTrB-P7ovh%ZPz?tfn#EJ)Wo^Z_4oLEb&y!oij5
z^$wc)W`^o0A4iXe9Jhj>67-E9AIE{vV)%o%y}<f%#wot6kR{AYe{4u&^V>iDPjDmg
zr?_2V2f)U_#=*{qT@ITJn+AIr_AKmm*hjF|*T{T=HPK)9$O;vL^@ObmTLU%}wgqfQ
z*gmi$U}ItDz$U^bv9%gfA?%0Eg1roz1N$5{7uG;?zpSV;tOu+gY#?k1Yy@mO*q+K-
zX~P1-c-R@Ri(yy8Zh_5!JqDW%dmHvC>^s;munzQziL9_YtdF#GenS9+I<Voek+5B1
z2f)U_#=*{qT@ITJn+AIr_AKmm*hgfovPQ2Ve1bKZF$Q2gVJpJcfDMIh0oxI_59|oo
zSlBtRiLgm#&8nIt6~cblEZEDiIk3-Rb72km!*pj@4_KQMUqDn_{_FRZ;49#*;F;~H
z{bdWJL3rPec^^2sJ@ZNMF7SD9i67a24Llco8$7N9`|pF#gMR_Hw07ix7ZCP>e+OUc
z#Qsm<Vx5`u!2`kO0(`|8U>C4+7mjxa?*x|tw+!#f0sauibZ4#(-UwcZ6H;TTe-r-?
z&y+0xguh-392o`M2{szGKkN|LF|cvc(x1u?t>D7fa$yozr7E~sEOSlpDsVmUPvCHH
zPx|9$*||-@3&E|yX_D!L)|(LaVKn4`%~QF67vM1PdvNz@?EehjFq2s;$QO_@i&-B{
zhrk8l|C8BjC;>s6%?FeNZvuY@{skNYt~!U~!@-~CGV8tlES|X~{PpHBcLeuToY2P2
zYJ;*Ib{(wVX~|&pzCrK)wr#k?orU?5N7EaRZ1a8K8R)}f;NEN4e;T~<vO2z%;T(hj
z9}dU{XZ2*h2A-71C%mou_p)E_!{M?IWv80mxJCi=x5%<a&fr9_8~7a91N;oU#(|H|
zxW>nO!M~T5&$7I7;A`Mi#E-wB{8bH=5wMX3gMJ4&-Q)tQ!#_sy|EtCYp6~&B4ZjEL
zXY2+pP=sqd6<i4H@ymZtxDf0C|8eGmhAX%PucxR{X+)HPEeETQf=b{3Sbfy!{$bEZ
z!H$QW1RDpdm-`GAn9aJ?Fb@HVuq$9!!}cn~m;43?u7kc2HU(B6);qvY5Wfe!ANDBh
zab;;JWkEO#dja+m>=oGSu(x3E!rq7d1@<-UU$9?c^%iSelrN~=-_!+kenSTcd!#_z
z75o4k4R-p%{(j(2;KAS|@NjT8cr^HL@HlXVuk_j?PdF9AC=w*k1|I;=2haP5{foip
zz=`0#`Rrc~{uR7h_fzSz{Po~J<n>Eo6NEBG4oC$z)0i{CgTaTu0(=a70<7PMZ^1R;
zr&Fnab-r;a7V3Qym?I?1N1b(9aqg{a7Ou)#a6bp;tzZvF=7nG`hrU<)f4D<2E>HKT
z{U867|FYQSXL0kSUpA1<^x2iqF%5jCFfmOlrTaLu<0Nbu7v?3>%6X@A2`-@rY+u+R
zuw!5+!_I<T1iKt|J?u8vblAhN$6-&wo?~k@T!e51_6F=7*axt`z&?Y04f_r@7xokE
zS6KS74An&Tu+FfBl%>0@2!vv=C1AZ^%fnWNtpQsXwjpeD*mkhpVEe)jg^ht#{VebD
z$q+{SaNkS^?*q>P`_rC??Cb>aU~qZNXKCOS@Rusb@k!v;;LYIK#6hyaoe(Y~U_bb~
z@?5|X@Q4b;G-$g02|@pDs1l{PjZ49nh4q8|5w<gIFW3RFBVpBv<*2OX#RbMia}DZ(
zZM~R7!A*KIHv+!}Hv@kKM}d0|;`ol>L*Q=Ukm1U2HAF)g%z~jm_!XFbQb_g1NAO5+
z=tw@n1n@!dWN_Rl_D=^JMl;U=e;Fs)l{y*1)$t@ylMFTa@)@^b5mf-Zd+mQqj9<sB
z7f>XLSsw-W!EQMI^m_Ie2d~&jEc@V<H=j^{ynaT21Ag4g=*ND2v8J!`^vAXy=YTj@
zST*h`S%#&*C(6}VB(q&rxmH6_98e0@2ev$H0BkkbI<WO&!(p4lwt?*k+XJ>Q>|ofD
z%C@GRb_kPUr@_vHt?AD#u<jE#VJP?~`e;nvS1rJ<AJC*7A@q330o}pb;QruSuh>5V
z`~o}{JOVr!+~76G&j4q>VO|KX^j5Leuo6NZ0+PXJIl!<Jd=3R307v}J1^xs+2tE&X
zeb4^O;8^et@XSA?pN9242zR7Fqs-+IggO7P(0+S4Zq(}gxX3-=<C)C*R8ZgmvwrI)
zf)6A9&_VXEldK-t6)JPdtAg1!Z^1UN727{LvVGT$ZTY@zHw<FyJ)CXtacoP|N9D4c
z3vFUMZwuRMzp?E=zXT`!69SdB8X7cb;pP~&OD3`%y^w9ta<=_owYAI>QrOPg#kTh$
zw%w1j?N48=lP4N{o^9C+Y<(`X-Fa17dels~#)9Pz+f~1?E&ht_P+yZO$fE(<we;0H
zS#Zf-Y=cL!eLtP;*24D6UxYr*lX3EgVx%>fXS=a7+kru3<)bdJHiQOj`>kR7a2s3e
zF1A%quys1g_UtLPr{AzOX}ohfmHwDbp0H+Nw#B^Jz6fUfu?gGg9%e%s8Cb0s3!h=D
z%wlf1f-L{oUIJ|nCIx0{ftD0I>ksrd^YS?QOH_jwWdchu`+~#4wTKIn|0XyLT-}r7
zTY_g+VWzDl6+b12xfgg+9p<sjR>O}WEKEhf-bT#ya<2*~*O*y=PlMNh8#iJ9M)134
z%<15B^lJcAZ^L2m3-C$B)Mo)v9B>H%#juTb2dr<i{RY;z+2|ETRd6PD)o4|$Sl?xH
z2kX0Rw6;)weV46<Wcd=JZ?iQ-fWFPv3aoFlbpq?#Y<<Bkv8zT;Vs!$2mu)gw-({N*
z)_2)f5L;!R>f3B72++6L(!u&R+cB`d&2|l}@3K7s>$_|;N2&_xyKFQADz4OnTil6u
zXq8Y40=;@Do(ZPMkm8r%s^FeIxq$j$pJ?Xh;2yo1JArHTVeSQ<3?525wCV(9`*FZ%
z1RNR2EWaxxTgWh+c@DTeI1wB$iv3AohjGl?z<b6s?*~_#z<kCEVa7xju7RI{e+5^Z
z#Qs0Q`tG4-#}(9f4-13!-NQ0qefN-NYIQ;S?xD3F1bzFkDOhbE(uH>dU!Tks=m~b7
z!aNxK9y|f8?>0^co2PPoA~=lMs(uetHgWf993a0YFInGiJOI|W8_$9D?Z&%cY&RNS
zfc4$R&tQGG(PHBAdAE`J-vffa-RK9_w;O}N`gY?FV12tW3e4M$215_<AJ~o@1g<%s
zFK{GyzhpYUfu1$$inRqCun0U2ybA2Hi2W&Gv5a{axc*A!Ltr18BB>z_Cv;xR{4==x
z7Hoptf>3NL2RsBX-_HC39Gc4f5j++we@{bp*?lnm<e=j4T^vt8(4_b-&B?MtmB6nv
z>He2OT?p+CaX@46@FUD^!0k^m_XV%aVjc|6J;gj0>~xxWrtZJMycpano7!JifPR)y
z)o@xia|(Db_y9PJ9uhJ>3p^Qo8~h9SSMZsiIsP5^>=ovGaDWy2vGgOCssj4{Z4t1(
ze_I-?@84Dc>-)EXV156#E?D2cZ3M>tt$}_qRh`eO@85QYpzq)I2CMyBy5gbWB)OkU
zJO=y#90#s*i~aH71K_3LX1CeDp4n=MgOG}VgWybXDS8g8CI`peWzGg4f53bTY<R@{
z0PF{T37!IeubBG(9)vFlaCyuH*ju=X^bP8Q;P0QXzc{!X*c+^GP*(u!8`Ran3xDC`
z>q?f-f9y~j!VsV~s15W5Eby48d_X7gvR|2dgI7Oe9;ox5%)`M&bD5`rQ^9kH)$_mS
z2M$<?fPemC-V6@^$h;Ta{xkDW;Jx6p;L>^QzX4X;y>vkj!TN6ROADU=`gZRJ1nAqn
z2KuQ3IkD*5y#>Jfc5hLzzTN8$*0*~rf#r6u978q1`fhKS1NFZ=fxg|_0s;DV?~h=8
zySFE}Z$1||2>dHJM)%X{<O!`{+U`|67p(90F4F__?cU7@(6@VcgZ1s+!(e^8_cU1F
z?!5wTXU8YJ51wyk{tX<1JzeW-2>K@PC$PTBYoVW%kS(Nd^16XnVAIzdoae+Rs0`Nk
zW$S_U{n_SV?9*Be9U!O;TWYfY;9hiA*@Pp(KZ9ez4PDtk3#{)DCxF!sF&)1Ke6=XY
zZwAjVCYkzwHw1lm@(@_xoy-F3yOUSI`tIa?u)aI_8(80+d=J)lC*{u!$rhEnlj`|z
zp?5DS=)03e!TRo`7g+61(iQoGZ_C|E;u_$M{>-7^wUwD$fZqf#cLaB@Lf0<~=nY{+
zZ4MX&{u?|FJh2Y@XMt0|i@|Q+vwuB!Uof-$F(i3`=fL~Hr$XrZWd)8ySRKX(Tmo;T
zS0d?`Kb<5C7}uEjG5AIk=GS272<E@Qx!`}mflb-(T7b)oYf9HIPf!}d-Db=cz~RlA
zgTS*}Fo%E(wPbDvuGNaUEqE!o7r1h3_74RoS|N;wFg=n3rh#qX1>iDK?6-ltf;WN_
zz^ULDZ8-iAczj#tGhk~}2NteDsL+x50XVD+^9yj@uFQXd^$l!;GdHQef$ap=H?ZBo
z`UbWav(=#QVEZFL-@&d1)_1T&z}UeyGy>}z*pXm;1G@`Y-@xtz);F+6DyI8i-@%@U
z0DT90CRpFWUIgYHY?@fWY6F|b&{nWVEVs~Za7FM>U@wfh(~{-=mo}XbxPbuQ8O%??
zE5L8T^=7i)=)z6@2wV^xG>iSEz)5qM1Hch;nQIfP=l>cA4H0lXo&#Egcg|z(2<}2V
z3G%M#4ZggPc?8&b5%YL(bMSQVL>v2;xZwF8MOzWFz$65e$43C$!1_l5>0tdMfx}?^
zBY_iO{Ud?%VErS3Yhe8&fx9l$|8incp9xR}9wERtnJe@XoDa?gkKV-oui(R5nH>sp
zd53l}7Xt@mF#CZQfh$`f9Dz_9To_;c_yJrW+zPC39(M-oo5y{@`sVR4u)cXb7OZa`
zTc<%V9pV};0&fSe27dvkfTNCZ{9f>s<IF#STYhE!Su(xt)Jf)j$fw{)W}5$9IUr^e
z2RMPpgNuV_fGdFCfvbXj@Lkh};JD3PUVCsFxCi*TV(S0F5cH38CV~U;G0;NEG~3Dl
zm5LmsE5SbHm@~j5z(>LFz^A|ldLfX<-vkFrrt=$~KzM|J7vQ9de8BJE&`Qk4LR_Jq
z;9}r7urD~xpW_3-BLbLffelq?{Vi)03L%gL$vwf3sxl7%Cw<2}8GIR>2rd-J{?*{X
zAm+`wpI(7v`N#F+t6BK@e?<?d&H)eg0%|b7mrS#t{2$iH0zOHmf#gt|*<9FQaCIqU
zcwURSAov})7}$XLvcz_>8R_S1nps?}D7J|Vqw<&L^Vp39FMrM41;u<qv3=m*AV2=1
z!p4D(ADQF9<KbToK9<RE58J_3=L0Mpg>Vym4IBx+4_<YU<Np9>VqYX5JQnfJZhS!@
z*hldK2b|#JD=}LQ<#9kg1Z03)gDa-+0iD6dB<4Y2pY_aRz@}}?Q^AuqF)sqYMf@7Y
zbQ5gB0qF>sjlHUKVDmaIAO{?a3cLkR#$Hz*xLbSf0>>iU0*k=Kz`tPGQ$e!42_g_s
z69JdN;ovWOxdM^kG2ouyoPF#c0ggi-PX~J-J`ubogX2?(t#Z(1LD-LgJ}B@Ec&gw7
z?ttekXMPS|5C3Oy82nD|+ybMPa(oH!E5!S|(?eKR-~$c_MnE7AXa;V!l$*3YxaKP6
z9^kl@%tOGHRx^(S{|5gIumR;Qbf^B871#k`6#}|0<^oc{Zxfi)!S@oGPk|G_KZC1)
zbHJ&6xdq;UgL^UmZG|8pI2Pp!wZH*I!Q0_43tm2e3#bJ?Gnly{xCXc#cn-Kfc)&o8
zw@!et1p%|ce~sgSrQpK7nUldCM=)oAXAWaN2|f$|74YsD_TLB3?yuNtcn;x5T;ZSK
zm}oxWEBFyAP_P)cP-(CixCuBAJYWnTUl05@D%1>|qgt5yzbk~lP=S8n3q!epk>CR3
znXTYP=z>My{d`4+ByjE!j^72&Lzf-_zYirpo!@W?LZvW1-~sq7_zn0>E%tu__xgdk
zKyhx7sbDX#Zv^|RfKSzD4gt?>K=(flenV>rPwR0&FYrek5CeXW&r4^4{oxnjiQvuP
zQi$IV-qnK3&juf>P4~a7z-<VfgE`;@_}cf(dEhNgm|Z;higQ{rdxJw;Gy8*Qp#^Gy
zeQ<nZ@Rf$t{;~o;Lg<Tt{@{<O&_r-zB*)JMyW@%kxb0>3r-1*YCYI%Ag6E-y&w?9S
z5pWwqwNO6b71*yC^B3@tFlLt$+yV!|Wx?Ap1_HoQtvJ3exDi^c8Q40$E(dgjkPIFQ
zzK<rH41Qjn;}?MYHDq1^?)yFSCh%1324;XCflo5ix4}$2m0m``1b5~f@DJcu;P>=Z
z0okOV!R6sMm*f^1P@Mh6z_~@3eI?t|H%hQruc7?Z|3gt=V;m4vk`L$vo`(YagO`?I
z|5$Kk@C?ax0w1jQmq;!^OIj>#lMsIm+Zekf%lpr+8y9#G0nK|dXMx9dXTAy^1^;7k
z82oR+-=n<0!BN}!c!yGSLONeiDzgW%dj3c6V8I^&$F?!o0)GHE0rvy90VkutXvuWN
zX;^%Zg1_bt9ut$nsW@MJDLnsUPw@!_0v4WSUJtfE$(#mmcY*mRxcxcii(v1g%sF7E
z6U?u{%Z@SsU5ff&Hpye0(B6|d`wR!Tf&G7CE(5NB6IKN`!YgS5@Y%y0-x{2S&rv#q
z<No9p8DxbJ`5Oxp!EK%~&jjCuKM@@NEBjNxHNpG9krlZDC&8XCm@k2QgROTUc)#R;
z7vLEP$OZSL!6a`xtu(jLR4-;1a3%Olf{O&OzZ|#)-LCTZn&6{+7g!CU5U!%YmSEpN
zKA;nLCsjli&>P&P0&@(wRaNFG;GFN6=YnU!p8(#e%BTKMf^eZ6AFvbb4L$;{zL5Rr
z!7af#;85@zaA0|k{|K(TnAt=ZD#u)+jHmM(+#qD*iu}NxP+&FiGE^uO9FHb%0rm~z
z6LbYvugE-1vOV3F^uLjnEjR)GSyVCg{EvsQi3Az20(=m>87!|%`uBnz$x1#29=4VF
zCU_>g@F93*752XX5A8_TFAMkxp~8<m2+iKyL^&v+IQUoeX&JB;gRBbpfFGZrKDanI
z5<J+K{n6kyWvwiXf{=p)=7NWnW&cXBCwMcsCO8v(27Ctm0p;BS`=JGY16Q%4MxP*5
zq?V8^V)o${7=uCN2Hpeq1{*PE0>BT*%H!*R4_0Ch2UqoHjsja(K<EzPR0;<S0+*sG
zQx-TDd;wiD6I=u*uz`nT&}{%eUBJih0#88v0cNY=!y*nig8;9}%y+;~P~bCg3P$~(
z;Ih>?-mVO{*nS*e6dYTX{bj)Ysioxw)l_VyAB&=bq!5k(Z?clxgCF96zTmcqj{z@+
ze>!*pStA|41T5nuCxIitX_Dpr*BXQ3AOc2$&w^j!3U7g%VF~se9EKM80Dgk0)<A<%
zPBhbSTRKaod0+(gLrV~==fD29h81u?&uFf3EwKK#hE2ihPXXzQI)d@HhK4?1{Z9!;
zf%QK<oC4PWmT^&8JpZ$L@_oM!0s0>^?gVG;W&dHY$3M&`!1{N-v%&gzzVCqb?{q%}
z>)!!?C+~mRMEaM&^AMna>D!)ePwCgc^j!puFMS()!TNW;gTVTCz8iq`?|ioc>)-kA
zDDQuH0{u(h{lWT|zDI)fFMUr0^OwF2h8bY>op0)r1z`Pa^6S9rYx3mZ4c5O_-}wZD
ti=VkkuYn(aVtx#M@;CE)u=-9p6{yh+CoiB(KKot4J|3CDJuFkI{eO_*IVb=C

delta 31643
zcmaK#cU)9Q_y2dV%PxzcprELLs3?l4pje{V8yZwpOw?FVu>>)eV8gQ5h1eGzYs6k+
zUla>=G-`+?Vj~z!G?u8ruBhR6X3pp4Nxpyl@_Ie@%=>fB%$++kckbP@i_iX(e*4$?
zElBoow;2BOpZmh)^-3hT7z{>|9S`KuU`TLTc(+N3sxOR69ib`FQ1ykqwy{acrLS1N
zZDN_&KF!#;<hb{&*ImvIV*6!d6HmKYwUUPLUBo7PKXW;IGp&N>Qg&vdS5d^8axNfe
z8awlc^NHMvizw%9t&F@7)meXb8r<zJMVfj-b%$sm<|^ZgRM4w5(`&ffw-#yok6MJ~
zD0h1SHB^$DKZceT5Oc4Gm@0S%uyZK}CW@Gy`9_mH)8uQcV$U^oG6vd%oZ5PXpra#j
z+7SCxr$B2Ri=i5walzO2Jx)zcgDi&Y3p<yzJDaPz|MGI*H@Cm39kg*|vU95?-Ok&a
znbVxq4AHe$ayVPs8B0)`X;ix!H7ProR>$HP>FlA|{k<Ez*YU9!B9kkJDO0zukA0YT
zSfEAG%ZO=>3=wm`avuFA;yb5CP8LJNcZP@=#}c%{izJkAvY+v8tz9<Ri<hZV^4nL1
zh0VJbRvv1y7>buS6jqKiIU>rGFlsYrIeL|SZPbL(-qj~m3!7!1>2tH}41>jxVE(ST
z^R(N=d!OtUk$gX5!0Cv&ZZ|#blYB3^E-y4268s|?IV~(S+k5(@xjy@wgb)($|7~`d
z%STk!5^p%7THUf}2f8^zBTtyLpR*mWqY{nU+4GK(UB(){Zl5<Bj>tKG_~Kz<VZ>5u
z{8I<}r>-5f4ILaEqQ5t3_p|IzyD!x~&vMM_aiEAc?wGwrZ=3e^sN-bsSPyMYT}QRy
zJFMFC>W&-Zju^GjAp5e!cfns)p=M{xX5B9p7KXk#DA!^{@~wy4f1tC>tvh&=*%11R
z6lPwsuj%IH_+`S@B3gV!`(&G+_Hmv4ds}TSf0g|gTNUl%Dtm$LSMB9W`?aK>wQ&_3
z8>fu7XrId1D<s#}&X#d>OU~D{Yo+XM=LBnYmpVqzx$3OBEq8QSY%ZcX2}if3$2Dz_
z)#1G&qOLYB+TQ<SdoBEueaA&VEi>AF{$dON2Hj*|B9jXr22Lz2Jo0y8VWETx?Wm{U
zIW3R0hg|a0rbgPM$aINxjJTBRqJ7ic5t;W_5p7aqN8{TAG%cc)eeK;q?aO2KpY9IS
zuD7(;{ykVbf7B84d#X_jYhizLZ-{oTxg+L&Ws_##@0j(-P1C0Db1eC@w^6IS*YWyE
zN27Md;b{FV-lz@UV|V;DS$nYCQ77MS(YkDPoOxSA(<*JYzkR<*>$}PRU4ftH&Klh5
zw-4uaDJ*<Z+}&x?X8YlSidw(T_B#a!v}c>_D?g-Y6*t+demvwdY@?GJ{Wg{D7eDwq
zTt0m?YF*YloC}Sb_T^ep)M%`y1+En>jmAS-`!)MCV=tqtXPT2?R2gd7twdVJkRrx`
z#?n81?_@Y~{XjmA&)bKGD)V{GtHeE%v4U2zvamQA{WM4Aj6f%2eWTVsAS1?Xj4Gl%
zTqgEdjl;C@%Q7ms7$aS@ujXeA_cUf|+N3$6MrmVfZDWc^C~d5zjZVqfP}+FIq%EJ3
zk>q2f<+gK*Xyi|=)^ZhJ`5SAMnmpafFsOJhr<ux5Joch^AE%zvGt&Hx?TlLAqT<gA
z#<E(=q8XNo#%D#mr&(wj$_+)Xw7aFSz<q5QZdHu)ifG#>WUQ)YJW)z}ADhvynX#Tp
z+Z~g!td;SBr#2)jV`>j$WsBy~Qf!GaR;^gNIbFM2(HH;xTdrsiG&}6{-*QF&n4R%5
z#(2=872TV$bBOV@uhvqCu``YSS_L7N&7_OVUn0&)KEFgfk#^e>QEV33lqDjBtpAWD
zPKN!4B!i(+tCa9Z^Qige(+X{4C$t-IA>uo)_DjTwSu_wqOT>a%#sn>WafW%ev0hOv
zx`b$AH}2Hdd5FL4#=+X6WYIm<*i);QEYeerA=)-~@kgq$llGOnsK3}4sMU5CF^i4q
zuD#r7eyhus)6-4(FQFCCuy{uICC2a~+N+`&OO_cc8a0De99(YfpzU@R))mG`ZGf}*
za)q(3>pctAtuO1|wTPuFj7PMF){O2ejo!xKaVCA7LU*ch((nB9$!eS;7CPb1-?URS
zuWT$Sj>Q>^+M_y|Gmfn??swIi%@B!OjhD1x?=m|4VBAqeJM|{x(+*>fxlZXPCVdK)
zsHV;O6pU0&TPEsFw-HJ>{a1!*pK-R67Wp`1-XUWbt@M!lCPVAcr-O>ubeciMQ9g2g
zPrIM-_OS82QG3@r<Hb?q-HKYyv5c#CjXtGn&8$yNo_$!lt7`9*H?5?(Ub~ejmsDXF
zrSD?>Km$?Xy|JdYY+FX>_r@_sZB6}*vjxURMT%~#NB7^9ECVe?;lI*YDx>n>#ui5J
zH5*K7F>g_J;;}!S<&b{6QP>KNU)n1AIne_mKpm^7V^wtwQpaF*tf`J6>R4MH!_={!
zIyO+phUyruj!o3DnL4&m$5!gtMja#6vAsG*+LY<2j#28^MIEEn@e6hAu8uv`v6njb
zQO6i{?5~co>NrRphp1zmI>xKxH|jV-9Y?8Sf;x`zv&pL;ryx-sC#vIQbxcynsp>dg
z9cQZJY;{ag$2saaPaPMi<05rTRmUajxKtgNtK&*_{9YZ?{G4ob`n3wytK&v>+^mjU
z)p46T?oh{F>bOT8)75dGI__7;gX);6jz`q-m^z+N$1HU`t&V5Z@f;m(g_W}voL9$-
z>iCN~=BVRUb-b>Qx$2mwj=!nnZFRh>j`!5@4|RN~j*r#xi8?-0$9y_Q&h_10&dKoT
zcB#J$BW-hiw@b*8FxPjBET7@>23gMJ@)}v*#pPA9yq?R;WO*r<7t8V-E-#ejBreaD
z<peG#%kmH|&yZzXFNP@+qPRRsmRoUoyev21@@QEO=JH5c_UH02S@z`e*Rt%)<$<#N
z(Mw%eKUvP_a&K9_$7Nd&3D+6A$#OQAJInGhE_aaSbS}4(<;`4fEz2vp++3CyaJjK8
zPv`PyvOJE<b!9o8%b~Iy<7HF5t05tp9o1wxg3Fa<Ih@NCWI2S(zOo#^<ubDD&1ElH
zcIC2%EE~A&Cd;op)w!&){E*8|<>>x<l*eF{j*DFWRF=wFTz)Uh`?>sDmbY>Fxh$t~
z`Kc_Ya`}-g&*t)dS)Rz{JF+~2%Qs~?mdZ9-bT=e)XU7#;j^y$sS#HMVpJh3W%V%Y|
zDwj{mvJaP!%d$I{56iNN%LionT}gHIdu90vmv>Xy_UJalcIn9B@)lV>!{rUKoXO=i
zvb>ASt7LgSmzT-%QZ6r+<vCnlD9cG)o-4}<TuzqdA*yWiogtwYJEq8T6qhH-aw{&6
zm*oar9xcnkTplUQ{#+g=%br~RT9%!;JW!TDmf*WUmh-vXTbA$H7<x#!&gE{hoXzFV
zvV4ro9b`G3%k5-&GnZS-@=7i@m*oXqZY;~wx%`<dkK=M(S&rv&s5)5;Lk;PO=5jSz
zj^J`-Sq|rN1z8T^vac)$aJh^udvn>VY=*BzyI7%4NPYjR1BMS9_H}}PgE|8T#SZ9K
zXF#0+_38}hA6CDAY-s<m*na&Q^siqhwr=fu4Z=bPeHIdu;a^Sbel)}JP-|Z{qjr%Z
zyMM@-v9rkRWukLCQxCDPohc>b%l4)(hKL<|Ov8nr!?Z#ibeQ@J-*nSO@hsicP9A(j
z^j_0I;eFCnT%6r&YRM&^eWnVc$v#tOvG0VbnD~C5DNVGV>r~cMCpa)%T+A|gi=$bl
zuA=#VQ%O;fVJeg1x!?4Ks~COHDP&gUDN{z|DW}-qGIp#p3zM;dXtUYeqL}<}8%6I&
z@_$udWGvZiUT(1tihEC=ZVC&ZiaQ6*<wf^hW`C{fP?5OHT*aC)lzpzEv%_4@EuQLq
z1F0=bsU*#X<Sja9o6BpHhKlwMb49m%<Vt`mh@)NUQ{@;)J}iZi6bEUvlFYqG$`~pR
z?xr4Wisv3oVAnQs6~qgdJ>-fQ#;)q(QU-PIBe_0@RFT^bpm(d*ki1xmB&jx}@*?!9
z*~gWXYX1cFbFnC;6qGt#EsKIGK;n*@pOEA=OkQdgt1DbR#FSmsm2gTKG=|h!96v)+
zEJ=ePEf$yFm@B!ZlJpa#YGPqNb%m5{D4m4IX{vXRq?eF7vgAq$o-*I?wO0|x-%(T;
zNu?ou$F0YbGzwC4zKA&_eFv!;OX(!dffTGHa~??xz7Y-g(sfONYXlE*uoHcFhg60e
zh$iU<Bwv>3joWY@5+->)Nhcs-jIv33326{F|B9s0;e26D)JU3r=yksBaFMu=I`b1;
zYk834$<=xUM_1xAr;^kR67?^eMwFCLC_y|{_eiP(X)KSGCw(4CgjAR3E1aZRkOrzK
zb393zBgDB3>dOtd8gtJx$@L6UClPv*PE3l|NbX%vvG5p8lB+Xa3#5^J4PhiLfHatA
zw--qlAcb>ZrjztLq;I*$=_FMd#m$FuR31sSAkl!*3NyP}4TgcEMBD-D@npCXxW~Q7
zbq>-{9_#5OxqU0{%ILUnx$Wh`V-H;ndGerq#ohc!Qt<>1iWgI^)Aa?=?$Q;KSxmV=
zOFWvSYNI*IOAWtyB1xr2i`;|en%3oT)#T~RAy*nC7mof&($vwS=OK#TJcb*xV&xSz
z7?L41;Z98?X*Hz&e4Xn_x&f)0cwWNd<3vjC7!j07mw5pm4{p-O)nM2UsV&!UMbb4$
zU-3M~ljJg%`?6kDG^dg@bF6qF+YB7XuD)VTmb`(<)d$iFvG6EuvA4@vyhYMsv$K{y
zPNW<**VG0~5C;!a-LQ#V_mG&fkLs?Z9rQ;?k9m@^NJ^e0Y9EokpTyC#cun}w23Tb>
zORHqGkKAroK$#+TUuNYMlwe*rsTA}M(pa(klR41Z)W)qe;)W>?ZD<c^k-93g4{d(w
zHj#3a<|QzRqkC}lL~=zzYQxKXJxSj{isEt2CTR(z&pGN9Ne3X+7P~K)eVl0XJDDU>
zj?u+Ff~OfKfn3HZeBy6Jg-3L0q_l<7l*c}kq&|?2a_^py^b}G9KCdTjc%LA(RFXNI
zB=4zW>v1}9lc{``V!VCGO*8-!-5t4fn`M$TZz>*oRGtAh_E=Zi)@DM=<Wq)`RCXG-
zJCsMX7fHRRiE}4tL_KD(i?&^P8)uTM^9&Jml6rFvuAV%Ma)0{>sS2;fFp>&paBq51
z`8nJzdElm#<T{hvpCF#+(7=(>7s>#>-$_{x<x5$uw1~-~<$89eh|8kodV7{wo<*IR
zKAYPK5O;p1yL~$CdE=5<TEWAePSWp?mhhzHk<>YbqlCO6Dyf}uKO}kw_?@J1+RZ*e
z`b$aXc#<-|6S=2k59SD$)8;DL(>bCxS?m2d+#t=L+=@JDcQVXnDNyt~L^oYjdAjLF
z>@quBZ^3aAcQ}P6&g0ldypP1o+K`f1lJ7b9A*J&qW|A~=K39zAw(gO%7ZN`+&GH`n
zG+!M2k$PEa0Y}GiFXemAKu9C_vgeSr0n#)1LRQHwlcWI)xnftGhNO9rrm6m!<y*|7
zh2r2DI!(1j99>3){zeVNlB<NBr3k*7IV9DF)Rm=ll6ph>oTWUHk|1^G&B8>t&n`%7
zd5XhGs+h{nFHsj}?nTmqRN-=t*7Cw7?5c^=kV`<CsjkMHN7CyhBIO))Jr}M{;`q<B
zGU;YA3a&qp2R)jkCXmMPXiOw&4y0o&ttTm9DOYSJrP5-=PiB9su9XzeUz+P{?}Yf}
zCpzmAgyL;QPJard=IX97%jsXYRJ6~g==pGc&JT@y<k|zN97}S}H$cLxvwY1>hJ=@4
zIsM;3^5)HDI!O~CO;MYRTRKVEkfQkx&m-yGQnBb~x&g{869>s!k1gY?jo}R}nx^jE
zGU0NbT!!WB^26AW>kXo-@@eH}tV=Y`^2z;?r0OfUVklo#0L@bvq`5qskt8jH)Ro_+
zNy&neE}oaAw`o%1S8_|;cztBaYAZ$J1$ig05?e1&FI~Uq5P!bLRy2dA@6|45j;Hd6
zRiffWc`>Uw?3#R11;_=m3Cck+^0hg@iImVZu~o)SPt(sTKfRb+(Kfm}O%zDioHguH
zH>r=@p>v?rQE%c-q}*B~MqQ#Iez;aFzeFp!@dmClKn|MPHM607E-w8+V<z87!#8oz
zGoHPbBo#oyCy*?X9zfc}i~0#kgEn);BzgTl^o^n6$Yycw7wYT7E&6%Yhp<?3Roco@
zbDq*UB!xrjC7Y!E`m)*I*j;qKLR(+jW!k&PZWUWEQ)|l+H-jIVw0Pe_36NJ_-ue=_
zhTlQ%QAGR?d^+62a(Rz}R8buNkt*pC0lY5c(jJ0{2C9!{xwOaqARgyXAL?%7_9p-9
z*8CP)TeZ=cThVD3Z4-%CXyEs3=kP{yJ(ahf0{V*MyXi{vuh7*U+#w1mR6D<01YM;n
zYxi&!{Zk*Ed_JTG!uTgmA1SM$6c_4KA1O1TtmC^VizK5%6kMe~mv(TApL72_>HcWw
z5bdwg$#T-!^_S>pw$N~2qlb)rFKb`$JwVaNp;Y3XRDPn&gtSO4UAcw(?c-`K)%%aU
zbv}bKjfb%nNvZop;&tlMKDcPf$VcW%a+x!@evFj<{m}D=qZ8lDa(LT9a$`yEdcpsd
z<gVudiS{x2aBqE$b2j2ia&<T(VmyzG9N8Wj!X=kR#$!Ksrww0oEV*hyY9*TArcTSZ
z&sUio)q+P|KJbbi76)_b5{n*T7r*_w$szg~l0WZVVI-Y`)SjQ1@_}~?(ieP($p@a>
zQEm&F3;DnchBS=3E+2R!kBZ!1X(go`6E1mV!%vEadFCe89w)i>I=<!P0Omj%EO#{@
zH&RMwagfa*dcHOLiTpfT3jHB|5XM`yK~Y$ZQzGalU0u>C4qwl2Mm}`mA0eHWYpIes
zlBDIQ#nziNm~YQ=^me|a_2im(jwO2Cr`H~HHc8LViJ;#ox?(oF^7)Popi6rUsXJdn
zBuQm|<|xm9QXyt2Q`PXAS5i>S&!Rv!nFCioze&C#*UIx;e>3;VhnAqv1(xbzphyaW
zG?+hwkn%N@?y_EK;r@}5PE#%jm)kVrF_%O`X-{7fakpvY+g;@b1Nlni$VWhmmJOD-
zCIEZ?v!`u=GK>2qU)7wiaWg}>UvlL}KpLkOvwBrqdQB|9LsR2<ouiwnH-&$Yb0luE
z58jB#<5S4n$NdIZD$6&AynQrC^vsvHj|s@{>MBL1M>OGn$=k;pQM9?L><&pKAf4iO
zVR<KQx*^*CPU94kD-y|CAO6Z`ufwb935C3bv|KDKVX5TiLo=3io1?19XI6l`p~l}~
zg+4LMwA4hBet`5le>Pc9(xAH{<sO}7!S5Vhir2?Sa!vkS+?B2dkJv?@O=LszEwDeN
zFuu6yBz*&^j#{&B+ejJ@X%4@z$``^bkGZXIHR)!PiC%9Xi^unAY6Aac*FAYZl@}wf
znN4oJ$p0(!IsbR>EB}NmsE_pOO%NZyt2`0s{-BNrJmr8`i~!ABcSu8c(XA(GDWtkG
zjZj|O@Khu|pzf@I{4?K!@+NNfOq_c_?Rxyh?fUUNCXmYlX#<~1-NcZptMr7Ne;bs+
zJZi5<Is$1Z_eaj8F`rwnj{AZn6Qm~UMsp&iY`!S_h<X_b&lFx9ndI64X(4yz2}$PX
z++;m*r+}W3G!LFo8uCfPNvZ;=A2$_CQeQ}vDJx0qOlCGyUvN|O0_{waeC6BlhNY(B
z&M8_SuSiON%TfT3kPl7cE=atJ%#kGJz7++J=^~21<LKJ_I9W-qGLZh}_pNM_9zd!o
zLT~bCh^J8ScH%>O@T31q^4;)0B%y9-b394W??u@sbmFz|`NVa2LgWYX?T~u&z}zG0
zE~GD6lAo(R3b<ki(LB#wOC^1*{?JBp_8^j$R=)xf^pwss4=z_84w?TM1?eC^+p<ae
z2x%toJg-RV`GL<fkxwlb;SNac`PtTrq>CR!#b>lGr&|hC%HM}yPRKJ3%6M)vi=?xV
zXgK7vMQ$ezoW(h+uVp%m0@+TG3%7&EdjhSX2uL2n;~dSg+&QYba#RctUN%YnAPwOu
zB4rJfK60SyTaN;7aGh3ki6h*&nLd1pa=+aODOgGBL)un1k&;g%xdSdejZV>5AhpHx
z&;`BvrvddA`wm!(BYIIp;Xe^79Y39<BE`5P_>{bjq$-fOX>$%qt&53s&#5DS!NuEq
zUHKe#D$Wha=~e$VFXlc|Yw2cgJ&kS&dMly-$%pfJ#Nqu;Zp>%@kyK`A8%br{#o8Bi
zqRH;uYAwDw<i<P$67BURE#;RSqKz_`&M?`7BP;M@Dw?FRka#5BCXzG(61JE1|Ed_w
zI~6HY5j03mq|DbH^AL|;(o=goJbihamos<|5<b}`kW{w>cclb>Ah(ndCtjJW6nR&&
zKore2S96mwyF59jFRukTY7ZdU`T3MZQWq}~_nOvZju%Hy;bD+>LXA=^_2Wk#DSb+_
z6308yM3R<z|K}`nW4-{X4;C&-PGvZ1B=3E4XYL5;B0rhs1V1Yy&b^_&R4>cX2^=ja
za4e*KD%wp>;3r7uMd($^_T`bZ!bePfOM^Juj~l4XGbIOc5u|_RDrQ2e&yQsJ+OgPA
zG<-)rI8vS)P@gL4CMH*uzgYB+KGxFLiwDVC3;j7H?SG3To-S*6MV8}uD@rA4H>8#P
zO<~M?`rzeTNyNRU6IBc3@afpd=^|!8Y9x-Avoy7itI83${p7U>NG*AblDG7ps-kBB
zwfCkvNAo9Ki=5%&!3E-{408o_6~VzGw}5&-JecFc_?8W)Zcc#IK<qxwAN0P1(vBx`
z4h3z4#EZ(DPSWqeV(kZN@?#Bda){XSf^vx_x|HymERB}iBfqX%6QFe!3lGZ=**DEj
zuJW2fIP?RjHYbwQKa{0ToF>>t(v;dPjpRj>N79Mf;>Ab0^7Y~D`i@6A<Tl;4^0Eq{
z+xST*=cr?24&_X~S<X?H#$xLy+M?$*7P-=DO@+(fbWu;6aOg)))ynPVKr@zF^UI-J
zoiVLhvIwJzK7Gj5S--6)C}Ih6lgskgcI@uMIU@OfVEvq>`h1(qWS}>shw}Z5(vKuv
zgcObmAn6{YC|<PbB)LY4bA{%bF7-Nan{ft_d&OMVO=fw<L2D^GU!z9DN&2oMhb8}O
zpyElt2fYfvP0F`~+)f<I-^rM>Nb-nUoNK9R9UH}-ulatIAJ29{`b)kS1<2gc@Xj3M
z&bQ%2lD>pAj~|%pNjeOv5wDwUk}g83%g<2x1?SDq!bP*x)SSAChMJ{{HNGpieNlM0
zTIzF#`c*V*$X>Lf)o}+>ZC>Z{R{0ZBQyzMG+iTspVh_%aky0H>kj#S9I~Pfnzu=%n
z+?3oQ=R;~GuSVrYet^P{cC%b<XTOjup{6#hhX^v!(ADqB?QmwoP40z@AbIh2KDehi
zVWMjs4!It04zlx;AUSit<fh*3D{hazfu&D9<}8x>e<k9a=v0>8><Z+x2%X6cr3=3=
zcmp%>vO{&Z^%g;9>hAnLq9<9Gl6|?2P3p6GMYkM^Nr6_JPx+Cgk$uHo3b(Ew$l+gc
zi_vt>YXe0?3%RPsv5Q|5oVJmxmgwh9sh?dIi?ekW+<1xmNbaGK+VS%ynC_+NL%ESe
zm1Z+XlXP{cXy{DCl@QOaBys#O?KVly7H2IxUZgnFxtES)-&Owbb6fhFj1rHfFM2fl
z+<BPg=dW-`Z+Mf+KVf!~4{2?j7-gjzu@kw*x4c<BA=f5IV|bg9Z{$xQAx$AuvtLi*
zitG3tHi4wPNusBVrHbY>SxhBsjZET@&HV6tLLu3Zy7B4cd-!um^~I$Vv_2zAdN+kD
z=KkyRoxJ7NPGx;N_iP(U<)^W9P8cuIR-Hpq{0x?`)qEr=2~ty@<p6pw7c+#5tEG)K
zW+q23!nQ~Y;KVGJ=wqbZY2*g(m&}rb`z<%{HIQQYwOnrCS0J_LEx|<d{uWYY-Z+9u
zdi76~N=nOhRVqcKxY3ApPT|w67ySz8%yM0wn#+<8E|8?`c`Oa$Hs$hina>h^Ya;Un
z@)aZ$(ktE^lZw&8d=2drf8d)=VJ8-F&Bl1`267Tsr8$0sWZ?-)pbOo*kSpEb4SqdI
zeHO8l%#)o>(osk>H}Yfp#p2Y19(E3Uq_idOw0KL<_eAcr6!$J>R|>wSq{}M3grz+7
z)G*8O_=&zJa;KpxE!fqLvxxE;9kPO@t^ACZ32^IjXg#G?-kJ@p%W0z<z+OC(aszc+
z$&K9R>&PVO6eRr~A<20aM;+$LmpkZUND*ovwX{{De+jx7W~~+}CFttTtm7I3)R39w
zkbPJu9#e?bxSn0nylmtST?0~$|LxGVA<YycZ`0?aalqmK+wYF97w1aSe&@SU6p(eP
zzlqyTMlLeSlSX^NCehQA`ZH@YJKOOY<igwxX_Fcavs{>gTSTrWT}9Lup?O)FXn9*i
z3oq(&=vJ<c9Zv4&UXaf7Z2Oxm>TSpYxhUUEa$U^(fh*ImM${KgB%RyF(nB8g06KNb
zc9!Pw%`SWM2c!=iHIbwPJ2=W)c*tb3Tp@<tEJdkT7`Hr<{2eTvQ0rcORl3I^YL})f
zyXg=;$yz@kq$Y0>^42h>i>;+;JTLAQxut2Vh`P+J{OkR7IHU@E-^p<p0%-;JdnHMi
zA^rP{NL?Goi&f@7t%e*v0sW@rKe>CYSB{7)V`*ZYh5&q6>XsuOOL-_{`eD^RdU(?r
zS3(Juk6~mT=0d|4Q;`(*`4v84OZANiZIp&VSH$D8G%Uleh!W*!SneSr>0kF$z*Vj_
z>VI9=zPKt5mZLEmaE*PqEOieqfM(^dYy7<|{?;DXIWAhRW^`~Ev_HfhU;4OEG?zo7
z#K<z1`dYbM(cYIXwiRT2huVrd-3C%$xkr{37kz1+Px@8l`cj83=W#3l8GC5Y{&(*z
zyD3uqXs##U5(oY0`j6ihcm1e(t2<m>eLJtdPVu+KLt7>v2I|K={?^0L68LtKy|Uir
zs!inWRC<ZOrMYbh|CuE@)qncmcdmcMf6uuKHWT&?Y!2*g*e9^>U`?;NzB{ZBY*pAW
zSX(m`B4NA3#=?$(od`P{HWfAvb{p(|*euwKuz9c#VPCPe(U;8JQ&(7T*Z|lN*l^eg
z*l5@o*m&4+u+w1|z^;Vd44bYjo&OkwY}o6t_h9p3Kf*e{<qmto`ojjpHh^sf8wJ}7
zb_i^Ov~+$$5`;OhOJUc;?t;yPJp-EqdmHu%>^oS~JFEd%AK0p}VPtJ`cWeeB61F>R
zEbIu_iLkR_Q(@C!x54g*&4RrMn+N+4_SHMOe%S#7{(9LJ)*CheHUu^tHUc&pHU>5x
z_5r;B$#yrirY)Q7GjQcL%=BlCO25&D#$N`QKC#fcEwc-FFSs<gNCf-+z{9|S;G5uJ
z@K5bHJ{0WNp1D4FjSWI$2-eRzpe48>?a8u(k>EIRSMXtQPq1?bj_(JK0S^Id9oat`
z++it%$q>%c@4)2=XMx{>?_$WvrT#nrfB1%E`S;<C(@=8*>~`3_u!mre!JdW9k(T}r
zd)S0Ro{aL}a95^+<G{(_zrYK@CFxJqW!IO2)4{92cfsqxrV$iRC$t68-`mNGE~B`C
z5OB|LnHz%_gIk05(2K9E*8%)yEb|xOf^p3HYAQaSxj+1EnQew)5WYabc<_7hG;sYy
zZeRg;DtIY4l0KNo4(YSqXcF@p_!ok=fzuT)>{7z^0Ll~Cm$3Szy#rI@>OXz@H_>BF
zo>8N>ZE1ge!|VdSK8)EDoH3v0wk-JB3FYr-@P&|6f&=`)-d&jk!55xzgTcCgGyC;9
zJUNp5a#DNh4Q6qL81O^zAh7Q#_Q!)8fu9#}!_KGKKMH>9kIdu2fndW2j?ZSUX|N&S
zEdumAsLvU0U>5vmCIA0=+&70S=slbZ*3Y;b+?T%FlXGwd91D)W^4}BQ1;@khbyfL`
z87kuvMoB^TXe1&=!;XhlOMniOz)7(Bs?q%?pr3}#hP?or1FN^&2p#y9b(`T90v^IX
zh0TZEr|~5>N5z-W-@+Ea>dV?_=J<MuH-lYaOTc<5OAiNc2tKgoVJpH`hOG))9kwQH
zC~SS$X0YvHJHzTDw#mXB-29ZLK)xLO0Kxo>Cvg|JHh3?%519V2TrGm_;G^JI;FDng
zzqq|0!JWWAfhQ2#)M|oo8Ueq8-SWAC+u&Q^dtl$^?0*P80Dc1g>;?PtbwBtuxXnug
zzZ|@WF#07|FgWuSe*cQODEKJ2IQUPnC%DvWj@R$Q7T{#GuTR08f4Ce=3;z+AS4x(T
zI@@C_cO|fZn|=X42>t|)|A142^;7l!Kl}r_s{2hY+`i7g{5Re{)a>DV0Wlpk-hZxw
z%TcaL-o(E9|K^jIE@V3z_77?0UB06zpTz-t0QMN{S=dXkxv;;(K7oA=`!}q`mD_cP
z^@1$}>&w<=r~sidY&F;#u(e_9!#09#2HOfY0yYx1Gi*259<VX6vC7h2HW)%2>@e6-
zu!*oX*x9i2VFm1J*v+uJVGqC_hdl$U=9SKGxCG%xJpHbLT}tw#=7A@dV!jVP3Qoks
z%2b;DPvIX9eg$3!{s8`!I8;5q-FU77ytx5a@HDUoII9f#Y0>m<3qk*fhlYWN!;aDY
zuv=kw!0v<1ggphTPApetiaYlx2QL8g!4IOjhpFHd-I$kwTYSO%J-9P?130}G`?rCM
z_h#M=UeHgm&9E23Q5Fn`z)kyeh2!AQ!Kc8f1K9sFxL7RnC2$V-8aR3&`}4q2agvME
zZ%-gp8A<{T$*}HTJS2bPC8{sjX(6|uxA+@aZ(#5u_Uo%4)XqEz^~<I*4+TG6Of2W1
zX$d}|{&?+BirE?ea#T04{<5aO<>`-YJ+2qx2Egk3*<-x=kHCrbH<DlJUlgHLX)}C{
z3d3O&U=v}JU}wV4fn5Z<6m~W2M%Znz4%h>*M`2GX+ldnE5H7)9g}nuv?877S@-Yu#
zD!3&5V_x#UN&^p&EQfR(1V<hR>;e1VWIhD0{u^@^xH0$~I1BNYz;^hrgNxqg`gg&T
zm~Dn<5IQ2@9oXj%SI~U<ihOS|yMb52UlLsGF8lq!1Ab=?0AB=G1K+qO{j{uWL8x(`
z1R4^zFbH}1EOag`&0Uy<cYCupv)5KWg}xQ^{efA(bsvJ=5np^e`(KJX!4`iTPV-kV
zx3;<o+m6lIw)vcGt0=aKU$TAEi|x1l*k*jqcH}o~V+Gq=%h}Gn&bCpxB6Pv@2Q(Gg
z+A1~X!ok68$HcS!GKsBM3fuOuf6Qm@y_D@4`d7zfgB!N7P20nE%U-tI53!BSWZVBZ
z+x(Nt+6;&22Y0f8cIVjM%wgLjkL^zSt2gQI8NzmaW45omu=N_iHk-cMlX07jPRe@F
z_u-OVJ=mH`v0X#|9E8*dSCE#Tb2S4Y)M6Vum+j5vY$vT|8@z|DvxDvFy==2?vejO&
zwY_Fr$H-Y!&*E$!1hIWxpKaexWc{fpAzfH_3tN2*bMs`f^sl&8vTCn9Gqh%wjYVur
z=+8`L@5vbas4$)WMpSYo@KSI+;$q|oa%F#W@T_9Ylz~zH8|9h1fq$WfWPAO=7lN3_
zFxw1UYqBr}0oDf0Dd6#70WSL)`&WUNgV%vuHD<p9?AwxAexW5VpfUKQVw$cb2<H(H
zhjiKvu%2G~1FWal=xvv;80j`=YFwT`&#rlb_3T<8SkJD7NtPcz^z>Rw1nB9tj$l2#
z)&s1k*XW6=PPnE$U*Slwo?V*+*0XDKz<PFV8L>_FSWmBQM1Y=NbAa{q+A*-6Ub_s|
zvum_hst)PdwdY_xyY@GDQfKZky~Ep7g%k+%fK+?~8~|?8g)7tqr-Ny4Q}GF1*-v|$
zVh620I+vjbc=i{}{lS;O!!(+IIRaz4bHKL<@aV-n5ggr*c{X?pI2D`}%l<TQudkW6
zg3Uvj_k(AGf3!ik9?t=n!Htknx(}WRehJpIhFTH606lBy0oJpI6~KDdFa)e;4Q;er
zs}a!ChV8&AZ77EjTvetKiDSSq;Gy8wBiTP5tY;bJ?>glO>siKBFtUs`^^Z)-06on}
z+rFxxrx_1`^)%yIu%2ej1tZO9cm&q7jPJpEmeFG3_BqQ)^Y0BoPcsI9^)zEBSWh#y
z0PAVS&R|Y6(l5fnZ6@=G#DSB+Bf+k8Zk7C)2H^`S&=AiDUj?rK_nX50jo?3LGw%d1
zn!|huoG_pHgw6|?FMt!5Bl&R+Lfi@txCMSfd#voxBXH{X%x}Txz$Pai($LlHFAiS1
zhS?Wfb0c##aI>vQel&uxc{>NR0iWE-+y%U4FY`d~vwh4%!4dnJ$AJ5QXXyST>|X$0
zdjvzg6v9;m&_58V7NPYhS2zG(e2h5@d<lFVT>m)x?}N*oV9p2oWHA?llWY*IW<H^w
zw=D_Q^R~WVJ#QNb*7LR@U_Ec!2(0IATZ56eHFN~q^t^3P2zuUj09fU1X$=emzd~Mj
zG`RLzzM>@X9B>NQ?Hv0TgTDu_VYV4^AZ$ZGv7fj?26#AF{_&{XCvvjcpAGi9#C#1L
z{R{JN;Bnx`;9tPc71R9Jy37?mBA_2w{voP7p`Ms72G$ePrNMe)x;$7<Ob3GX#B^=2
zo|tYVSw8=fnKm>>fJ#i$kaqx|y~-Wz0e*aqc>p+{-c+a=!`C|BXC4mz`VZ#GV8a9E
z*~IGkpAKO;0=hinfKA|~;Jx52f3p85*!qO|40t5?3RtCise`w`dY1RG1<!vy&HEYw
zdYaeZ%u}GJd5ePeG_My}PxF=s>uKI<V43EXBUBfxXL*}D)BMXh*3-O^2+-5KUx4*A
zZwy#Z^TvVoG;f0Lf59iTfho<aI*<(3v%E|606opS2?2VVcQ;s1^JaqeH1BD!p60y-
z-i);GP4Mk^+<`y9XWpZOPax>Y-FIL;xoffV2<ge)65yvu_Lc{C{KO4b2kZIR#$Y`!
z`#Bi-S)1WY2r5xab2J#dj|NQ6*+_6DBlASC0M7*Lnc;<Cl^Le`tH4$0{am)U34B{I
z&Hru)dRFofSkFpkf%UB9C9s~Ayb0E`l7E2ptmJdBo|TmUevn*qGApT`{}vY(^sJ;8
zSkFrOfmK$Lt{?~;T$ZmW4E)xIxf%F{FLNZgd3omU;6220|1k`JkX4NX;=rAPn8$+8
zgJ*&Zzze{Gs&o7ruuBc*ZD3#UesGyu_+`X#2>Erm!g=u9y3F#QN0LMKQv>EZ;A#z-
zpMYb)Z@>|a*#9?pI=FaIZZD@1UB4UwUkEiDGY5i~f<wW-Her7_IJPNsdvHoK=C0sJ
z;QnA+bM_AdKh%YB5U#c0fT`dI;JM(@^dV7p*bd$WUI)Gp-Ue>mn&S_Fv%x=tZ5!Hg
zz-0&%+B5$KUfhBC5qN$_<~LwHacywrA=MMtE?_-z?FrTs*M7`4gPysTi3K^wdgi(|
zSkGLCgORy5v<B;m>nO0Exc&;PC$3|`dg6MdV!8h#Q=NzaJ##$+tY@z0gE@0e8w*$^
zu4ziPfaAa65!wx&2tEoPMW55<6rPqW??2NR7Oo&*ELi^gS#m_4f}epGj^%ix8~ek?
zF&6{R0G9*5N@Ra9c;y7<`o!w_{~SU~1XP{G0iD3wWajSRo!|lBfGO-B0geHW1Fr^8
z1D~J4@r&H>{NL~$3uy>QMCyMlSpO=(0oK0?$OP+O1)KouUj>{4>t6+22J2r1<htSc
zuf7YQIk}C1u}io^kHK99^DFR=;7?%pmF#z>Y?z#~;%k{pfs-h;BIi5+{5!b14T8re
z4yX?v2yOvh1nvmdlg2&4deV3xSWg;%1J;wqW59aS*fteHcVv<0gA2hc!BIQ7gB!sc
zb~5h;|3b+kd4)&8YyM)sAelbxG@yT>Q2OtIqnPQ-Z%UTSfCT!4AOl>$<G`iC)4_q@
zd~i*$KYnA{5}YKsy>8%L;J)C8ifR6bLeRg)Nd$-B>z{d&X}6RA{~WS|%fa+>Ud8F)
z5#S@>eDEo-fj%(E`d7grlIi@0yAW<8;1M{@n=AYUZdQib=*}IA0ha<Nfh&P`mF4(g
z@CYB~dSHVuy?@IdHG>dBg5((RZ9nEA;57P(BmI-W7s07ucYpS;1cy{$-lY30G9TCL
zSG4etA1>(ul{ny*-ar8JbIG*($^RD=*@1VGX(2h&XDrEP54z>t+zk(_Fc$;ogG+%8
zh_6UoL=K}tB=)ovx1rg#GK|`LM2Sn;Y!vv(Z_HnznRjS57XCL6*gqEB?GbYdI2-<z
zV9%}muKWYo7V`rOM<4`kW4;Vt5C2W@U+}*G7eyYV5PS~tt|j<_7VPBq{J=@+%+;7}
zhD20oi~whxpc8oFQm)Vw+-(ta95}(wJR01ce*G^;a0>VWW$z`=2e(LNUZt3Bf{&=+
zK)|mHIN&U}=R)Q@@Wn;U&%l@FGZ%n&x8*6YmgEum9b5`rAFnxqlI2aX5&?A)P%(lV
zXa$bi%qNHfpWVb90}k23JOZ2po(7Ibd@A_)MvmV|Y?F)58v*+fupcM<5qu??E8GC*
zr7%APzlQ%kcrpAgo;(7lXK{QPa8vk$Jn11UJJ9Ys4rqvg>8Q{i{QWE*k#6ARxy*gR
zIrRO4oa1=#q<PF^!3_~V9UP7J=6TZm%MKW4bA=TM*fo`TBe=zMW(T+y8aM^MKa>3z
zz*E3^U_*Dlz^CAO(aaxg5dMT<_2LeteaQh{;6nH-f}iwce?4%yKFlq_vwJf~gY&?H
z!I@vN-!>k?M+D3Qw~yn1#o&S6nAd~1^=D28--uy83HBMld<pC{nE597*B**(hKCTg
z;tF4a&vfMqpTJ@G;-pw99-)!l*zX5k0S*CY;)IRCozS87;1D$>H2=LIv_}Jjz~ym8
zBf)(!A~x_cOu>AxD>w}t(U&{86WkGN;4rv#9rDxp4d)?Dk^=E>V4pDNr{Hqcm_LH|
z!C$mAkH{6UA9!p-j;{eO6UrP8zCo-W!kr*A2;qSK;LlMZ0UTb7{WHOn5ih{!!JELt
zYjXU4aFM3W*<jBgB;l_^*b&SDkHCT80`SMr*ze}eR~*uexjZ<vIdc#=7b6e`PC)%O
z;L7!A{N<2;0pS1w27^DZ!Tv<>!xqfR;2}+z1$YyZ?i;~vYqLKCd<!FZ2E5FMA-)b_
zCMx_19v99Hd;}lE8gVPbBTx);UJ?8^I2gPEBh(1I3{%h^Y|F+K_J;5dJPaJ#h&wO|
z9A1TaF8E+w=4Idm)tNVdue@eX2Zz03KFLhK0)uc70Y960E6oEh0sjeZ?ZN){;6(V%
zWqCw0t?Vxaj)1?CWO~_m$IE({^3(hucjpRiP+^8Ea}V$>G&mUis3`l#fNkLElIaBg
zc-vnjxhTET;?*_{@qwMV{!YpA{`(TbK?JPs!U0*}b5YE{fKS7J2fP^mXW+SL?*n+l
zDz5Kbj!sDDoAEufH?eyD?_JG85CS|=p&qy$xGne~K5}*eze9ulB-0h|!prw4_>))j
zoKFH9()fHS<?#H!n8AWTz}<t)YrsABGVcO!KFoXsyyXz{&){!&G3SB%q%%JOKi<Rq
zp&ZS>9Fn?dz{!W%9~~$Gp1hkYQ~*yvd`<AOo$PN4_5pVSdtc)b=n2lb&m3oiu>Lv=
ziQtVlm}h{4z^UNh(BMXJGI$?&eOd0%N$`kYna_hWz_uF@zQxbw9)YhT;1xLCl{=*Q
z@(5jVXLbWmg1;<yupj#?gVShT$ep$>xI`z_A)BEYgeo2!&;dNQB6AP0R-AbNcqiTe
zvcUxKT7Txr;1JYL2Is=R5Uf>_@ihNw5adHdHn;=)Ej@H39|q4#Vm=361I`1df}et?
zqrJD_`BS;RiAF}QxrZ{I&Tl9I!JqC|dBOnjb~IQU{1_c-2L2609tj?+u83Y}z_ZZa
zH<D>%+J*nG((&-;(uk_(e+q>67~*B%VwmGi;O5}H;NIX<;1etO0<MB@U<z)5pOt6-
zBk=L|bp7&*-$IyxPd;V}ltUDPD=H2C3=LKQU#5Dpehsi2MyLsRD7`63e-!wr7jr-G
zM*03P14cmzK?BL)6Q1l}4juvC1WpELfXm?oKZ4t#y=&lc7~wy_Q*5a44#GqXso9@L
z;4BtV39uQfzC5@aI2c^JBzL$0xES5O@&a3dr-D0!ZBHTefly`{SBL`-cViv{E{`dh
z0Uk_Ssyu-md=iUpEx17v`*(un7A@lsFxw36a9jL{fKhb2$bcJQ`4E!)09=4o{}MbV
zfc-@(@QAr$O?ZJXQk<+`0elGiOkKq`P87(1RtWeOL)Z;mrvx`J5WEST0KSI?r-ARF
z{vz-ha2j|ec$Z{(|E<HKIEVn5wIkVZ2HdnV^EL4I;D_MF7?IcDdSC<HE%Gk9irdmv
zGVKE+kPj_Gte*e+-xvm>LV8y|VLh<^H-_!N>Q4ZvgWbW;PH=oISpO5kQDFT~4kv^4
zzhRtT5zl}9ZxB}_K>s7g9boUx{PfBM$LBMj0PDZ?&Iaqh^_Clu90C2e-1orxZ@=>^
z()`N+{m0)02+)7*?L@0x`t=`smjvU--iAtG{kPtsVEwn=O~Lway*q;S-+FhqLC}Bf
zJs7P2*n1>c|FL%>n1Ad|znKNA-+I%O%mwQ|k6#T|KaVH>Zm|Ay`kp5sRCvNedKp~j
mPv$$|de4}jgVk@zsX>iyKRH69k+>`lPH@R+INUP1_WuLD-69GA

diff --git a/3rdparty/bx/tools/bin/linux/genie b/3rdparty/bx/tools/bin/linux/genie
index 021bc6ea4d21c1514b94ff40d15c735469e8e523..8b602ec23c8ba3d16081774bfa1deb443e586db0 100755
GIT binary patch
delta 16862
zcmZvk30xIb7r@8yF@uP>Vd4`I5y1sX5zz>DLPIn))FN{WEfv$0{K_;+$pvZAtYb=L
zN}J)5VVKgQ(ITRmVJV@RQ5m6`QuZF$eE<92lhm5u&;IwFbMAKTxp(f&Tw42L(ApP+
zo;mCjcC)FeE9~Y~<sY;t-fp;l6>m@U@Y&4`%KdNI`PkNAbIs?27tKG@z&m%;1HV1=
z@{+asvbB1H!nL}7tdZVncHvxK{rQF};nPgNBdlkS9wD9XnLTUP^o)?`X}x++n;P45
z@|3imvAyr<K6T2Jv}u#OckdN5x%)lQG1FpVVtS{gO^q#lE=GT*VS_sg!~5uI@5_c7
zy@_r9<n+n%yQOz2%>F}9vNxPOXS$4R+^|RCS4|u4c<cFR8a9{Tb~kL)Y(9##>2v3G
z@+4&PriOtMxu;=M^NM8EriMX{x_N%}m2I0Eh7>-rr{VG!yvGh4d27P0qi>hJEp7fm
zs>*w!?wy)ZU*<j2?v1)qiay4cW_mKn<T19VW!1}GEnV#_JU_-Z_Pz!olK6pbHrV-r
zZ9N$Ip=}D-@uBT-!;px038~fF$crD@qW<~y(?_;xm?ZDDjRr66wRH!F_S!~)cKd8w
zZw^sSn3g{I-g!~CKRj#7?DTm)lkZOpnK*E8%B{DJh@1D|lukW6hdkOhBs98vkKUm^
z8TU=S-)G{SPCX+-<d=Q6Sq*&T@egb*W$X7|{)Ss#xcFn+Sl=7|yJY!(v8|2ozlt$@
zI{Qq$KTRIrWosq<cj`@LeWA@~nSR(-s2}#4aWYfW8nkVA^FysPZL6+nrB6GoV$utr
zcUbWQk-?l4P5T?xOwqJE#bGU6sA-SE<Q1A$4cB0Qdac8%hT~swShgE9?H(8i=fD^!
zFc}7{b66R0G|YihGfCu;cmx)~7hxGJfYq=D+FEPc?dvHB7Ql4)HCzLKgZc0>EQYOf
z9aaU5fHiOq^bOFo<uD9xgz@kc91k;tU!+GQR>FMvIxK|&FVO(F6#BN&wAWx5?EbRD
zN`bRrI$Q;_;7*te_rn6%dV|BNgk51B902|8nwA11;Ttg7u4QWaD-?v_CYS?TZKMGB
zJgkJ<VI3@n{%tkw7Z?fe-sG^-;BuG+D`5c)%fo{(1=hi5U}T`C*<PjL@NSp`cftHX
z#=nw8F@luW$N(RKfkB#f3?{>X*J&V}57)q>un6A!1|EVA7#OT+gWtrpa0^U>Uu<S*
z;j?eiAXo&e;a+HKr)dZBvCky&C5aSh!3=oM7Fq@u!&0~oR>Id{9XtyIL+II7hm`=g
zZ6hDN2-m<B+o=$4g4OWFx3O=pX%Fq7L+}h-2>ZO_url*Vj48knu7tKwO?wpv!mc~X
z0CyHLws7DsJO#tvBe-Du-MF@crscx%FnA9QfX~AsxD!^uPDKn|n5O*?Cc~Fu7EE~G
zk!cl?nDsslfXkq*qoy5%@$f5{0_T0en8BxE9^3_s;FJ$(2;2t4!kJEc87i0$vtTLA
zh28hjaQFtSfhS>P1hZyeCKZtA_AwQ}DX;+UgSJkZmRQVK!Z%?KJOzv4q)#Xau7r`D
zk;4>t`=_`T&iRaTVEE?@9oz;ZyJ(vKemn&)z<gMqS;Cl-Xm9|Rch$5fVLEJ6N)KTY
ztbi%71}=wzkwo)B#vV?DX>bY5f=v!lF-(OOaMWQc?#3*FDe!NY0sDW+P{GWlBnn6r
z!!lTXgt3a!w4PtF;K0)`9R`(=0WO6_u=7!d488-yZq&4<U(+Dy_YEF^|APhaJy-?<
zkFkz)CpcjwY<Qel=+69qheQT~4`B{GTTaVilW*xU>;-*$XxdyD2ET;y@Nbv`TU5|M
z7zDH6AealYVF9d!HE{h2@<+1*!g%-x%!tmUf|FE?U@**s<6sdigq83htb=La;hCPS
z127Uk0TW;;%zzG<1N}}>F}w?w!X2;@cKx1m;3LpKM$_tGawdspKj1PLQAu#Y4`2=a
z3kLRL+WkmHa1<O5C&3Ij=O-ErZ9h{16c`h$X^qa{5%?fn1HXhtuohOp-oH>j90D_a
zd(*>TDH#4*O%Gt(a})?4hyHzV9gKnRz!doCZ@3z+g?aEKEQQ*6Vgk;GVK>pR-{~PN
zhYMk=3k(g6fCccm3(WsY5?c}2;+WSk5c*xDB6tVPfD2(R`~Vii)36$Ls3A6PX2@V9
zJOjtW8~&g|I0WXxNq^En_!6|mYufNjbO_$|S0)vZ2y#*|Os-{&;Nm(A`Z7i^9lm4H
zQ+N&*LsN5F6|fVmfxp0*egt1br?n7f!(6z`=Cn%TN3arp4(s4y=-*${zR4sJNy6L9
zX(hl_Fb&qgELhmcX_dhSK2EC|1~ew)02Ue;1Mi2)uoz~*P+uyB6JY_I--HIjgD`L)
z9`$ot<KfFN2WF0F>a>bUSg;a)(u@WT(zF|!Q!v~Cv*3mn^aKuTNe^Jm4d@4J+EXwc
z9)oM(%GOS+2sQ|yJUFTi<t5M{m;jH$@o=WyY2_s_|DSA24-w?U8u&Z(y@gp3=(NJ%
z2AB*-wxa^LGK31?8?XX?59?sJ_EeOpX?buw)Iw=Ed>-b)*ba0Q&VUuLUzpRf4auY>
zBm#$UpN28;JD3dng;OD10*m1qSOJg28u&Z(9ZEDu&`@|&Cn|<hyHO!r2#b_H3VZl8
z47`=u0F&Y0FfEfr%Nq$A7z5Y9F)$zIz%uv-tcJDSomPAjp6J0?!zW=ryc9!B4C8j&
z%W0*+h*)|KN5BI3H>`t`d(+TlZr3mwR>I7MBx?H5QrPk)dI*O?+i-TdFbwX13GgIL
zgZ4NofJ0#(%!I|z`(}Cyb70^IHmNWkdiN#RU=NrN$G}o}w68PM3cQV;A&7^a`%wYR
zg1IocKRt)JFmR-%y$j=EzyN{{&VxB{H_V4!2NHB}LIOiMO4Am>6eutQ?zjby!W!s1
znsp&DlQAQ4GLeGdS(pdwU=iFg1dqUOLviu#n)WA*f$eX_g>V8)hws5WXu%@5D~X1{
zkYVIY(X_W=0&I{>2jO!tAO4m}qLjq?5x9H|(F+4%huf$C?ioqL;IUL%4oBU^Si;3H
za4bOy<KZ5d0#CsV=rx{Lf)TI)-U`d$OjrX~K;Ll;-OG1V5sAhV7*qH^SOmi+(sMWg
z+U_70U>Mv86W|{(4Yr;{K6n$%gR5XMJOC@<?UTuWC&3EC;ANNqQ>T#sPUe4`d*~s8
zQ!pQ%g~f1I8ZAs^y1__z!!$C$pnI{0f8K{3oIah3;2!9E7fbL=JOP)c;{o_LEP@+m
zp&!pK8OFem@5fW)ng2~@6Kn{!!&2CM4lTZ$r4z=(UJqahpN0AG!FhNF9)>mWBJ`iY
z{vd;bVHQk=>tPyv4=#k!^Jy>~1}os_|HC7h6Y0@IR0OMFI$ZxSJ%n`&F@%3EqJl|`
z;iK5WCt(`=2<E^lm=8x~GRE*3=sTH>35<c~;dt2YF**X*!8NeM<IMkJ5;GUm6FBz?
z#%2mnFfa{%0T;qLxCS;~N=5Kl7<doU4#vRKFd6zTBOiPh=0U&ZbO62%{nLn*71+V0
za1GqKg883MV%SRNIn2tThf|4am;wu)q#(E-7Q<g*B@B6r9!?{QVLbd9ro&@T(?D4G
z3>CwBR#EZ2nzs4{8U{axS@6YNJO#IJz%_yQkzoTpfz@z4-0})B0Y896Fd~nbn9l7M
z#=wtX#g*`{*O*@L%Qx^K?EEGb&d{_B7z00tDX_s?bOeULVt7+N<;`R^Wp2fQM5k>O
z2xGQm2oqotOotV46SSqXq{1*b_HA4Vy$gr|m<Wqt+D<$P*FyhU+&f?-{2C@e?|11S
zEP**N(?KGS#LQjv04|5V_Y>VP61IPjieQ`Fj2V0h7Q_B~u$#>i4a49WI3CU~qG0&e
z`?wem`2f2)H26b?3YNhP*x@6}o5TD+L81~tkG%}V18hJ(Wh~)om<208qe8glb5=h1
z4fLPOtl3Wk;YOGNYheNGR>CxfCt)27K0vwi*!3QyBQWX^6~ne)Q0}};T=gXdW@uX9
zqx1k~A18=las>sz+oAu1yf}c7@U2tK3Rw0%1;EEn(*d~g2P&G+tby_HTqSn!#viE=
zE{8?%BrJn%e`2UINmP-D{GX;x`<W<(-G8A6a0IM`OREW{hcxXom=2#hi)Y~Fa})$0
z{|y&E%q|+n!(r#C7$#n%9QZ1%gXuL4)dI%+59)#U{mJ~#C2`v&3WBkJQP3lrb^^x0
z00+SdC&4VZ0_MYVSPZkA^Z@=2!xnNsuf>DVw~ijeZm<B}4@==TSP6Sv##4)!)|Z+8
z2_%~QP0!%o2DMfZe4$~jRSivBtz~<Z*#N^}kXNmh4DW^uVI3@hZH!v06!tc2EnkLo
zy*GBSqEW522Hxz09Sm+<YgNM4F!C{GLF3F?D}lrm-&!jJPKP;ga+6xC2JZ8#wbCDF
z$!uC{<-%3Xs0cpXf`%>T`QE?QN`b$^BG{o-tyKo^hSe|!+LrKi+?smegWYKmEQbZ~
z&rA}fBu4kBwd!CF49g<wVKST>O;6!Pm<RQqR0JP~m9QMzp5SphhJ4Tg(_wTkIs(VR
zeAq0u)~bLHLfcYJI|3tNX0twIAaNgD19NVoLC^uKVT(92EMt3qGj^~|Ja+I!SPT!s
z3fR0Ku3yfUtUnEhufTM8-vAl_qXtqgY&W>p3S2=45^Ali6<P<a#Vr^hh=av22Ufy5
z7@18lCCcfKjc|SBV(}?9qI7M6#1$LindcTeEMm6)@+UQ(e<PPTEVEw9D;LttVy!>&
zCfCVx_#J~h;wpI<={)`oM$Sm{`0dwhsXovzU`fLyzwY`RzktDh%?A3(cg04d%v_A(
zl#%CBl~wZZ1LQ5Q2%(bK@$Wt4H(V$8Z@~QrdFXZWNaTBwx4urEfczum1K6<Ft0GB_
zM{#C}EcgU3wa${wpBS<FfGj!v3Eqm!a#(HZt=$c>>C6n|e%HwZk@rAuzfK;5yeINz
z*U6KScSU}yk|+3u=m~z_s>5l>$0H9Q%<8i!#X^tb>IwppuR%V64E>b#Airiaef+$K
zHy-Tg%}=E-K)>*cKEbcq68&M9PMIqQl%ZdTp4*r!zkVwP+|qcckDnNy8TQO?(5Z6N
zkS@_v>+>pESVLYnqUR3d(GQ^~_o|`xR{8|=VNW<LZhcqj(_DI2XVK3^AENZ0OSNes
z@&(9=WO?~Bqw|z9e)LraG~yAOpfVR@<Up=s@5-g*70A7o{##yyyg71ZbLCQ%`Px`Z
zmr9q<ji}77*XfmQ3i=4-t*&H(N}i59>^ghZ=`7?S$WxTP%W29fN%c<2L!W{^^&0(1
zrSIzUS}_JsVbJ?(gK2)vh!8CdSta^cmx{ih34Z4?>9XI58oqhize6v@ixmj@)g5*<
z9Qhl_JF4<MR}9QTz7u(d^gUn%%HI7(YyJ0SQoi2^*6WsulX~<{%cV;RzlScDq!P8j
zESI?@$j2|2jq3Ls%cZ2mh>hF7++p3N%62(Gzw=7isQPWJG`1DId{TPPrRonv9<V~<
z4j8eaBhkAS7+3zOSF*bzHUqnHD`fQnBPuity8)^Um))cFcFIr1*yXK|(+6l*`3mtV
zh0U`ix|H_L%a$pMN3*5mjA1v{FkP%{*`aI(u9Wgpqo3X(N9+fUsOZ8Rht*lt;ki^F
z5*o2jMUKnrFV%<f$am(*f`dk^ec6)^%TQX{rl5i&AN_+*O36Wdxa3Kx!Y*?gcI}jb
zs}FbA_d(SY=ED{RJ<CD;rR4D*xo^GHU49Dkjo1%Sa#wi--y&D+xnd^+{eJYDl%Cb%
zDSe>4e8^~{Cp;y)58;rWPfOJy8aM7~@%h5&XW#PlwSlDuEuk^nlBZ?L7uY}ijI34+
zTP6Dx(^koaFK}YcDzP6n5~C-tc382hG8&UX*Hq)j)9Os*uBB|0U$ddeRk<0+A73r2
z4;xX&H|Vo<nQ+W#FFzkPI_u4!lNMi6@Pp4u+?Q0;>3Ny)C5!dh=Vhb%-ACkz`n_1h
z=ZF#B@rYdQ)%pO`)F{QUdVh(2o;%VXI6|cRu9fT~<nORn_9?bpD^A77)=KnOMwI<w
zzE)nn_NsEmH|5RrT3PTFqrC#XYqXK8OX>5`|8K37pzrk^dKCdzF6D?i^gkfKI=WOo
zf8wPE`DVF%#0Ze#Wky?l+6ywHjD9Y8LDrSw*x%QAMmd2S>>&5eqk?=P>#to;)l8^C
z-f6wGIEw$$)=S(`%E@0ZGmg@~&Ff{OlE1QEO4RQc){Fi%zh7G~UA{I(=<lwV1z#Hz
z^z!vmuH>z9MgImn|6GathH}z!Wrq4aE>|{wgFY))N}%yYuEPpyApH&)t)<B^I#BSU
zbU8+bxR)gLn31SI@{(-ix7YTU9M(zc_O;Pg&L1<n8~t8(SZ6euV3D!!aieGG*9iV=
zEl8$k8LY}DUY6CzjS)s0zK-T-(yxu#My%sTkGL@idSK+92(z!5Sn31wH1x~|mp&1#
za+Uhve8&cvQBGIVHpptlT^nR~IdkIT1}Rs(v_YKZ^kmd4(&bwt+-v?T4(pO!4mSg2
z*0)AaJ!q3`M6Rc7l9F$+U%W}G6gO=Wp9)yANxCSW*(6C7EHsDnBwPKylqZ`jaOM54
z%IOO7KmMxtoPhgYmFN@H*XuQzasvIb*JRxZ>_2`@cB|jnuge9+J+DiblgLlJE=ebi
z*g@WJcy8;ig<y>9zOD~<-<q?C<uT|Nc>2FY*B`p#AmxWV^xNK$eJ73Z;NEZY#X#A4
zF6DUjmANbOs0M1JI!Nd5jEKxee7z01Vg*%<C%5Ici@eiya`hRyJ#yD=-J?_X3y~)x
zpLm`98stwR?{=L$ANfY)<h`q2r3xxW@e&FatNKgHE0C{3e)a87$!k3JLzVSZzh)zF
zM?6ViX6>QP(&ZE*b!M~7IK^Txo6phQUFxqGQfsiA@Rsa2WyB^ujDCR9j`nLdk#bb|
zmFQnVe*Ml-eLffD%I3<YM!-KveFm54?~U*&2hbCk^_Q|uLBAil)Jr}4668bk9af9$
z<XOlEA!p60zamNHBEKnLc7IR5Mxu8)IMv;M)s9N^Q}abXO*=-TAFgt%MTccuwIeK;
zPd3QMx#a`&(Qdi=tp6!;9-`|nSG$o%Y;jm4>!t3+G9CHEEs}kjNPc6B>^sd}AZIIE
zc_Qq*(NWs|U<B!RZIiekm<s#0$&4SE8fDvLHH`ja+rPKi1>986BEP=F>UcoqHmOqO
z6mJ)MB?BAqwxm|F+THcGY_6mM&%G_DD=Gi@+v4-15k98L4u{o6HP|(k?z=KmTmu`^
zp0{-9<FD$F>Zm$4a5Hii-}*~UoiyY<kuQ}+ZOkxv^G72>pI#tGexyC)c8dO!F;>6v
zT}l0k-&5a}?4Ov@6W^81isRmu5+%Rzt~k~2>UYKdvk@NZQ|NgXa<y+#efwglFQEfL
zTqsk1re8^gvJQKFd!g+6nR3SN5_`2_53a%HfnA=Up^TA~pn}7f2$;D`x}3rG&0Ugu
z#t07Dh@ETM^z=&Ak%8SB<YVRRUo7O?&lmyvj$LYR6|7h6lBzS*d2E-o_yu0rC2@-9
zkbi5~!_%0FP2c0&p(@YS{<-z-S8d2b-{C#k_=^!WC=0u&db{*`JLZs9hTUrP|Fz_+
za@3)RP47vID*BSMTLx9(kJY<nf%@Hik8G|Y4!+tWRaL~nsP`rASM)<akhzM9AIfHE
zEdS7973$LOS8h(1f90-b?o$p7jt=E?#P!dLF|2tVkTc%(mkN?(<dOSiMl}^~*(aN;
zjl|FoiXE0~<#9ct>F(z@Y+Vh<{-<JTah4~PcRq1gH`SNsp7HVQAIgw>Unf_GD~kU^
zo{qc#`PD0!3fL^<n~=L4=qXD%HWzsj@)aypAD(5i*zr>-Kg-D8_o+C~8YArA?sr&8
zDuZkESQ1DkFkPCI$dq%8bX|#LpCfWwA8=T#vMxIXWtWGY9r;*!v5VPOPMtHn^`Qr(
zihTA*k@r=Gl&hee=i8M=9*{1-;k*T<GUGQRe9)Vv4r`?{z|Qm9fk$fhj(83Bea1Mf
z=>O0w=N6!^DU~C?vD`F1=&-hFvbD-+Bkj)9(ZvTP?mQiBc}P;vGfJxtIjk9;g1BR*
zU5P&xx_OlL?IGEBo^T2OLY&y^rC-RP-zj(qCrEzL<?plHDPQ>AhzLD(<mv}NbyHTQ
zx;;|N3T*#MPXCVQ;=dA~3-GhABwDdmnWSDY5@UlnL2~_#B$tWN68U^p5PN%lo}YID
z$pYjLAb(zyb5(4oFI_O&>mME!`$Yo(?XM;2BJsPOqbRqijOty)0MEOK8`)eTcRidS
zR}cSc$kb7l%CBYJMYeg#-#D!8x}0oo2Fhs!dckq=sbNe$<46j34&Di=_Xhv$zg0lR
zbSE&;y8?Nr($Bf#09U}Lpg&PAb8Bdzqg*yB4)|8B#KCgD#_+ahaE|49RX78#ukrI%
zty1S+E_^F3{vdE>aDK(LoG2SErH>)3msZGV^dq7<$Ktxrk$)yttLV=_KLk0G%~kF+
zk30wYIONGn?zxm(^N_DZKCywP_Y|yeJSjeZ8c|c!Ihf{Gtf8tPh7jD2+~sDEPRWyz
zpFn>7GhUiU9;<A5X`;5Ms#6P**PfJhf3hAO{f_5oFLwhj{AmPt+*|2+`9;r`=ws^L
zsxqpBIJ?ll<$mlLb&1EGu0P6*OEi4ukFxp_>&wy~|NZ8qh<vM&U;hTA4EfW@uU_9&
zhpLgUy-u!1z(ylpL4M`MhG(h{_6wQq7eIS?Yb5>uG8zSM$4JG=l}oukfy@QSZ*Sn9
zMl1d@BD@NJa#&;B&kl$G;+9kPlT`hMTaF{2(a;_4Lmb9H{o7w8+rf%)XO)yVh^F;b
z;#6E+C0(3MRA-gUb>fCzzsfq;aOkfNIndecC>vUvwnk+P!K7cglgo=`JdZW0H9CZL
zum11nOIN7T!yBt*bS)*uSIdH0mXMTcS*JLyT6QbWt(J1dr>e!Nm|HFOI`~et#MRL<
z-?K8K&Ik{`@2qESov8-&u8*&bsPf60J`4={XxApZy?j$=bRLw!fg=`dR}3%F|9PFD
z9BO06Ekdsv>0YewxKf^`3RDM>9zQ38E*m34-^MPy-tM0*h041IyTWs_<1!t5|D2Rx
zX3aZwPFnoUt@DoGWXj*Xnt6x=PCO+1tDZ=eKaLlFzsU||mxtY5^>+WP6so<I*nRh#
zdOET<vfZp{28W~`aBndNSd4f7^O9;&XXr)QY~j527o|k8?M11wcxe6l;&Vx6tk)@y
zKBc?D<hc!IcfIc7b4Sdm(McS9x_WP@LRXz*8hD*tHN>CWP#@&){?pKcE1n}THFfUk
zP>qakV8-gr|B&nkW^mu#e>f~|F7=lxN1cZ{h#Y9zEowwF>PJ+Sqs~cHp}$MsKS88j
zZeX^xH{k$QqO$i~Y5@HEvA6tFqIEM|Fa1+ebu&u0U6O3Ysh4E8ZU*bmU6OM2gY1Jm
zn{B4p^>;^Ge+~m(y5?TQ_T62PYEKyYwtvZ>hGwk&X@|pVq*RonpawDn{bLSU-O!Bk
z`U3r0IoOCh!S@Z#c18r}!h(F{r84$PK{m6iUT2den;D)u+{@|SV@y&G7+LQC<)DlK
zJVl{zLE0UmYAfo#)Bvp+z}E`&t``CxxyqN1e4>|>*ywngmsBZE_Yxm3IMYkID9-Yd
zBxs*aejYYl<&UTIc#>%Yc`@fDi@nT#`Y=PBUS^b>_cFcpOxKUt(02@{wZ)TZiJr_|
z@&(tD?3(X`IIC^SXoJxhZOQ_J%2G^Or#RM>-HIzrDOX%+ic|4PQ|u=E#FRM2?@UQm
z9O5kt6eoMjI>ooWWw+uvZz(s;@VF-$Ijzs?+o6AQ#XHJ5wgg6nLvKwydIdyUApdFa
zBXQnlV(6Xd(>?knJgbH1<6JAR8tp9XUiFcU-bC@{#!~K0e^&U4-iY5_{3Nas?O5a|
zsg2BV<8?o$wXA^zS8`9zYh;G$KQ)z-MtHBPnfUma;a+jgoz_No7!UU`d+IsOWifKS
zu(@pZ!EQ(ksZzhEw~!W%37-lM%S~2yrU1FSvDsGlYAFjEQ*m@l*`fGeOQ}-)uBC+f
zno;&ut(@+4Lv@BrMM3gVcEYV>hA$2Mrj=yF5uI*uS|Q57b16sVVb>jb>nlR2<m%|z
zjmTZm;n67vmLczj{9#Xr{hOF!LDO10ui5e|!<bt+<BoihoG3MdWNs5PJSaE7X^rs|
zrr=rtu{)0ZpadOcdXzRX{ln+CaawOHgCTyW8t89c8CciCT#3DD$3YF{WJi<NEq-Q%
z9uOp>{qRV6kYxMeNv~kpq1ZlH%3)|^u=773TEnQrHjGc#!4ldO`v-$1sVU_w43@>}
z_cOt=8QOEm&sNX1C!9fZl!r^nw<%bvn&RQVgT<#A3(mN95~nyjL}oNIN9f@lWOp+>
zKd^)IUmxholKIw-+_n2sWpgRdS0kT<+_lF@^lLVZ`jkU$!}&0be6j0|V0#(T+-$Gw
zF*2jM8J;v7g=-;jZFbxR<&fb~<egN>p361l$e%*a<3|0ZCVmm}4IQMsIU(s2CiWI)
ztkEURX}zFH)Nh7e=Cm*a^p#<fjY9u8O!l=f6N9TTaCzn~zh<}7QdN-cHjb=xbXw1Q
z_Jf03;-YckGNq+C!ag9v>3$DD*KQ#Lz18x`&?iDl(Cglv#O{y%<p_yWv?3(cpIPSH
z$!Sez-`>W&L2~^~fBlwDvcsR*G7rV?a``+#rne$PSZ4`sWkwlab>_H%=OyUuR%RQ0
zNEgX&r3NKh%9Ua7XmP6Fdt)T&1~bu~)5m$ugP;lyb*k@?KC<}+rtxz0d*wuPGe|Dn
zU<QXby2<I@*}8mw8+~zKYOvI4KPUFnc|r~{gC(Uk>q6U`WkGAABl>39+*-~1c&Sp1
z?<-vb;L(0EBY=iA=`Y!eefrC0#q|DC5<o-83=n%8ey0qOxHi~#8z>9fAWs}DB}$$@
zSgP8XQE|%?JTCS$NJXpvSZ+_qUBTdLu-etTFV)c)^ve@uw4JDz1X*CGyv+%+j^BoU
zi__Y{7l^LBPX5kL)MX42y)7MCJVc@u=M0r8ZHdB#L$9q^MVmSmn1P%}FP<LUFVmDf
z7kM^vw(#|rS`P}4=M9x3lxvT@)e~GE3l%GM=!f4b_CRyQ-LsRN?k6jcUUf9>4(dZb
zRLO>^+v>vljw*c?`dsv*l-}hC?qGZuQqL4><nquTME{rE9mQ+x6M@{W7Y!4iAo_V^
zn8XDUDP5EQxwEM08AHB>ck;y{S=I$HIkJ-F2=-p5vFqjDDF+2p&KJWaDHwgv5wbWK
zeT&<i)}!uC+~Hs**B!S>RWOcUcANOLV_<*3O;X!23!9AOM2wf(#J83$?aZF`QDd&{
zxSDkpY#qmr5obFhBVeq=g_yC?Z;rionk&brkLR=_@~byes^eM6e;F$qL(D`ye4JE;
znEmYi$NhVN!tUk|*~ZDB_GVOQIr;(QbvyE(kB({})uBh{I9ZLI@xz@?>yjq--^g>s
zukFnseRZn%gyPZL?vl7rdb;p#$qqFWjpLJ?)^MBqxyBJ{Mi^hGIjw!fp6eS(2eW7F
z(0i|)rLHbb<nRu1AJr7krD7!ud1vG!C^4LS#r_Uvgjd8&r`1_5w__T&2qW6P(<Lqp
zr>{wu)G#x)<D;`YYx12ccy6!XlPMpsnZ%PW`gZQk-IrlzJNu0Lofcarmk%-t1~r?i
zv3vG@@#$!WM;D`Ka@Aj|!I6_Wi!<AK%`1OZegg7{*)pXgWBBfD+1Qa_an6?Vj^;G`
z_y?}N8><e+PvP*<1Ckn!3zITrb2yvR+c_}m+7@|gRCX2E=?{ui*|nc9NfDGgYrZUw
zU<hW+mmLviKjX9cPOBDAN11%4>clo>?!z*u69%t5EOR^Ir~wOPT_+mW@)0TTL;=N*
zNK$7uhXXpZIh?mp{fN~=7s=wz7{9VeHh0FWMT?{aBfV^qIFZ|ravIe&^*pUp-U^(?
zVb4b;t_zFoGmkp0S?;~hBHn`OpFAq-x|p$X4{+47x60#sNpgSvlvds>BmYtR)t|=P
zd!TCMb27!Js~H@v^1DWfw>N4*Q2D~{<-o^v@)+dt$Un}M)UNcW$z!s*EB#sYnCymL
zFFeMfG`W1%2$M#U>^v?nk?2U4s#l+Ix(|1dUtNLxQ%6B-pO6KS_#t4atc#?a{YzyZ
z_WJRqav{=;4IZ%U-!nOEI!{E%gZXF}V%lX=H#5ZOu$*Ie9>E3#UPD&UBROkv-#gt+
z4NtcCL{U>_wseW2hUk@&8pUEYVWrdJ>A3z<b2FD$caN@=-Pi^Hgx;*zvVfAVM1K_d
zU#>YDCS7i1+^6Qqj2q4Q`iLjx^o>;T=abT+JAD5s8Pr_`)HAZ6I{{X*N=mw$(~=@q
zdn#rkC3{@14wDW+&eJSk-Tay*dF1MF>2&0-t%RpBYC@>PrjM<b1wE+$<7Z`e56VBD
zFFw&cyaaFI9H7TmL3J$be*SV}iwugUT_d*0+-Mv#X^X5=e1=acN`7vOT;R8T$X2KO
zsh7KedhMt}(SJ67xv^EEdt(3GR!QxNLl1401&ZHnm32MM#EwsFbGqNxx$YkSBd7aP
zZK)*xOXxew2@Z`IwmIw(w@GLW)kbcY(J>S|eY-4Bd}+IEjG<e(Z>#17Ki`X4RLVz|
zUPSw@9n1n%s;i1y`NTBY^=P7^HsJwI^}KWKhFgu<c;v=AvY?kaP4816r+ZPyq5|=W
zB|5+4d@nBr>o4W1n7OR?1u`1F*X5l~Yl>{C;)BJ8STm$!yF$+m+g1NR9~ab+6qBzT
z`p?|6^m44(A-sU!pQ_Sb_ImLZpQyDbY#zt;cd<uvy{>96_x3hB>$UI6>fTfw>F2Pf
zxnIJ4-`i}h|FK(~y&0PcoCoft3iMpcw}Ba)1>GZYeHe=kdnC1wnHcmrb{yNPzm$V=
zvHJ*lv%<Z7%y0F=^L@?7jLhk=gI!iD`?@*m4mZy%b~A68n@_BFbMUKfKIEbQTkhX0
z-*I!T`ZFc22S0an&LKA+{?g4w-?$lb%*9MC`h;8X(RXgHJmqGapWNJX-p$T`xtZBL
z!Bs(*WH-0p>1O0?H<L5nY`DtJJ;CnRsU17J84~4YKu^U?PUG}(3m)m`=AZ=6Z}s<4
zTn}w_^Y=nGR~5NA;0rec54%}@#LZ()H*aW~=*s_W3pa-ZyV*(oJr!5&5ZH}GqcLv5
zymU7cX1m#Vj+_0Tb#u&Xh4w^qf|tJcJy|f!jPm*Cf6pL0hMBQG13q;d4*0b2!Z7nS
F{r?lv`4a#D

delta 16776
zcmaK!30PIt`iD2$!`Xz0XgD1P5fK#;6%h^5>wuPsmPVRHrHGb>Ib>Z2ZkUn-Vwq8&
z8L3xo&@hKc6RDd+S&BnWC83#ODxs#Sc@EhAzw4X>srUcf{XF#U^ZVBDt#1u`?ZKA5
z7qs-fpv8y0OS&1FI!d~|QTk=Wj8Cl2Um2hDck|749;N;RZ#*^HICp&H;plI!zCCI{
z_eo0+4V<%7pSM)^*t%5Lf4ooMFk$NyAN}n*>crbme>|*n$F3o5Mo*kDVcgV^Zk@-B
zO&>j~^XSf_V>^#di|Lw{-YG36eblIKX<a*~cZrVe7Sn0$106eVeJe&^T*sr^*6?n6
z`WIWjzM&`DwkG)1+4S*NDXugB9T`2$GQ#r6zy}|dM*=MYa(<ZQxvk@dTZTO=U;W22
zQ96EQc~?rlvZP6e9hO6H@3aiyFHpwrw2YC6Lzen7a+k$ZPVcnzyyIu*T_kzzki}0~
zd&{>6ExxjGmu00KO|shb&XKZkw`GEaer#zZzF%9KYz_a~GR$XQ)M3lksKcIlr?NZ(
z>U5vhNYg&iH7#t3-Q3Oj(WQ1XzPF}X-?5vy$(l9{UQg4sO)xW4(|&|W3p6cZncd7I
zecZctvl13UOJhyj4g=x0Fa}<MNpMoW-JA;7!Cd%7zCFua!^shZVt4}{g+A|5AWVXm
zCYts!jDbGO?Pdmyf_d;kSO6b`yWvDw2D9LG_$~Bls%fWT7<9mR*lmT~OocD4$f8A@
zoJA;rCM<!IR^k9W1$~-n+FvjX&i}XFOom^>40sV{!+NXiW<G2I3*kgq4(GsXxD@)^
zG_3$eLTxn>fhk#>WN`8Z%!L!yPyoCP%c0l%6aWLEe}Ja-hLP|ym<~_FY#9Fm`CukI
z3JYL0yZ|EuH7)f+9EYF5T<G@^4Tf3qob2YL09L~zFfd5dI<Cb-I0<IJ127M^Uq@HL
z|G-K(<YUqYYudXo9(oqw7;L@XZmxm9!K2Xs6Vk(G(9&GfT5ceHb1h42%SkfAa5xoi
zhXt?{mcVPU9R3BXVZV)37(&Zn0yH+04-SNR@E3R#R>4Ym<5SYN(6mE^Gzj+kj4p?Z
z;hGj%nznum2@%dhOQ@#ZhJkR-Rx-ePMf5Fv2kwTM+ZkMNdNHAGscF|?Dtze+9DtW$
zF|79`gAvYx{$ZMS5GKJtVK&VF59zWv`I?hT_%pP$(zI4z5gOPICc}L&7oLY}px+KM
z!XkJb8ao-B;Y_DpbQQGhrUDoS^Wl8B8){#Z9>&7R*3251mCVVjoaDkHSO{(3;7A)y
zTMpBq#~$pk8{7>)hu7g*7}*y6TO5TS!%X<?ca#HP-b>d(V;|`vG%W+B!o(7~7)I}B
z{#SD{Vm~2or)lS42Aq6=7Q$7q3>Lua@H7mJWHcY7@8MRM4u68#aLgephMQp-Tzi;`
z+cS%f;2<0Zr$T{w@YInkGH?=jltBX%j?q_9nzj(ez_{Z$1Yd-C@DwbDulzul!MZ;Z
z`VN}*3`~GyPY?lk5EjA)rIZV&LZ6NdP8bQF_=&Lq>t>bV5hqPxF6?&_&*2z&9WH`C
zoiuGP41;Y?F*e{Zm<-3m4EQ3<hRa|+JOc}1{LhpF|A1l9tbi~c4mwS}Fl!MfxtzQU
z*T4<17}o!VzJsk`H7thyommH9Bs>WdVAvT7f<s^~919EKr?3QipQRi)2Uf!)&_71g
zlFNxe4D<ilbA${b>sJOBY<wP%;Uh4x3)2oJz_l<Heh#O?Z!h35Oua}2@CuBH)wEF+
zL<H`Kd9dwoGzdNl%i!zqI$QyLx-$P;U8dl!n%4IUEr2ilPJ!?Q^zTOKU<|B#m5{^Z
zFdP03*TC3ov=lxLE8zhc_5hCkK?`B@bs7jKz<ihm3*jYLp2dmv2GcE$c?|>MSeOhq
z!m02m%!iF{5+N7|E8$G&)15AZk+9dFGz2~mGvNxD4?n+!gYYJ_#B197b{YgfbTI#?
za`Iv|1;aJB=_6SB7YTaMM<$_!b#;fi2KI-$;nT1T&Vtur?>Y`MrYD2X(_v=9GcX_i
zY;~9=u&LoN%V7wthHaq#KQyg_;m9&0IT?wN058IHnB?U!vtfN7hj|o^fR%7kJu>!U
zp@A{*8<+$GeI4dhI0NRxt*{UtsE>oNm7l{5>`g>rD*O}X!Vel`(IQTUH^f62bic!l
z?4xP(U=sB9#}Rl7mcZ4GXaUS^ioLI<{R%T+$7T*Q51xg^aD<KW;MxGnOTZzR0NV%R
z0Ne%Fz;jtaw2%`^FrLBz(5IiKJqyF&Etmv9YC#3?Y$z2#ttDLmAAr^HRT$Y{)2d-A
zd^`-t;boW)Uu#7};Z9fv-wt<}mIoP(yEqBt<Tn@tySAo6_;wpAgg?REuo9NRPHk}%
z4uC!nQ2-2sZ$wZr{GtOD!lSTQ**lURo`-=0m<^pM9}a`*1DOBMagvFU3-jQ5SOCl6
zQK&`J_wdoq4l_QHNWiJ^94vqjcV$cr<Z|21VJ5>Ym<>OGg>cvdL<nw!VM$!BVG@jw
zqu!(}23vPLMR*RD!IjW5h+QrWgWmD<9gKzPa0<+ZE8!aW1KbTq_MoM(90m?%liCx9
z;mCh5*x&+K0N2A3*uEF(vL2>ooW#RdU^*;=`EX5dS`M$nz(+LAw-4#yB$xsB!Ccs|
zF9pLnupDmbPgf4nv|}(CUV&4g_d`S!CPAN}tP3zJm6O;36a@RhH82?#L+?Z)0$+vx
zk80ZBffNj<!z8!`X21qX<b%UuG4vaRBXAn@Pu8^i1`|Oz0;a=DumC>zF!R5JlRq9I
z<c~3WVIZ72gbLvOkK!2Y_#~dgwXhnN!oXn+N*E9Ce~OmEZg47m66V1ySP1_OkHTH>
zI{XFt3}+p9n)#o^$>@>vDLe>^Vdf}W4!1zd<BSCu1|2W~4oagXa3aixZ@@M1BHRsI
zjwS-|V`v${V1;4uv2^mo%`hEK&Kg4tIq5c*kidR$H~bp<q%hr{VP3=MpCto)aUAL4
z;O9sOcffLZe+KEFU<sZ;B;cv%i2xinkp{tA&?l8$@+7(t223VWFl!7a`J8x7VN}9#
z(DEcpCya-SULYa-4Hm%tFA^Ep<|RT22SWd+*dM?cSO$~eA21y@_$Sj0=1jw3xEhwh
zkQqeeY3BcnoWwt^X?<ognBX5U7bep+WpMB;Di}#0!gzQNro*PMkPh~N1#m4q3NJvP
zQEW_R;}CoZrow;1sqh-igEQyg(5Ng1MK&!#*gKEDN#hP?J{7>$FcT)jJU9*(z~5lt
zXr|o)Du8h?38uqz=$pe>fMa19G@*YwV`U-f;3=2~>*Z2j7ALD0G0$PyYqW3-qZ%ed
zpT!geTfp70FD!@CpwC!FF^q@*c!S8nj!P&I)_;?V;dbc%jHXq*i(@b#pTP=mEGJUX
zYc=yf@L4i&5(5)pD)d~#n1GF8G0ggaF)@zIEsTKy9}!CU$XccsZ2K`0gs(vV@jQ%y
zF)*ZnhQJZ)X$YJKcf&V6p}gmqO&gj2$(+pEM1e4OGYMfnEQY&b8LWbq43^YS88q+{
zm;^^|VGO|KuoxC^C6e%W=s!VYiDUM_D3}08!c-W#jeKy(HU`-mPIeX10(ctwJkRLf
zPDVH#Cc(+Y^clPfcSHGtbQ4*kVHoW7B?ZF+Fc;SOiV(vU&}R}3@1Uz-<W3q2XTm(#
zC2JRb%E<y4IGGK|cl0HUgV`{CFBL-1eXM-21N5K5IshY~15SmHme3XORagRJ_mdvJ
z1Os1S*L#qLz`2L07{2g5<-)Ark5J%LO?&GHS^&@d#2|uePEi2-82Z1+g98`|>zrj)
zz{qk6fIq@==r~73FEML=rKPa{dD6joFb|%F#W3~)k%2G3%9oh`eJ)b)KQ--3m=5Rv
zMhoBvupFMc%wU?PY0a+?8Tc#AhL2sPAb0}$OlKE;jf&xFm<gBPpd5G`R>R#l>8crA
zuK%PS_|+{2`3&a&hjt1=c+EjUGwG8mDuk0@GW;B7!(U(ljIO5Tunbnh0k<hQlk53k
zL=dLKJoqXsgx|mtXqYqzE`WjmVp?leW<nMxV>rox%`8=BF}&(oWmdwctyQMwWo82m
zgD)CYW)l1iX2N8zDzgwyh9&Uz`>ISI8uW)Z>0pdcm6-?MgoW^>dR5$TupYq3S<C|8
zDl;pAlOj&0!W}RdZmVBqUWc{@IQ9xlCd`Ky8&VPc-XF(ibAR8s%1nlRo8UN{36H|h
zU?nVvmO0!VH>0I+e>4uk=*~C<2Y0UG`p?Naglbq0!?GFmF|+{gg_&?*mnw4&Oo7Gl
z1T2Tqu~hIXx6?2T4(W=+FbC$s4X^+{+pWqhgZrUnuBJsiKm_2kaam;G<f}M5gXP_6
zAsiA<3*dNYnaB3LC+Xm1m<Mmb-LTa^C>W0GMd;_VCF_mj&<-=;SAB2*&h1ON@Spvv
z%)kXSpnsK_4ac{5kOZ78fxBTjEQiSt;b9Jg=^=^x#u~1Flr3rBSfljEGi3?C)B3+^
zHyN|Fhd*bj{C$LO)T%so<D6Nn33|PI^jv=DqPM<75953dfA793wmsJHtn1k0?y#%!
z%K2+N*KXc_k6z912I!6N(fvKR#Gr@Xqer4QLvM1Ao`BvIy%*b>T2mxvsTg|AmBV`o
z#KgIxe`}4^m(G>AZ*g=98Jg9mb~W6hGx*Vc@6iL%7ogkj(PPjTqWj&WC!x<lAE5LE
z-w-{)*GmN=9la2}DbMO^vnWHR+i+(Ef#`YYTjq-Qch+G2=v;|Vyf#-7zoS7N=E)58
zyWKolrkFWTilA-pJi8fETRID`D!-1V9iAuG$d?s4-){C$_TIjJGkv|BJ-!?J5Omd>
zHHY%64E@RfN576f4qc_GIaED97FH+p!Ak7o>-U=8+qdbQI(lz!U%x)SUVZBJ^_3NS
ztql_A+>==qluYI<^hPSP`%roYdgeXpRdBM=r_C3~UK(F8Uu^r}uk$5N@%DU4*~c{4
zv_NL>vqmNAIse^z$)3zp^gA2jtPfp7Z{_wuA%?$f^m;ks*k_IP9Du!@JQ-^=v6dkx
z=g80!YgFhg<X$R0v7Mt&Bf7353`APF1<a9EC2Dr&$X?Rxg%qcLYYQcGKZCQyLP^<g
z4Gv0O$P*fOUIkTsAk%ab`Xsr%&l)J3_FJ3iA1#!<q}Mkrl#2b7SGG`W2l!pRP~r|y
zZgj4sDUQpPoCDbR<w}ua`$bZr(hXlE-UqGWp6@JTw#oYXMvx3WXbtw9`5F&+q@JJA
zL~;*W{iEx@Za3TB;g1Sa!hLMm&{ec*4&`SmdcD`>@Ih;A+|Sr)wlf|Is@n>%??;bP
zf-_DTwQ*AUOR)d+y7W0jJdDLM<B&Bt+Pc^|X;cp9q3VzK<{FOf<JJ^Z`N`;x*JUsH
z^cIVy;t=i6Un~v3r+*f{!Lvb?&{?m}kW@b?i@!?pZFob{zPI+Yc`mtoxTwBJsLLgJ
zi4;+e{{0fEP|SQ&Y=>d-o052#ddlCFDTl57qqpVR%~(}1^-Oiou~Z!W0o^&rhxqzE
zgs#e+ihd$bDpWZg-V)mp>tKDH$oM0aw_jw*5%gJ0W#<v%(eE8`sNaj<k%*)GE`3MR
zj*>27ndBU$y=|AtI;E#ClQW7lmx=8d!}YmkcJqA+@-hNt@-fEEkEB0ljk0}@e&?Fw
z?7#XvbX_Kg$rtnsc4z;ows0tW0rrFFE7dq`Dm{){1ERa~!cN7$=1~1pO}g&rZEHlR
z^82$|CcP^;$MNsWcV*{sYn10tq}w7V>oWct{$LG{9`K&sd{E^}@a?F2NtIVdzG>(o
zN_QV>UR_6@^`49;pI-c)Ect=*EX#?%@-IM6|6pyZ>&wNV?DqF0^hfHw`ktix$hiND
z@<vMgpGddzM{7sjcexz?(fX7gy<DPCkbc5)NjX9KjODWAgzD$zvQz!uKzYhuwp>C>
zVe1u=Sc>kmQf8N0qpV9-+RX);1Rt@skRM8|E%YNR#eqTh`nN><L<hh9Z<(POwMzE>
zM8|bmB^B!T<W&+^2G_5WoH7>kFIU-3#-(#0^=H7S2CFxt8&}EBGAb@6AME3|+HOAK
zPB%x_AFCPIs#Da9(t+4jUe~}+t1&wVU>Wv@SIf|o__ubo%sxrE2Ug3vllbxC`{Lkt
zXu<pLMcp-Vl4=T2<(F_hE`48$ezV#VD><(st+Vr7Y5U!oR=r#e{lISato0?c#_XJg
zk=T1-Z{s1a9k(`-FHTuo>TN!h6c58@&Bgfehob*Xq`&!4;(o@_6CX;NqU|He`I!bR
z`bdhfxBB@bo^7j2`l^*!f4rs^H3HR(*8pBmn$p8!G?n(JtpWO(wUT(+8XJ25I=k7Y
z);Yb`o%z&@*e>g273uVm>!kEF9k+g+I24VK#rBIe-1hp%cdtlEl%?Lw&iPnUez6YD
z8pZ3*5LLeWP$N4jfU7Bbn;H=+UA?cJj_$l#yKPF(L|=~nv|Ce9dLH^Y^!E4Y1?UcR
z(mYXXQW<t*xQT(4t@coQ8Tv)^JFkFBzwS=|kV-wq*KaV<iRaXZVGzDvW}mV4)O)R$
zoo85D_V6B;D^~3h!&x5bwyYQLv)0(e@3Hq%)}g+BPg9O6zZ|<A{oehTde3|XU8Six
z)M)k(Qtyi;=PdiVmY@9hMlu<D3v{W;r1J2m-cPSYZ+MTMjlK+>m8<qpOLRW^8|cna
z{g|&`uG-ejt<%>xWR7M0F5jlTzwwn0<<<tak4WT<eu^tN%E@vvfANW=l;h+&?1NO%
zYF*<3pqvQ{=EVp4aF^aoAL`Q8%kkh1c9UDt+C!yJM$bZjq*isUs~PB9(O*$|QtjSL
zZ9hkQ<Kr%0KQYU#&2(*}*v>Ik$~Q{lIctCY(@nBO{kCnEo#&WJk(;FgMi1Kj-#gpF
z<~(vlzc)ha#e4i_>GLa#U*M;_NYmt0T_afXezgXAPC|IowaOm))!NbXQ+`iz?P6M-
z=W2B6Q;9oI#hnTz;!kU^%sOxN(#I4^&Ur5V6AO6(c}G)K2-Iu&H_=)7YY#Qi($N<d
zO6hr4t{qz>;sUMOuvJnnScmEJK9|k>?(@ax?n|C?b*fz#oLh(7P(p~Eb;i9?K2kIP
zohxMl_6?s)=tZ^?iQE4Fa#WXBlP+PKOu0y-(zeO6i!2c7+ob3s3&*N$a!u)8MPjSK
zzfX!Jv4WNUyCU~^#4lB=8h1VvwyQ{HlP=C<yZe5{>Gz|xeyMn*lFmVX=aqur>WJZQ
zDtcv+R8&x}_ja}Avsp{fVzx`4-|(U1b{VgjxLuYg_TR2n@nHR#?NX{ZbGtZx!^g7i
z5>d(TitUo9SVg`{^zvd^R!P|v#d4V6p5wl-o4TCrW&}vcC2NSa-IsRrfQLlg;4=8k
zC2L#j+W*+i4j$6;GE4jVODwx9zmmO|C}H^y(J#Z(J0wo=>D?+Swzxpn#2I{!aPOU3
z4EjuTI=uE!qa+DE8-0g!`~}LH%hq74=ht>~gf4?`Q)|!_YdgL19!b1HJ9_Ss@mIJd
zs{5_oe4w@**BTMez9kaf>mFTww-EUM=o#of=y$Gps&BH<tM=H<PIr{Ua;D{@`;&eF
z&tu(#Es?33>Fd|q*GnZRCc(c*Fj{4FcKU<ZM>y?ju$N)~3j3})a^MeZOPT(=wY9Bt
ziQP=B&A{Tqc|rt>b%_-HZViu1-fuU%yX|vbH)jv}c67Bkmvj@cv&cIuR8aZXpikK^
zwyTVgmHVX+3_eb}9x4HjDX7@{a8Fx>-o!&b@i%O;@~SmNKYT!Ts*D~7rQ#|fUp!pA
zufe2)5^;?oH2R>VT(ic8mmaj6n^Z{;`YzV>wY9@LK^4wic<zwh{6>?(=d7*dr)$<A
z{o)~UsDe6wFSb9};=cU7-MlP4ny`hO@&^@$9hPN(P{E4BviA?xgKo#fcAfN7j!E2g
z?0Lr|O|kiL$${1h$5~PtnoZavA4Z6Nkxw@6y%6Pd)AAhpOUhI3Z}k^^y_AQA=->Sy
z5jR*)oBSwKZdjv+-p3~$tct2d>_CR^s%esQ0qelibnN|<U9HgUGnAf=-lZlVdJ36T
z7pPA};!nu68#uh?gxGFUk58GT-Bb-YDLFT3Sl~%nr`YAB?7d0Dhny6L;?k39RSmb5
zoV1&7Q<my7J<r!mm8-rONjxRvm3`+aUg9ge`%v~6M(wFnvJQJ<4qt*euWjUi4!@MY
zQ?ai=XVy8(9qZO}(Kn#0aawbzIITfH`?J_?;lJawq}^ijsSigQ)Fgtc+?Z(MgYJxv
z+otp+^e*Vm1^Ny>-L1zeE#22oZIxA9GSMF;eM1j^&|izVnP{3@hlw=T(%ZMq9M1$_
zsc+|+*#3+}*x9`_J1^ty1kvxj<k(q<=A4(Ec7nX?yj0k&QE8|6m_!Ap=1@)*cjgf(
z`n}IBj-sDOzjKXJ=_}EHzeiVPTJYF@US>Gx46h4!-%zPQCSdouAVm&FbN!3rUB#%6
z<Rg|Ia;btFn2}Z7=ybX$Gpej(_0E-Yri%7#s+7=b2EiYdGE}jmQf5~Z{Gpd*Z#C^&
zbV;r$uDm3++t@$9B#F1lH~+Ha+$Ik5F3Y;xY+Uj#+s$tH;97`=_=XT<)+y&zGqWo<
zh1iL;+ddJC>u|1g*uTWiEOy(+-(@exz8AY%LtN$0xXWJFRqLz$N&+UCJHg*KFsg!T
z3b>9Pxa~>4A<w!TRFwwedEG0r<S!bSenocvMT5V(A{Bqpz=|v4Z4%Iu-zCN5_p#q)
zhT@6eWtr0NyDAmxcm1nxc^l!LjrfM=3FlVktw)Tedbg`@jW-7C8?U}qWJKwQuD(@a
z1nYaRinj+ptFKC&2fw>sQ@1a{`cv0rhKCVr6~6cBTt}Yzg+=>>htay<48GTKt`yEy
ze#BjkFJQHvj@@~e?zAV@+EZzp`cCHs>;V$gk)<z9H+t$(H)WM>#OmE{N~w-}$u}jU
zju92Titm2z+_<U1s6O9$=N?_f)SuOT@lDC8gX7_U%DOs6Y}`1$5Bd(9tCRFSHIArq
za!KEi?}h$;uIYLjqdo&_bxUj(%IR}U;w(n2Z4Bx9t8(0j8vOn}+0xyT9E-uyZI{g!
z0+?!-GcfcEyL-3mY`?DVI-G~fS4_T3c8T^R-;@MN@uZ#y95O@k1&1v2q`vc2cJn@!
zgZv6=OibmX`(u?<cv4U6YOz^~Th?t!u^RpLWJ6Y2Derwl_F5@tiy;+o#7BnXu4msW
zP6fT#GNL<gm7M;Ku5r&fIZCh>{||d5_8s!Qml5FU)0^F%>ok(nuhnUjP!-%He(s?V
zcKFUlq*4X_rlq6rH>A|S`PE)x^P<y#a573}c^O`M-TNfRixw=sPu6)E+<)BXa6MIb
zdOGqhPcymgE%+byeC(fKf7I>79GzK~<yvc1B#N>B@;~yI_2K2++5eHCngY&Yf4sH=
zeOQf$s)E42yzcaNxLz+f8}Ro7R%MUJ-WB`&+Wa`EPjz~z?5TaTxLW;>0x~IJ8}><e
z6)>uHN0yKMzPb+c$y&SqM2&;av537t_W!u;?hRHF_j+r6oA&4aYmm25S3g--QoN1e
zpllz9>lF)aQ8^O$3*<l3(PzoOe&XTUdT*nhzR5>QNw44b5r;QDtourIU54i`zLHYc
zhz*_X=P+MzSIMJ}$#=RG&lU8TpKMm?O+P8EYxMW**^rmp5@cng)7!^ruJ^fL(tNNl
zxnFX8ST4%%mr{OP>+=oUA@@Eyv>pzu@RvUINVvma#@A!s#59s+^>FCPMsip&r;&L3
z!Um0{k7C!xGQ-!1%DULZ;d(fwoa9iQk@OJru&KjrsRU<dj;it7Y0tpk0sCZSXM4nY
z?!|_P7b)(|D)w2}LzLZpsHw6BeLi}V8j){noLdU@ZP`2w&N<{Zs)!#&U({3*>l^)T
zUp8~xb*G{1F$4H^y_u|2_Ukr>`PknDI~Nl530kKB@%AGi2LdF{k3c;gC@FqKa#C|y
z=SLu-L*$GffqODUyc@vjArh^a9U>_WNdH=h%uv7ghsdf1q(2%WrRw*E5YZbN;c?|5
z4wD_AbE~MXohpDO1KDk(&s4fIfJwf+^bF@<RSR4t_F*j~ts#DGXdz1!_k_yMhQ{DJ
zFSl~YzzS=#!Rj-%clg3DR2AYrR1}Y*uSUQ3Nm3>H=jhI@4g-21&Zr<+c&PDNE17XW
z!z(3LT_Ro2Uhg-&2Jaz#YgM+>Nq6s&{y2IYrMnMj@1g&S&K*bX5zkpM`YrTzJX5{N
z%a5V{MzCjEIBSLLF~S@EMiYH@xUBOxVm;5IyI+!cG(rz(Ew)DVUZ2+PC_drqr|zFs
z!Ip>l+N8D2Xv7M-p^X$ZG6n}NZR>D7S)hUH;U#U>3P`sYJw%@T#TqIPHa6P!85!XY
zgtK7}lhSpl-cg_5J%-&G*oV{&z~7hkFzo8Hyx|eDxiP(*9wDWT>7Dl?#JdUH7a?&?
z@F%mK%uv7EMasG+>>E}@;=h;cZrW&Sv<-hb#^JiHbUKjc?d#RAu6nDW94;He`+e-C
zx@)5`y{XaK+N`U?yx*dp%mm8Lrbf7}JkD{~4XbiZeb{#-PV{EDb~;WXn&HZ*?lQg^
z19fP3$!W$F@`WC9xS0`U&FR5}@Q}ausXJ?%(bBr2r^D>xAx~YiHj}4qMhk1+KOE)>
z4|%?w(M(n#jA`Hd&J~`UPp;=Rx4aq2+^J$`=(>lCveP>EEwTEyMT@Zmw|#)~IhwLN
zdkMpC><_qmDS#lY@8h^*bM<6xFR9*FpAHsaZ^nIJ86Iy4Fj{!N+Sg%@lauGIVPX$3
zT8A(1=P>8iR;Xv!ln{wlLIIZkd<Nnnry3iLB{k6Sx1C0CF6b%~hZ+z5!`N>0mz+R?
zc)Gu=3pBz*)n}I63a}Qc@$p3M4WjCVOw#KQ$~DsIDG!P*h)yUOASpqNhtdI(qxfy2
z6a^XKiANLfuFs$R>f6K;bmx|Nxa(o0((}>JpgZ^K{8oo*NFjQ4qC^B6QNgbbbk7r~
z7588@_V>}Jc*x5?(b<cFjn1|`Ne(l}ok&47G@Yw=Ns^QX)6nZl;!tcgNNmj+HOB@?
zT60EC#9)W{Z%tlfL`h+DBg{4jp`I#%u2xG*45yhRc-=Ett~F;pEE_BlA+$K|VMz(0
zpw^Gbsu1*rk4UM~#}5(j7DjBVjYAwJ6V*L7^g%UUrCLy#!edtKMbiETGk9JLqpdz*
zsB9*mK4z#?v|#419e#I%)EFsyf^84FE`!3=vxQJ2*gENP2OD8o-hrna8$*qz(H<l2
zUb9rFGgA2+0sYRMu39Uz(R+^&N2t-?`ZD$r?n`W1OC#7O2>13)82gP)BV<`iS{j`q
zdr7AcO_2)qyL+nG!iY#ks-%VS`@<(?Nf=k2qfg5jrFR-F5v}+g^^DAb(JRN@J>i{>
zKFv~r?u~Obhl*G>`YYqaq4H-<ki>AKf9S~P`KC!FaxOXgx|#^99*ZAIl%JQ);Y9TK
zM5zeJ&$AQ7yS35NwsVrhWV7nbmqjKu#w$tx+ayVAZG=Y$PIj0K`PxJI6*-Czc+k0n
zs6AAVB%o(amLkg4eW!?{HEZM0DGqZl>yDq%Qbx2f{t;aG!rhm8)u8w^K3ziZR!2QY
z;>}ze#=xqVL~m=v27k==r+aGaagF*iGNk;|VNPU;YemV^+ZwGsr%&VaF4w!%9c{V1
zd_7G{+Y<Vn)5JRhmzqzP#0WyRccv_hK)277y%B7umS(c}vd~y~sa~(0(bDs|e>u#H
zGNB!d@sM^D6!@}?Z%6F*IT<B~+OdP5ktL<=jPRCrK0M@B&mH*bHF0-N-hweKFtZ#I
zaNF9-TSc}EA##=sjbw>$Kg(fqEvP+|J#Z{vo6eFY*!2suq$rZBX3#5qZ6-lid6MAQ
zo?F}RUXh6QMsQa9AomA9&K`G7@=Ef5OZpb7RHv@q;J6Oeb3V`TI1Ib9$Jw>1cYI0@
zL%(*9?i@Mj0ka+EY*obHk7>v`S2_e4UUIO#(M%sRTPoV)@v+&`FbZCsEpbr<`mwo^
z6J<oj-Ja(#pI0Tgk3gutPJNW`Z=GYv>8bkU#dWAiWMZE*U#_Y0S}YJ-2Rf|N0!i$^
zZR64f|GgT7jnlOE7Ra&=bo9&|DOFs-M;9HL8%uK?u2*udxlwzmiKHH%@x1)cmDwGQ
zD9@Zle1a)K&3W~>zoXGEXbZoc&sN-isr31zKaRdsPBvz|M0Vo3((|>u=WaUb%1PJ$
zHA(4YjJ1A&eUyg;hZ(Kpw@$_&{e#yfE}8}nUM%Ak8@(aRqUq|dmpIJzGDtm2bwtzf
z&^!t4Y>XY8mFISZhs^2?uX7Sk;GPLRPC3V`uSB=5zK-32-a+Z^LrqTgwd{}RbKEa*
zFLh>D5b(Cc9NWybbLkpmw2kYu!C^8+-GfIx*Je+~Mcg@f=);$=tM02gR7k?mXKavV
zF|_1^4YD@|&p+QFab3BJ_;q2bT-YGCE*SgsUaJcuf5k?J>v0GDp)Ligzx^ljFI6^5
z4)#8muyc`74`?@f=tFA9g>&4HE_{>2_3?tUKcA_!Cy*r*dxuSOtqW`H$xRX$%UXAH
zlcXqCZIT(W_>sL?*2UuFj?Hpd@#bc6#4=v4^SY}mzrzY8tt<20?=x9OBWzE8cK6+f
z>fcoEf1dhG4tF)i>WjBXTsQI^+ahV*j3`^%tq!xJ^51=^AjM1}rdwqlcKxx>rL-IM
z{d1ex9>A|xw@KUs{Pro58S3}uB3bqT^XJiGsdzy7>nFB2{2BCx42?6wdu-ub<u)~i
zLlu|6seGS`o~U%!I9EGP=b_>fkNp<*F|Pa9{c()%kT1D<x{^5mt2#NKbWPFywzlYQ
z9MrdV>~5s%TYv9i%(7<LyKo=P5zxcMUxvFlZMKUs^IUB7u8T+3xajr1i)+@o81Si!
zbGN%VdXI}w?RW8+LoPmd+{MrzoXpZf%3Q)Hr(B%=vx~+#7vH_&Vv|2zoX|lvodykb
zvFqb5emcR$BmZ(S=XDpy1aeo-aWLG)z3p7w*-<fz4|!u<LMwIq&QaOh{X5ac(2rbv
z_cIr}7r9tb;^H^^UCch@;_O>4e&yrZjP&qx@oIpJhugaNem~fr3O^m{5`xCNcsawx
z&nCEdaj}c$>aDi^##5g9b6aJ`KqJcg@Bfj4Y#wODdSBS(N_b({)@uWe5A^>7>DbR4

diff --git a/3rdparty/bx/tools/bin/windows/genie.exe b/3rdparty/bx/tools/bin/windows/genie.exe
index b70a00cb408bd28105cb95d897bae81162d9f0be..701de893737b1ef2545e677ba6a52466b8336696 100644
GIT binary patch
delta 2253
zcmZ`*4Nz3q6~6cEd#o<tE`nf)KMSk^3Uv7quz?2U?XDt%2m&Go*xmPdyC7<zsO=O<
zK!FeyL-5uiokUAIMiVEQ#NJqK1C><lBr~+*SWrOx8M7oLVuK23*xu*RTARt6Irn?#
zobR6Vy?gI_^J*_ItnFLa(ik|b`!XNRT*FAK^288`UIm2Y!^h*H^6}xT4>XMzIfOo7
zv7XkeI2#wVMnNc+TXWgy{HO;0se%q`U7iQc^T1VC;7u$2VgT>L0Vk{$)GQcMnss$w
z;kqe)m%7=jt=mSn4B=q-#7g!K;hivGqyHMhFJMTwlb;OZMgH^c7StpdI9K(w+uc-X
z;4X582^+BuW81pMT^1Cn40RJ~?ZI{?yKcf$d$0&2bW92F>(uT$z@o=qn{jHH)12JG
zymN0TA-A?ZfsxNO?jk-nurDxNdIJlACs<S0*6-c!1LF9Wmx{JtU<?a-`4P+4-@u^?
z@BJJ24Fywg;#>vWZ(`e1v7bF+Vwq|tw#rWZZs9~vxNN5x-{KA&^q5s2EHvnug}KhO
zAv7o$^cPVO6L*o#8M71LG5jJ-*h$V9HUbjq{xQ5nrJE&M&{G;Gl5^kTbYM7p95?vQ
zXW9Ns*G2~WwGG3p)F7`HNyj+e1AZb|d>@B{P9$mf@hY9)lRCjO?O3$^J}zfjI{N`$
zjzf(sUDus{;Xu_=MW)fU{UAd9yb^xeJxnB99^#duasN-&J|Rau(;`fy=N{rk7&eIH
z+61126p>t+#6gfKlF3QzfFzOFr*I9Vi{$nc{xt|9u|2|{My_FHM#9;~x6Fu1d?M&q
zO^TqTX9OjRH1jbo09`%{UmwxF^1_tc-Om?`<m@y)t}B0rEQ_Sf&DKs9=~*}afQQ#a
zTE@dt6*P&obrzIkn2-t9Kt2>pWQzvkAjLrsYoHP<Hy&Qlr<&6%v27F##r>UqOS&aG
zs)frKik;*yKF|g!PO^C}++}Q%<_l|~Q=)CY5UGN0iFErz81zWwTYva7oR{e704N28
z84w5t&^t+TAhZK=lB<F6sb`AAg2KB7yuM(Bj_08!u$Dba(*lF*m13lv@cL!nxZTRR
zsS|0^0x+v#R3h~uumpxBaw-JggK>#$Ho%MCV@&YK-`HyQ^Dc?LV}MHm^Cnm<y3_UL
z*KYSnw3{8;{y#pJNL><C!n{&4kp$BaQA#J0;d>rXDSem<yD`iuB|A-!z>uCa0nft;
znS7WAci<hF99;+Ta7-qDUk5Sp8=3IwU`;+RThM@aCmRR(9yShjT@wsC_JZi5dzQ0t
zz`K`4BUiBR%D&E_u9_z)g)-pXCzFnJSf?!ey>!@tVO%D*3^2iCnVigkKuC0uzh*!Z
zxMad*LWp*hRr)&WKsD;`>Wg&AG(HnvdNS+C1_)BN_3Q@NsjPNH7A#{v&Ch~M0K+m3
z$$>*0jLP)md^nA@iEQCN_L7*&!hU{KCOZpYJ?lz43m{0PQ`)ed_a0-|$B)bO-d50i
z!yE_kErVcCJ4iwq>xUe)pbQ+dVS$5wRt42QzO_yZa?d;eI(x}RIc49DI7!BSNc>^q
zT_-uZAKrTI{Qu6_*IBiu>4GxvA5N0hzz)nEC*9QmLF&DND-}t1Q&EIQKo{Z!w0EU|
ziq{KhQJ#RZncXcEkR%GIvO+)`ekq`JwF1hi6HtDWfV8g)$or6h+7C0%>jD}l0%|-i
zAnv4q3T_IBpB7NFrwOTpOei$kgoaj|(2-sfLf1^_(RCC0?zSlpwR~klxnm}@@BzbV
z6Ur_$qt6{?R9<05&%bO&51E~*H=~c<GoyQ*W>nE_M!~&iq#81#E0bn)+rxq~7Fp2Z
zAPc&~EYM^@#(Y|F2sW>bjgHob*|u-n=BU!gNGlQ~F+R4$YLAUiuo*?G)h?A7jVt0x
zjJB9KDK0K9!EP7h=~NH60Mdk3AH3iXpM&@(@=c@4o2xD<FCkfts%2DcQl$l|N_IGi
zPn#-=p8B2Y%qg-xg!3W4UC+(-u4Wm#<L8xO$_*)r<Q5ZkCZ|=;#7R&lw{UjkjF(JP
zGPx5+XXUQiuvS>RF`JCJxP^ZIEn$;Nqy95*(8bx+Gdz+s!W|;3Kjh|m6e+^O>H7m*
zG0=!n?!JiiBxobQok7b+zMaAHY~IS?KsMj2(uXDy{Wg#1$nG3I>Z#Y2!%K`3x{2S&
zV9zGr$l%>gd=`VLP5kL(y&^>_FDb2z%HFfhy1l$oQ&M5q7p+RoS(};h)5>31!(zkr
zHOczm7-Muou%=2DD>Owr!eS$oj(2nU`D7@U&&C=OoySLDJ>8ne=XwSFNLli>Kku*m
z@iHEoaE(}DCn-PU=aK7K{NjdNHlE(HdDw>Nn$_ws?0=j6IH+tEq-U1GY&WwvZFGOC
ZItH^Z214W$K`qnyCUxW7-{-66{u_J~5p)0m

delta 2017
zcmZWq4Nz3q6+ZXuJ~sSy7g31BD99?1NCGbMQ_*O6yNgN@M}8IK?%T&>9W+8iFqt;6
z6%~krP&79Vc8V!ZQj<_KHMud;h*6?;v}31EW*31h#%Vweq5dG^&-Ombj7|Gy&OPt^
zfA^mA-gUX3bzOe8xjrH%{vU-f^>{(A2_zT+ZzF)x@?J9j@kUGUSU_x#7T_q?ecjqi
z){?^37chn>t@#+!nuno(ZpI!B=GbZr{8)h>sr&{rZR}zn*+#;1ZC2PV8kg-h)LtjG
z6T&v#?x6kUcDjCm%thHoYX(RK*4x>w0kW9jxShV}C9}iM*sahY8nr`JkA1#|Vx#t~
z*4SaE4li-6ihbP*@#<1LuG2TQap1M%f%>M61mKXW-r23+*~qnr7EB2)F6>^Cz@-yO
zs>-L|k;0u%#J)~L21zJ#K7Wvi5xJ5ThBiF%`5H^^tbb+WhSS``3cq>4?K=iZjOzFC
zAlaw#jH@JH<!x7q;|bWF2OKO%&A~=WZ2EU3H4t4A%N`*eB<cg6y}#HP$tiZ|)BOPb
zqA~I;L~(FudCWnHhK`b#utTD`qa*=8lGv_MGEZYT#^av29hK;b+awD)_rF8xrgw9z
z=lqr7eviJ+%Tx6W=Oo&3hrEeh5}k9G#Np=>U3r%*H*`(rnLAaEYuoOU&D_fT$H?;}
z#?8&OXR=N=R?b&pCJb$D0`Lf``bqUm5?z0fq{a08ud{yA`SMRnT$0#{dt^3mszk4h
zllwR%(QodPDD+D7{(Z6uuSry%Ak}zBqSq$KTX<8VjtAsq{3uT|9LF2qJOw6o640+2
zR6y5$0>~?|)ep%!G=Rej-wx}yFP`xEJi=2Bdi*hY&oG^9pP)B5Xqk_nox#D5`^ZrN
z7dTj%fb%sN=U}Z*;bwyGDwG6ZA%aZT2VfF<71k1fuMzddF9_Y5pOwqJjiS-!>2}YP
zWp-1KmkENCeiefI(OXK3rs0p=o30GS6=;*${!on9pe)lfVHk@}nT~|vmsl*b&TuS6
z)iXQ-jo4jEmqcJ2wwKc02s|0+Rjd$qsxRmwcj$Nqt0N-$RvKm*hql{5o$wvc{owPd
za}(`g>9f$H!#y(H5smZECDRY1@gv+P(;_3j6x_@K5C4^)rbnok*<mA|51;Wa*Cloj
zeSgE}8&34`q4j)mP^Pu%_!@pH)A4kCj2C1!z62i$*e<hs8TbalPMKDiF@<w>#EgP~
znF{@6CH@o53O%q2lX1C1|Go+rVyZ%gER>cM^F8zh+xXfqIQcqo>WXMI@DIe0c<y<=
z_63)7ZTuqsXW7|3aH@LJq`LG4zrlB%g{#zK|5p~46WpgzM>d-AphAyiV+6Vt`gJy@
zW4%JPt1()?hiAI+LSq$pPPyaj6_&gjUzv=0CkLa{wjR&H3iY%Xtik8F%nH}wc|@1O
zqH}St7WXLZSRr;2y_+XG7Nl^L#U5dgLMztcFL+hfy$+)^2DJ?Dd~h@8Zebrsw*e!A
zu~VU;WjGh#S7=HZuh*`yb!E88AG;LRQ;Ag}p^;82_-1tdmVab7wQApzoiuwFrvATS
zg_9oGg}<L!{NE8fyDKvrPOEXhbka3-d|-At*|s{2($!r4J_E)^GN8*}gtbW`=+i_P
z%@UzAUxe>C)f9^`CW~O-BEr%)MM$d=;pOcjSn5T1#OXn!2o235<hF=#^&Jta4vBE<
z0}(O@MY#P?glPe0Xqsb&&KJ#4vfK>I{$_?FZZjP2H^c9)nhW6PLuP;xGn~E6`9m{2
zDzZRTsRhC|TcD@f0*7iWuyMNuHXN})+m{x2`YQ`uxM+couUNo!&jL+aD~vy5h4Ztm
z&^*TqXI{2KZytMfFBYYl5>g^#9b30-*;E;sYAR7=hu!2bIg(8dDJeyg6Qv~CZcmj`
zO!A_I$*D<+N?Jk!n>dF<h%3a_5Hjnt3s~|O*50658NudsXg>Lnjx5y%uwUnBs}Inn
z*R@6T57)JClQ?#=Uu#2l_y_IX656&_*h&L)g*N(rt{~BXJmC_?q|?25!c01rC-AFp
zN?4XJ@L87yJ^d_S$oS8je}NE16J)_a7Z(W2>01RtC!LWcgzGkz$C?)K$d*FkY3eEz
z*3w&vLMZ)qo8NR{(#A-UkRL=BRrrNaZ<a8#&ZG#;r1)(bAi&y0T^tE(=C|peI9=d$
glarC(lbLqbYt}6!)L_*)naiqkO=ImvI@`2=18EJ>{{R30

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf8ce26..1c1c794 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ SUBDIRS (
 	tests/
 
 	3rdparty/glfw
+	3rdparty/bimg
 	3rdparty/bgfx
 	3rdparty/rbdl
 	3rdparty/googletest
@@ -144,6 +145,7 @@ TARGET_LINK_LIBRARIES ( protot
 	glfw
 	${glfw_dependencies}
 	${OPENGL_LIBRARIES}
+	bimg
 	bgfx
 	bgfx_aux
 	glsl-optimizer
diff --git a/src/main.cc b/src/main.cc
index 1e6b923..40a3960 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -10,7 +10,7 @@
 #include <unistd.h>
 #include <iostream>
 
-#include "bgfx/bgfxplatform.h"
+#include "bgfx/platform.h"
 #include "bx/timer.h"
 #include "Timer.h"
 #include "RuntimeModuleManager.h"
diff --git a/src/modules/RenderModule.cc b/src/modules/RenderModule.cc
index bec7a85..163e07b 100644
--- a/src/modules/RenderModule.cc
+++ b/src/modules/RenderModule.cc
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 
-#include <bgfx/bgfxplatform.h>
+#include <bgfx/platform.h>
 #include <bx/thread.h>
 
 #include <bx/timer.h>