diff --git a/CMakeLists.txt b/CMakeLists.txt
index bba47ce..cdda6db 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,11 +9,11 @@ option(VTFLIB_SHARED "If enabled, builds VTFLIB as a shared library" ON)
 
 set(CMAKE_CXX_STANDARD 17)
 
-include_directories(thirdparty/include thirdparty/miniz)
+include_directories(thirdparty/include thirdparty/half/include thirdparty/miniz)
 
 if (UNIX)
-	set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-undefined")
-endif()
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-undefined")
+endif ()
 
 # MT/MTd specification for Windows
 set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>")
@@ -23,69 +23,69 @@ set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 # VTFLIB
 #
 set(VTFLIB_SRCS
-	VTFLib/Resource.rc
-	
-	VTFLib/Error.cpp
-	VTFLib/FileReader.cpp
-	VTFLib/FileWriter.cpp
-	VTFLib/Float16.cpp
-	VTFLib/MemoryReader.cpp
-	VTFLib/MemoryWriter.cpp
-	VTFLib/Proc.cpp
-	VTFLib/ProcReader.cpp
-	VTFLib/ProcWriter.cpp
-	VTFLib/VMTFile.cpp
-	VTFLib/VMTGroupNode.cpp
-	VTFLib/VMTIntegerNode.cpp
-	VTFLib/VMTNode.cpp
-	VTFLib/VMTSingleNode.cpp
-	VTFLib/VMTStringNode.cpp
-	VTFLib/VMTValueNode.cpp
-	VTFLib/VMTWrapper.cpp
-	VTFLib/VTFFile.cpp
-	VTFLib/VTFLib.cpp
-	VTFLib/VTFMathlib.cpp
-	VTFLib/VTFWrapper.cpp
+        VTFLib/Resource.rc
 
-	# Miniz sources
-	thirdparty/miniz/miniz.c)
+        VTFLib/Error.cpp
+        VTFLib/FileReader.cpp
+        VTFLib/FileWriter.cpp
+        VTFLib/Float16.cpp
+        VTFLib/MemoryReader.cpp
+        VTFLib/MemoryWriter.cpp
+        VTFLib/Proc.cpp
+        VTFLib/ProcReader.cpp
+        VTFLib/ProcWriter.cpp
+        VTFLib/VMTFile.cpp
+        VTFLib/VMTGroupNode.cpp
+        VTFLib/VMTIntegerNode.cpp
+        VTFLib/VMTNode.cpp
+        VTFLib/VMTSingleNode.cpp
+        VTFLib/VMTStringNode.cpp
+        VTFLib/VMTValueNode.cpp
+        VTFLib/VMTWrapper.cpp
+        VTFLib/VTFFile.cpp
+        VTFLib/VTFLib.cpp
+        VTFLib/VTFMathlib.cpp
+        VTFLib/VTFWrapper.cpp
+
+        # Miniz sources
+        thirdparty/miniz/miniz.c)
 
 if (VTFLIB_STATIC)
-	add_library(vtflib_static STATIC ${VTFLIB_SRCS})
-	target_compile_definitions(vtflib_static PUBLIC VTFLIB_STATIC)
-endif()
+    add_library(vtflib_static STATIC ${VTFLIB_SRCS})
+    target_compile_definitions(vtflib_static PUBLIC VTFLIB_STATIC)
+endif ()
 if (VTFLIB_SHARED)
-	add_library(vtflib SHARED ${VTFLIB_SRCS})
-endif()
+    add_library(vtflib SHARED ${VTFLIB_SRCS})
+endif ()
 
 # Link against compressonator libs & set runtime lib
 if (UNIX)
-	if (VTFLIB_SHARED)
-		target_link_directories(vtflib PUBLIC thirdparty/lib/x64)
-		target_link_libraries(vtflib PUBLIC CMP_Compressonator pthread)
-	endif()
-	if (VTFLIB_STATIC)
-		target_link_directories(vtflib_static PUBLIC thirdparty/lib/x64)
-		target_link_libraries(vtflib_static PUBLIC CMP_Compressonator pthread)
-	endif()
-else()
-	if (VTFLIB_SHARED)
-		target_link_directories(vtflib PUBLIC thirdparty/lib)
-		target_link_libraries(vtflib PUBLIC "Compressonator_MT$<$<CONFIG:Debug>:d>")
-	endif()
-	if (VTFLIB_STATIC)
-		target_link_libraries(vtflib_static PUBLIC "Compressonator_MT$<$<CONFIG:Debug>:d>")
-		target_link_directories(vtflib_static PUBLIC thirdparty/lib)
-	endif()
-endif()
+    if (VTFLIB_SHARED)
+        target_link_directories(vtflib PUBLIC thirdparty/lib/x64)
+        target_link_libraries(vtflib PUBLIC CMP_Compressonator pthread)
+    endif ()
+    if (VTFLIB_STATIC)
+        target_link_directories(vtflib_static PUBLIC thirdparty/lib/x64)
+        target_link_libraries(vtflib_static PUBLIC CMP_Compressonator pthread)
+    endif ()
+else ()
+    if (VTFLIB_SHARED)
+        target_link_directories(vtflib PUBLIC thirdparty/lib)
+        target_link_libraries(vtflib PUBLIC "Compressonator_MT$<$<CONFIG:Debug>:d>")
+    endif ()
+    if (VTFLIB_STATIC)
+        target_link_libraries(vtflib_static PUBLIC "Compressonator_MT$<$<CONFIG:Debug>:d>")
+        target_link_directories(vtflib_static PUBLIC thirdparty/lib)
+    endif ()
+endif ()
 
 if (VTFLIB_SHARED)
-	target_compile_definitions(vtflib PRIVATE VTFLIB_EXPORTS)
-	target_precompile_headers(vtflib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/VTFLib/stdafx.h>")
-	target_include_directories(vtflib PUBLIC lib)
-endif()
+    target_compile_definitions(vtflib PRIVATE VTFLIB_EXPORTS)
+    target_precompile_headers(vtflib PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/VTFLib/stdafx.h>")
+    target_include_directories(vtflib PUBLIC lib)
+endif ()
 if (VTFLIB_STATIC)
-	target_compile_definitions(vtflib_static PRIVATE VTFLIB_EXPORTS)
-	target_precompile_headers(vtflib_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/VTFLib/stdafx.h>")
-	target_include_directories(vtflib_static PUBLIC lib)
-endif()
+    target_compile_definitions(vtflib_static PRIVATE VTFLIB_EXPORTS)
+    target_precompile_headers(vtflib_static PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${CMAKE_CURRENT_SOURCE_DIR}/VTFLib/stdafx.h>")
+    target_include_directories(vtflib_static PUBLIC lib)
+endif ()
diff --git a/VTFLib/VTFFile.cpp b/VTFLib/VTFFile.cpp
index 8ceae34..b44dce3 100644
--- a/VTFLib/VTFFile.cpp
+++ b/VTFLib/VTFFile.cpp
@@ -9,16 +9,20 @@
  * version.
  */
 
-#include "VTFLib.h"
 #include "VTFFile.h"
-#include "VTFFormat.h"
-#include "VTFDXTn.h"
-#include "VTFMathlib.h"
 
 #include "Compressonator.h"
+#include "VTFDXTn.h"
+#include "VTFFormat.h"
+#include "VTFLib.h"
+#include "VTFMathlib.h"
 #include "miniz.h"
 
 #define STB_IMAGE_RESIZE_IMPLEMENTATION
+#define STB_IMAGE_IMPLEMENTATION
+#undef STBI_NO_FAILURE_STRINGS
+
+#include "stb_image.h"
 #include "stb_image_resize.h"
 
 using namespace VTFLib;
@@ -27,6 +31,8 @@ using namespace VTFLib;
 #undef min
 #undef max
 
+#include "half.hpp"
+
 // Class construction
 // ------------------
 CVTFFile::CVTFFile()
@@ -44,7 +50,7 @@ CVTFFile::CVTFFile()
 // CVTFFile()
 // Copy constructor.
 //
-CVTFFile::CVTFFile(const CVTFFile &VTFFile)
+CVTFFile::CVTFFile( const CVTFFile &VTFFile )
 {
 	this->Header = 0;
 
@@ -54,23 +60,23 @@ CVTFFile::CVTFFile(const CVTFFile &VTFFile)
 	this->uiThumbnailBufferSize = 0;
 	this->lpThumbnailImageData = 0;
 
-	if(VTFFile.IsLoaded())
+	if ( VTFFile.IsLoaded() )
 	{
 		this->Header = new SVTFHeader;
-		memcpy(this->Header, VTFFile.Header, sizeof(SVTFHeader));
+		memcpy( this->Header, VTFFile.Header, sizeof( SVTFHeader ) );
 
-		if(VTFFile.GetHasImage())
+		if ( VTFFile.GetHasImage() )
 		{
 			this->uiImageBufferSize = VTFFile.uiImageBufferSize;
 			this->lpImageData = new vlByte[this->uiImageBufferSize];
-			memcpy(this->lpImageData, VTFFile.lpImageData, this->uiImageBufferSize);
+			memcpy( this->lpImageData, VTFFile.lpImageData, this->uiImageBufferSize );
 		}
 
-		if(VTFFile.GetHasThumbnail())
+		if ( VTFFile.GetHasThumbnail() )
 		{
 			this->uiThumbnailBufferSize = VTFFile.uiThumbnailBufferSize;
 			this->lpThumbnailImageData = new vlByte[this->uiThumbnailBufferSize];
-			memcpy(this->lpThumbnailImageData, VTFFile.lpThumbnailImageData, this->uiThumbnailBufferSize);
+			memcpy( this->lpThumbnailImageData, VTFFile.lpThumbnailImageData, this->uiThumbnailBufferSize );
 		}
 	}
 }
@@ -79,7 +85,7 @@ CVTFFile::CVTFFile(const CVTFFile &VTFFile)
 // CVTFFile()
 // Copy constructor.  Converts VTFFile to ImageFormat.
 //
-CVTFFile::CVTFFile(const CVTFFile &VTFFile, VTFImageFormat ImageFormat)
+CVTFFile::CVTFFile( const CVTFFile &VTFFile, VTFImageFormat ImageFormat )
 {
 	this->Header = 0;
 
@@ -89,16 +95,16 @@ CVTFFile::CVTFFile(const CVTFFile &VTFFile, VTFImageFormat ImageFormat)
 	this->uiThumbnailBufferSize = 0;
 	this->lpThumbnailImageData = 0;
 
-	if(VTFFile.IsLoaded())
+	if ( VTFFile.IsLoaded() )
 	{
 		this->Header = new SVTFHeader;
-		memcpy(this->Header, VTFFile.Header, sizeof(SVTFHeader));
+		memcpy( this->Header, VTFFile.Header, sizeof( SVTFHeader ) );
 
 		// Set new format.
 		this->Header->ImageFormat = ImageFormat;
 
 		// Check flags.
-		//if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] <= VTF_MINOR_VERSION_MIN_RESOURCE))
+		// if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] <= VTF_MINOR_VERSION_MIN_RESOURCE))
 		//{
 		//	if(!this->GetImageFormatInfo(ImageFormat).bIsCompressed)
 		//	{
@@ -110,7 +116,7 @@ CVTFFile::CVTFFile(const CVTFFile &VTFFile, VTFImageFormat ImageFormat)
 		//	}
 		//}
 
-		if(this->GetImageFormatInfo(ImageFormat).uiAlphaBitsPerPixel == 1)
+		if ( this->GetImageFormatInfo( ImageFormat ).uiAlphaBitsPerPixel == 1 )
 		{
 			this->Header->Flags |= TEXTUREFLAGS_ONEBITALPHA;
 		}
@@ -119,7 +125,7 @@ CVTFFile::CVTFFile(const CVTFFile &VTFFile, VTFImageFormat ImageFormat)
 			this->Header->Flags &= ~TEXTUREFLAGS_ONEBITALPHA;
 		}
 
-		if(this->GetImageFormatInfo(ImageFormat).uiAlphaBitsPerPixel > 1)
+		if ( this->GetImageFormatInfo( ImageFormat ).uiAlphaBitsPerPixel > 1 )
 		{
 			this->Header->Flags |= TEXTUREFLAGS_EIGHTBITALPHA;
 		}
@@ -129,46 +135,46 @@ CVTFFile::CVTFFile(const CVTFFile &VTFFile, VTFImageFormat ImageFormat)
 		}
 
 		// Convert image data.
-		if(VTFFile.GetHasImage())
+		if ( VTFFile.GetHasImage() )
 		{
 			vlUInt uiFrames = VTFFile.GetFrameCount();
 			vlUInt uiFaces = VTFFile.GetFaceCount();
 			vlUInt uiMipmaps = VTFFile.GetMipmapCount();
 			vlUInt uiSlices = VTFFile.GetDepth();
 
-			this->uiImageBufferSize = this->ComputeImageSize(this->Header->Width, this->Header->Height, uiMipmaps, this->Header->ImageFormat) * uiFrames * uiFaces;
+			this->uiImageBufferSize = this->ComputeImageSize( this->Header->Width, this->Header->Height, uiMipmaps, this->Header->ImageFormat ) * uiFrames * uiFaces;
 			this->lpImageData = new vlByte[this->uiImageBufferSize];
 
-			//vlByte *lpImageData = new vlByte[this->ComputeImageSize(this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888)];
+			// vlByte *lpImageData = new vlByte[this->ComputeImageSize(this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888)];
 
-			for(vlUInt i = 0; i < uiFrames; i++)
+			for ( vlUInt i = 0; i < uiFrames; i++ )
 			{
-				for(vlUInt j = 0; j < uiFaces; j++)
+				for ( vlUInt j = 0; j < uiFaces; j++ )
 				{
-					for(vlUInt k = 0; k < uiSlices; k++)
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						for(vlUInt l = 0; l < uiMipmaps; l++)
+						for ( vlUInt l = 0; l < uiMipmaps; l++ )
 						{
 							vlUInt uiMipmapWidth, uiMipmapHeight, uiMipmapDepth;
-							this->ComputeMipmapDimensions(this->Header->Width, this->Header->Height, 1, l, uiMipmapWidth, uiMipmapHeight, uiMipmapDepth);
+							this->ComputeMipmapDimensions( this->Header->Width, this->Header->Height, 1, l, uiMipmapWidth, uiMipmapHeight, uiMipmapDepth );
 
-							//this->ConvertToRGBA8888(VTFFile.GetData(i, j, k, l), lpImageData, uiMipmapWidth, uiMipmapHeight, VTFFile.GetFormat());
-							//this->ConvertFromRGBA8888(lpImageData, this->GetData(i, j, k, l), uiMipmapWidth, uiMipmapHeight, this->GetFormat());
-							this->Convert(VTFFile.GetData(i, j, k, l), this->GetData(i, j, k, l), uiMipmapWidth, uiMipmapHeight, VTFFile.GetFormat(), this->GetFormat());
+							// this->ConvertToRGBA8888(VTFFile.GetData(i, j, k, l), lpImageData, uiMipmapWidth, uiMipmapHeight, VTFFile.GetFormat());
+							// this->ConvertFromRGBA8888(lpImageData, this->GetData(i, j, k, l), uiMipmapWidth, uiMipmapHeight, this->GetFormat());
+							this->Convert( VTFFile.GetData( i, j, k, l ), this->GetData( i, j, k, l ), uiMipmapWidth, uiMipmapHeight, VTFFile.GetFormat(), this->GetFormat() );
 						}
 					}
 				}
 			}
 
-			//delete []lpImageData;
+			// delete []lpImageData;
 		}
 
 		// Convert thumbnail data.
-		if(VTFFile.GetHasThumbnail())
+		if ( VTFFile.GetHasThumbnail() )
 		{
 			this->uiThumbnailBufferSize = VTFFile.uiThumbnailBufferSize;
 			this->lpThumbnailImageData = new vlByte[this->uiThumbnailBufferSize];
-			memcpy(this->lpThumbnailImageData, VTFFile.lpThumbnailImageData, this->uiThumbnailBufferSize);
+			memcpy( this->lpThumbnailImageData, VTFFile.lpThumbnailImageData, this->uiThumbnailBufferSize );
 		}
 	}
 }
@@ -186,18 +192,18 @@ CVTFFile::~CVTFFile()
 // options must be set after creation.  Essential format flags are automatically
 // generated.
 //
-vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, VTFImageFormat ImageFormat, vlBool bThumbnail, vlBool bMipmaps, vlBool bNullImageData)
+vlBool CVTFFile::Create( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, VTFImageFormat ImageFormat, vlBool bThumbnail, vlBool bMipmaps, vlBool bNullImageData )
 {
-	return this->Init(uiWidth, uiHeight, uiFrames, uiFaces, uiSlices, ImageFormat, bThumbnail, bMipmaps ? -1 : 1, bNullImageData);
+	return this->Init( uiWidth, uiHeight, uiFrames, uiFaces, uiSlices, ImageFormat, bThumbnail, bMipmaps ? -1 : 1, bNullImageData );
 }
 
 //
 // Init()
 // Helper with struct as param instead
 //
-vlBool CVTFFile::Init(const SVTFInitOptions& o)
+vlBool CVTFFile::Init( const SVTFInitOptions &o )
 {
-	return this->Init(o.uiWidth, o.uiHeight, o.uiFrames, o.uiFaces, o.uiSlices, o.ImageFormat, o.bThumbnail, o.nMipMaps, o.bNullImageData);
+	return this->Init( o.uiWidth, o.uiHeight, o.uiFrames, o.uiFaces, o.uiSlices, o.ImageFormat, o.bThumbnail, o.nMipMaps, o.bNullImageData );
 }
 
 //
@@ -206,7 +212,7 @@ vlBool CVTFFile::Init(const SVTFInitOptions& o)
 // options must be set after creation.  Essential format flags are automatically
 // generated.
 //
-vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, VTFImageFormat ImageFormat, vlBool bThumbnail, vlInt nMipmaps, vlBool bNullImageData)
+vlBool CVTFFile::Init( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, VTFImageFormat ImageFormat, vlBool bThumbnail, vlInt nMipmaps, vlBool bNullImageData )
 {
 	this->Destroy();
 
@@ -215,77 +221,77 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 	//
 
 	// Check if width is valid (power of 2 and fits in a short).
-	if(!this->IsPowerOfTwo(uiWidth) || uiWidth > 0xffff)
+	if ( !this->IsPowerOfTwo( uiWidth ) || uiWidth > 0xffff )
 	{
-		if(uiWidth == 0)
+		if ( uiWidth == 0 )
 		{
-			LastError.Set("Invalid image width.  Width must be nonzero.");
+			LastError.Set( "Invalid image width.  Width must be nonzero." );
 		}
 		else
 		{
-			vlUInt uiNextPowerOfTwo = this->NextPowerOfTwo(uiWidth);
-			LastError.SetFormatted("Invalid image width %u.  Width must be a power of two (nearest powers are %u and %u).", uiWidth, uiNextPowerOfTwo >> 1, uiNextPowerOfTwo);
+			vlUInt uiNextPowerOfTwo = this->NextPowerOfTwo( uiWidth );
+			LastError.SetFormatted( "Invalid image width %u.  Width must be a power of two (nearest powers are %u and %u).", uiWidth, uiNextPowerOfTwo >> 1, uiNextPowerOfTwo );
 		}
 		return vlFalse;
 	}
 
 	// Check if height is valid (power of 2 and fits in a short).
-	if(!this->IsPowerOfTwo(uiHeight) || uiHeight > 0xffff)
+	if ( !this->IsPowerOfTwo( uiHeight ) || uiHeight > 0xffff )
 	{
-		if(uiHeight == 0)
+		if ( uiHeight == 0 )
 		{
-			LastError.Set("Invalid image height.  Height must be nonzero.");
+			LastError.Set( "Invalid image height.  Height must be nonzero." );
 		}
 		else
 		{
-			vlUInt uiNextPowerOfTwo = this->NextPowerOfTwo(uiHeight);
-			LastError.SetFormatted("Invalid image height %u.  Height must be a power of two (nearest powers are %u and %u).", uiHeight, uiNextPowerOfTwo >> 1, uiNextPowerOfTwo);
+			vlUInt uiNextPowerOfTwo = this->NextPowerOfTwo( uiHeight );
+			LastError.SetFormatted( "Invalid image height %u.  Height must be a power of two (nearest powers are %u and %u).", uiHeight, uiNextPowerOfTwo >> 1, uiNextPowerOfTwo );
 		}
 		return vlFalse;
 	}
 
 	// Check if height is valid (power of 2 and fits in a short).
-	if(!this->IsPowerOfTwo(uiSlices) || uiSlices > 0xffff)
+	if ( !this->IsPowerOfTwo( uiSlices ) || uiSlices > 0xffff )
 	{
-		if(uiHeight == 0)
+		if ( uiHeight == 0 )
 		{
-			LastError.Set("Invalid image depth.  Depth must be nonzero.");
+			LastError.Set( "Invalid image depth.  Depth must be nonzero." );
 		}
 		else
 		{
-			vlUInt uiNextPowerOfTwo = this->NextPowerOfTwo(uiSlices);
-			LastError.SetFormatted("Invalid image depth %u.  Depth must be a power of two (nearest powers are %u and %u).", uiSlices, uiNextPowerOfTwo >> 1, uiNextPowerOfTwo);
+			vlUInt uiNextPowerOfTwo = this->NextPowerOfTwo( uiSlices );
+			LastError.SetFormatted( "Invalid image depth %u.  Depth must be a power of two (nearest powers are %u and %u).", uiSlices, uiNextPowerOfTwo >> 1, uiNextPowerOfTwo );
 		}
 		return vlFalse;
 	}
 
-	if(ImageFormat <= IMAGE_FORMAT_NONE || ImageFormat >= IMAGE_FORMAT_COUNT)
+	if ( ImageFormat <= IMAGE_FORMAT_NONE || ImageFormat >= IMAGE_FORMAT_COUNT )
 	{
-		LastError.Set("Invalid image format.");
+		LastError.Set( "Invalid image format." );
 		return vlFalse;
 	}
 
-	if(!this->GetImageFormatInfo(ImageFormat).bIsSupported)
+	if ( !this->GetImageFormatInfo( ImageFormat ).bIsSupported )
 	{
-		LastError.Set("Image format not supported.");
+		LastError.Set( "Image format not supported." );
 		return vlFalse;
 	}
 
-	if(uiFrames < 1 || uiFrames > 0xffff)
+	if ( uiFrames < 1 || uiFrames > 0xffff )
 	{
-		LastError.SetFormatted("Invalid image frame count %u.", uiFrames);
+		LastError.SetFormatted( "Invalid image frame count %u.", uiFrames );
 		return vlFalse;
 	}
 
-	if(uiFaces != 1 && uiFaces != 6 && uiFaces != 7)
+	if ( uiFaces != 1 && uiFaces != 6 && uiFaces != 7 )
 	{
-		LastError.SetFormatted("Invalid image face count %u.", uiFaces);
+		LastError.SetFormatted( "Invalid image face count %u.", uiFaces );
 		return vlFalse;
 	}
 
-	if(uiFaces != 1 && uiFaces != 6 && VTF_MINOR_VERSION_DEFAULT >= VTF_MINOR_VERSION_MIN_NO_SPHERE_MAP)
+	if ( uiFaces != 1 && uiFaces != 6 && VTF_MINOR_VERSION_DEFAULT >= VTF_MINOR_VERSION_MIN_NO_SPHERE_MAP )
 	{
-		LastError.SetFormatted("Invalid image face count %u for version %d.%d.", uiFaces, VTF_MAJOR_VERSION, VTF_MINOR_VERSION_DEFAULT);
+		LastError.SetFormatted( "Invalid image face count %u for version %d.%d.", uiFaces, VTF_MAJOR_VERSION, VTF_MINOR_VERSION_DEFAULT );
 		return vlFalse;
 	}
 
@@ -293,7 +299,7 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 
 	// A image cannot have multiple frames and faces.
 	// Logic: StartFrame is used as a flag when the texture is a TEXTUREFLAGS_ENVMAP.
-	//if(uiFrames != 1 && uiFaces != 1)
+	// if(uiFrames != 1 && uiFaces != 1)
 	//{
 	//	LastError.Set("Invalid image frame and face count.  An image cannot have multiple frames and faces.");
 	//	return vlFalse;
@@ -304,23 +310,20 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 	//
 
 	this->Header = new SVTFHeader;
-	memset(this->Header, 0, sizeof(SVTFHeader));
-	
+	memset( this->Header, 0, sizeof( SVTFHeader ) );
+
 	// Compute mipmap count if requested by user
-	if (nMipmaps < 0)
-		nMipmaps = this->ComputeMipmapCount(uiWidth, uiHeight, uiSlices);
+	if ( nMipmaps < 0 )
+		nMipmaps = this->ComputeMipmapCount( uiWidth, uiHeight, uiSlices );
 	nMipmaps = nMipmaps < 0 ? 1 : nMipmaps; // Make sure at least 1
 
-	strcpy(this->Header->TypeString, "VTF");
+	strcpy( this->Header->TypeString, "VTF" );
 	this->Header->Version[0] = VTF_MAJOR_VERSION;
 	this->Header->Version[1] = VTF_MINOR_VERSION_DEFAULT;
 	this->Header->HeaderSize = 0;
 	this->Header->Width = (vlShort)uiWidth;
 	this->Header->Height = (vlShort)uiHeight;
-	this->Header->Flags = (this->GetImageFormatInfo(ImageFormat).uiAlphaBitsPerPixel == 1 ? TEXTUREFLAGS_ONEBITALPHA : 0)
-							| (this->GetImageFormatInfo(ImageFormat).uiAlphaBitsPerPixel > 1 ? TEXTUREFLAGS_EIGHTBITALPHA : 0)
-							| (uiFaces == 1 ? 0 : TEXTUREFLAGS_ENVMAP)
-							| (nMipmaps > 1 ? 0 : TEXTUREFLAGS_NOMIP | TEXTUREFLAGS_NOLOD);
+	this->Header->Flags = ( this->GetImageFormatInfo( ImageFormat ).uiAlphaBitsPerPixel == 1 ? TEXTUREFLAGS_ONEBITALPHA : 0 ) | ( this->GetImageFormatInfo( ImageFormat ).uiAlphaBitsPerPixel > 1 ? TEXTUREFLAGS_EIGHTBITALPHA : 0 ) | ( uiFaces == 1 ? 0 : TEXTUREFLAGS_ENVMAP ) | ( nMipmaps > 1 ? 0 : TEXTUREFLAGS_NOMIP | TEXTUREFLAGS_NOLOD );
 	this->Header->Frames = (vlShort)uiFrames;
 	this->Header->StartFrame = uiFaces != 6 || VTF_MINOR_VERSION_DEFAULT >= VTF_MINOR_VERSION_MIN_NO_SPHERE_MAP ? 0 : 0xffff;
 	this->Header->Reflectivity[0] = 1.0f;
@@ -336,7 +339,7 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 	// Generate thumbnail.
 	//
 
-	if(bThumbnail)
+	if ( bThumbnail )
 	{
 		// Note: Valve informs us that DXT1 is the correct format.
 
@@ -350,27 +353,27 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 
 		vlUInt uiThumbnailWidth = this->Header->Width, uiThumbnailHeight = this->Header->Height;
 
-		while(vlTrue)
+		while ( vlTrue )
 		{
-			if(uiThumbnailWidth <= 16 && uiThumbnailHeight <= 16)
+			if ( uiThumbnailWidth <= 16 && uiThumbnailHeight <= 16 )
 			{
 				break;
 			}
 
 			uiThumbnailWidth >>= 1;
 			uiThumbnailHeight >>= 1;
-			
-			if(uiThumbnailWidth < 1)
+
+			if ( uiThumbnailWidth < 1 )
 				uiThumbnailWidth = 1;
 
-			if(uiThumbnailHeight < 1)
+			if ( uiThumbnailHeight < 1 )
 				uiThumbnailHeight = 1;
 		}
 
 		this->Header->LowResImageWidth = (vlByte)uiThumbnailWidth;
 		this->Header->LowResImageHeight = (vlByte)uiThumbnailHeight;
 
-		this->uiThumbnailBufferSize = this->ComputeImageSize(this->Header->LowResImageWidth, this->Header->LowResImageHeight, 1, this->Header->LowResImageFormat);
+		this->uiThumbnailBufferSize = this->ComputeImageSize( this->Header->LowResImageWidth, this->Header->LowResImageHeight, 1, this->Header->LowResImageFormat );
 		this->lpThumbnailImageData = new vlByte[this->uiThumbnailBufferSize];
 
 		this->Header->Resources[this->Header->ResourceCount++].Type = VTF_LEGACY_RSRC_LOW_RES_IMAGE;
@@ -389,7 +392,7 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 	// Generate image.
 	//
 
-	this->uiImageBufferSize = this->ComputeImageSize(this->Header->Width, this->Header->Height, this->Header->Depth, this->Header->MipCount, this->Header->ImageFormat) * uiFrames * uiFaces;
+	this->uiImageBufferSize = this->ComputeImageSize( this->Header->Width, this->Header->Height, this->Header->Depth, this->Header->MipCount, this->Header->ImageFormat ) * uiFrames * uiFaces;
 	this->lpImageData = new vlByte[this->uiImageBufferSize];
 
 	this->Header->Resources[this->Header->ResourceCount++].Type = VTF_LEGACY_RSRC_IMAGE;
@@ -398,10 +401,10 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 	// Null image data.
 	//
 
-	if(bNullImageData)
+	if ( bNullImageData )
 	{
-		memset(this->lpThumbnailImageData, 0, this->uiThumbnailBufferSize);
-		memset(this->lpImageData, 0, this->uiImageBufferSize);
+		memset( this->lpThumbnailImageData, 0, this->uiThumbnailBufferSize );
+		memset( this->lpImageData, 0, this->uiImageBufferSize );
 	}
 
 	this->ComputeResources();
@@ -415,9 +418,9 @@ vlBool CVTFFile::Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt u
 // Can also generate mipmaps and a thumbnail.  Recommended function for high level single
 // face/frame VTF file creation.
 //
-vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlByte *lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions)
+vlBool CVTFFile::Create( vlUInt uiWidth, vlUInt uiHeight, vlByte *lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions )
 {
-	return this->Create(uiWidth, uiHeight, 1, 1, 1, &lpImageDataRGBA8888, VTFCreateOptions);
+	return this->Create( uiWidth, uiHeight, 1, 1, 1, &lpImageDataRGBA8888, VTFCreateOptions );
 }
 
 static CMP_FORMAT GetCMPFormat( VTFImageFormat imageFormat, bool bDXT5GA )
@@ -427,22 +430,34 @@ static CMP_FORMAT GetCMPFormat( VTFImageFormat imageFormat, bool bDXT5GA )
 
 	switch ( imageFormat )
 	{
-	case IMAGE_FORMAT_BGR888:			return CMP_FORMAT_BGR_888;
-	case IMAGE_FORMAT_RGB888:			return CMP_FORMAT_RGB_888;
-	case IMAGE_FORMAT_RGBA8888:			return CMP_FORMAT_RGBA_8888;
-	case IMAGE_FORMAT_BGRA8888:			return CMP_FORMAT_BGRA_8888;
+		case IMAGE_FORMAT_BGR888:
+			return CMP_FORMAT_BGR_888;
+		case IMAGE_FORMAT_RGB888:
+			return CMP_FORMAT_RGB_888;
+		case IMAGE_FORMAT_RGBA8888:
+			return CMP_FORMAT_RGBA_8888;
+		case IMAGE_FORMAT_BGRA8888:
+			return CMP_FORMAT_BGRA_8888;
 
-	case IMAGE_FORMAT_DXT1_ONEBITALPHA:	return CMP_FORMAT_DXT1;
-	case IMAGE_FORMAT_DXT1:				return CMP_FORMAT_DXT1;
-	case IMAGE_FORMAT_DXT3:				return CMP_FORMAT_DXT3;
-	case IMAGE_FORMAT_DXT5:				return CMP_FORMAT_DXT5;
-	case IMAGE_FORMAT_ATI1N:			return CMP_FORMAT_ATI1N;
-	// Swizzle is technically wrong for below but we reverse it in the shader!
-	case IMAGE_FORMAT_ATI2N:			return CMP_FORMAT_ATI2N;
+		case IMAGE_FORMAT_DXT1_ONEBITALPHA:
+			return CMP_FORMAT_DXT1;
+		case IMAGE_FORMAT_DXT1:
+			return CMP_FORMAT_DXT1;
+		case IMAGE_FORMAT_DXT3:
+			return CMP_FORMAT_DXT3;
+		case IMAGE_FORMAT_DXT5:
+			return CMP_FORMAT_DXT5;
+		case IMAGE_FORMAT_ATI1N:
+			return CMP_FORMAT_ATI1N;
+		// Swizzle is technically wrong for below but we reverse it in the shader!
+		case IMAGE_FORMAT_ATI2N:
+			return CMP_FORMAT_ATI2N;
 
-	case IMAGE_FORMAT_BC7:				return CMP_FORMAT_BC7;
+		case IMAGE_FORMAT_BC7:
+			return CMP_FORMAT_BC7;
 
-	default:							return CMP_FORMAT_Unknown;
+		default:
+			return CMP_FORMAT_Unknown;
 	}
 }
 
@@ -450,197 +465,603 @@ static const char *GetCMPErrorString( CMP_ERROR error )
 {
 	switch ( error )
 	{
-		case CMP_OK:		                        return "Ok.";
-		case CMP_ABORTED:                           return "The conversion was aborted.";
-		case CMP_ERR_INVALID_SOURCE_TEXTURE:        return "The source texture is invalid.";
-		case CMP_ERR_INVALID_DEST_TEXTURE:          return "The destination texture is invalid.";
-		case CMP_ERR_UNSUPPORTED_SOURCE_FORMAT:     return "The source format is not a supported format.";
-		case CMP_ERR_UNSUPPORTED_DEST_FORMAT:       return "The destination format is not a supported format.";
-		case CMP_ERR_UNSUPPORTED_GPU_ASTC_DECODE:   return "The gpu hardware is not supported.";
-		case CMP_ERR_UNSUPPORTED_GPU_BASIS_DECODE:  return "The gpu hardware is not supported.";
-		case CMP_ERR_SIZE_MISMATCH:                 return "The source and destination texture sizes do not match.";
-		case CMP_ERR_UNABLE_TO_INIT_CODEC:          return "Compressonator was unable to initialize the codec needed for conversion.";
-		case CMP_ERR_UNABLE_TO_INIT_DECOMPRESSLIB:  return "GPU_Decode Lib was unable to initialize the codec needed for decompression .";
-		case CMP_ERR_UNABLE_TO_INIT_COMPUTELIB:     return "Compute Lib was unable to initialize the codec needed for compression.";
-		case CMP_ERR_CMP_DESTINATION:               return "Error in compressing destination texture";
-		case CMP_ERR_MEM_ALLOC_FOR_MIPSET:          return "Memory Error: allocating MIPSet compression level data buffer";
-		case CMP_ERR_UNKNOWN_DESTINATION_FORMAT:    return "The destination Codec Type is unknown! In SDK refer to GetCodecType()";
-		case CMP_ERR_FAILED_HOST_SETUP:             return "Failed to setup Host for processing";
-		case CMP_ERR_PLUGIN_FILE_NOT_FOUND:         return "The required plugin library was not found";
-		case CMP_ERR_UNABLE_TO_LOAD_FILE:           return "The requested file was not loaded";
-		case CMP_ERR_UNABLE_TO_CREATE_ENCODER:      return "Request to create an encoder failed";
-		case CMP_ERR_UNABLE_TO_LOAD_ENCODER:        return "Unable to load an encode library";
-		case CMP_ERR_NOSHADER_CODE_DEFINED:         return "No shader code is available for the requested framework";
-		case CMP_ERR_GPU_DOESNOT_SUPPORT_COMPUTE:   return "The GPU device selected does not support compute";
-		case CMP_ERR_NOPERFSTATS:                   return "No Performance Stats are available";
-		case CMP_ERR_GPU_DOESNOT_SUPPORT_CMP_EXT:   return "The GPU does not support the requested compression extension!";
-		case CMP_ERR_GAMMA_OUTOFRANGE:              return "Gamma value set for processing is out of range";
-		case CMP_ERR_PLUGIN_SHAREDIO_NOT_SET:       return "The plugin C_PluginSetSharedIO call was not set and is required for this plugin to operate";
-		case CMP_ERR_UNABLE_TO_INIT_D3DX:           return "Unable to initialize DirectX SDK or get a specific DX API";
+		case CMP_OK:
+			return "Ok.";
+		case CMP_ABORTED:
+			return "The conversion was aborted.";
+		case CMP_ERR_INVALID_SOURCE_TEXTURE:
+			return "The source texture is invalid.";
+		case CMP_ERR_INVALID_DEST_TEXTURE:
+			return "The destination texture is invalid.";
+		case CMP_ERR_UNSUPPORTED_SOURCE_FORMAT:
+			return "The source format is not a supported format.";
+		case CMP_ERR_UNSUPPORTED_DEST_FORMAT:
+			return "The destination format is not a supported format.";
+		case CMP_ERR_UNSUPPORTED_GPU_ASTC_DECODE:
+			return "The gpu hardware is not supported.";
+		case CMP_ERR_UNSUPPORTED_GPU_BASIS_DECODE:
+			return "The gpu hardware is not supported.";
+		case CMP_ERR_SIZE_MISMATCH:
+			return "The source and destination texture sizes do not match.";
+		case CMP_ERR_UNABLE_TO_INIT_CODEC:
+			return "Compressonator was unable to initialize the codec needed for conversion.";
+		case CMP_ERR_UNABLE_TO_INIT_DECOMPRESSLIB:
+			return "GPU_Decode Lib was unable to initialize the codec needed for decompression .";
+		case CMP_ERR_UNABLE_TO_INIT_COMPUTELIB:
+			return "Compute Lib was unable to initialize the codec needed for compression.";
+		case CMP_ERR_CMP_DESTINATION:
+			return "Error in compressing destination texture";
+		case CMP_ERR_MEM_ALLOC_FOR_MIPSET:
+			return "Memory Error: allocating MIPSet compression level data buffer";
+		case CMP_ERR_UNKNOWN_DESTINATION_FORMAT:
+			return "The destination Codec Type is unknown! In SDK refer to GetCodecType()";
+		case CMP_ERR_FAILED_HOST_SETUP:
+			return "Failed to setup Host for processing";
+		case CMP_ERR_PLUGIN_FILE_NOT_FOUND:
+			return "The required plugin library was not found";
+		case CMP_ERR_UNABLE_TO_LOAD_FILE:
+			return "The requested file was not loaded";
+		case CMP_ERR_UNABLE_TO_CREATE_ENCODER:
+			return "Request to create an encoder failed";
+		case CMP_ERR_UNABLE_TO_LOAD_ENCODER:
+			return "Unable to load an encode library";
+		case CMP_ERR_NOSHADER_CODE_DEFINED:
+			return "No shader code is available for the requested framework";
+		case CMP_ERR_GPU_DOESNOT_SUPPORT_COMPUTE:
+			return "The GPU device selected does not support compute";
+		case CMP_ERR_NOPERFSTATS:
+			return "No Performance Stats are available";
+		case CMP_ERR_GPU_DOESNOT_SUPPORT_CMP_EXT:
+			return "The GPU does not support the requested compression extension!";
+		case CMP_ERR_GAMMA_OUTOFRANGE:
+			return "Gamma value set for processing is out of range";
+		case CMP_ERR_PLUGIN_SHAREDIO_NOT_SET:
+			return "The plugin C_PluginSetSharedIO call was not set and is required for this plugin to operate";
+		case CMP_ERR_UNABLE_TO_INIT_D3DX:
+			return "Unable to initialize DirectX SDK or get a specific DX API";
 		default:
-		case CMP_ERR_GENERIC:                       return "An unknown error occurred.";
+		case CMP_ERR_GENERIC:
+			return "An unknown error occurred.";
 	}
 }
 
+//
+// Create()
+// determines rather to Create() or CreateFloat() depending on the create options format.
+// RGBA8888 based Create() cannot handle floating point images, CreateFloat
+// is used for this, so if SVTFCreateOptions::Format is a floating point format
+// it'll redirect to CreateFloat passing the source format to preserve
+// quality if the source format has floating point data.
+//
+
+vlBool CVTFFile::Create( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, vlByte **lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions, const VTFImageFormat &SourceFormat )
+{
+	if ( VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGBA32323232F || VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGB323232F || VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGBA16161616F )
+		return CreateFloat( uiWidth, uiHeight, uiFrames, uiFaces, uiSlices, lpImageDataRGBA8888, VTFCreateOptions, SourceFormat );
+	return Create( uiWidth, uiHeight, uiFrames, uiFaces, uiSlices, lpImageDataRGBA8888, VTFCreateOptions );
+}
+
 //
 // Create()
 // Creates a VTF file of the specified format and size using the provided image RGBA data.
 // Can also generate mipmaps and a thumbnail.  Recommended function for high level multiple
 // face/frame VTF file creation.
 //
-vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, vlByte **lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions)
+vlBool CVTFFile::Create( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, vlByte **lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions )
 {
+	if ( VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGBA32323232F || VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGB323232F || VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGBA16161616F )
+		return false;
+
 	vlUInt uiCount = 0;
-	if(uiFrames > uiCount)
+	if ( uiFrames > uiCount )
 		uiCount = uiFrames;
-	if(uiFaces > uiCount)
+	if ( uiFaces > uiCount )
 		uiCount = uiFaces;
-	if(uiSlices > uiCount)
+	if ( uiSlices > uiCount )
 		uiCount = uiSlices;
-	vlByte **lpNewImageDataRGBA8888 = 0;
 
-	if((uiFrames == 1 && uiFaces > 1 && uiSlices > 1) || (uiFrames > 1 && uiFaces == 1 && uiSlices > 1) || (uiFrames > 1 && uiFaces > 1 && uiSlices == 1))
+	if ( ( uiFrames == 1 && uiFaces > 1 && uiSlices > 1 ) || ( uiFrames > 1 && uiFaces == 1 && uiSlices > 1 ) || ( uiFrames > 1 && uiFaces > 1 && uiSlices == 1 ) )
 	{
-		LastError.Set("Invalid image frame, face and slice count combination.  Function does not support images with any combination of multiple frames or faces or slices.");
+		LastError.Set( "Invalid image frame, face and slice count combination.  Function does not support images with any combination of multiple frames or faces or slices." );
 		return vlFalse;
 	}
 
-	if(VTFCreateOptions.uiVersion[0] != VTF_MAJOR_VERSION || (VTFCreateOptions.uiVersion[1] < 0 || VTFCreateOptions.uiVersion[1] > VTF_MINOR_VERSION))
+	if ( VTFCreateOptions.uiVersion[0] != VTF_MAJOR_VERSION || ( VTFCreateOptions.uiVersion[1] < 0 || VTFCreateOptions.uiVersion[1] > VTF_MINOR_VERSION ) )
 	{
-		LastError.SetFormatted("File version %u.%u does not match %d.%d to %d.%d.", VTFCreateOptions.uiVersion[0], VTFCreateOptions.uiVersion[1], VTF_MAJOR_VERSION, 0, VTF_MAJOR_VERSION, VTF_MINOR_VERSION);
+		LastError.SetFormatted( "File version %u.%u does not match %d.%d to %d.%d.", VTFCreateOptions.uiVersion[0], VTFCreateOptions.uiVersion[1], VTF_MAJOR_VERSION, 0, VTF_MAJOR_VERSION, VTF_MINOR_VERSION );
 		return vlFalse;
 	}
 
-	if(VTFCreateOptions.uiVersion[0] == VTF_MAJOR_VERSION && VTFCreateOptions.uiVersion[1] < VTF_MINOR_VERSION_MIN_VOLUME && uiSlices > 1)
+	if ( VTFCreateOptions.uiVersion[0] == VTF_MAJOR_VERSION && VTFCreateOptions.uiVersion[1] < VTF_MINOR_VERSION_MIN_VOLUME && uiSlices > 1 )
 	{
-		LastError.SetFormatted("Volume textures are only supported in version %d.%d and up.", VTF_MAJOR_VERSION, VTF_MINOR_VERSION_MIN_VOLUME);
+		LastError.SetFormatted( "Volume textures are only supported in version %d.%d and up.", VTF_MAJOR_VERSION, VTF_MINOR_VERSION_MIN_VOLUME );
 		return vlFalse;
 	}
 
-	if(VTFCreateOptions.uiVersion[0] == VTF_MAJOR_VERSION && VTFCreateOptions.uiVersion[1] < VTF_MINOR_VERSION_MIN_SPHERE_MAP && uiFaces == 7)
+	if ( VTFCreateOptions.uiVersion[0] == VTF_MAJOR_VERSION && VTFCreateOptions.uiVersion[1] < VTF_MINOR_VERSION_MIN_SPHERE_MAP && uiFaces == 7 )
 	{
-		LastError.SetFormatted("Sphere maps are only supported in version %d.%d and up.", VTF_MAJOR_VERSION, VTF_MINOR_VERSION_MIN_SPHERE_MAP);
+		LastError.SetFormatted( "Sphere maps are only supported in version %d.%d and up.", VTF_MAJOR_VERSION, VTF_MINOR_VERSION_MIN_SPHERE_MAP );
 		return vlFalse;
 	}
 
-	if(VTFCreateOptions.bMipmaps && uiSlices > 1)
+	if ( VTFCreateOptions.bMipmaps && uiSlices > 1 )
 	{
-		LastError.Set("Mipmap generation for depth textures is not supported.");
+		LastError.Set( "Mipmap generation for depth textures is not supported." );
 		return vlFalse;
 	}
 
+	vlByte **lpNewImageDataRGBA8888 = 0;
+
 	try
 	{
-		if(VTFCreateOptions.bResize)
+		lpNewImageDataRGBA8888 = new vlByte *[uiCount];
+
+		if ( VTFCreateOptions.bResize )
 		{
 			vlUInt uiNewWidth = uiWidth;
 			vlUInt uiNewHeight = uiHeight;
 
-			switch(VTFCreateOptions.ResizeMethod)
+			switch ( VTFCreateOptions.ResizeMethod )
 			{
-			case RESIZE_NEAREST_POWER2:
-			case RESIZE_BIGGEST_POWER2:
-			case RESIZE_SMALLEST_POWER2:
-				// Find the best width.
-				if(this->IsPowerOfTwo(uiWidth))
-				{
-					// Width already a power of 2.
-					uiNewWidth = uiWidth;
-				}
-				else
-				{
-					// Find largest power of 2.
-					uiNewWidth = this->NextPowerOfTwo(uiWidth);
-
-					if(VTFCreateOptions.ResizeMethod == RESIZE_NEAREST_POWER2)
+				case RESIZE_NEAREST_POWER2:
+				case RESIZE_BIGGEST_POWER2:
+				case RESIZE_SMALLEST_POWER2:
+					// Find the best width.
+					if ( this->IsPowerOfTwo( uiWidth ) )
+					{
+						// Width already a power of 2.
+						uiNewWidth = uiWidth;
+					}
+					else
 					{
-						if(uiWidth - (uiNewWidth >> 1) < uiNewWidth - uiWidth)
+						// Find largest power of 2.
+						uiNewWidth = this->NextPowerOfTwo( uiWidth );
+
+						if ( VTFCreateOptions.ResizeMethod == RESIZE_NEAREST_POWER2 )
+						{
+							if ( uiWidth - ( uiNewWidth >> 1 ) < uiNewWidth - uiWidth )
+							{
+								uiNewWidth >>= 1;
+							}
+						}
+						else if ( VTFCreateOptions.ResizeMethod == RESIZE_SMALLEST_POWER2 )
 						{
 							uiNewWidth >>= 1;
 						}
+
+						if ( uiNewWidth == 0 )
+						{
+							uiNewWidth = 1;
+						}
 					}
-					else if(VTFCreateOptions.ResizeMethod == RESIZE_SMALLEST_POWER2)
+					if ( VTFCreateOptions.bResizeClamp && uiNewWidth > VTFCreateOptions.uiResizeClampWidth )
 					{
-						uiNewWidth >>= 1;
+						uiNewWidth = VTFCreateOptions.uiResizeClampWidth;
 					}
 
-					if(uiNewWidth == 0)
+					// Find the best height.
+					if ( this->IsPowerOfTwo( uiHeight ) )
 					{
-						uiNewWidth = 1;
+						// Height already a power of 2.
+						uiNewHeight = uiHeight;
 					}
-				}
-				if(VTFCreateOptions.bResizeClamp && uiNewWidth > VTFCreateOptions.uiResizeClampWidth)
+					else
+					{
+						// Find largest power of 2.
+						uiNewHeight = this->NextPowerOfTwo( uiHeight );
+
+						if ( VTFCreateOptions.ResizeMethod == RESIZE_NEAREST_POWER2 )
+						{
+							if ( uiHeight - ( uiNewHeight >> 1 ) < uiNewHeight - uiHeight )
+							{
+								uiNewHeight >>= 1;
+							}
+						}
+						else if ( VTFCreateOptions.ResizeMethod == RESIZE_SMALLEST_POWER2 )
+						{
+							uiNewHeight >>= 1;
+						}
+
+						if ( uiNewHeight == 0 )
+						{
+							uiNewHeight = 1;
+						}
+					}
+					if ( VTFCreateOptions.bResizeClamp && uiNewHeight > VTFCreateOptions.uiResizeClampHeight )
+					{
+						uiNewHeight = VTFCreateOptions.uiResizeClampHeight;
+					}
+					break;
+				case RESIZE_SET:
+					uiNewWidth = VTFCreateOptions.uiResizeWidth;
+					uiNewHeight = VTFCreateOptions.uiResizeHeight;
+					break;
+			}
+
+			assert( ( uiNewWidth & ( uiNewWidth - 1 ) ) == 0 );
+			assert( ( uiNewHeight & ( uiNewHeight - 1 ) ) == 0 );
+
+			// Resize the input.
+			if ( uiWidth != uiNewWidth || uiHeight != uiNewHeight )
+			{
+				for ( vlUInt i = 0; i < uiCount; i++ )
 				{
-					uiNewWidth = VTFCreateOptions.uiResizeClampWidth;
+					lpNewImageDataRGBA8888[i] = new vlByte[this->ComputeImageSize( uiNewWidth, uiNewHeight, 1, IMAGE_FORMAT_RGBA8888 )];
+
+					if ( !this->Resize( lpImageDataRGBA8888[i], lpNewImageDataRGBA8888[i], uiWidth, uiHeight, uiNewWidth, uiNewHeight, VTFCreateOptions.ResizeFilter, VTFCreateOptions.bSRGB ) )
+					{
+						throw 0;
+					}
 				}
 
-				// Find the best height.
-				if(this->IsPowerOfTwo(uiHeight))
+				uiWidth = uiNewWidth;
+				uiHeight = uiNewHeight;
+			}
+			else
+			{
+				for ( vlUInt i = 0; i < uiCount; i++ )
 				{
-					// Height already a power of 2.
-					uiNewHeight = uiHeight;
+					vlUInt size = this->ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 );
+					lpNewImageDataRGBA8888[i] = new vlByte[size];
+					memcpy( lpNewImageDataRGBA8888[i], lpImageDataRGBA8888[i], size );
 				}
-				else
+			}
+		}
+		else
+		{
+			for ( vlUInt i = 0; i < uiCount; i++ )
+			{
+				vlUInt size = this->ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 );
+				lpNewImageDataRGBA8888[i] = new vlByte[size];
+				memcpy( lpNewImageDataRGBA8888[i], lpImageDataRGBA8888[i], size );
+			}
+		}
+
+		// Create image (allocate and setup structures).
+		if ( !this->Init( uiWidth, uiHeight, uiFrames, uiFaces + ( VTFCreateOptions.bSphereMap && uiFaces == 6 ? 1 : 0 ), uiSlices, VTFCreateOptions.ImageFormat, VTFCreateOptions.bThumbnail, VTFCreateOptions.bMipmaps ? -1 : 1, vlFalse ) )
+		{
+			throw 0;
+		}
+
+		// Update version, for the current versions with the current checking this should be sufficient.
+		this->Header->Version[0] = VTFCreateOptions.uiVersion[0];
+		this->Header->Version[1] = VTFCreateOptions.uiVersion[1];
+
+		this->ComputeResources();
+
+		// Do gamma correction.
+		if ( VTFCreateOptions.bGammaCorrection )
+		{
+			for ( vlUInt i = 0; i < uiFrames; i++ )
+			{
+				for ( vlUInt j = 0; j < uiFaces; j++ )
 				{
-					// Find largest power of 2.
-					uiNewHeight = this->NextPowerOfTwo(uiHeight);
+					for ( vlUInt k = 0; k < uiSlices; k++ )
+					{
+						this->CorrectImageGamma( lpNewImageDataRGBA8888[i + j + k], this->Header->Width, this->Header->Height, VTFCreateOptions.sGammaCorrection );
+					}
+				}
+			}
+		}
 
-					if(VTFCreateOptions.ResizeMethod == RESIZE_NEAREST_POWER2)
+		// Generate mipmaps off source image.
+		if ( VTFCreateOptions.bMipmaps && this->Header->MipCount != 1 )
+		{
+			auto temp = std::vector<vlByte>( this->Header->Width * this->Header->Height * 4 );
+
+			for ( vlUInt i = 0; i < uiFrames; i++ )
+			{
+				for ( vlUInt j = 0; j < uiFaces; j++ )
+				{
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						if(uiHeight - (uiNewHeight >> 1) < uiNewHeight - uiHeight)
+						vlByte *pSource = lpNewImageDataRGBA8888[i + j + k];
+
+						if ( !this->ConvertFromRGBA8888( pSource, this->GetData( i, j, k, 0 ), this->Header->Width, this->Header->Height, this->Header->ImageFormat ) )
 						{
-							uiNewHeight >>= 1;
+							throw 0;
+						}
+
+						for ( vlUInt m = 1; m < this->Header->MipCount; m++ )
+						{
+							vlUShort usWidth = std::max( 1, this->Header->Width >> m );
+							vlUShort usHeight = std::max( 1, this->Header->Height >> m );
+
+							if ( !stbir_resize_uint8_generic(
+									 pSource, this->Header->Width, this->Header->Height, 0,
+									 temp.data(), usWidth, usHeight, 0,
+									 4, 3, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_BOX, VTFCreateOptions.bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, NULL ) )
+							{
+								throw 0;
+							}
+
+							if ( !this->ConvertFromRGBA8888( temp.data(), this->GetData( i, j, k, m ), usWidth, usHeight, this->Header->ImageFormat ) )
+							{
+								throw 0;
+							}
 						}
 					}
-					else if(VTFCreateOptions.ResizeMethod == RESIZE_SMALLEST_POWER2)
+				}
+			}
+		}
+		else
+		{
+			for ( vlUInt i = 0; i < uiFrames; i++ )
+			{
+				for ( vlUInt j = 0; j < uiFaces; j++ )
+				{
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						uiNewHeight >>= 1;
+						if ( !this->ConvertFromRGBA8888( lpNewImageDataRGBA8888[i + j + k], this->GetData( i, j, k, 0 ), this->Header->Width, this->Header->Height, this->Header->ImageFormat ) )
+						{
+							throw 0;
+						}
 					}
+				}
+			}
+		}
+
+		// Generate thumbnail off mipmaps.
+		if ( VTFCreateOptions.bThumbnail )
+		{
+			if ( !this->GenerateThumbnail( VTFCreateOptions.bSRGB ) )
+			{
+				throw 0;
+			}
+		}
 
-					if(uiNewHeight == 0)
+		if ( VTFCreateOptions.bSphereMap && uiFaces == 6 )
+		{
+			if ( !this->GenerateSphereMap() )
+			{
+				throw 0;
+			}
+		}
+
+		if ( VTFCreateOptions.bReflectivity )
+		{
+			this->Header->Reflectivity[0] = 0.0f;
+			this->Header->Reflectivity[1] = 0.0f;
+			this->Header->Reflectivity[2] = 0.0f;
+
+			for ( vlUInt i = 0; i < uiFrames; i++ )
+			{
+				for ( vlUInt j = 0; j < uiFaces; j++ )
+				{
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						uiNewHeight = 1;
+						vlSingle sX, sY, sZ;
+						this->ComputeImageReflectivity( lpNewImageDataRGBA8888[i + j + k], uiWidth, uiHeight, sX, sY, sZ );
+
+						this->Header->Reflectivity[0] += sX;
+						this->Header->Reflectivity[1] += sY;
+						this->Header->Reflectivity[2] += sZ;
 					}
 				}
-				if(VTFCreateOptions.bResizeClamp && uiNewHeight > VTFCreateOptions.uiResizeClampHeight)
-				{
-					uiNewHeight = VTFCreateOptions.uiResizeClampHeight;
-				}
-				break;
-			case RESIZE_SET:
-				uiNewWidth = VTFCreateOptions.uiResizeWidth;
-				uiNewHeight = VTFCreateOptions.uiResizeHeight;
-				break;
 			}
 
-			assert((uiNewWidth & (uiNewWidth - 1)) == 0);
-			assert((uiNewHeight & (uiNewHeight - 1)) == 0);
+			vlSingle sInverse = 1.0f / (vlSingle)( uiFrames * uiFaces * uiSlices );
 
-			// Resize the input.
-			if(uiWidth != uiNewWidth || uiHeight != uiNewHeight)
+			this->Header->Reflectivity[0] *= sInverse;
+			this->Header->Reflectivity[1] *= sInverse;
+			this->Header->Reflectivity[2] *= sInverse;
+		}
+		else
+		{
+			this->SetReflectivity( VTFCreateOptions.sReflectivity[0], VTFCreateOptions.sReflectivity[1], VTFCreateOptions.sReflectivity[2] );
+		}
+
+		// Set the flags, call SetFlag() to make sure we don't set anything we shouldn't.
+		for ( vlUInt i = 0, uiFlag = 0x00000001; i < TEXTUREFLAGS_COUNT; i++, uiFlag <<= 1 )
+		{
+			if ( VTFCreateOptions.uiFlags & uiFlag )
 			{
-				lpNewImageDataRGBA8888 = new vlByte *[uiCount];
-				memset(lpNewImageDataRGBA8888, 0, uiCount * sizeof(vlByte *));
+				this->SetFlag( (VTFImageFlag)uiFlag, vlTrue );
+			}
+		}
+		this->SetStartFrame( VTFCreateOptions.uiStartFrame );
+		this->SetBumpmapScale( VTFCreateOptions.sBumpScale );
 
-				for(vlUInt i = 0; i < uiCount; i++)
+		for ( int i = 0; i < uiCount; i++ )
+			delete[] lpNewImageDataRGBA8888[i];
+		delete[] lpNewImageDataRGBA8888;
+
+		return vlTrue;
+	}
+	catch ( ... )
+	{
+		if ( lpNewImageDataRGBA8888 != 0 )
+		{
+			for ( vlUInt i = 0; i < uiCount; i++ )
+			{
+				delete[] lpNewImageDataRGBA8888[i];
+			}
+			delete[] lpNewImageDataRGBA8888;
+		}
+
+		this->Destroy();
+
+		return vlFalse;
+	}
+}
+
+//
+// CreateFloat()
+// Normal Create() is strictly tied to RGBA8888, as the highest quality among anything
+// not floating lesser formats can be converted to and from RGBA8888 without losses.
+// FP16/32 However is above RGBA8888, so it needs to be handled separately from Create().
+//
+vlBool CVTFFile::CreateFloat( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, vlByte **lpImageDataFP, const SVTFCreateOptions &VTFCreateOptions, const VTFImageFormat &SourceFormat )
+{
+	if ( !( VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGBA32323232F || VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGB323232F || VTFCreateOptions.ImageFormat == IMAGE_FORMAT_RGBA16161616F ) )
+		return false;
+
+	vlUInt uiCount = 0;
+	if ( uiFrames > uiCount )
+		uiCount = uiFrames;
+	if ( uiFaces > uiCount )
+		uiCount = uiFaces;
+	if ( uiSlices > uiCount )
+		uiCount = uiSlices;
+
+	if ( ( uiFrames == 1 && uiFaces > 1 && uiSlices > 1 ) || ( uiFrames > 1 && uiFaces == 1 && uiSlices > 1 ) || ( uiFrames > 1 && uiFaces > 1 && uiSlices == 1 ) )
+	{
+		LastError.Set( "Invalid image frame, face and slice count combination.  Function does not support images with any combination of multiple frames or faces or slices." );
+		return vlFalse;
+	}
+
+	if ( VTFCreateOptions.uiVersion[0] != VTF_MAJOR_VERSION || ( VTFCreateOptions.uiVersion[1] < 0 || VTFCreateOptions.uiVersion[1] > VTF_MINOR_VERSION ) )
+	{
+		LastError.SetFormatted( "File version %u.%u does not match %d.%d to %d.%d.", VTFCreateOptions.uiVersion[0], VTFCreateOptions.uiVersion[1], VTF_MAJOR_VERSION, 0, VTF_MAJOR_VERSION, VTF_MINOR_VERSION );
+		return vlFalse;
+	}
+
+	if ( VTFCreateOptions.uiVersion[0] == VTF_MAJOR_VERSION && VTFCreateOptions.uiVersion[1] < VTF_MINOR_VERSION_MIN_VOLUME && uiSlices > 1 )
+	{
+		LastError.SetFormatted( "Volume textures are only supported in version %d.%d and up.", VTF_MAJOR_VERSION, VTF_MINOR_VERSION_MIN_VOLUME );
+		return vlFalse;
+	}
+
+	if ( VTFCreateOptions.uiVersion[0] == VTF_MAJOR_VERSION && VTFCreateOptions.uiVersion[1] < VTF_MINOR_VERSION_MIN_SPHERE_MAP && uiFaces == 7 )
+	{
+		LastError.SetFormatted( "Sphere maps are only supported in version %d.%d and up.", VTF_MAJOR_VERSION, VTF_MINOR_VERSION_MIN_SPHERE_MAP );
+		return vlFalse;
+	}
+
+	if ( VTFCreateOptions.bMipmaps && uiSlices > 1 )
+	{
+		LastError.Set( "Mipmap generation for depth textures is not supported." );
+		return vlFalse;
+	}
+
+	vlByte **lpNewImageDataFP = new vlByte *[sizeof( vlByte * ) * uiCount];
+
+	vlUInt fp32ImageSize = CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA32323232F );
+
+	for ( int i = 0; i < uiCount; i++ )
+	{
+		lpNewImageDataFP[i] = new vlByte[fp32ImageSize];
+
+		if ( !Convert( lpImageDataFP[i], lpNewImageDataFP[i], uiWidth, uiHeight, SourceFormat, IMAGE_FORMAT_RGBA32323232F ) )
+		{
+			for ( int j = 0; j < i; j++ )
+			{
+				delete[] lpNewImageDataFP[i];
+			}
+			delete[] lpNewImageDataFP;
+		}
+	}
+
+	try
+	{
+		if ( VTFCreateOptions.bResize )
+		{
+			vlUInt uiNewWidth = uiWidth;
+			vlUInt uiNewHeight = uiHeight;
+
+			switch ( VTFCreateOptions.ResizeMethod )
+			{
+				case RESIZE_NEAREST_POWER2:
+				case RESIZE_BIGGEST_POWER2:
+				case RESIZE_SMALLEST_POWER2:
+					// Find the best width.
+					if ( this->IsPowerOfTwo( uiWidth ) )
+					{
+						// Width already a power of 2.
+						uiNewWidth = uiWidth;
+					}
+					else
+					{
+						// Find largest power of 2.
+						uiNewWidth = this->NextPowerOfTwo( uiWidth );
+
+						if ( VTFCreateOptions.ResizeMethod == RESIZE_NEAREST_POWER2 )
+						{
+							if ( uiWidth - ( uiNewWidth >> 1 ) < uiNewWidth - uiWidth )
+							{
+								uiNewWidth >>= 1;
+							}
+						}
+						else if ( VTFCreateOptions.ResizeMethod == RESIZE_SMALLEST_POWER2 )
+						{
+							uiNewWidth >>= 1;
+						}
+
+						if ( uiNewWidth == 0 )
+						{
+							uiNewWidth = 1;
+						}
+					}
+					if ( VTFCreateOptions.bResizeClamp && uiNewWidth > VTFCreateOptions.uiResizeClampWidth )
+					{
+						uiNewWidth = VTFCreateOptions.uiResizeClampWidth;
+					}
+
+					// Find the best height.
+					if ( this->IsPowerOfTwo( uiHeight ) )
+					{
+						// Height already a power of 2.
+						uiNewHeight = uiHeight;
+					}
+					else
+					{
+						// Find largest power of 2.
+						uiNewHeight = this->NextPowerOfTwo( uiHeight );
+
+						if ( VTFCreateOptions.ResizeMethod == RESIZE_NEAREST_POWER2 )
+						{
+							if ( uiHeight - ( uiNewHeight >> 1 ) < uiNewHeight - uiHeight )
+							{
+								uiNewHeight >>= 1;
+							}
+						}
+						else if ( VTFCreateOptions.ResizeMethod == RESIZE_SMALLEST_POWER2 )
+						{
+							uiNewHeight >>= 1;
+						}
+
+						if ( uiNewHeight == 0 )
+						{
+							uiNewHeight = 1;
+						}
+					}
+					if ( VTFCreateOptions.bResizeClamp && uiNewHeight > VTFCreateOptions.uiResizeClampHeight )
+					{
+						uiNewHeight = VTFCreateOptions.uiResizeClampHeight;
+					}
+					break;
+				case RESIZE_SET:
+					uiNewWidth = VTFCreateOptions.uiResizeWidth;
+					uiNewHeight = VTFCreateOptions.uiResizeHeight;
+					break;
+			}
+
+			assert( ( uiNewWidth & ( uiNewWidth - 1 ) ) == 0 );
+			assert( ( uiNewHeight & ( uiNewHeight - 1 ) ) == 0 );
+
+			// Resize the input.
+			if ( uiWidth != uiNewWidth || uiHeight != uiNewHeight )
+			{
+				for ( vlUInt i = 0; i < uiCount; i++ )
 				{
-					lpNewImageDataRGBA8888[i] = new vlByte[this->ComputeImageSize(uiNewWidth, uiNewHeight, 1, IMAGE_FORMAT_RGBA8888)];
+					vlByte *lpScaledFP = new vlByte[this->ComputeImageSize( uiNewWidth, uiNewHeight, 1, IMAGE_FORMAT_RGBA32323232F )];
 
-					if(!this->Resize(lpImageDataRGBA8888[i], lpNewImageDataRGBA8888[i], uiWidth, uiHeight, uiNewWidth, uiNewHeight, VTFCreateOptions.ResizeFilter, VTFCreateOptions.bSRGB))
+					if ( !this->ResizeFloat( lpNewImageDataFP[i], lpScaledFP, uiWidth, uiHeight, uiNewWidth, uiNewHeight, VTFCreateOptions.ResizeFilter, VTFCreateOptions.bSRGB ) )
 					{
 						throw 0;
 					}
+
+					delete[] lpNewImageDataFP[i];
+					lpNewImageDataFP[i] = lpScaledFP;
 				}
 
 				uiWidth = uiNewWidth;
 				uiHeight = uiNewHeight;
-
-				lpImageDataRGBA8888 = lpNewImageDataRGBA8888;
 			}
 		}
 
 		// Create image (allocate and setup structures).
-		if(!this->Init(uiWidth, uiHeight, uiFrames, uiFaces + (VTFCreateOptions.bSphereMap && uiFaces == 6 ? 1 : 0), uiSlices, VTFCreateOptions.ImageFormat, VTFCreateOptions.bThumbnail, VTFCreateOptions.bMipmaps ? -1 : 1, vlFalse))
+		if ( !this->Init( uiWidth, uiHeight, uiFrames, uiFaces + ( VTFCreateOptions.bSphereMap && uiFaces == 6 ? 1 : 0 ), uiSlices, VTFCreateOptions.ImageFormat, VTFCreateOptions.bThumbnail, VTFCreateOptions.bMipmaps ? -1 : 1, vlTrue ) )
 		{
 			throw 0;
 		}
@@ -652,52 +1073,58 @@ vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt
 		this->ComputeResources();
 
 		// Do gamma correction.
-		if(VTFCreateOptions.bGammaCorrection)
+		if ( VTFCreateOptions.bGammaCorrection )
 		{
-			for(vlUInt i = 0; i < uiFrames; i++)
+			for ( vlUInt i = 0; i < uiFrames; i++ )
 			{
-				for(vlUInt j = 0; j < uiFaces; j++)
+				for ( vlUInt j = 0; j < uiFaces; j++ )
 				{
-					for(vlUInt k = 0; k < uiSlices; k++)
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						this->CorrectImageGamma(lpImageDataRGBA8888[i + j + k], this->Header->Width, this->Header->Height, VTFCreateOptions.sGammaCorrection);
+						auto lpSource = lpNewImageDataFP[i + j + k];
+						auto lpSourceFP = reinterpret_cast<float *>( lpSource );
+						auto lpLastFP = reinterpret_cast<float *>( lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA32323232F ) );
+
+						for ( int test = 0; lpSourceFP < lpLastFP; lpSourceFP += 4, test++ )
+						{
+							lpSourceFP[0] = pow( lpSourceFP[0], 1.0f / VTFCreateOptions.sGammaCorrection );
+							lpSourceFP[1] = pow( lpSourceFP[1], 1.0f / VTFCreateOptions.sGammaCorrection );
+							lpSourceFP[2] = pow( lpSourceFP[2], 1.0f / VTFCreateOptions.sGammaCorrection );
+						}
 					}
 				}
 			}
 		}
 
 		// Generate mipmaps off source image.
-		if(VTFCreateOptions.bMipmaps && this->Header->MipCount != 1)
+		if ( VTFCreateOptions.bMipmaps && this->Header->MipCount != 1 )
 		{
-			auto temp = std::vector<vlByte>(this->Header->Width * this->Header->Height * 4);
+			auto temp = std::vector<vlByte>( ComputeImageSize( this->Header->Width, this->Header->Height, 1, this->Header->ImageFormat ) );
 
-			for(vlUInt i = 0; i < uiFrames; i++)
+			for ( vlUInt i = 0; i < uiFrames; i++ )
 			{
-				for(vlUInt j = 0; j < uiFaces; j++)
+				for ( vlUInt j = 0; j < uiFaces; j++ )
 				{
-					for(vlUInt k = 0; k < uiSlices; k++)
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						vlByte* pSource = lpImageDataRGBA8888[i + j + k];
+						vlByte *pSource = lpNewImageDataFP[i + j + k];
 
-						if(!this->ConvertFromRGBA8888(pSource, this->GetData(i, j, k, 0), this->Header->Width, this->Header->Height, this->Header->ImageFormat))
+						if ( !this->Convert( pSource, this->GetData( i, j, k, 0 ), this->Header->Width, this->Header->Height, IMAGE_FORMAT_RGBA32323232F, VTFCreateOptions.ImageFormat ) )
 						{
 							throw 0;
 						}
 
-						for (vlUInt m = 1; m < this->Header->MipCount; m++)
+						for ( vlUInt m = 1; m < this->Header->MipCount; m++ )
 						{
-							vlUShort usWidth  = std::max(1, this->Header->Width  >> m);
-							vlUShort usHeight = std::max(1, this->Header->Height >> m);
+							vlUShort usWidth = std::max( 1, this->Header->Width >> m );
+							vlUShort usHeight = std::max( 1, this->Header->Height >> m );
 
-							if (!stbir_resize_uint8_generic(
-								pSource, this->Header->Width, this->Header->Height, 0,
-								temp.data(), usWidth, usHeight, 0,
-								4, 3, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_BOX, VTFCreateOptions.bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, NULL))
+							if ( !this->ResizeFloat( lpNewImageDataFP[i], temp.data(), uiWidth, uiHeight, usWidth, usHeight, VTFCreateOptions.ResizeFilter, VTFCreateOptions.bSRGB ) )
 							{
 								throw 0;
 							}
 
-							if (!this->ConvertFromRGBA8888(temp.data(), this->GetData(i, j, k, m), usWidth, usHeight, this->Header->ImageFormat))
+							if ( !this->Convert( temp.data(), this->GetData( i, j, k, m ), usWidth, usHeight, IMAGE_FORMAT_RGBA32323232F, VTFCreateOptions.ImageFormat ) )
 							{
 								throw 0;
 							}
@@ -708,13 +1135,13 @@ vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt
 		}
 		else
 		{
-			for(vlUInt i = 0; i < uiFrames; i++)
+			for ( vlUInt i = 0; i < uiFrames; i++ )
 			{
-				for(vlUInt j = 0; j < uiFaces; j++)
+				for ( vlUInt j = 0; j < uiFaces; j++ )
 				{
-					for(vlUInt k = 0; k < uiSlices; k++)
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						if(!this->ConvertFromRGBA8888(lpImageDataRGBA8888[i + j + k], this->GetData(i, j, k, 0), this->Header->Width, this->Header->Height, this->Header->ImageFormat))
+						if ( !this->Convert( lpNewImageDataFP[i + j + k], this->GetData( i, j, k, 0 ), this->Header->Width, this->Header->Height, IMAGE_FORMAT_RGBA32323232F, this->Header->ImageFormat ) )
 						{
 							throw 0;
 						}
@@ -724,45 +1151,56 @@ vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt
 		}
 
 		// Generate thumbnail off mipmaps.
-		if(VTFCreateOptions.bThumbnail)
+		if ( VTFCreateOptions.bThumbnail )
 		{
-			if(!this->GenerateThumbnail(VTFCreateOptions.bSRGB))
+			if ( !this->GenerateThumbnail( VTFCreateOptions.bSRGB ) )
 			{
 				throw 0;
 			}
 		}
 
-		if(VTFCreateOptions.bSphereMap && uiFaces == 6)
+		if ( VTFCreateOptions.bSphereMap && uiFaces == 6 )
 		{
-			if(!this->GenerateSphereMap())
+			if ( !this->GenerateSphereMap() )
 			{
 				throw 0;
 			}
 		}
 
-		if(VTFCreateOptions.bReflectivity)
+		if ( VTFCreateOptions.bReflectivity )
 		{
 			this->Header->Reflectivity[0] = 0.0f;
 			this->Header->Reflectivity[1] = 0.0f;
 			this->Header->Reflectivity[2] = 0.0f;
 
-			for(vlUInt i = 0; i < uiFrames; i++)
+			for ( vlUInt i = 0; i < uiFrames; i++ )
 			{
-				for(vlUInt j = 0; j < uiFaces; j++)
+				for ( vlUInt j = 0; j < uiFaces; j++ )
 				{
-					for(vlUInt k = 0; k < uiSlices; k++)
+					for ( vlUInt k = 0; k < uiSlices; k++ )
 					{
-						vlSingle sX, sY, sZ;
-						this->ComputeImageReflectivity(lpImageDataRGBA8888[i + j + k], uiWidth, uiHeight, sX, sY, sZ);
+						// For reflectivity, we don't need to be 100% accurate, just good enough.
+						// So we can get away with this instead of rewriting ComputeImageReflectivity
+						// from scratch to work on floating point images.
+						vlUInt size = ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 );
+						vlByte *reflectRGBA8888Data = reinterpret_cast<vlByte *>( malloc( size ) );
+
+						ConvertToRGBA8888( this->GetData( i, j, k, 0 ), reflectRGBA8888Data, uiWidth, uiHeight, VTFCreateOptions.ImageFormat );
+
+						vlSingle sX,
+							sY, sZ;
+						this->ComputeImageReflectivity( reflectRGBA8888Data, uiWidth, uiHeight, sX, sY, sZ );
 
 						this->Header->Reflectivity[0] += sX;
 						this->Header->Reflectivity[1] += sY;
 						this->Header->Reflectivity[2] += sZ;
+
+						free( reflectRGBA8888Data );
 					}
 				}
 			}
 
-			vlSingle sInverse = 1.0f / (vlSingle)(uiFrames * uiFaces * uiSlices);
+			vlSingle sInverse = 1.0f / (vlSingle)( uiFrames * uiFaces * uiSlices );
 
 			this->Header->Reflectivity[0] *= sInverse;
 			this->Header->Reflectivity[1] *= sInverse;
@@ -770,36 +1208,31 @@ vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt
 		}
 		else
 		{
-			this->SetReflectivity(VTFCreateOptions.sReflectivity[0], VTFCreateOptions.sReflectivity[1], VTFCreateOptions.sReflectivity[2]);
+			this->SetReflectivity( VTFCreateOptions.sReflectivity[0], VTFCreateOptions.sReflectivity[1], VTFCreateOptions.sReflectivity[2] );
 		}
 
 		// Set the flags, call SetFlag() to make sure we don't set anything we shouldn't.
-		for(vlUInt i = 0, uiFlag = 0x00000001; i < TEXTUREFLAGS_COUNT; i++, uiFlag <<= 1)
+		for ( vlUInt i = 0, uiFlag = 0x00000001; i < TEXTUREFLAGS_COUNT; i++, uiFlag <<= 1 )
 		{
-			if(VTFCreateOptions.uiFlags & uiFlag)
+			if ( VTFCreateOptions.uiFlags & uiFlag )
 			{
-				this->SetFlag((VTFImageFlag)uiFlag, vlTrue);
+				this->SetFlag( (VTFImageFlag)uiFlag, vlTrue );
 			}
 		}
-		this->SetStartFrame(VTFCreateOptions.uiStartFrame);
-		this->SetBumpmapScale(VTFCreateOptions.sBumpScale);
+		this->SetStartFrame( VTFCreateOptions.uiStartFrame );
+		this->SetBumpmapScale( VTFCreateOptions.sBumpScale );
+
+		for ( int i = 0; i < uiCount; i++ )
+			delete[] lpNewImageDataFP[i];
+		delete[] lpNewImageDataFP;
 
 		return vlTrue;
 	}
-	catch(...)
+	catch ( ... )
 	{
-		if(lpNewImageDataRGBA8888 != 0)
-		{
-			for(vlUInt i = 0; i < uiCount; i++)
-			{
-				delete []lpNewImageDataRGBA8888[i];
-			}
-			delete []lpNewImageDataRGBA8888;
-		}
-
-		this->Destroy();
-
-		return vlFalse;
+		for ( int i = 0; i < uiCount; i++ )
+			delete[] lpNewImageDataFP[i];
+		delete[] lpNewImageDataFP;
 	}
 }
 
@@ -809,11 +1242,11 @@ vlBool CVTFFile::Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt
 //
 vlVoid CVTFFile::Destroy()
 {
-	if(this->Header != 0)
+	if ( this->Header != 0 )
 	{
-		for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
+		for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 		{
-			delete []this->Header->Data[i].Data;
+			delete[] this->Header->Data[i].Data;
 		}
 	}
 
@@ -821,35 +1254,35 @@ vlVoid CVTFFile::Destroy()
 	this->Header = 0;
 
 	this->uiImageBufferSize = 0;
-	delete []this->lpImageData;
-	this->lpImageData= 0;
+	delete[] this->lpImageData;
+	this->lpImageData = 0;
 
 	this->uiThumbnailBufferSize = 0;
-	delete []this->lpThumbnailImageData;
+	delete[] this->lpThumbnailImageData;
 	this->lpThumbnailImageData = 0;
 }
 
-vlBool CVTFFile::IsPowerOfTwo(vlUInt uiSize)
+vlBool CVTFFile::IsPowerOfTwo( vlUInt uiSize )
 {
-	return uiSize > 0 && (uiSize & (uiSize - 1)) == 0;
+	return uiSize > 0 && ( uiSize & ( uiSize - 1 ) ) == 0;
 }
 
-vlUInt CVTFFile::NextPowerOfTwo(vlUInt uiSize)
+vlUInt CVTFFile::NextPowerOfTwo( vlUInt uiSize )
 {
-	if(uiSize == 0)
+	if ( uiSize == 0 )
 	{
 		return 1;
 	}
 
-	if(this->IsPowerOfTwo(uiSize))
+	if ( this->IsPowerOfTwo( uiSize ) )
 	{
 		return uiSize;
 	}
 
 	uiSize--;
-	for(vlUInt i = 1; i <= sizeof(vlUInt) * 4; i <<= 1)
+	for ( vlUInt i = 1; i <= sizeof( vlUInt ) * 4; i <<= 1 )
 	{
-		uiSize = uiSize | (uiSize >> i);
+		uiSize = uiSize | ( uiSize >> i );
 	}
 	uiSize++;
 
@@ -867,47 +1300,47 @@ vlBool CVTFFile::IsLoaded() const
 	return this->Header != 0;
 }
 
-vlBool CVTFFile::Load(const vlChar *cFileName, vlBool bHeaderOnly)
+vlBool CVTFFile::Load( const vlChar *cFileName, vlBool bHeaderOnly )
 {
-	IO::Readers::CFileReader reader(cFileName);
-	return this->Load(&reader, bHeaderOnly);
+	IO::Readers::CFileReader reader( cFileName );
+	return this->Load( &reader, bHeaderOnly );
 }
 
-vlBool CVTFFile::Load(const vlVoid *lpData, vlUInt uiBufferSize, vlBool bHeaderOnly)
+vlBool CVTFFile::Load( const vlVoid *lpData, vlUInt uiBufferSize, vlBool bHeaderOnly )
 {
-	IO::Readers::CMemoryReader reader(lpData, uiBufferSize);
-	return this->Load(&reader, bHeaderOnly);
+	IO::Readers::CMemoryReader reader( lpData, uiBufferSize );
+	return this->Load( &reader, bHeaderOnly );
 }
 
-vlBool CVTFFile::Load(vlVoid *pUserData, vlBool bHeaderOnly)
+vlBool CVTFFile::Load( vlVoid *pUserData, vlBool bHeaderOnly )
 {
-	IO::Readers::CProcReader reader(pUserData);
-	return this->Load(&reader, bHeaderOnly);
+	IO::Readers::CProcReader reader( pUserData );
+	return this->Load( &reader, bHeaderOnly );
 }
 
-vlBool CVTFFile::Save(const vlChar *cFileName) const
+vlBool CVTFFile::Save( const vlChar *cFileName ) const
 {
-	IO::Writers::CFileWriter writer(cFileName);
-	return this->Save(&writer);
+	IO::Writers::CFileWriter writer( cFileName );
+	return this->Save( &writer );
 }
 
-vlBool CVTFFile::Save(vlVoid *lpData, vlUInt uiBufferSize, vlUInt &uiSize) const
+vlBool CVTFFile::Save( vlVoid *lpData, vlUInt uiBufferSize, vlUInt &uiSize ) const
 {
 	uiSize = 0;
 
-	IO::Writers::CMemoryWriter MemoryWriter = IO::Writers::CMemoryWriter(lpData, uiBufferSize);
+	IO::Writers::CMemoryWriter MemoryWriter = IO::Writers::CMemoryWriter( lpData, uiBufferSize );
 
-	vlBool bResult = this->Save(&MemoryWriter);
+	vlBool bResult = this->Save( &MemoryWriter );
 
 	uiSize = MemoryWriter.GetStreamSize();
 
 	return bResult;
 }
 
-vlBool CVTFFile::Save(vlVoid *pUserData) const
+vlBool CVTFFile::Save( vlVoid *pUserData ) const
 {
-	IO::Writers::CProcWriter writer(pUserData);
-	return this->Save(&writer);
+	IO::Writers::CProcWriter writer( pUserData );
+	return this->Save( &writer );
 }
 
 // -----------------------------------------------------------------------------------
@@ -917,216 +1350,216 @@ vlBool CVTFFile::Save(vlVoid *pUserData) const
 // Reader - The stream to read from.
 // bHeaderOnly - only read in the header if true (dont allocate and read image data in)
 // ------------------------------------------------------------------------------------
-vlBool CVTFFile::Load(IO::Readers::IReader *Reader, vlBool bHeaderOnly)
+vlBool CVTFFile::Load( IO::Readers::IReader *Reader, vlBool bHeaderOnly )
 {
 	this->Destroy();
 
 	try
 	{
-		if(!Reader->Open())
+		if ( !Reader->Open() )
 			throw 0;
 
 		// Get the size of the .vtf file.
 		vlUInt uiFileSize = Reader->GetStreamSize();
 
 		// Check we at least have enough bytes for a header.
-		if(uiFileSize < sizeof(SVTFFileHeader))
+		if ( uiFileSize < sizeof( SVTFFileHeader ) )
 		{
-			LastError.Set("File is corrupt; file to small for it's header.");
+			LastError.Set( "File is corrupt; file to small for it's header." );
 			throw 0;
 		}
 
 		SVTFFileHeader FileHeader;
 
 		// read the file header
-		memset(&FileHeader, 0, sizeof(SVTFFileHeader));
-		if(Reader->Read(&FileHeader, sizeof(SVTFFileHeader)) != sizeof(SVTFFileHeader))
+		memset( &FileHeader, 0, sizeof( SVTFFileHeader ) );
+		if ( Reader->Read( &FileHeader, sizeof( SVTFFileHeader ) ) != sizeof( SVTFFileHeader ) )
 		{
 			throw 0;
 		}
 
-		if(memcmp(FileHeader.TypeString, "VTF\0", 4) != 0)
+		if ( memcmp( FileHeader.TypeString, "VTF\0", 4 ) != 0 )
 		{
-			LastError.Set("File signature does not match 'VTF'.");
+			LastError.Set( "File signature does not match 'VTF'." );
 			throw 0;
 		}
 
-		if(FileHeader.Version[0] != VTF_MAJOR_VERSION || (FileHeader.Version[1] < 0 || FileHeader.Version[1] > VTF_MINOR_VERSION))
+		if ( FileHeader.Version[0] != VTF_MAJOR_VERSION || ( FileHeader.Version[1] < 0 || FileHeader.Version[1] > VTF_MINOR_VERSION ) )
 		{
-			LastError.SetFormatted("File version %u.%u does not match %d.%d to %d.%d.", FileHeader.Version[0], FileHeader.Version[1], VTF_MAJOR_VERSION, 0, VTF_MAJOR_VERSION, VTF_MINOR_VERSION);
+			LastError.SetFormatted( "File version %u.%u does not match %d.%d to %d.%d.", FileHeader.Version[0], FileHeader.Version[1], VTF_MAJOR_VERSION, 0, VTF_MAJOR_VERSION, VTF_MINOR_VERSION );
 			throw 0;
 		}
 
-		if(FileHeader.HeaderSize > sizeof(SVTFHeader))
+		if ( FileHeader.HeaderSize > sizeof( SVTFHeader ) )
 		{
-			LastError.SetFormatted("File header size %d B is larger than the %d B maximum expected.", FileHeader.HeaderSize, sizeof(SVTFHeader));
+			LastError.SetFormatted( "File header size %d B is larger than the %d B maximum expected.", FileHeader.HeaderSize, sizeof( SVTFHeader ) );
 			throw 0;
 		}
 
-		Reader->Seek(0, SEEK_SET);
+		Reader->Seek( 0, SEEK_SET );
 
 		this->Header = new SVTFHeader;
-		memset(this->Header, 0, sizeof(SVTFHeader));
+		memset( this->Header, 0, sizeof( SVTFHeader ) );
 
 		// read the header
-		if(Reader->Read(this->Header, FileHeader.HeaderSize) != FileHeader.HeaderSize)
+		if ( Reader->Read( this->Header, FileHeader.HeaderSize ) != FileHeader.HeaderSize )
 		{
 			throw 0;
 		}
 
-		if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] < VTF_MINOR_VERSION_MIN_VOLUME))
+		if ( this->Header->Version[0] < VTF_MAJOR_VERSION || ( this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] < VTF_MINOR_VERSION_MIN_VOLUME ) )
 		{
 			// set depth if version is lower than 7.2
 			this->Header->Depth = 1;
 		}
 
-		if(!this->GetSupportsResources())
+		if ( !this->GetSupportsResources() )
 		{
 			// set resource count if version is lower than 7.3
-			this->Header->ResourceCount = 0;
-		}
-
-		// if we just want the header loaded, bail here
-		if(bHeaderOnly)
-		{
-			Reader->Close();
-			return vlTrue;
-		}
-
-		// work out how big out buffers need to be
-		this->uiImageBufferSize = this->ComputeImageSize(this->Header->Width, this->Header->Height, this->Header->Depth, this->Header->MipCount, this->Header->ImageFormat) * this->GetFaceCount() * this->GetFrameCount();
-
-		if(this->Header->LowResImageFormat != IMAGE_FORMAT_NONE)
-		{
-			this->uiThumbnailBufferSize = this->ComputeImageSize(this->Header->LowResImageWidth, this->Header->LowResImageHeight, 1, this->Header->LowResImageFormat);
-		}
-		else
-		{
-			this->uiThumbnailBufferSize = 0;
-		}
-
-		// read the resource directory if version > 7.3
-		vlUInt uiThumbnailBufferOffset = 0, uiImageDataOffset = 0, uiRealImageSize = this->uiImageBufferSize;
-		vlBool bHasAuxCompression = false;
-		vlByte* lpCompressionInfo = 0;
-		if(this->Header->ResourceCount)
-		{
-			if(this->Header->ResourceCount > VTF_RSRC_MAX_DICTIONARY_ENTRIES)
-			{
-				LastError.SetFormatted("File may be corrupt; directory length %u exceeds maximum dictionary length of %u.", this->Header->ResourceCount, VTF_RSRC_MAX_DICTIONARY_ENTRIES);
-				throw 0;
-			}
-
-			for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
-			{
-				switch(this->Header->Resources[i].Type)
-				{
-				case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-					if(this->Header->LowResImageFormat == IMAGE_FORMAT_NONE)
-					{
-						LastError.Set("File may be corrupt; unexpected low resolution image directory entry.");
-						throw 0;
-					}
-					if(uiThumbnailBufferOffset != 0)
-					{
-						LastError.Set("File may be corrupt; multiple low resolution image directory entries.");
-						throw 0;
-					}
-					uiThumbnailBufferOffset = this->Header->Resources[i].Data;
-					break;
-				case VTF_LEGACY_RSRC_IMAGE:
-					if(uiImageDataOffset != 0)
-					{
-						LastError.Set("File may be corrupt; multiple image directory entries.");
-						throw 0;
-					}
-					uiImageDataOffset = this->Header->Resources[i].Data;
-					break;
-				case VTF_RSRC_AUX_COMPRESSION_INFO: // If no data chunk, compression = 0 and so we don't need to deal with this case specially.
-				{
-					if (this->Header->Resources[i].Data + sizeof(vlUInt) > uiFileSize)
-					{
-						LastError.Set("File may be corrupt; file too small for its resource data.");
-					}
-
-					vlUInt uiSize = 0;
-					Reader->Seek(this->Header->Resources[i].Data, SEEK_SET);
-					if (Reader->Read(&uiSize, sizeof(vlUInt)) != sizeof(vlUInt)) 
-					{
-						LastError.Set("File may be corrupt; file too small for its resource data.");
-						throw 0;
-					}
-
-					if (this->Header->Resources[i].Data + sizeof(vlUInt) + uiSize > uiFileSize)
-					{
-						LastError.Set("File may be corrupt; file too small for its resource data.");
-						throw 0;
-					}
-
-					this->Header->Data[i].Size = uiSize;
-					lpCompressionInfo = this->Header->Data[i].Data = new vlByte[uiSize];
-					if (Reader->Read(lpCompressionInfo, uiSize) != uiSize)
-					{
-						throw 0;
-					}
-
-					if (uiSize > sizeof(SVTFAuxCompressionInfoHeader)) 
-					{
-						vlUInt32 CompressionLevel = ((SVTFAuxCompressionInfoHeader*)lpCompressionInfo)->CompressionLevel;
-						bHasAuxCompression = CompressionLevel != 0;
-					}
+			this->Header->ResourceCount = 0;
+		}
 
-					if (!bHasAuxCompression)
-						break;
+		// if we just want the header loaded, bail here
+		if ( bHeaderOnly )
+		{
+			Reader->Close();
+			return vlTrue;
+		}
 
-					uiRealImageSize = 0;
+		// work out how big out buffers need to be
+		this->uiImageBufferSize = this->ComputeImageSize( this->Header->Width, this->Header->Height, this->Header->Depth, this->Header->MipCount, this->Header->ImageFormat ) * this->GetFaceCount() * this->GetFrameCount();
 
-					for (vlInt iMip = this->Header->MipCount - 1; iMip >= 0; --iMip)
-					{
-						for (vlUInt uiFrame = 0; uiFrame < this->Header->Frames; ++uiFrame)
-						{
-							for (vlUInt uiFace = 0; uiFace < GetFaceCount(); ++uiFace)
-							{
-								vlUInt infoOffset = GetAuxInfoOffset(uiFrame, uiFace, iMip);
+		if ( this->Header->LowResImageFormat != IMAGE_FORMAT_NONE )
+		{
+			this->uiThumbnailBufferSize = this->ComputeImageSize( this->Header->LowResImageWidth, this->Header->LowResImageHeight, 1, this->Header->LowResImageFormat );
+		}
+		else
+		{
+			this->uiThumbnailBufferSize = 0;
+		}
 
-								SVTFAuxCompressionInfoEntry* infoEntry = (SVTFAuxCompressionInfoEntry*)(lpCompressionInfo + infoOffset);
+		// read the resource directory if version > 7.3
+		vlUInt uiThumbnailBufferOffset = 0, uiImageDataOffset = 0, uiRealImageSize = this->uiImageBufferSize;
+		vlBool bHasAuxCompression = false;
+		vlByte *lpCompressionInfo = 0;
+		if ( this->Header->ResourceCount )
+		{
+			if ( this->Header->ResourceCount > VTF_RSRC_MAX_DICTIONARY_ENTRIES )
+			{
+				LastError.SetFormatted( "File may be corrupt; directory length %u exceeds maximum dictionary length of %u.", this->Header->ResourceCount, VTF_RSRC_MAX_DICTIONARY_ENTRIES );
+				throw 0;
+			}
 
-								uiRealImageSize += infoEntry->CompressedSize;
-							}
+			for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
+			{
+				switch ( this->Header->Resources[i].Type )
+				{
+					case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
+						if ( this->Header->LowResImageFormat == IMAGE_FORMAT_NONE )
+						{
+							LastError.Set( "File may be corrupt; unexpected low resolution image directory entry." );
+							throw 0;
 						}
-					}
-					break;
-				}
-				default:
-					if((this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK) == 0)
-					{
-						if(this->Header->Resources[i].Data + sizeof(vlUInt) > uiFileSize)
+						if ( uiThumbnailBufferOffset != 0 )
+						{
+							LastError.Set( "File may be corrupt; multiple low resolution image directory entries." );
+							throw 0;
+						}
+						uiThumbnailBufferOffset = this->Header->Resources[i].Data;
+						break;
+					case VTF_LEGACY_RSRC_IMAGE:
+						if ( uiImageDataOffset != 0 )
 						{
-							LastError.Set("File may be corrupt; file too small for its resource data.");
+							LastError.Set( "File may be corrupt; multiple image directory entries." );
 							throw 0;
 						}
+						uiImageDataOffset = this->Header->Resources[i].Data;
+						break;
+					case VTF_RSRC_AUX_COMPRESSION_INFO: // If no data chunk, compression = 0 and so we don't need to deal with this case specially.
+					{
+						if ( this->Header->Resources[i].Data + sizeof( vlUInt ) > uiFileSize )
+						{
+							LastError.Set( "File may be corrupt; file too small for its resource data." );
+						}
 
 						vlUInt uiSize = 0;
-						Reader->Seek(this->Header->Resources[i].Data, SEEK_SET);
-						if(Reader->Read(&uiSize, sizeof(vlUInt)) != sizeof(vlUInt))
+						Reader->Seek( this->Header->Resources[i].Data, SEEK_SET );
+						if ( Reader->Read( &uiSize, sizeof( vlUInt ) ) != sizeof( vlUInt ) )
 						{
+							LastError.Set( "File may be corrupt; file too small for its resource data." );
 							throw 0;
 						}
 
-						if(this->Header->Resources[i].Data + sizeof(vlUInt) + uiSize > uiFileSize)
+						if ( this->Header->Resources[i].Data + sizeof( vlUInt ) + uiSize > uiFileSize )
 						{
-							LastError.Set("File may be corrupt; file too small for its resource data.");
+							LastError.Set( "File may be corrupt; file too small for its resource data." );
 							throw 0;
 						}
 
 						this->Header->Data[i].Size = uiSize;
-						this->Header->Data[i].Data = new vlByte[uiSize];
-						if(Reader->Read(this->Header->Data[i].Data, uiSize) != uiSize)
+						lpCompressionInfo = this->Header->Data[i].Data = new vlByte[uiSize];
+						if ( Reader->Read( lpCompressionInfo, uiSize ) != uiSize )
 						{
 							throw 0;
 						}
+
+						if ( uiSize > sizeof( SVTFAuxCompressionInfoHeader ) )
+						{
+							vlUInt32 CompressionLevel = ( (SVTFAuxCompressionInfoHeader *)lpCompressionInfo )->CompressionLevel;
+							bHasAuxCompression = CompressionLevel != 0;
+						}
+
+						if ( !bHasAuxCompression )
+							break;
+
+						uiRealImageSize = 0;
+
+						for ( vlInt iMip = this->Header->MipCount - 1; iMip >= 0; --iMip )
+						{
+							for ( vlUInt uiFrame = 0; uiFrame < this->Header->Frames; ++uiFrame )
+							{
+								for ( vlUInt uiFace = 0; uiFace < GetFaceCount(); ++uiFace )
+								{
+									vlUInt infoOffset = GetAuxInfoOffset( uiFrame, uiFace, iMip );
+
+									SVTFAuxCompressionInfoEntry *infoEntry = (SVTFAuxCompressionInfoEntry *)( lpCompressionInfo + infoOffset );
+
+									uiRealImageSize += infoEntry->CompressedSize;
+								}
+							}
+						}
+						break;
 					}
-					break;
+					default:
+						if ( ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK ) == 0 )
+						{
+							if ( this->Header->Resources[i].Data + sizeof( vlUInt ) > uiFileSize )
+							{
+								LastError.Set( "File may be corrupt; file too small for its resource data." );
+								throw 0;
+							}
+
+							vlUInt uiSize = 0;
+							Reader->Seek( this->Header->Resources[i].Data, SEEK_SET );
+							if ( Reader->Read( &uiSize, sizeof( vlUInt ) ) != sizeof( vlUInt ) )
+							{
+								throw 0;
+							}
+
+							if ( this->Header->Resources[i].Data + sizeof( vlUInt ) + uiSize > uiFileSize )
+							{
+								LastError.Set( "File may be corrupt; file too small for its resource data." );
+								throw 0;
+							}
+
+							this->Header->Data[i].Size = uiSize;
+							this->Header->Data[i].Data = new vlByte[uiSize];
+							if ( Reader->Read( this->Header->Data[i].Data, uiSize ) != uiSize )
+							{
+								throw 0;
+							}
+						}
+						break;
 				}
 			}
 		}
@@ -1135,100 +1568,100 @@ vlBool CVTFFile::Load(IO::Readers::IReader *Reader, vlBool bHeaderOnly)
 			uiThumbnailBufferOffset = this->Header->HeaderSize;
 			uiImageDataOffset = uiThumbnailBufferOffset + this->uiThumbnailBufferSize;
 		}
-		
+
 		// sanity check
 		// headersize + lowbuffersize + buffersize *should* equal the filesize
-		if(this->Header->HeaderSize > uiFileSize || uiThumbnailBufferOffset + this->uiThumbnailBufferSize > uiFileSize || uiImageDataOffset + uiRealImageSize > uiFileSize)
+		if ( this->Header->HeaderSize > uiFileSize || uiThumbnailBufferOffset + this->uiThumbnailBufferSize > uiFileSize || uiImageDataOffset + uiRealImageSize > uiFileSize )
 		{
-			LastError.Set("File may be corrupt; file too small for its image data.");
+			LastError.Set( "File may be corrupt; file too small for its image data." );
 			throw 0;
 		}
 
-		if(uiThumbnailBufferOffset == 0)
+		if ( uiThumbnailBufferOffset == 0 )
 		{
 			this->Header->LowResImageFormat = IMAGE_FORMAT_NONE;
 		}
 
 		// assuming all is well, size our data buffers
-		if(this->Header->LowResImageFormat != IMAGE_FORMAT_NONE)
+		if ( this->Header->LowResImageFormat != IMAGE_FORMAT_NONE )
 		{
 			this->lpThumbnailImageData = new vlByte[this->uiThumbnailBufferSize];
 
 			// load the low res data
-			Reader->Seek(uiThumbnailBufferOffset, SEEK_SET);
-			if(Reader->Read(this->lpThumbnailImageData, this->uiThumbnailBufferSize) != this->uiThumbnailBufferSize)
+			Reader->Seek( uiThumbnailBufferOffset, SEEK_SET );
+			if ( Reader->Read( this->lpThumbnailImageData, this->uiThumbnailBufferSize ) != this->uiThumbnailBufferSize )
 			{
 				throw 0;
 			}
 		}
 
-		if(uiImageDataOffset == 0)
+		if ( uiImageDataOffset == 0 )
 		{
 			this->Header->ImageFormat = IMAGE_FORMAT_NONE;
 		}
 
-		if(this->Header->ImageFormat != IMAGE_FORMAT_NONE)
+		if ( this->Header->ImageFormat != IMAGE_FORMAT_NONE )
 		{
 			this->lpImageData = new vlByte[this->uiImageBufferSize];
 
-			Reader->Seek(uiImageDataOffset, SEEK_SET);
+			Reader->Seek( uiImageDataOffset, SEEK_SET );
 
 			// Load the compressed image
-			if (bHasAuxCompression)
+			if ( bHasAuxCompression )
 			{
 				// Prepare decompression stream
 				z_stream zStream = { 0 };
-				if (bHasAuxCompression && (inflateInit(&zStream) != Z_OK))
+				if ( bHasAuxCompression && ( inflateInit( &zStream ) != Z_OK ) )
 				{
-					LastError.Set("Unable to initialise VTF decompression stream!\n");
+					LastError.Set( "Unable to initialise VTF decompression stream!\n" );
 					throw 0;
 				}
 
-				vlByte* lpCompressionBuf = new vlByte[uiRealImageSize];
-				if (Reader->Read(lpCompressionBuf, uiRealImageSize) != uiRealImageSize)
+				vlByte *lpCompressionBuf = new vlByte[uiRealImageSize];
+				if ( Reader->Read( lpCompressionBuf, uiRealImageSize ) != uiRealImageSize )
 				{
-					LastError.Set("Unable to read compressed VTF!\n");
-					inflateEnd(&zStream);
+					LastError.Set( "Unable to read compressed VTF!\n" );
+					inflateEnd( &zStream );
 					throw 0;
 				}
 
 				vlInt totalRead = 0;
 
-				for (vlInt iMip = this->Header->MipCount - 1; iMip >= 0; --iMip)
+				for ( vlInt iMip = this->Header->MipCount - 1; iMip >= 0; --iMip )
 				{
-					vlInt iMipSize = ComputeMipmapSize(this->Header->Width, this->Header->Height, 1, iMip, this->Header->ImageFormat);
+					vlInt iMipSize = ComputeMipmapSize( this->Header->Width, this->Header->Height, 1, iMip, this->Header->ImageFormat );
 
-					for (vlUInt uiFrame = 0; uiFrame < this->Header->Frames; ++uiFrame)
+					for ( vlUInt uiFrame = 0; uiFrame < this->Header->Frames; ++uiFrame )
 					{
-						for (vlUInt uiFace = 0; uiFace < GetFaceCount(); ++uiFace)
+						for ( vlUInt uiFace = 0; uiFace < GetFaceCount(); ++uiFace )
 						{
-							vlByte *lpMipBits = GetData(uiFrame, uiFace, 0, iMip);
+							vlByte *lpMipBits = GetData( uiFrame, uiFace, 0, iMip );
 
-							vlUInt uiInfoOffset = GetAuxInfoOffset(uiFrame, uiFace, iMip);
+							vlUInt uiInfoOffset = GetAuxInfoOffset( uiFrame, uiFace, iMip );
 
-							SVTFAuxCompressionInfoEntry* pInfoEntry = (SVTFAuxCompressionInfoEntry*)(lpCompressionInfo + uiInfoOffset);
+							SVTFAuxCompressionInfoEntry *pInfoEntry = (SVTFAuxCompressionInfoEntry *)( lpCompressionInfo + uiInfoOffset );
 
 							// Decompress
 							zStream.next_in = lpCompressionBuf + totalRead;
 							zStream.avail_in = pInfoEntry->CompressedSize;
 							zStream.total_out = 0;
 
-							while (zStream.avail_in)
+							while ( zStream.avail_in )
 							{
 								zStream.next_out = lpMipBits + zStream.total_out;
 								zStream.avail_out = iMipSize - zStream.total_out;
 
-								vlInt zRet = inflate(&zStream, Z_NO_FLUSH);
-								vlBool zFailure = (zRet != Z_OK) && (zRet != Z_STREAM_END);
-								if (zFailure || ((zRet == Z_STREAM_END) && (zStream.total_out != iMipSize)))
+								vlInt zRet = inflate( &zStream, Z_NO_FLUSH );
+								vlBool zFailure = ( zRet != Z_OK ) && ( zRet != Z_STREAM_END );
+								if ( zFailure || ( ( zRet == Z_STREAM_END ) && ( zStream.total_out != iMipSize ) ) )
 								{
-									LastError.Set("Unable to decompress VTF!\n");
-									inflateEnd(&zStream);
+									LastError.Set( "Unable to decompress VTF!\n" );
+									inflateEnd( &zStream );
 									throw 0;
 								}
 							}
 
-							inflateReset(&zStream);
+							inflateReset( &zStream );
 
 							totalRead += pInfoEntry->CompressedSize;
 						}
@@ -1236,9 +1669,9 @@ vlBool CVTFFile::Load(IO::Readers::IReader *Reader, vlBool bHeaderOnly)
 				}
 
 				delete[] lpCompressionBuf;
-				inflateEnd(&zStream);
+				inflateEnd( &zStream );
 			}
-			else if (Reader->Read(this->lpImageData, this->uiImageBufferSize) != this->uiImageBufferSize) // load the high-res data
+			else if ( Reader->Read( this->lpImageData, this->uiImageBufferSize ) != this->uiImageBufferSize ) // load the high-res data
 			{
 				throw 0;
 			}
@@ -1247,7 +1680,7 @@ vlBool CVTFFile::Load(IO::Readers::IReader *Reader, vlBool bHeaderOnly)
 		// Fixup resource offsets for writing.
 		this->ComputeResources();
 	}
-	catch(...)
+	catch ( ... )
 	{
 		Reader->Close();
 
@@ -1265,19 +1698,19 @@ vlBool CVTFFile::Load(IO::Readers::IReader *Reader, vlBool bHeaderOnly)
 // Save()
 // Saves the current image.  Basic format checking is done.
 //
-vlBool CVTFFile::Save(IO::Writers::IWriter *Writer) const
+vlBool CVTFFile::Save( IO::Writers::IWriter *Writer ) const
 {
-	if(!this->IsLoaded() || !this->GetHasImage())
+	if ( !this->IsLoaded() || !this->GetHasImage() )
 	{
-		LastError.Set("No image to save.");
+		LastError.Set( "No image to save." );
 		return vlFalse;
 	}
 
 	// Check for aux compression in case we should use the compressed path
 	vlInt iCompressionLevel = GetAuxCompressionLevel();
-	if (iCompressionLevel != 0)
+	if ( iCompressionLevel != 0 )
 	{
-		return SaveCompressed(Writer, iCompressionLevel);
+		return SaveCompressed( Writer, iCompressionLevel );
 	}
 
 	// ToDo: Check if the image buffer is ok.
@@ -1285,74 +1718,73 @@ vlBool CVTFFile::Save(IO::Writers::IWriter *Writer) const
 
 	try
 	{
-		if(!Writer->Open())
+		if ( !Writer->Open() )
 			throw 0;
 
 		// Write the header.
-		if(Writer->Write(this->Header, this->Header->HeaderSize) != this->Header->HeaderSize)
+		if ( Writer->Write( this->Header, this->Header->HeaderSize ) != this->Header->HeaderSize )
 		{
 			throw 0;
 		}
 
-		if(this->GetSupportsResources())
+		if ( this->GetSupportsResources() )
 		{
-
-			for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
+			for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 			{
-				switch(this->Header->Resources[i].Type)
-				{
-				case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-					if(Writer->Write(this->lpThumbnailImageData, this->uiThumbnailBufferSize) != this->uiThumbnailBufferSize)
-					{
-						throw 0;
-					}
-					break;
-				case VTF_LEGACY_RSRC_IMAGE:
+				switch ( this->Header->Resources[i].Type )
 				{
-					if (Writer->Write(this->lpImageData, this->uiImageBufferSize) != this->uiImageBufferSize)
-					{
-						throw 0;
-					}
-					break;
-				}
-				default:
-					if((this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK) == 0)
-					{
-						if(Writer->Write(&this->Header->Data[i].Size, sizeof(vlUInt)) != sizeof(vlUInt))
+					case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
+						if ( Writer->Write( this->lpThumbnailImageData, this->uiThumbnailBufferSize ) != this->uiThumbnailBufferSize )
 						{
 							throw 0;
 						}
-
-						if(Writer->Write(this->Header->Data[i].Data, this->Header->Data[i].Size) != this->Header->Data[i].Size)
+						break;
+					case VTF_LEGACY_RSRC_IMAGE:
+					{
+						if ( Writer->Write( this->lpImageData, this->uiImageBufferSize ) != this->uiImageBufferSize )
 						{
 							throw 0;
 						}
+						break;
 					}
+					default:
+						if ( ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK ) == 0 )
+						{
+							if ( Writer->Write( &this->Header->Data[i].Size, sizeof( vlUInt ) ) != sizeof( vlUInt ) )
+							{
+								throw 0;
+							}
+
+							if ( Writer->Write( this->Header->Data[i].Data, this->Header->Data[i].Size ) != this->Header->Data[i].Size )
+							{
+								throw 0;
+							}
+						}
 				}
 			}
 		}
 		else
 		{
-			if(this->Header->LowResImageFormat != IMAGE_FORMAT_NONE)
+			if ( this->Header->LowResImageFormat != IMAGE_FORMAT_NONE )
 			{
 				// write the thumbnail image data
-				if(Writer->Write(this->lpThumbnailImageData, this->uiThumbnailBufferSize) != this->uiThumbnailBufferSize)
+				if ( Writer->Write( this->lpThumbnailImageData, this->uiThumbnailBufferSize ) != this->uiThumbnailBufferSize )
 				{
 					throw 0;
 				}
 			}
 
-			if(this->Header->ImageFormat != IMAGE_FORMAT_NONE)
+			if ( this->Header->ImageFormat != IMAGE_FORMAT_NONE )
 			{
 				// write the image data
-				if(Writer->Write(this->lpImageData, this->uiImageBufferSize) != this->uiImageBufferSize)
+				if ( Writer->Write( this->lpImageData, this->uiImageBufferSize ) != this->uiImageBufferSize )
 				{
 					throw 0;
 				}
 			}
 		}
 	}
-	catch(...)
+	catch ( ... )
 	{
 		Writer->Close();
 
@@ -1368,86 +1800,84 @@ vlBool CVTFFile::Save(IO::Writers::IWriter *Writer) const
 // SaveCompressed()
 // Saves the current image with a certain compression level.  Basic format checking is done.
 //
-vlBool CVTFFile::SaveCompressed(IO::Writers::IWriter* Writer, vlInt iCompressionLevel) const
+vlBool CVTFFile::SaveCompressed( IO::Writers::IWriter *Writer, vlInt iCompressionLevel ) const
 {
-	if (!this->IsLoaded() || !this->GetHasImage())
+	if ( !this->IsLoaded() || !this->GetHasImage() )
 	{
-		LastError.Set("No image to save.");
+		LastError.Set( "No image to save." );
 		return vlFalse;
 	}
 
-	if (this->GetMajorVersion() < 7 || (this->GetMajorVersion() == 7 && this->GetMinorVersion() < 6))
+	if ( this->GetMajorVersion() < 7 || ( this->GetMajorVersion() == 7 && this->GetMinorVersion() < 6 ) )
 	{
-		LastError.Set("VTF Version <7.6 does not support auxiliary compression.");
+		LastError.Set( "VTF Version <7.6 does not support auxiliary compression." );
 		return vlFalse;
 	}
 
-	if (iCompressionLevel <= 0 && iCompressionLevel != SVTFAuxCompressionInfoHeader::DEFAULT_COMPRESSION)
+	if ( iCompressionLevel <= 0 && iCompressionLevel != SVTFAuxCompressionInfoHeader::DEFAULT_COMPRESSION )
 	{
-		LastError.Set("Invalid compression level while saving.");
+		LastError.Set( "Invalid compression level while saving." );
 		return vlFalse;
 	}
 
 	// Initialise new compression info
-	vlULong ulCompressionInfoSize = sizeof(SVTFAuxCompressionInfoHeader)
-		+ (this->Header->MipCount * this->Header->Frames * GetFaceCount())
-		* sizeof(SVTFAuxCompressionInfoEntry);
+	vlULong ulCompressionInfoSize = sizeof( SVTFAuxCompressionInfoHeader ) + ( this->Header->MipCount * this->Header->Frames * GetFaceCount() ) * sizeof( SVTFAuxCompressionInfoEntry );
 
-	vlByte* lpCompressionInfo = new vlByte[ulCompressionInfoSize];
+	vlByte *lpCompressionInfo = new vlByte[ulCompressionInfoSize];
 
-	SVTFAuxCompressionInfoHeader* pInfoHeader = (SVTFAuxCompressionInfoHeader*)lpCompressionInfo;
+	SVTFAuxCompressionInfoHeader *pInfoHeader = (SVTFAuxCompressionInfoHeader *)lpCompressionInfo;
 	pInfoHeader->CompressionLevel = iCompressionLevel;
 
-	if (iCompressionLevel == SVTFAuxCompressionInfoHeader::DEFAULT_COMPRESSION)
+	if ( iCompressionLevel == SVTFAuxCompressionInfoHeader::DEFAULT_COMPRESSION )
 		iCompressionLevel = Z_DEFAULT_COMPRESSION;
 
-	vlByte* lpCompressedImage = nullptr;
+	vlByte *lpCompressedImage = nullptr;
 
 	try
 	{
 		// Pre-emptively compress the image
 		z_stream zStream = { 0 };
-		if (deflateInit(&zStream, iCompressionLevel) != Z_OK)
+		if ( deflateInit( &zStream, iCompressionLevel ) != Z_OK )
 		{
-			LastError.Set("Unable to initialise VTF decompression stream!\n");
+			LastError.Set( "Unable to initialise VTF decompression stream!\n" );
 			throw 0;
 		}
 
 		// Create upper-bound buffer for deflate
-		vlULong ulMaxDeflateSize = deflateBound(&zStream, this->uiImageBufferSize);
+		vlULong ulMaxDeflateSize = deflateBound( &zStream, this->uiImageBufferSize );
 		lpCompressedImage = new vlByte[ulMaxDeflateSize];
-		memset(lpCompressedImage, 0, ulMaxDeflateSize);
+		memset( lpCompressedImage, 0, ulMaxDeflateSize );
 
 		vlULong ulActualDeflateSize = 0;
 
 		// Actually do the compression
-		for (vlInt iMip = this->Header->MipCount - 1; iMip >= 0; --iMip)
+		for ( vlInt iMip = this->Header->MipCount - 1; iMip >= 0; --iMip )
 		{
-			vlInt iMipSize = ComputeMipmapSize(this->Header->Width, this->Header->Height, 1, iMip, this->Header->ImageFormat);
+			vlInt iMipSize = ComputeMipmapSize( this->Header->Width, this->Header->Height, 1, iMip, this->Header->ImageFormat );
 
-			for (vlUInt uiFrame = 0; uiFrame < this->Header->Frames; ++uiFrame)
+			for ( vlUInt uiFrame = 0; uiFrame < this->Header->Frames; ++uiFrame )
 			{
-				for (vlUInt uiFace = 0; uiFace < GetFaceCount(); ++uiFace)
+				for ( vlUInt uiFace = 0; uiFace < GetFaceCount(); ++uiFace )
 				{
-					vlByte* lpMipBits = GetData(uiFrame, uiFace, 0, iMip);
+					vlByte *lpMipBits = GetData( uiFrame, uiFace, 0, iMip );
 
 					// Compress lpMipBits -> Next free data of lpCompressedImage
 					zStream.next_in = lpMipBits;
 					zStream.avail_in = iMipSize;
 					zStream.total_out = 0;
 
-					while (zStream.avail_in)
+					while ( zStream.avail_in )
 					{
 						vlULong ulTotalWritten = ulActualDeflateSize + zStream.total_out;
 
 						zStream.next_out = lpCompressedImage + ulTotalWritten;
 						zStream.avail_out = ulMaxDeflateSize - ulTotalWritten;
 
-						vlInt zRet = deflate(&zStream, Z_FINISH);
-						if ((zRet != Z_OK) && (zRet != Z_STREAM_END))
+						vlInt zRet = deflate( &zStream, Z_FINISH );
+						if ( ( zRet != Z_OK ) && ( zRet != Z_STREAM_END ) )
 						{
-							LastError.Set("Unable to compress VTF!\n");
-							deflateEnd(&zStream);
+							LastError.Set( "Unable to compress VTF!\n" );
+							deflateEnd( &zStream );
 							throw 0;
 						}
 					}
@@ -1455,22 +1885,22 @@ vlBool CVTFFile::SaveCompressed(IO::Writers::IWriter* Writer, vlInt iCompression
 					// Update info and size
 					ulActualDeflateSize += zStream.total_out;
 
-					vlUInt uiInfoOffset = GetAuxInfoOffset(uiFrame, uiFace, iMip);
-					SVTFAuxCompressionInfoEntry* pInfoEntry = (SVTFAuxCompressionInfoEntry*)(lpCompressionInfo + uiInfoOffset);
+					vlUInt uiInfoOffset = GetAuxInfoOffset( uiFrame, uiFace, iMip );
+					SVTFAuxCompressionInfoEntry *pInfoEntry = (SVTFAuxCompressionInfoEntry *)( lpCompressionInfo + uiInfoOffset );
 
 					pInfoEntry->CompressedSize = zStream.total_out;
 
-					deflateReset(&zStream);
+					deflateReset( &zStream );
 				}
 			}
 		}
 
 		// We now have a compressed image and filled out aux compression info, so we can continue saving in a slightly modified way.
-		if (!Writer->Open())
+		if ( !Writer->Open() )
 			throw 0;
 
 		// Write the header to reserve space. This will be recalculated later, but we save with dummy data at first.
-		if (Writer->Write(this->Header, this->Header->HeaderSize) != this->Header->HeaderSize)
+		if ( Writer->Write( this->Header, this->Header->HeaderSize ) != this->Header->HeaderSize )
 		{
 			throw 0;
 		}
@@ -1481,74 +1911,74 @@ vlBool CVTFFile::SaveCompressed(IO::Writers::IWriter* Writer, vlInt iCompression
 		vlULong ulFileOffset = this->Header->HeaderSize;
 
 		// Resources are guaranteed for compression-compatible VTF
-		for (vlUInt i = 0; i < this->Header->ResourceCount; i++)
+		for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 		{
 			modHeader.Resources[i].Data = ulFileOffset;
 
-			switch (this->Header->Resources[i].Type)
-			{
-			case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-				if (Writer->Write(this->lpThumbnailImageData, this->uiThumbnailBufferSize) != this->uiThumbnailBufferSize)
-				{
-					throw 0;
-				}
-
-				ulFileOffset += this->uiThumbnailBufferSize;
-				break;
-			case VTF_LEGACY_RSRC_IMAGE:
+			switch ( this->Header->Resources[i].Type )
 			{
-				if (Writer->Write(lpCompressedImage, ulActualDeflateSize) != ulActualDeflateSize)
-				{
-					throw 0;
-				}
+				case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
+					if ( Writer->Write( this->lpThumbnailImageData, this->uiThumbnailBufferSize ) != this->uiThumbnailBufferSize )
+					{
+						throw 0;
+					}
 
-				ulFileOffset += ulActualDeflateSize;
-				break;
-			}
-			case VTF_RSRC_AUX_COMPRESSION_INFO:
-			case VTF_RSRC_AUX_COMPRESSION_INFO | RSRCF_HAS_NO_DATA_CHUNK:
-			{
-				vlUInt uiCompressionInfoSize = ulCompressionInfoSize;
-				if (Writer->Write(&uiCompressionInfoSize, sizeof(vlUInt)) != sizeof(vlUInt))
+					ulFileOffset += this->uiThumbnailBufferSize;
+					break;
+				case VTF_LEGACY_RSRC_IMAGE:
 				{
-					throw 0;
-				}
+					if ( Writer->Write( lpCompressedImage, ulActualDeflateSize ) != ulActualDeflateSize )
+					{
+						throw 0;
+					}
 
-				if (Writer->Write(lpCompressionInfo, ulCompressionInfoSize) != ulCompressionInfoSize)
-				{
-					throw 0;
+					ulFileOffset += ulActualDeflateSize;
+					break;
 				}
-
-				ulFileOffset += ulCompressionInfoSize;
-				break;
-			}
-			default:
-				if ((this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK) == 0)
+				case VTF_RSRC_AUX_COMPRESSION_INFO:
+				case VTF_RSRC_AUX_COMPRESSION_INFO | RSRCF_HAS_NO_DATA_CHUNK:
 				{
-					if (Writer->Write(&this->Header->Data[i].Size, sizeof(vlUInt)) != sizeof(vlUInt))
+					vlUInt uiCompressionInfoSize = ulCompressionInfoSize;
+					if ( Writer->Write( &uiCompressionInfoSize, sizeof( vlUInt ) ) != sizeof( vlUInt ) )
 					{
 						throw 0;
 					}
 
-					if (Writer->Write(this->Header->Data[i].Data, this->Header->Data[i].Size) != this->Header->Data[i].Size)
+					if ( Writer->Write( lpCompressionInfo, ulCompressionInfoSize ) != ulCompressionInfoSize )
 					{
 						throw 0;
 					}
 
-					ulFileOffset += sizeof(vlUInt) + this->Header->Data[i].Size;
+					ulFileOffset += ulCompressionInfoSize;
+					break;
 				}
+				default:
+					if ( ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK ) == 0 )
+					{
+						if ( Writer->Write( &this->Header->Data[i].Size, sizeof( vlUInt ) ) != sizeof( vlUInt ) )
+						{
+							throw 0;
+						}
+
+						if ( Writer->Write( this->Header->Data[i].Data, this->Header->Data[i].Size ) != this->Header->Data[i].Size )
+						{
+							throw 0;
+						}
+
+						ulFileOffset += sizeof( vlUInt ) + this->Header->Data[i].Size;
+					}
 			}
 		}
 
 		// Write modified header.
-		Writer->Seek(0, SEEK_SET);
+		Writer->Seek( 0, SEEK_SET );
 
-		if (Writer->Write(&modHeader, this->Header->HeaderSize) != this->Header->HeaderSize)
+		if ( Writer->Write( &modHeader, this->Header->HeaderSize ) != this->Header->HeaderSize )
 		{
 			throw 0;
 		}
 	}
-	catch (...)
+	catch ( ... )
 	{
 		delete[] lpCompressionInfo;
 		delete[] lpCompressedImage;
@@ -1571,7 +2001,7 @@ vlBool CVTFFile::SaveCompressed(IO::Writers::IWriter* Writer, vlInt iCompression
 //
 vlBool CVTFFile::GetHasImage() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
 	return this->lpImageData != 0;
@@ -1583,7 +2013,7 @@ vlBool CVTFFile::GetHasImage() const
 //
 vlUInt CVTFFile::GetMajorVersion() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Version[0];
@@ -1595,7 +2025,7 @@ vlUInt CVTFFile::GetMajorVersion() const
 //
 vlUInt CVTFFile::GetMinorVersion() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Version[1];
@@ -1605,20 +2035,21 @@ vlUInt CVTFFile::GetMinorVersion() const
 // SetVersion
 // Sets the version of the VTF
 //
-bool CVTFFile::SetVersion(vlUInt major, vlUInt minor)
+bool CVTFFile::SetVersion( vlUInt major, vlUInt minor )
 {
-	if (major != 7 || minor < 1 || minor > 6)
+	if ( major != 7 || minor < 1 || minor > 6 )
 		return false;
 
 	bool didSupportResources = GetSupportsResources();
-	
+
 	Header->Version[0] = major;
 	Header->Version[1] = minor;
 
 	bool doesSupportResources = GetSupportsResources();
 
 	// Add new resources for compatibility if we didn't previously
-	if (!didSupportResources && doesSupportResources) {
+	if ( !didSupportResources && doesSupportResources )
+	{
 		this->Header->Resources[this->Header->ResourceCount++].Type = VTF_LEGACY_RSRC_LOW_RES_IMAGE;
 		this->Header->Resources[this->Header->ResourceCount++].Type = VTF_LEGACY_RSRC_IMAGE;
 	}
@@ -1632,69 +2063,69 @@ bool CVTFFile::SetVersion(vlUInt major, vlUInt minor)
 //
 vlVoid CVTFFile::ComputeResources()
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
 	// Correct resource count.
-	if(!this->GetSupportsResources())
+	if ( !this->GetSupportsResources() )
 	{
 		this->Header->ResourceCount = 0;
 	}
 
 	// Correct header size.
-	STATIC_ASSERT(VTF_MAJOR_VERSION == 7, "HeaderSize needs calculation for new major version.");
-	STATIC_ASSERT(VTF_MINOR_VERSION == 6, "HeaderSize needs calculation for new minor version.");
-	switch(this->Header->Version[0])
+	STATIC_ASSERT( VTF_MAJOR_VERSION == 7, "HeaderSize needs calculation for new major version." );
+	STATIC_ASSERT( VTF_MINOR_VERSION == 6, "HeaderSize needs calculation for new minor version." );
+	switch ( this->Header->Version[0] )
 	{
-	case 7:
-		switch(this->Header->Version[1])
-		{
-		case 0:
-			this->Header->HeaderSize = sizeof(SVTFHeader_70_A);
-			break;
-		case 1:
-			this->Header->HeaderSize = sizeof(SVTFHeader_71_A);
-			break;
-		case 2:
-			this->Header->HeaderSize = sizeof(SVTFHeader_72_A);
-			break;
-		case 3:
-			this->Header->HeaderSize = sizeof(SVTFHeader_73_A) + this->Header->ResourceCount * sizeof(SVTFResource);
-			break;
-		case 4:
-			this->Header->HeaderSize = sizeof(SVTFHeader_74_A) + this->Header->ResourceCount * sizeof(SVTFResource);
-			break;
-		case 5:
-			this->Header->HeaderSize = sizeof(SVTFHeader_75_A) + this->Header->ResourceCount * sizeof(SVTFResource);
-			break;
-		case 6:
-			this->Header->HeaderSize = sizeof(SVTFHeader_76_A) + this->Header->ResourceCount * sizeof(SVTFResource);
+		case 7:
+			switch ( this->Header->Version[1] )
+			{
+				case 0:
+					this->Header->HeaderSize = sizeof( SVTFHeader_70_A );
+					break;
+				case 1:
+					this->Header->HeaderSize = sizeof( SVTFHeader_71_A );
+					break;
+				case 2:
+					this->Header->HeaderSize = sizeof( SVTFHeader_72_A );
+					break;
+				case 3:
+					this->Header->HeaderSize = sizeof( SVTFHeader_73_A ) + this->Header->ResourceCount * sizeof( SVTFResource );
+					break;
+				case 4:
+					this->Header->HeaderSize = sizeof( SVTFHeader_74_A ) + this->Header->ResourceCount * sizeof( SVTFResource );
+					break;
+				case 5:
+					this->Header->HeaderSize = sizeof( SVTFHeader_75_A ) + this->Header->ResourceCount * sizeof( SVTFResource );
+					break;
+				case 6:
+					this->Header->HeaderSize = sizeof( SVTFHeader_76_A ) + this->Header->ResourceCount * sizeof( SVTFResource );
+					break;
+			}
 			break;
-		}
-		break;
 	}
 
 	// Correct resource offsets.
 	vlUInt uiOffset = this->Header->HeaderSize;
-	for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
+	for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 	{
-		switch(this->Header->Resources[i].Type)
+		switch ( this->Header->Resources[i].Type )
 		{
-		case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-			this->Header->Resources[i].Data = uiOffset;
-			uiOffset += this->uiThumbnailBufferSize;
-			break;
-		case VTF_LEGACY_RSRC_IMAGE:
-			this->Header->Resources[i].Data = uiOffset;
-			uiOffset += this->uiImageBufferSize;
-			break;
-		default:
-			if((this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK) == 0)
-			{
+			case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
 				this->Header->Resources[i].Data = uiOffset;
-				uiOffset += sizeof(vlUInt) + this->Header->Data[i].Size;
-			}
-			break;
+				uiOffset += this->uiThumbnailBufferSize;
+				break;
+			case VTF_LEGACY_RSRC_IMAGE:
+				this->Header->Resources[i].Data = uiOffset;
+				uiOffset += this->uiImageBufferSize;
+				break;
+			default:
+				if ( ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK ) == 0 )
+				{
+					this->Header->Resources[i].Data = uiOffset;
+					uiOffset += sizeof( vlUInt ) + this->Header->Data[i].Size;
+				}
+				break;
 		}
 	}
 }
@@ -1705,25 +2136,25 @@ vlVoid CVTFFile::ComputeResources()
 //
 vlUInt CVTFFile::GetSize() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	vlUInt uiResourceSize = 0;
-	if(this->GetSupportsResources())
+	if ( this->GetSupportsResources() )
 	{
-		for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
+		for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 		{
-			switch(this->Header->Resources[i].Type)
+			switch ( this->Header->Resources[i].Type )
 			{
-			case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-			case VTF_LEGACY_RSRC_IMAGE:
-				break;
-			default:
-				if((this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK) == 0)
-				{
-					uiResourceSize += sizeof(vlUInt) + this->Header->Data[i].Size;
-				}
-				break;
+				case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
+				case VTF_LEGACY_RSRC_IMAGE:
+					break;
+				default:
+					if ( ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK ) == 0 )
+					{
+						uiResourceSize += sizeof( vlUInt ) + this->Header->Data[i].Size;
+					}
+					break;
 			}
 		}
 	}
@@ -1737,7 +2168,7 @@ vlUInt CVTFFile::GetSize() const
 //
 vlUInt CVTFFile::GetWidth() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Width;
@@ -1749,7 +2180,7 @@ vlUInt CVTFFile::GetWidth() const
 //
 vlUInt CVTFFile::GetHeight() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Height;
@@ -1761,7 +2192,7 @@ vlUInt CVTFFile::GetHeight() const
 //
 vlUInt CVTFFile::GetDepth() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Depth;
@@ -1773,7 +2204,7 @@ vlUInt CVTFFile::GetDepth() const
 //
 vlUInt CVTFFile::GetFrameCount() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Frames;
@@ -1787,10 +2218,10 @@ vlUInt CVTFFile::GetFrameCount() const
 //---------------------------------------------------------------------------------
 vlUInt CVTFFile::GetFaceCount() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
-	return this->Header->Flags & TEXTUREFLAGS_ENVMAP ? (this->Header->StartFrame != 0xffff && this->Header->Version[1] < VTF_MINOR_VERSION_MIN_NO_SPHERE_MAP ? CUBEMAP_FACE_COUNT : CUBEMAP_FACE_COUNT - 1) : 1;
+	return this->Header->Flags & TEXTUREFLAGS_ENVMAP ? ( this->Header->StartFrame != 0xffff && this->Header->Version[1] < VTF_MINOR_VERSION_MIN_NO_SPHERE_MAP ? CUBEMAP_FACE_COUNT : CUBEMAP_FACE_COUNT - 1 ) : 1;
 }
 
 //
@@ -1799,7 +2230,7 @@ vlUInt CVTFFile::GetFaceCount() const
 //
 vlUInt CVTFFile::GetMipmapCount() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->MipCount;
@@ -1813,7 +2244,7 @@ vlUInt CVTFFile::GetMipmapCount() const
 //
 vlUInt CVTFFile::GetStartFrame() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->StartFrame;
@@ -1823,9 +2254,9 @@ vlUInt CVTFFile::GetStartFrame() const
 // SetStartFrame()
 // Sets the first frame in the animation sequence.
 //
-vlVoid CVTFFile::SetStartFrame(vlUInt uiStartFrame)
+vlVoid CVTFFile::SetStartFrame( vlUInt uiStartFrame )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
 	// Note: Valve informs us that animated enviroment maps ARE possible.
@@ -1833,12 +2264,12 @@ vlVoid CVTFFile::SetStartFrame(vlUInt uiStartFrame)
 	// maps without sphere maps.  This is trivial...
 
 	// Don't let the user set the start frame of an enviroment map.
-	if(this->Header->Flags & TEXTUREFLAGS_ENVMAP)
+	if ( this->Header->Flags & TEXTUREFLAGS_ENVMAP )
 	{
 		return;
 	}
 
-	if(uiStartFrame >= (vlUInt)this->Header->Frames)
+	if ( uiStartFrame >= (vlUInt)this->Header->Frames )
 	{
 		uiStartFrame = (vlUInt)this->Header->Frames - 1;
 	}
@@ -1853,7 +2284,7 @@ vlVoid CVTFFile::SetStartFrame(vlUInt uiStartFrame)
 //
 vlUInt CVTFFile::GetFlags() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->Flags;
@@ -1863,13 +2294,13 @@ vlUInt CVTFFile::GetFlags() const
 // Sets the flags associated with the image.  These flags
 // are stored in the VTFImageFlag enumeration.
 //
-vlVoid CVTFFile::SetFlags(vlUInt uiFlags)
+vlVoid CVTFFile::SetFlags( vlUInt uiFlags )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
 	// Don't let the user set flags critical to the image's format.
-	//if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] <= VTF_MINOR_VERSION_MIN_RESOURCE))
+	// if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] <= VTF_MINOR_VERSION_MIN_RESOURCE))
 	//{
 	//	if(this->Header->Flags & TEXTUREFLAGS_DEPRECATED_NOCOMPRESS)
 	//		uiFlags |= TEXTUREFLAGS_DEPRECATED_NOCOMPRESS;
@@ -1877,17 +2308,17 @@ vlVoid CVTFFile::SetFlags(vlUInt uiFlags)
 	//		uiFlags &= ~TEXTUREFLAGS_DEPRECATED_NOCOMPRESS;
 	//}
 
-	if(this->Header->Flags & TEXTUREFLAGS_EIGHTBITALPHA)
+	if ( this->Header->Flags & TEXTUREFLAGS_EIGHTBITALPHA )
 		uiFlags |= TEXTUREFLAGS_EIGHTBITALPHA;
 	else
 		uiFlags &= ~TEXTUREFLAGS_EIGHTBITALPHA;
 
-	if(this->Header->Flags & TEXTUREFLAGS_ENVMAP)
+	if ( this->Header->Flags & TEXTUREFLAGS_ENVMAP )
 		uiFlags |= TEXTUREFLAGS_ENVMAP;
 	else
 		uiFlags &= ~TEXTUREFLAGS_ENVMAP;
 
-	if(this->Header->Flags & TEXTUREFLAGS_ENVMAP)
+	if ( this->Header->Flags & TEXTUREFLAGS_ENVMAP )
 		uiFlags |= TEXTUREFLAGS_ENVMAP;
 	else
 		uiFlags &= ~TEXTUREFLAGS_ENVMAP;
@@ -1899,12 +2330,12 @@ vlVoid CVTFFile::SetFlags(vlUInt uiFlags)
 // GetFlag()
 // Gets the status of the specified flag in the image.
 //
-vlBool CVTFFile::GetFlag(VTFImageFlag ImageFlag) const
+vlBool CVTFFile::GetFlag( VTFImageFlag ImageFlag ) const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	return (this->Header->Flags & ImageFlag) != 0;
+	return ( this->Header->Flags & ImageFlag ) != 0;
 }
 
 //
@@ -1912,26 +2343,26 @@ vlBool CVTFFile::GetFlag(VTFImageFlag ImageFlag) const
 // Sets the flag ImageFlag to bState (set or not set).  Flags critical
 // to the image's format cannot be set.
 //
-vlVoid CVTFFile::SetFlag(VTFImageFlag ImageFlag, vlBool bState)
+vlVoid CVTFFile::SetFlag( VTFImageFlag ImageFlag, vlBool bState )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
-	//if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] <= VTF_MINOR_VERSION_MIN_RESOURCE))
+	// if(this->Header->Version[0] < VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] <= VTF_MINOR_VERSION_MIN_RESOURCE))
 	//{
 	//	if(ImageFlag == TEXTUREFLAGS_DEPRECATED_NOCOMPRESS)
 	//	{
 	//		return;
 	//	}
-	//}
+	// }
 
 	// Don't let the user set flags critical to the image's format.
-	if(ImageFlag == TEXTUREFLAGS_ONEBITALPHA || ImageFlag == TEXTUREFLAGS_EIGHTBITALPHA || ImageFlag == TEXTUREFLAGS_ENVMAP)
+	if ( ImageFlag == TEXTUREFLAGS_ONEBITALPHA || ImageFlag == TEXTUREFLAGS_EIGHTBITALPHA || ImageFlag == TEXTUREFLAGS_ENVMAP )
 	{
 		return;
 	}
 
-	if(bState)
+	if ( bState )
 	{
 		this->Header->Flags |= ImageFlag;
 	}
@@ -1947,7 +2378,7 @@ vlVoid CVTFFile::SetFlag(VTFImageFlag ImageFlag, vlBool bState)
 //
 vlSingle CVTFFile::GetBumpmapScale() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0.0f;
 
 	return this->Header->BumpScale;
@@ -1957,9 +2388,9 @@ vlSingle CVTFFile::GetBumpmapScale() const
 // SetBumpmapScale()
 // Sets the bumpmap scale of the image.
 //
-vlVoid CVTFFile::SetBumpmapScale(vlSingle sBumpmapScale)
+vlVoid CVTFFile::SetBumpmapScale( vlSingle sBumpmapScale )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
 	this->Header->BumpScale = sBumpmapScale;
@@ -1969,9 +2400,9 @@ vlVoid CVTFFile::SetBumpmapScale(vlSingle sBumpmapScale)
 // GetReflectivity()
 // Gets the reflectivity of the image.
 //
-vlVoid CVTFFile::GetReflectivity(vlSingle &sX, vlSingle &sY, vlSingle &sZ) const
+vlVoid CVTFFile::GetReflectivity( vlSingle &sX, vlSingle &sY, vlSingle &sZ ) const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
 	sX = this->Header->Reflectivity[0];
@@ -1983,9 +2414,9 @@ vlVoid CVTFFile::GetReflectivity(vlSingle &sX, vlSingle &sY, vlSingle &sZ) const
 // SetReflectivity()
 // Sets the reflectivity of the image.
 //
-vlVoid CVTFFile::SetReflectivity(vlSingle sX, vlSingle sY, vlSingle sZ)
+vlVoid CVTFFile::SetReflectivity( vlSingle sX, vlSingle sY, vlSingle sZ )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return;
 
 	this->Header->Reflectivity[0] = sX;
@@ -1999,7 +2430,7 @@ vlVoid CVTFFile::SetReflectivity(vlSingle sX, vlSingle sY, vlSingle sZ)
 //
 VTFImageFormat CVTFFile::GetFormat() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return IMAGE_FORMAT_NONE;
 
 	return this->Header->ImageFormat;
@@ -2010,13 +2441,13 @@ VTFImageFormat CVTFFile::GetFormat() const
 // Gets the image data of the specified frame, face and mipmap in the format
 // of the image.
 //
-vlByte *CVTFFile::GetData(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel) const
+vlByte *CVTFFile::GetData( vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel ) const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
-	vlUInt uiOffset = this->ComputeDataOffset(uiFrame, uiFace, uiSlice, uiMipmapLevel, this->Header->ImageFormat);
-	assert(uiOffset < this->uiImageBufferSize);
+	vlUInt uiOffset = this->ComputeDataOffset( uiFrame, uiFace, uiSlice, uiMipmapLevel, this->Header->ImageFormat );
+	assert( uiOffset < this->uiImageBufferSize );
 
 	return this->lpImageData + uiOffset;
 }
@@ -2026,15 +2457,15 @@ vlByte *CVTFFile::GetData(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt
 // Sets the image data of the specified frame, face and mipmap.  Image data
 // must be in the format of the image.
 //
-vlVoid CVTFFile::SetData(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel, vlByte *lpData)
+vlVoid CVTFFile::SetData( vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel, vlByte *lpData )
 {
-	if(!this->IsLoaded() || this->lpImageData == 0)
+	if ( !this->IsLoaded() || this->lpImageData == 0 )
 		return;
 
-	vlUInt uiOffset = this->ComputeDataOffset(uiFrame, uiFace, uiSlice, uiMipmapLevel, this->Header->ImageFormat);
-	assert(uiOffset < this->uiImageBufferSize);
+	vlUInt uiOffset = this->ComputeDataOffset( uiFrame, uiFace, uiSlice, uiMipmapLevel, this->Header->ImageFormat );
+	assert( uiOffset < this->uiImageBufferSize );
 
-	memcpy(this->lpImageData + uiOffset, lpData, CVTFFile::ComputeMipmapSize(this->Header->Width, this->Header->Height, 1, uiMipmapLevel, this->Header->ImageFormat));
+	memcpy( this->lpImageData + uiOffset, lpData, CVTFFile::ComputeMipmapSize( this->Header->Width, this->Header->Height, 1, uiMipmapLevel, this->Header->ImageFormat ) );
 }
 
 //
@@ -2044,7 +2475,7 @@ vlVoid CVTFFile::SetData(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt u
 //
 vlBool CVTFFile::GetHasThumbnail() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
 	return this->Header->LowResImageFormat != IMAGE_FORMAT_NONE;
@@ -2056,7 +2487,7 @@ vlBool CVTFFile::GetHasThumbnail() const
 //
 vlUInt CVTFFile::GetThumbnailWidth() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->LowResImageWidth;
@@ -2068,7 +2499,7 @@ vlUInt CVTFFile::GetThumbnailWidth() const
 //
 vlUInt CVTFFile::GetThumbnailHeight() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->Header->LowResImageHeight;
@@ -2080,7 +2511,7 @@ vlUInt CVTFFile::GetThumbnailHeight() const
 //
 VTFImageFormat CVTFFile::GetThumbnailFormat() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return IMAGE_FORMAT_NONE;
 
 	return this->Header->LowResImageFormat;
@@ -2094,7 +2525,7 @@ VTFImageFormat CVTFFile::GetThumbnailFormat() const
 //
 vlByte *CVTFFile::GetThumbnailData() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return 0;
 
 	return this->lpThumbnailImageData;
@@ -2104,49 +2535,49 @@ vlByte *CVTFFile::GetThumbnailData() const
 // SetThumbnailData()
 // Sets the thumbnail image data.  Image data must be in the format of the image.
 //
-vlVoid CVTFFile::SetThumbnailData(vlByte *lpData)
+vlVoid CVTFFile::SetThumbnailData( vlByte *lpData )
 {
-	if(!this->IsLoaded() || this->lpThumbnailImageData == 0)
+	if ( !this->IsLoaded() || this->lpThumbnailImageData == 0 )
 		return;
 
-	memcpy(this->lpThumbnailImageData, lpData, this->uiThumbnailBufferSize/*CVTFFile::ComputeImageSize(this->Header->LowResImageWidth, this->Header->LowResImageHeight, this->Header->LowResImageFormat)*/);
+	memcpy( this->lpThumbnailImageData, lpData, this->uiThumbnailBufferSize /*CVTFFile::ComputeImageSize(this->Header->LowResImageWidth, this->Header->LowResImageHeight, this->Header->LowResImageFormat)*/ );
 }
 
 vlBool CVTFFile::GetSupportsResources() const
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	return this->Header->Version[0] > VTF_MAJOR_VERSION || (this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] >= VTF_MINOR_VERSION_MIN_RESOURCE);
+	return this->Header->Version[0] > VTF_MAJOR_VERSION || ( this->Header->Version[0] == VTF_MAJOR_VERSION && this->Header->Version[1] >= VTF_MINOR_VERSION_MIN_RESOURCE );
 }
 
 vlUInt CVTFFile::GetResourceCount() const
 {
-	if(!this->GetSupportsResources())
+	if ( !this->GetSupportsResources() )
 		return 0;
 
 	return this->Header->ResourceCount;
 }
 
-vlUInt CVTFFile::GetResourceType(vlUInt uiIndex) const
+vlUInt CVTFFile::GetResourceType( vlUInt uiIndex ) const
 {
-	if(!this->GetSupportsResources())
+	if ( !this->GetSupportsResources() )
 		return 0;
 
-	if(uiIndex >= this->Header->ResourceCount)
+	if ( uiIndex >= this->Header->ResourceCount )
 		return 0;
 
 	return this->Header->Resources[uiIndex].Type;
 }
 
-vlBool CVTFFile::GetHasResource(vlUInt uiType) const
+vlBool CVTFFile::GetHasResource( vlUInt uiType ) const
 {
-	if(!this->GetSupportsResources())
+	if ( !this->GetSupportsResources() )
 		return vlFalse;
 
-	for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
+	for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 	{
-		if(this->Header->Resources[i].Type == uiType)
+		if ( this->Header->Resources[i].Type == uiType )
 		{
 			return vlTrue;
 		}
@@ -2155,45 +2586,45 @@ vlBool CVTFFile::GetHasResource(vlUInt uiType) const
 	return vlFalse;
 }
 
-vlVoid *CVTFFile::GetResourceData(vlUInt uiType, vlUInt &uiSize) const
+vlVoid *CVTFFile::GetResourceData( vlUInt uiType, vlUInt &uiSize ) const
 {
-	if(this->IsLoaded())
+	if ( this->IsLoaded() )
 	{
-		if(this->GetSupportsResources())
+		if ( this->GetSupportsResources() )
 		{
-			switch(uiType)
+			switch ( uiType )
 			{
-			case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-				uiSize = this->uiThumbnailBufferSize;
-				return this->lpThumbnailImageData;
-				break;
-			case VTF_LEGACY_RSRC_IMAGE:
-				uiSize = this->uiImageBufferSize;
-				return this->lpImageData;
-				break;
-			default:
-				for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
-				{
-					if(this->Header->Resources[i].Type == uiType)
+				case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
+					uiSize = this->uiThumbnailBufferSize;
+					return this->lpThumbnailImageData;
+					break;
+				case VTF_LEGACY_RSRC_IMAGE:
+					uiSize = this->uiImageBufferSize;
+					return this->lpImageData;
+					break;
+				default:
+					for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 					{
-						if(this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK)
-						{
-							uiSize = sizeof(vlUInt);
-							return &this->Header->Resources[i].Data;
-						}
-						else
+						if ( this->Header->Resources[i].Type == uiType )
 						{
-							uiSize = this->Header->Data[i].Size;
-							return this->Header->Data[i].Data;
+							if ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK )
+							{
+								uiSize = sizeof( vlUInt );
+								return &this->Header->Resources[i].Data;
+							}
+							else
+							{
+								uiSize = this->Header->Data[i].Size;
+								return this->Header->Data[i].Data;
+							}
 						}
 					}
-				}
-				break;
+					break;
 			}
 		}
 		else
 		{
-			LastError.Set("Resources require VTF file version v7.3 and up.");
+			LastError.Set( "Resources require VTF file version v7.3 and up." );
 		}
 	}
 
@@ -2201,138 +2632,138 @@ vlVoid *CVTFFile::GetResourceData(vlUInt uiType, vlUInt &uiSize) const
 	return 0;
 }
 
-vlVoid *CVTFFile::SetResourceData(vlUInt uiType, vlUInt uiSize, vlVoid *lpData)
+vlVoid *CVTFFile::SetResourceData( vlUInt uiType, vlUInt uiSize, vlVoid *lpData )
 {
-	if(this->IsLoaded())
+	if ( this->IsLoaded() )
 	{
-		if(this->GetSupportsResources())
+		if ( this->GetSupportsResources() )
 		{
-			switch(uiType)
+			switch ( uiType )
 			{
-			case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
-				LastError.Set("Low resolution image resource cannot be modified through resource interface.");
-				break;
-			case VTF_LEGACY_RSRC_IMAGE:
-				LastError.Set("Image resource cannot be modified through resource interface.");
-				break;
-			default:
-				for(vlUInt i = 0; i < this->Header->ResourceCount; i++)
-				{
-					if(this->Header->Resources[i].Type == uiType)
+				case VTF_LEGACY_RSRC_LOW_RES_IMAGE:
+					LastError.Set( "Low resolution image resource cannot be modified through resource interface." );
+					break;
+				case VTF_LEGACY_RSRC_IMAGE:
+					LastError.Set( "Image resource cannot be modified through resource interface." );
+					break;
+				default:
+					for ( vlUInt i = 0; i < this->Header->ResourceCount; i++ )
 					{
-						if(uiSize == 0)
-						{
-							delete []this->Header->Data[i].Data;
-							for(vlUInt j = i + 1; j < this->Header->ResourceCount; j++)
-							{
-								this->Header->Resources[j - 1] = this->Header->Resources[j];
-								this->Header->Data[j - 1] = this->Header->Data[j];
-							}
-							this->Header->ResourceCount--;
-							this->ComputeResources();
-							return 0;
-						}
-						else
+						if ( this->Header->Resources[i].Type == uiType )
 						{
-							if(this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK)
+							if ( uiSize == 0 )
 							{
-								if(uiSize != sizeof(vlUInt))
-								{
-									LastError.Set("Resources with no data chunk must have size 4.");
-									return 0;
-								}
-								if(lpData == 0)
-								{
-									this->Header->Resources[i].Data= 0;
-								}
-								else if(&this->Header->Resources[i].Data != lpData)
+								delete[] this->Header->Data[i].Data;
+								for ( vlUInt j = i + 1; j < this->Header->ResourceCount; j++ )
 								{
-									this->Header->Resources[i].Data = *(vlUInt *)lpData;
+									this->Header->Resources[j - 1] = this->Header->Resources[j];
+									this->Header->Data[j - 1] = this->Header->Data[j];
 								}
-								return &this->Header->Resources[i].Data;
+								this->Header->ResourceCount--;
+								this->ComputeResources();
+								return 0;
 							}
 							else
 							{
-								if(this->Header->Data[i].Size != uiSize)
-								{
-									delete []this->Header->Data[i].Data;
-									this->Header->Data[i].Size = uiSize;
-									this->Header->Data[i].Data = new vlByte[uiSize];
-									this->ComputeResources();
-								}
-								if(lpData == 0)
+								if ( this->Header->Resources[i].Flags & RSRCF_HAS_NO_DATA_CHUNK )
 								{
-									memset(this->Header->Data[i].Data, 0, this->Header->Data[i].Size);
+									if ( uiSize != sizeof( vlUInt ) )
+									{
+										LastError.Set( "Resources with no data chunk must have size 4." );
+										return 0;
+									}
+									if ( lpData == 0 )
+									{
+										this->Header->Resources[i].Data = 0;
+									}
+									else if ( &this->Header->Resources[i].Data != lpData )
+									{
+										this->Header->Resources[i].Data = *(vlUInt *)lpData;
+									}
+									return &this->Header->Resources[i].Data;
 								}
-								else if(this->Header->Data[i].Data != lpData)
+								else
 								{
-									memcpy(this->Header->Data[i].Data, lpData, this->Header->Data[i].Size);
+									if ( this->Header->Data[i].Size != uiSize )
+									{
+										delete[] this->Header->Data[i].Data;
+										this->Header->Data[i].Size = uiSize;
+										this->Header->Data[i].Data = new vlByte[uiSize];
+										this->ComputeResources();
+									}
+									if ( lpData == 0 )
+									{
+										memset( this->Header->Data[i].Data, 0, this->Header->Data[i].Size );
+									}
+									else if ( this->Header->Data[i].Data != lpData )
+									{
+										memcpy( this->Header->Data[i].Data, lpData, this->Header->Data[i].Size );
+									}
+									return this->Header->Data[i].Data;
 								}
-								return this->Header->Data[i].Data;
-							}
-						}
-					}
-				}
-
-				// Resource not found.
-				if(uiSize != 0)
-				{
-					if(this->Header->ResourceCount == VTF_RSRC_MAX_DICTIONARY_ENTRIES)
-					{
-						LastError.SetFormatted("Maximum directory entry count %u reached.", VTF_RSRC_MAX_DICTIONARY_ENTRIES);
-						return 0;
+							}
+						}
 					}
 
-					vlUInt uiIndex = this->Header->ResourceCount;
-
-					this->Header->Resources[uiIndex].Type = uiType;
-					this->Header->Resources[uiIndex].Data = 0;
-
-					this->Header->Data[uiIndex].Size = 0;
-					this->Header->Data[uiIndex].Data = 0;
-
-					if(this->Header->Resources[uiIndex].Flags & RSRCF_HAS_NO_DATA_CHUNK)
+					// Resource not found.
+					if ( uiSize != 0 )
 					{
-						if(uiSize != sizeof(vlUInt))
+						if ( this->Header->ResourceCount == VTF_RSRC_MAX_DICTIONARY_ENTRIES )
 						{
-							LastError.Set("Resources with no data chunk must have size 4.");
+							LastError.SetFormatted( "Maximum directory entry count %u reached.", VTF_RSRC_MAX_DICTIONARY_ENTRIES );
 							return 0;
 						}
-						if(lpData != 0)
-						{
-							this->Header->Resources[uiIndex].Data = *(vlUInt *)lpData;
-						}
-						else
-						{
-							this->Header->Resources[uiIndex].Data = 0;
-						}
-						this->Header->ResourceCount++;
-						this->ComputeResources();
-						return &this->Header->Resources[uiIndex].Data;
-					}
-					else
-					{
-						this->Header->Data[uiIndex].Size = uiSize;
-						this->Header->Data[uiIndex].Data = new vlByte[uiSize];
-						if(lpData != 0)
+
+						vlUInt uiIndex = this->Header->ResourceCount;
+
+						this->Header->Resources[uiIndex].Type = uiType;
+						this->Header->Resources[uiIndex].Data = 0;
+
+						this->Header->Data[uiIndex].Size = 0;
+						this->Header->Data[uiIndex].Data = 0;
+
+						if ( this->Header->Resources[uiIndex].Flags & RSRCF_HAS_NO_DATA_CHUNK )
 						{
-							memcpy(this->Header->Data[uiIndex].Data, lpData, this->Header->Data[uiIndex].Size);
+							if ( uiSize != sizeof( vlUInt ) )
+							{
+								LastError.Set( "Resources with no data chunk must have size 4." );
+								return 0;
+							}
+							if ( lpData != 0 )
+							{
+								this->Header->Resources[uiIndex].Data = *(vlUInt *)lpData;
+							}
+							else
+							{
+								this->Header->Resources[uiIndex].Data = 0;
+							}
+							this->Header->ResourceCount++;
+							this->ComputeResources();
+							return &this->Header->Resources[uiIndex].Data;
 						}
 						else
 						{
-							memset(this->Header->Data[uiIndex].Data, 0, this->Header->Data[uiIndex].Size);
+							this->Header->Data[uiIndex].Size = uiSize;
+							this->Header->Data[uiIndex].Data = new vlByte[uiSize];
+							if ( lpData != 0 )
+							{
+								memcpy( this->Header->Data[uiIndex].Data, lpData, this->Header->Data[uiIndex].Size );
+							}
+							else
+							{
+								memset( this->Header->Data[uiIndex].Data, 0, this->Header->Data[uiIndex].Size );
+							}
+							this->Header->ResourceCount++;
+							this->ComputeResources();
+							return this->Header->Data[uiIndex].Data;
 						}
-						this->Header->ResourceCount++;
-						this->ComputeResources();
-						return this->Header->Data[uiIndex].Data;
 					}
-				}
-				break;
+					break;
 			}
 		}
 		else
 		{
-			LastError.Set("Resources require VTF file version v7.3 and up.");
+			LastError.Set( "Resources require VTF file version v7.3 and up." );
 		}
 	}
 
@@ -2347,11 +2778,11 @@ vlInt CVTFFile::GetAuxCompressionLevel() const
 {
 	// Find the compression info and get data out of it
 	vlUInt uiDataSize;
-	SVTFAuxCompressionInfoHeader* pInfoHeader = (SVTFAuxCompressionInfoHeader*)this->GetResourceData(VTF_RSRC_AUX_COMPRESSION_INFO, uiDataSize);
+	SVTFAuxCompressionInfoHeader *pInfoHeader = (SVTFAuxCompressionInfoHeader *)this->GetResourceData( VTF_RSRC_AUX_COMPRESSION_INFO, uiDataSize );
 
-	if (!pInfoHeader)
+	if ( !pInfoHeader )
 		return 0;
-	
+
 	return pInfoHeader->CompressionLevel;
 }
 
@@ -2359,41 +2790,41 @@ vlInt CVTFFile::GetAuxCompressionLevel() const
 // SetAuxCompressionLevel()
 // Sets the auxiliary compression level of the VTF. Valid levels are 0-9 and SVTFAuxCompressionInfoHeader::DEFAULT_COMPRESSION
 //
-vlBool CVTFFile::SetAuxCompressionLevel(vlInt iCompressionLevel)
+vlBool CVTFFile::SetAuxCompressionLevel( vlInt iCompressionLevel )
 {
-	if (this->GetMajorVersion() < 7 || (this->GetMajorVersion() == 7 && this->GetMinorVersion() < 6))
+	if ( this->GetMajorVersion() < 7 || ( this->GetMajorVersion() == 7 && this->GetMinorVersion() < 6 ) )
 	{
-		LastError.Set("VTF Version <7.6 does not support auxiliary compression.");
+		LastError.Set( "VTF Version <7.6 does not support auxiliary compression." );
 		return vlFalse;
 	}
 
 	SVTFAuxCompressionInfoHeader compressionHeader;
 	compressionHeader.CompressionLevel = iCompressionLevel;
 
-	this->SetResourceData(VTF_RSRC_AUX_COMPRESSION_INFO, sizeof(SVTFAuxCompressionInfoHeader), &compressionHeader);
+	this->SetResourceData( VTF_RSRC_AUX_COMPRESSION_INFO, sizeof( SVTFAuxCompressionInfoHeader ), &compressionHeader );
 	return vlTrue;
 }
 
 //
-// GenerateMipmaps()
+// GenerateMipmaps()malloc()
 // Generate mipmaps from the first mipmap level.
 //
-vlBool CVTFFile::GenerateMipmaps(VTFMipmapFilter MipmapFilter, vlBool bSRGB)
+vlBool CVTFFile::GenerateMipmaps( VTFMipmapFilter MipmapFilter, vlBool bSRGB )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	if(this->Header->MipCount == 0)
+	if ( this->Header->MipCount == 0 )
 		return vlTrue;
 
 	vlUInt uiFrameCount = this->GetFrameCount();
 	vlUInt uiFaceCount = this->GetFaceCount();
 
-	for(vlUInt i = 0; i < uiFrameCount; i++)
+	for ( vlUInt i = 0; i < uiFrameCount; i++ )
 	{
-		for(vlUInt j = 0; j < uiFaceCount; j++)
+		for ( vlUInt j = 0; j < uiFaceCount; j++ )
 		{
-			if(!this->GenerateMipmaps(i, j, MipmapFilter, bSRGB))
+			if ( !this->GenerateMipmaps( i, j, MipmapFilter, bSRGB ) )
 			{
 				return vlFalse;
 			}
@@ -2407,27 +2838,27 @@ vlBool CVTFFile::GenerateMipmaps(VTFMipmapFilter MipmapFilter, vlBool bSRGB)
 // GenerateMipmaps()
 // Generate mipmaps from the first mipmap level of the specified frame and face.
 //
-vlBool CVTFFile::GenerateMipmaps(vlUInt uiFace, vlUInt uiFrame, VTFMipmapFilter MipmapFilter, vlBool bSRGB)
+vlBool CVTFFile::GenerateMipmaps( vlUInt uiFace, vlUInt uiFrame, VTFMipmapFilter MipmapFilter, vlBool bSRGB )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	auto formatInfo = GetImageFormatInfo(GetFormat());
+	auto formatInfo = GetImageFormatInfo( GetFormat() );
 	VTFImageFormat actualFormat = GetFormat();
-	vlByte* lpData = (vlByte*)GetData(uiFrame, uiFace, 0, 0);
+	vlByte *lpData = (vlByte *)GetData( uiFrame, uiFace, 0, 0 );
 	bool bConverted = false;
 
 	// If the image is compressed or one of the other unsupported stbir types, we'll convert it to RGBA8888 for processing
-	if (formatInfo.bIsCompressed || formatInfo.uiAlphaBitsPerPixel < 8 || formatInfo.uiBlueBitsPerPixel < 8 ||
-		formatInfo.uiGreenBitsPerPixel < 8 || formatInfo.uiRedBitsPerPixel < 8)
+	if ( formatInfo.bIsCompressed || formatInfo.uiAlphaBitsPerPixel < 8 || formatInfo.uiBlueBitsPerPixel < 8 ||
+		 formatInfo.uiGreenBitsPerPixel < 8 || formatInfo.uiRedBitsPerPixel < 8 )
 	{
 		bConverted = true;
 		lpData = new vlByte[GetWidth() * GetHeight() * 4];
-		if (!ConvertToRGBA8888(GetData(uiFrame, uiFace, 0, 0), lpData, GetWidth(), GetHeight(), GetFormat()))
+		if ( !ConvertToRGBA8888( GetData( uiFrame, uiFace, 0, 0 ), lpData, GetWidth(), GetHeight(), GetFormat() ) )
 			return false;
 
 		actualFormat = IMAGE_FORMAT_RGBA8888;
-		formatInfo = GetImageFormatInfo(actualFormat);
+		formatInfo = GetImageFormatInfo( actualFormat );
 	}
 
 	auto uiWidth = GetWidth();
@@ -2436,70 +2867,79 @@ vlBool CVTFFile::GenerateMipmaps(vlUInt uiFace, vlUInt uiFrame, VTFMipmapFilter
 	auto uiMipHeight = uiHeight >> 1;
 
 	// Alloc a working buffer that will fit all of our mips
-	vlByte* lpWorkBuffer = new vlByte[uiMipWidth * uiMipHeight * formatInfo.uiBytesPerPixel];
+	vlByte *lpWorkBuffer = new vlByte[uiMipWidth * uiMipHeight * formatInfo.uiBytesPerPixel];
 
 	// Determine datatype + channel count
 	stbir_datatype iDataType = STBIR_TYPE_UINT8;
-	if (actualFormat == IMAGE_FORMAT_RGB323232F || actualFormat == IMAGE_FORMAT_RGBA32323232F)
+	if ( actualFormat == IMAGE_FORMAT_RGB323232F || actualFormat == IMAGE_FORMAT_RGBA32323232F )
 		iDataType = STBIR_TYPE_FLOAT;
-	else if (actualFormat == IMAGE_FORMAT_RGBA16161616 || actualFormat == IMAGE_FORMAT_RGBA16161616 || 
-			actualFormat == IMAGE_FORMAT_RGBA16161616F)
+	else if ( actualFormat == IMAGE_FORMAT_RGBA16161616 || actualFormat == IMAGE_FORMAT_RGBA16161616 ||
+			  actualFormat == IMAGE_FORMAT_RGBA16161616F )
 		iDataType = STBIR_TYPE_UINT16;
 
 	int iNumChannels = 0;
-	if (formatInfo.uiAlphaBitsPerPixel > 0) iNumChannels++;
-	if (formatInfo.uiGreenBitsPerPixel > 0) iNumChannels++;
-	if (formatInfo.uiBlueBitsPerPixel > 0) iNumChannels++;
-	if (formatInfo.uiRedBitsPerPixel > 0) iNumChannels++;
+	if ( formatInfo.uiAlphaBitsPerPixel > 0 )
+		iNumChannels++;
+	if ( formatInfo.uiGreenBitsPerPixel > 0 )
+		iNumChannels++;
+	if ( formatInfo.uiBlueBitsPerPixel > 0 )
+		iNumChannels++;
+	if ( formatInfo.uiRedBitsPerPixel > 0 )
+		iNumChannels++;
 
 	// Determine mip filter
 	stbir_filter iMipFilter;
-	switch(MipmapFilter)
-	{
-	case MIPMAP_FILTER_BOX:
-		iMipFilter = STBIR_FILTER_BOX; break;
-	case MIPMAP_FILTER_TRIANGLE:
-		iMipFilter = STBIR_FILTER_TRIANGLE; break;
-	case MIPMAP_FILTER_CUBIC:
-		iMipFilter = STBIR_FILTER_CUBICBSPLINE; break;
-	case MIPMAP_FILTER_CATROM:
-		iMipFilter = STBIR_FILTER_CATMULLROM; break;
-	case MIPMAP_FILTER_MITCHELL:
-		iMipFilter = STBIR_FILTER_MITCHELL; break;
-	default:
-		iMipFilter = STBIR_FILTER_DEFAULT; break;
+	switch ( MipmapFilter )
+	{
+		case MIPMAP_FILTER_BOX:
+			iMipFilter = STBIR_FILTER_BOX;
+			break;
+		case MIPMAP_FILTER_TRIANGLE:
+			iMipFilter = STBIR_FILTER_TRIANGLE;
+			break;
+		case MIPMAP_FILTER_CUBIC:
+			iMipFilter = STBIR_FILTER_CUBICBSPLINE;
+			break;
+		case MIPMAP_FILTER_CATROM:
+			iMipFilter = STBIR_FILTER_CATMULLROM;
+			break;
+		case MIPMAP_FILTER_MITCHELL:
+			iMipFilter = STBIR_FILTER_MITCHELL;
+			break;
+		default:
+			iMipFilter = STBIR_FILTER_DEFAULT;
+			break;
 	}
 
 	bool bOk = true;
-	for (vlUInt32 i = 1; i < GetMipmapCount(); ++i)
+	for ( vlUInt32 i = 1; i < GetMipmapCount(); ++i )
 	{
 		bOk &= stbir_resize(
 			lpData, uiWidth, uiHeight, 0, lpWorkBuffer,
 			uiMipWidth, uiMipHeight, 0, iDataType, iNumChannels,
 			formatInfo.uiAlphaBitsPerPixel > 0, STBIR_FLAG_ALPHA_PREMULTIPLIED, STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP,
-			iMipFilter, iMipFilter, bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, nullptr
-		);
+			iMipFilter, iMipFilter, bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, nullptr );
 
-		if (bConverted)
+		if ( bConverted )
 		{
-			vlUInt32 uiOffset = ComputeDataOffset(uiFrame, uiFace, 0, i, GetFormat());
-			assert(uiOffset < this->uiImageBufferSize);
+			vlUInt32 uiOffset = ComputeDataOffset( uiFrame, uiFace, 0, i, GetFormat() );
+			assert( uiOffset < this->uiImageBufferSize );
 
-			bOk &= Convert(lpWorkBuffer, this->lpImageData + uiOffset, uiMipWidth, uiMipHeight, actualFormat, GetFormat());
+			bOk &= Convert( lpWorkBuffer, this->lpImageData + uiOffset, uiMipWidth, uiMipHeight, actualFormat, GetFormat() );
 		}
 		else // Data can be set directly
 		{
-			SetData(uiFrame, uiFace, 0, i, lpWorkBuffer);
+			SetData( uiFrame, uiFace, 0, i, lpWorkBuffer );
 		}
 
 		uiMipWidth >>= 1;
 		uiMipHeight >>= 1;
 	}
 
-	delete [] lpWorkBuffer;
-	if (bConverted)
+	delete[] lpWorkBuffer;
+	if ( bConverted )
 	{
-		delete [] lpData;
+		delete[] lpData;
 	}
 
 	return bOk;
@@ -2510,40 +2950,40 @@ vlBool CVTFFile::GenerateMipmaps(vlUInt uiFace, vlUInt uiFrame, VTFMipmapFilter
 // We should have a mipmap that matches the thumbnail size.  This function finds it and
 // copies it over to the mipmap data, converting it if need be.
 //
-vlBool CVTFFile::GenerateThumbnail(vlBool bSRGB)
+vlBool CVTFFile::GenerateThumbnail( vlBool bSRGB )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	if(!this->GetHasThumbnail())
+	if ( !this->GetHasThumbnail() )
 	{
-		LastError.Set("VTF file does not have a thumbnail.");
+		LastError.Set( "VTF file does not have a thumbnail." );
 		return vlFalse;
 	}
 
-	if(this->lpImageData == 0)
+	if ( this->lpImageData == 0 )
 	{
-		LastError.Set("No image data to generate thumbnail from.");
+		LastError.Set( "No image data to generate thumbnail from." );
 		return vlFalse;
 	}
 
 	// Find a mipmap that matches the size of the thumbnail.
-	for(vlUInt i = 0; i < this->Header->MipCount; i++)
+	for ( vlUInt i = 0; i < this->Header->MipCount; i++ )
 	{
 		vlUInt uiMipmapWidth, uiMipmapHeight, uiMipmapDepth;
-		CVTFFile::ComputeMipmapDimensions(this->Header->Width, this->Header->Height, 1, i, uiMipmapWidth, uiMipmapHeight, uiMipmapDepth);
+		CVTFFile::ComputeMipmapDimensions( this->Header->Width, this->Header->Height, 1, i, uiMipmapWidth, uiMipmapHeight, uiMipmapDepth );
 
-		if(uiMipmapWidth == (vlUInt)this->Header->LowResImageWidth && uiMipmapHeight == (vlUInt)this->Header->LowResImageHeight)
+		if ( uiMipmapWidth == (vlUInt)this->Header->LowResImageWidth && uiMipmapHeight == (vlUInt)this->Header->LowResImageHeight )
 		{
 			// Check if it is the same format (in which case copy it) otherwise convert
 			// it to the right format and copy it.
-			if(this->Header->ImageFormat == this->Header->LowResImageFormat)
+			if ( this->Header->ImageFormat == this->Header->LowResImageFormat )
 			{
-				this->SetThumbnailData(this->GetData(0, 0, 0, i));
+				this->SetThumbnailData( this->GetData( 0, 0, 0, i ) );
 			}
 			else
 			{
-				if(!CVTFFile::Convert(this->GetData(0, 0, 0, i), this->GetThumbnailData(), uiMipmapWidth, uiMipmapHeight, this->Header->ImageFormat, this->Header->LowResImageFormat))
+				if ( !CVTFFile::Convert( this->GetData( 0, 0, 0, i ), this->GetThumbnailData(), uiMipmapWidth, uiMipmapHeight, this->Header->ImageFormat, this->Header->LowResImageFormat ) )
 				{
 					return vlFalse;
 				}
@@ -2553,37 +2993,37 @@ vlBool CVTFFile::GenerateThumbnail(vlBool bSRGB)
 	}
 
 	// We don't have a matching mipmap (maybe we have no mipmaps) so generate one.
-	vlByte *lpImageData = new vlByte[CVTFFile::ComputeImageSize(this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888)];
-	vlByte *lpThumbnailImageData = new vlByte[CVTFFile::ComputeImageSize(this->Header->LowResImageWidth, this->Header->LowResImageHeight, 1, IMAGE_FORMAT_RGBA8888)];
+	vlByte *lpImageData = new vlByte[CVTFFile::ComputeImageSize( this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888 )];
+	vlByte *lpThumbnailImageData = new vlByte[CVTFFile::ComputeImageSize( this->Header->LowResImageWidth, this->Header->LowResImageHeight, 1, IMAGE_FORMAT_RGBA8888 )];
 
-	if(!CVTFFile::ConvertToRGBA8888(this->GetData(0, 0, 0, 0), lpImageData, this->Header->Width, this->Header->Height, this->Header->ImageFormat))
+	if ( !CVTFFile::ConvertToRGBA8888( this->GetData( 0, 0, 0, 0 ), lpImageData, this->Header->Width, this->Header->Height, this->Header->ImageFormat ) )
 	{
-		delete []lpImageData;
-		delete []lpThumbnailImageData;
+		delete[] lpImageData;
+		delete[] lpThumbnailImageData;
 
 		return vlFalse;
 	}
 
-	if(!CVTFFile::Resize(lpImageData, lpThumbnailImageData, this->Header->Width, this->Header->Height, this->Header->LowResImageWidth, this->Header->LowResImageHeight, MIPMAP_FILTER_CATROM, bSRGB))
+	if ( !CVTFFile::Resize( lpImageData, lpThumbnailImageData, this->Header->Width, this->Header->Height, this->Header->LowResImageWidth, this->Header->LowResImageHeight, MIPMAP_FILTER_CATROM, bSRGB ) )
 	{
-		delete []lpImageData;
-		delete []lpThumbnailImageData;
+		delete[] lpImageData;
+		delete[] lpThumbnailImageData;
 
 		return vlFalse;
 	}
 
-	if(!CVTFFile::ConvertFromRGBA8888(lpThumbnailImageData, this->GetThumbnailData(), this->Header->LowResImageWidth, this->Header->LowResImageHeight, this->Header->LowResImageFormat))
+	if ( !CVTFFile::ConvertFromRGBA8888( lpThumbnailImageData, this->GetThumbnailData(), this->Header->LowResImageWidth, this->Header->LowResImageHeight, this->Header->LowResImageFormat ) )
 	{
-		delete []lpImageData;
-		delete []lpThumbnailImageData;
+		delete[] lpImageData;
+		delete[] lpThumbnailImageData;
 
 		return vlFalse;
 	}
 
-	delete []lpImageData;
-	delete []lpThumbnailImageData;
+	delete[] lpImageData;
+	delete[] lpThumbnailImageData;
 
-	//LastError.Set("VTF file does not have a mipmap that matches the thumbnail size.");
+	// LastError.Set("VTF file does not have a mipmap that matches the thumbnail size.");
 	return vlTrue;
 }
 
@@ -2591,16 +3031,16 @@ vlBool CVTFFile::GenerateThumbnail(vlBool bSRGB)
 // GenerateNormalMap()
 // Convert the first level mipmap of each frame to a normal map.
 //
-vlBool CVTFFile::GenerateNormalMap(VTFKernelFilter KernelFilter, VTFHeightConversionMethod HeightConversionMethod, VTFNormalAlphaResult NormalAlphaResult)
+vlBool CVTFFile::GenerateNormalMap( VTFKernelFilter KernelFilter, VTFHeightConversionMethod HeightConversionMethod, VTFNormalAlphaResult NormalAlphaResult )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
 	vlUInt uiFrameCount = this->GetFrameCount();
 
-	for(vlUInt i = 0; i < uiFrameCount; i++)
+	for ( vlUInt i = 0; i < uiFrameCount; i++ )
 	{
-		if(!this->GenerateNormalMap(i, KernelFilter, HeightConversionMethod, NormalAlphaResult))
+		if ( !this->GenerateNormalMap( i, KernelFilter, HeightConversionMethod, NormalAlphaResult ) )
 		{
 			return vlFalse;
 		}
@@ -2613,52 +3053,52 @@ vlBool CVTFFile::GenerateNormalMap(VTFKernelFilter KernelFilter, VTFHeightConver
 // GenerateNormalMap()
 // Convert the first level mipmap of the specified frame to a normal map.
 //
-vlBool CVTFFile::GenerateNormalMap(vlUInt uiFrame, VTFKernelFilter KernelFilter, VTFHeightConversionMethod HeightConversionMethod, VTFNormalAlphaResult NormalAlphaResult)
+vlBool CVTFFile::GenerateNormalMap( vlUInt uiFrame, VTFKernelFilter KernelFilter, VTFHeightConversionMethod HeightConversionMethod, VTFNormalAlphaResult NormalAlphaResult )
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	if(this->Header->Flags & TEXTUREFLAGS_ENVMAP)
+	if ( this->Header->Flags & TEXTUREFLAGS_ENVMAP )
 	{
-		LastError.Set("Image is an enviroment map.");
+		LastError.Set( "Image is an enviroment map." );
 		return vlFalse;
 	}
 
-	if(this->lpImageData == 0)
+	if ( this->lpImageData == 0 )
 	{
-		LastError.Set("No image data to generate normal map from.");
+		LastError.Set( "No image data to generate normal map from." );
 		return vlFalse;
 	}
 
-	vlByte *lpData = this->GetData(0, uiFrame, 0, 0);
+	vlByte *lpData = this->GetData( 0, uiFrame, 0, 0 );
 
 	// Will hold frame's converted image data.
-	vlByte *lpSource = new vlByte[this->ComputeImageSize(this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888)];
+	vlByte *lpSource = new vlByte[this->ComputeImageSize( this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888 )];
 
 	// Get the frame's image data.
-	if(!this->ConvertToRGBA8888(lpData, lpSource, this->Header->Width, this->Header->Height, this->Header->ImageFormat))
+	if ( !this->ConvertToRGBA8888( lpData, lpSource, this->Header->Width, this->Header->Height, this->Header->ImageFormat ) )
 	{
-		delete []lpSource;
+		delete[] lpSource;
 
 		return vlFalse;
 	}
 
 	// Will hold normal image data.
-	//vlByte *lpDest = new vlByte[this->ComputeImageSize(this->Header->Width, this->Header->Height, IMAGE_FORMAT_RGBA8888)];
+	// vlByte *lpDest = new vlByte[this->ComputeImageSize(this->Header->Width, this->Header->Height, IMAGE_FORMAT_RGBA8888)];
 
-	//delete []lpSource;
+	// delete []lpSource;
 
 	// Set the frame's image data.
-	if(!this->ConvertFromRGBA8888(lpSource/*lpDest*/, lpData, this->Header->Width, this->Header->Height, this->Header->ImageFormat))
+	if ( !this->ConvertFromRGBA8888( lpSource /*lpDest*/, lpData, this->Header->Width, this->Header->Height, this->Header->ImageFormat ) )
 	{
-		delete []lpSource;	// Moved from above.
-		//delete []lpDest;
+		delete[] lpSource; // Moved from above.
+		// delete []lpDest;
 
 		return vlFalse;
 	}
 
-	delete []lpSource;	// Moved from above.
-	//delete []lpDest;
+	delete[] lpSource; // Moved from above.
+	// delete []lpDest;
 
 	return vlTrue;
 }
@@ -2667,20 +3107,20 @@ vlBool CVTFFile::GenerateNormalMap(vlUInt uiFrame, VTFKernelFilter KernelFilter,
 // -----------------------------------------------------------
 struct SphereMapFace
 {
-	vlUInt *buf;			// pointer to the address where the image data is.
-	Vector u, v, n, o;		// vectors for plane equations
+	vlUInt *buf;	   // pointer to the address where the image data is.
+	Vector u, v, n, o; // vectors for plane equations
 };
 
 // Define our faces and vectors (don't moan about the order!)
 // ----------------------------------------------------------
 SphereMapFace SFace[6] =
-{
-	{0, {0, 0, -1}, {0, 1, 0}, {-1, 0, 0}, {-0.5, -0.5, 0.5}},	// left (lf)
-	{0, {1, 0, 0}, {0, 1, 0}, {0, 0, -1}, {-0.5, -0.5, -0.5}},	// down (dn) 
-	{0, {0, 0, 1}, {0, 1, 0}, {1, 0, 0}, {0.5, -0.5, -0.5}}, 	// right (rt)
-	{0, {-1, 0, 0}, {0, 1, 0}, {0, 0, 1}, {0.5, -0.5, 0.5}},	// up (up)
-	{0, {1, 0, 0}, {0, 0, 1}, {0, 1, 0}, {-0.5, 0.5, -0.5}},	// front (ft)
-	{0,	{1, 0, 0}, {0, 0, -1}, {0, -1, 0}, {-0.5, -0.5, 0.5}}	// back (bk)
+	{
+		{ 0, { 0, 0, -1 }, { 0, 1, 0 }, { -1, 0, 0 }, { -0.5, -0.5, 0.5 } }, // left (lf)
+		{ 0, { 1, 0, 0 }, { 0, 1, 0 }, { 0, 0, -1 }, { -0.5, -0.5, -0.5 } }, // down (dn)
+		{ 0, { 0, 0, 1 }, { 0, 1, 0 }, { 1, 0, 0 }, { 0.5, -0.5, -0.5 } },	 // right (rt)
+		{ 0, { -1, 0, 0 }, { 0, 1, 0 }, { 0, 0, 1 }, { 0.5, -0.5, 0.5 } },	 // up (up)
+		{ 0, { 1, 0, 0 }, { 0, 0, 1 }, { 0, 1, 0 }, { -0.5, 0.5, -0.5 } },	 // front (ft)
+		{ 0, { 1, 0, 0 }, { 0, 0, -1 }, { 0, -1, 0 }, { -0.5, -0.5, 0.5 } }	 // back (bk)
 };
 
 // Normalised pixel colour struct
@@ -2696,24 +3136,24 @@ struct NColour
 //
 vlBool CVTFFile::GenerateSphereMap()
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	if(!(this->Header->Flags & TEXTUREFLAGS_ENVMAP))
+	if ( !( this->Header->Flags & TEXTUREFLAGS_ENVMAP ) )
 	{
-		LastError.Set("Image is not an enviroment map.");
+		LastError.Set( "Image is not an enviroment map." );
 		return vlFalse;
 	}
 
-	if(this->Header->StartFrame == 0xffff)
+	if ( this->Header->StartFrame == 0xffff )
 	{
-		LastError.Set("Enviroment map does not have a sphere map.");
+		LastError.Set( "Enviroment map does not have a sphere map." );
 		return vlFalse;
 	}
 
-	if(this->lpImageData == 0)
+	if ( this->lpImageData == 0 )
 	{
-		LastError.Set("No image data to generate sphere map from.");
+		LastError.Set( "No image data to generate sphere map from." );
 		return vlFalse;
 	}
 
@@ -2721,36 +3161,36 @@ vlBool CVTFFile::GenerateSphereMap()
 	vlUInt uiHeight = (vlUInt)this->Header->Height;
 
 	// lets go!
-	vlByte *lpImageData[6] = { 0, 0, 0, 0, 0, 0 };  					// 6 pointers to memory for our faces.
-	vlByte *lpSphereMapData = 0;					// SphereMap buffer 
-	vlUInt map[6] = {2, 0, 5, 4, 3, 1};		// used to remap valves face order to my face order.
-	vlUInt samples = 4;							// pixel samples for rendering
+	vlByte *lpImageData[6] = { 0, 0, 0, 0, 0, 0 }; // 6 pointers to memory for our faces.
+	vlByte *lpSphereMapData = 0;				   // SphereMap buffer
+	vlUInt map[6] = { 2, 0, 5, 4, 3, 1 };		   // used to remap valves face order to my face order.
+	vlUInt samples = 4;							   // pixel samples for rendering
 
 	vlUInt i, j, x, y, f;
 	NColour c, texel, average;
 	Vector v, r, p;
 	vlSingle s, t, temp, k;
-	 
+
 	// load the faces into the buffers and convert as needed
-	for( i = 0; i < 6; i ++)
-	{ 
-		vlUInt j = map[i];		// Valve face order to my face order map.
-
-		lpImageData[j] = new vlByte[this->ComputeImageSize(uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888)]; 
-		
-		if(!this->ConvertToRGBA8888(this->GetData(0, i, 0, 0), lpImageData[j], uiWidth, uiHeight, this->Header->ImageFormat)) 
-		{ 
-			for(vlUInt l = 0; l < 6; l++)  
-				delete[] lpImageData[l];  
-			
-			LastError.Set("Could not convert source to RGBA8888 format");
-			return vlFalse; 
-		} 
-		SFace[j].buf = (vlUInt *)lpImageData[j];	// save the address
+	for ( i = 0; i < 6; i++ )
+	{
+		vlUInt j = map[i]; // Valve face order to my face order map.
+
+		lpImageData[j] = new vlByte[this->ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 )];
+
+		if ( !this->ConvertToRGBA8888( this->GetData( 0, i, 0, 0 ), lpImageData[j], uiWidth, uiHeight, this->Header->ImageFormat ) )
+		{
+			for ( vlUInt l = 0; l < 6; l++ )
+				delete[] lpImageData[l];
+
+			LastError.Set( "Could not convert source to RGBA8888 format" );
+			return vlFalse;
+		}
+		SFace[j].buf = (vlUInt *)lpImageData[j]; // save the address
 	}
 
 	// Assuming at this point our faces have loaded fine, create a buffer for the SphereMap
-	lpSphereMapData = new vlByte[this->ComputeImageSize(uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888)]; 
+	lpSphereMapData = new vlByte[this->ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 )];
 
 	// At this point we need to flip 4 of the faces as follows as their "Valve" orientation
 	// is different to what the SphereMap rendering code needs.
@@ -2759,32 +3199,32 @@ vlBool CVTFFile::GenerateSphereMap()
 	// ft - flip vertical
 	// bk - flip vertical
 
-	this->MirrorImage(lpImageData[0], this->Header->Width, this->Header->Height);
-	this->MirrorImage(lpImageData[2], this->Header->Width, this->Header->Height);
-	this->MirrorImage(lpImageData[3], this->Header->Width, this->Header->Height);
-	this->FlipImage(lpImageData[4], this->Header->Width, this->Header->Height);
-	this->FlipImage(lpImageData[5], this->Header->Width, this->Header->Height);
-	
+	this->MirrorImage( lpImageData[0], this->Header->Width, this->Header->Height );
+	this->MirrorImage( lpImageData[2], this->Header->Width, this->Header->Height );
+	this->MirrorImage( lpImageData[3], this->Header->Width, this->Header->Height );
+	this->FlipImage( lpImageData[4], this->Header->Width, this->Header->Height );
+	this->FlipImage( lpImageData[5], this->Header->Width, this->Header->Height );
+
 	// disable conversion warning
-	//#pragma warning(disable: 4244)
+	// #pragma warning(disable: 4244)
 
 	// calculate the average colour for the forward face
 	// using just the forward face is quicker and seems fairly
 	// consistent with what Valves own SphereMaps look like.
 	vlUInt uiAvgR = 0, uiAvgG = 0, uiAvgB = 0;
 	vlUInt uiPixelCount = uiWidth * uiHeight;
-	
-	vlByte *src = lpImageData[3];	// 3 = up or forward face
-	vlByte *lpSourceEnd = src + (uiWidth * uiHeight * 4);
-	
-	for( ; src < lpSourceEnd; src += 4)
+
+	vlByte *src = lpImageData[3]; // 3 = up or forward face
+	vlByte *lpSourceEnd = src + ( uiWidth * uiHeight * 4 );
+
+	for ( ; src < lpSourceEnd; src += 4 )
 	{
 		uiAvgR += src[0];
 		uiAvgG += src[1];
 		uiAvgB += src[2];
 	}
 
-	uiAvgR /= uiPixelCount; 
+	uiAvgR /= uiPixelCount;
 	uiAvgG /= uiPixelCount;
 	uiAvgB /= uiPixelCount;
 
@@ -2796,58 +3236,58 @@ vlBool CVTFFile::GenerateSphereMap()
 	vlByte *lpSphereMapDataPointer = lpSphereMapData;
 
 	// Calculate sphere-map by rendering a perfectly reflective solid sphere.
-	for (y = 0; y < uiHeight; y++)
+	for ( y = 0; y < uiHeight; y++ )
 	{
-		for (x = 0; x < uiWidth; x++)
+		for ( x = 0; x < uiWidth; x++ )
 		{
 			texel.r = texel.g = texel.b = 0.0f;
-		
-			for (j = 0; j < samples; j++)
+
+			for ( j = 0; j < samples; j++ )
 			{
-				s = ((vlSingle)x + (vlSingle)drand48()) / (vlSingle)uiWidth - 0.5f;
-				t = ((vlSingle)y + (vlSingle)drand48()) / (vlSingle)uiHeight - 0.5f;
+				s = ( (vlSingle)x + (vlSingle)drand48() ) / (vlSingle)uiWidth - 0.5f;
+				t = ( (vlSingle)y + (vlSingle)drand48() ) / (vlSingle)uiHeight - 0.5f;
 				temp = s * s + t * t;
 
-				//point not on sphere so use the average colour
-				if (temp >= 0.25f)
+				// point not on sphere so use the average colour
+				if ( temp >= 0.25f )
 				{
-					texel.r += average.r;		
-					texel.g += average.g;		
-					texel.b += average.b;		
+					texel.r += average.r;
+					texel.g += average.g;
+					texel.b += average.b;
 					continue;
 				}
 
-				//get point on sphere
+				// get point on sphere
 				p.x = s;
 				p.y = t;
-				p.z = sqrt(0.25f - temp);
-				VecScale(&p, 2.0f);
+				p.z = sqrt( 0.25f - temp );
+				VecScale( &p, 2.0f );
 
-				//ray from infinity (eyepoint) to surface
+				// ray from infinity (eyepoint) to surface
 				v.x = 0.0f;
 				v.y = 0.0f;
 				v.z = 1.0f;
 
-				//get reflected ray
-				VecReflect(&p, &v, &r);
+				// get reflected ray
+				VecReflect( &p, &v, &r );
 
-				//Intersect reflected ray with cube
-				f = Intersect(&r);
-				k = VecDot(&SFace[f].o, &SFace[f].n) / VecDot(&r, &SFace[f].n);
-				VecScale(&r, k);
-				VecSub(&r, &SFace[f].o, &v);
+				// Intersect reflected ray with cube
+				f = Intersect( &r );
+				k = VecDot( &SFace[f].o, &SFace[f].n ) / VecDot( &r, &SFace[f].n );
+				VecScale( &r, k );
+				VecSub( &r, &SFace[f].o, &v );
 
-				//Get texture map-indices
-				s = VecDot(&v, &SFace[f].u);
-				t = VecDot(&v, &SFace[f].v);
+				// Get texture map-indices
+				s = VecDot( &v, &SFace[f].u );
+				t = VecDot( &v, &SFace[f].v );
 
-				//Sample to get color
+				// Sample to get color
 				SphereMapFace *pf = &SFace[f];
 				vlUInt xpos, ypos;
 				vlByte *p;
-  
-				xpos = (vlUInt)(s * (vlSingle)uiWidth);
-				ypos = (vlUInt)(t * (vlSingle)uiHeight);
+
+				xpos = (vlUInt)( s * (vlSingle)uiWidth );
+				ypos = (vlUInt)( t * (vlSingle)uiHeight );
 
 				p = (vlByte *)&pf->buf[ypos * uiWidth + xpos];
 				c.r = (vlSingle)p[0] / 255.0f;
@@ -2858,35 +3298,35 @@ vlBool CVTFFile::GenerateSphereMap()
 				texel.g += c.g;
 				texel.b += c.b;
 			}
-	
+
 			// punch the pixel into our SphereMap image buffer
-			lpSphereMapDataPointer[0] = (vlByte)(255.0f * texel.r / (vlSingle)samples);
-			lpSphereMapDataPointer[1] = (vlByte)(255.0f * texel.g / (vlSingle)samples);
-			lpSphereMapDataPointer[2] = (vlByte)(255.0f * texel.b / (vlSingle)samples);
+			lpSphereMapDataPointer[0] = (vlByte)( 255.0f * texel.r / (vlSingle)samples );
+			lpSphereMapDataPointer[1] = (vlByte)( 255.0f * texel.g / (vlSingle)samples );
+			lpSphereMapDataPointer[2] = (vlByte)( 255.0f * texel.b / (vlSingle)samples );
 			lpSphereMapDataPointer[3] = 0xff;
 			lpSphereMapDataPointer += 4;
 		}
 	}
 
-	//#pragma warning(default: 4244)
+	// #pragma warning(default: 4244)
 
-	if (!this->ConvertFromRGBA8888(lpSphereMapData,
-									this->GetData(0, CUBEMAP_FACE_SphereMap, 0, 0),
-									this->Header->Width,
-									this->Header->Height,
-									this->Header->ImageFormat) )
+	if ( !this->ConvertFromRGBA8888( lpSphereMapData,
+									 this->GetData( 0, CUBEMAP_FACE_SphereMap, 0, 0 ),
+									 this->Header->Width,
+									 this->Header->Height,
+									 this->Header->ImageFormat ) )
 	{
-		for(i = 0; i < 6; i++)
+		for ( i = 0; i < 6; i++ )
 		{
 			delete[] lpImageData[i];
 		}
 		delete[] lpSphereMapData;
 
-		return vlFalse; 
+		return vlFalse;
 	};
 
 	// delete the memory buffers
-	for(i = 0; i < 6; i++)
+	for ( i = 0; i < 6; i++ )
 	{
 		delete[] lpImageData[i];
 	}
@@ -2901,12 +3341,12 @@ vlBool CVTFFile::GenerateSphereMap()
 //
 vlBool CVTFFile::ComputeReflectivity()
 {
-	if(!this->IsLoaded())
+	if ( !this->IsLoaded() )
 		return vlFalse;
 
-	if(this->lpImageData == 0)
+	if ( this->lpImageData == 0 )
 	{
-		LastError.Set("No image data to compute reflectivity from.");
+		LastError.Set( "No image data to compute reflectivity from." );
 
 		return vlFalse;
 	}
@@ -2915,42 +3355,42 @@ vlBool CVTFFile::ComputeReflectivity()
 	this->Header->Reflectivity[1] = 0.0f;
 	this->Header->Reflectivity[2] = 0.0f;
 
-    vlByte *lpImageData = new vlByte[this->ComputeImageSize(this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888)];
+	vlByte *lpImageData = new vlByte[this->ComputeImageSize( this->Header->Width, this->Header->Height, 1, IMAGE_FORMAT_RGBA8888 )];
 
 	vlUInt uiFrameCount = this->GetFrameCount();
 	vlUInt uiFaceCount = this->GetFaceCount();
 	vlUInt uiSliceCount = this->GetDepth();
 
-    for(vlUInt uiFrame = 0; uiFrame < uiFrameCount; uiFrame++)
-    {
-        for(vlUInt uiFace = 0; uiFace < uiFaceCount; uiFace++)
-        {
-			for(vlUInt uiSlice = 0; uiSlice < uiSliceCount; uiSlice++)
+	for ( vlUInt uiFrame = 0; uiFrame < uiFrameCount; uiFrame++ )
+	{
+		for ( vlUInt uiFace = 0; uiFace < uiFaceCount; uiFace++ )
+		{
+			for ( vlUInt uiSlice = 0; uiSlice < uiSliceCount; uiSlice++ )
 			{
-				if(!this->ConvertToRGBA8888(this->GetData(uiFrame, uiFace, uiSlice, 0), lpImageData, this->Header->Width, this->Header->Height, this->Header->ImageFormat))
+				if ( !this->ConvertToRGBA8888( this->GetData( uiFrame, uiFace, uiSlice, 0 ), lpImageData, this->Header->Width, this->Header->Height, this->Header->ImageFormat ) )
 				{
-					delete []lpImageData;
+					delete[] lpImageData;
 
 					return vlFalse;
 				}
 
 				vlSingle sX, sY, sZ;
-				this->ComputeImageReflectivity(lpImageData, this->Header->Width, this->Header->Height, sX, sY, sZ);
+				this->ComputeImageReflectivity( lpImageData, this->Header->Width, this->Header->Height, sX, sY, sZ );
 
 				this->Header->Reflectivity[0] += sX;
 				this->Header->Reflectivity[1] += sY;
 				this->Header->Reflectivity[2] += sZ;
 			}
-        }
-    }
+		}
+	}
 
-	vlSingle sInverse = 1.0f / (vlSingle)(uiFrameCount * uiFaceCount * uiSliceCount);
+	vlSingle sInverse = 1.0f / (vlSingle)( uiFrameCount * uiFaceCount * uiSliceCount );
 
 	this->Header->Reflectivity[0] *= sInverse;
 	this->Header->Reflectivity[1] *= sInverse;
 	this->Header->Reflectivity[2] *= sInverse;
 
-	delete []lpImageData;
+	delete[] lpImageData;
 
 	return vlTrue;
 }
@@ -2959,121 +3399,121 @@ vlBool CVTFFile::ComputeReflectivity()
 // (taken from imageloader.cpp, Valve Source SDK)
 //------------------------------------------------------
 static SVTFImageFormatInfo VTFImageFormatInfo[] =
-{
-	{ "RGBA8888",			 32,  4,  8,  8,  8,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGBA8888,
-	{ "ABGR8888",			 32,  4,  8,  8,  8,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_ABGR8888, 
-	{ "RGB888",				 24,  3,  8,  8,  8,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGB888,
-	{ "BGR888",				 24,  3,  8,  8,  8,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGR888,
-	{ "RGB565",				 16,  2,  5,  6,  5,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGB565, 
-	{ "I8",					  8,  1,  0,  0,  0,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_I8,
-	{ "IA88",				 16,  2,  0,  0,  0,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_IA88
-	{ "P8",					  8,  1,  0,  0,  0,  0, vlFalse, vlFalse },		// IMAGE_FORMAT_P8
-	{ "A8",					  8,  1,  0,  0,  0,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_A8
-	{ "RGB888 Bluescreen",	 24,  3,  8,  8,  8,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGB888_BLUESCREEN
-	{ "BGR888 Bluescreen",	 24,  3,  8,  8,  8,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGR888_BLUESCREEN
-	{ "ARGB8888",			 32,  4,  8,  8,  8,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_ARGB8888
-	{ "BGRA8888",			 32,  4,  8,  8,  8,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGRA8888
-	{ "DXT1",				  4,  0,  0,  0,  0,  0,  vlTrue,  vlTrue },		// IMAGE_FORMAT_DXT1
-	{ "DXT3",				  8,  0,  0,  0,  0,  8,  vlTrue,  vlTrue },		// IMAGE_FORMAT_DXT3
-	{ "DXT5",				  8,  0,  0,  0,  0,  8,  vlTrue,  vlTrue },		// IMAGE_FORMAT_DXT5
-	{ "BGRX8888",			 32,  4,  8,  8,  8,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGRX8888
-	{ "BGR565",				 16,  2,  5,  6,  5,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGR565
-	{ "BGRX5551",			 16,  2,  5,  5,  5,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGRX5551
-	{ "BGRA4444",			 16,  2,  4,  4,  4,  4, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGRA4444
-	{ "DXT1 One Bit Alpha",	  4,  0,  0,  0,  0,  1,  vlTrue,  vlTrue },		// IMAGE_FORMAT_DXT1_ONEBITALPHA
-	{ "BGRA5551",			 16,  2,  5,  5,  5,  1, vlFalse,  vlTrue },		// IMAGE_FORMAT_BGRA5551
-	{ "UV88",				 16,  2,  8,  8,  0,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_UV88
-	{ "UVWQ8888",			 32,  4,  8,  8,  8,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_UVWQ8899
-	{ "RGBA16161616F",	     64,  8, 16, 16, 16, 16, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGBA16161616F
-	{ "RGBA16161616",	     64,  8, 16, 16, 16, 16, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGBA16161616
-	{ "UVLX8888",			 32,  4,  8,  8,  8,  8, vlFalse,  vlTrue },		// IMAGE_FORMAT_UVLX8888
-	{ "R32F",				 32,  4, 32,  0,  0,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_R32F
-	{ "RGB323232F",			 96, 12, 32, 32, 32,  0, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGB323232F
-	{ "RGBA32323232F",		128, 16, 32, 32, 32, 32, vlFalse,  vlTrue },		// IMAGE_FORMAT_RGBA32323232F
-	{},
-	{},
-	{}, 
-	{ "NULL",				  32, 4,  0,  0,  0,  0, vlFalse, vlFalse },		// IMAGE_FORMAT_NV_NULL
-	{ "ATI2N",				  8,  0,  0,  0,  0,  0, vlTrue,  vlTrue  }, 		// IMAGE_FORMAT_ATI2N
-	{ "ATI1N",				  4,  0,  0,  0,  0,  0, vlTrue,  vlTrue  },		// IMAGE_FORMAT_ATI1N
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{ "BC7",					8,  0,  0,  0,  0,  0, vlTrue,  vlTrue  }			// IMAGE_FORMAT_BC7
+	{
+		{ "RGBA8888", 32, 4, 8, 8, 8, 8, vlFalse, vlTrue },			   // IMAGE_FORMAT_RGBA8888,
+		{ "ABGR8888", 32, 4, 8, 8, 8, 8, vlFalse, vlTrue },			   // IMAGE_FORMAT_ABGR8888,
+		{ "RGB888", 24, 3, 8, 8, 8, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_RGB888,
+		{ "BGR888", 24, 3, 8, 8, 8, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGR888,
+		{ "RGB565", 16, 2, 5, 6, 5, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_RGB565,
+		{ "I8", 8, 1, 0, 0, 0, 0, vlFalse, vlTrue },				   // IMAGE_FORMAT_I8,
+		{ "IA88", 16, 2, 0, 0, 0, 8, vlFalse, vlTrue },				   // IMAGE_FORMAT_IA88
+		{ "P8", 8, 1, 0, 0, 0, 0, vlFalse, vlFalse },				   // IMAGE_FORMAT_P8
+		{ "A8", 8, 1, 0, 0, 0, 8, vlFalse, vlTrue },				   // IMAGE_FORMAT_A8
+		{ "RGB888 Bluescreen", 24, 3, 8, 8, 8, 0, vlFalse, vlTrue },   // IMAGE_FORMAT_RGB888_BLUESCREEN
+		{ "BGR888 Bluescreen", 24, 3, 8, 8, 8, 0, vlFalse, vlTrue },   // IMAGE_FORMAT_BGR888_BLUESCREEN
+		{ "ARGB8888", 32, 4, 8, 8, 8, 8, vlFalse, vlTrue },			   // IMAGE_FORMAT_ARGB8888
+		{ "BGRA8888", 32, 4, 8, 8, 8, 8, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGRA8888
+		{ "DXT1", 4, 0, 0, 0, 0, 0, vlTrue, vlTrue },				   // IMAGE_FORMAT_DXT1
+		{ "DXT3", 8, 0, 0, 0, 0, 8, vlTrue, vlTrue },				   // IMAGE_FORMAT_DXT3
+		{ "DXT5", 8, 0, 0, 0, 0, 8, vlTrue, vlTrue },				   // IMAGE_FORMAT_DXT5
+		{ "BGRX8888", 32, 4, 8, 8, 8, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGRX8888
+		{ "BGR565", 16, 2, 5, 6, 5, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGR565
+		{ "BGRX5551", 16, 2, 5, 5, 5, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGRX5551
+		{ "BGRA4444", 16, 2, 4, 4, 4, 4, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGRA4444
+		{ "DXT1 One Bit Alpha", 4, 0, 0, 0, 0, 1, vlTrue, vlTrue },	   // IMAGE_FORMAT_DXT1_ONEBITALPHA
+		{ "BGRA5551", 16, 2, 5, 5, 5, 1, vlFalse, vlTrue },			   // IMAGE_FORMAT_BGRA5551
+		{ "UV88", 16, 2, 8, 8, 0, 0, vlFalse, vlTrue },				   // IMAGE_FORMAT_UV88
+		{ "UVWQ8888", 32, 4, 8, 8, 8, 8, vlFalse, vlTrue },			   // IMAGE_FORMAT_UVWQ8899
+		{ "RGBA16161616F", 64, 8, 16, 16, 16, 16, vlFalse, vlTrue },   // IMAGE_FORMAT_RGBA16161616F
+		{ "RGBA16161616", 64, 8, 16, 16, 16, 16, vlFalse, vlTrue },	   // IMAGE_FORMAT_RGBA16161616
+		{ "UVLX8888", 32, 4, 8, 8, 8, 8, vlFalse, vlTrue },			   // IMAGE_FORMAT_UVLX8888
+		{ "R32F", 32, 4, 32, 0, 0, 0, vlFalse, vlTrue },			   // IMAGE_FORMAT_R32F
+		{ "RGB323232F", 96, 12, 32, 32, 32, 0, vlFalse, vlTrue },	   // IMAGE_FORMAT_RGB323232F
+		{ "RGBA32323232F", 128, 16, 32, 32, 32, 32, vlFalse, vlTrue }, // IMAGE_FORMAT_RGBA32323232F
+		{},
+		{},
+		{},
+		{ "NULL", 32, 4, 0, 0, 0, 0, vlFalse, vlFalse }, // IMAGE_FORMAT_NV_NULL
+		{ "ATI2N", 8, 0, 0, 0, 0, 0, vlTrue, vlTrue },	 // IMAGE_FORMAT_ATI2N
+		{ "ATI1N", 4, 0, 0, 0, 0, 0, vlTrue, vlTrue },	 // IMAGE_FORMAT_ATI1N
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{ "BC7", 8, 0, 0, 0, 0, 0, vlTrue, vlTrue } // IMAGE_FORMAT_BC7
 };
 
-SVTFImageFormatInfo const &CVTFFile::GetImageFormatInfo(VTFImageFormat ImageFormat)
+SVTFImageFormatInfo const &CVTFFile::GetImageFormatInfo( VTFImageFormat ImageFormat )
 {
-	assert(ImageFormat >= 0 && ImageFormat < IMAGE_FORMAT_COUNT);
+	assert( ImageFormat >= 0 && ImageFormat < IMAGE_FORMAT_COUNT );
 
 	return VTFImageFormatInfo[ImageFormat];
 }
 
 //------------------------------------------------------------------------------------
 // ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat ImageFormat)
-// 
+//
 // Returns how many bytes are needed to store an image of width * height in the chosen
 // image format. If bMipMaps is true, the total will reflect the space needed to store
 // the original image plus all the mipmaps down to a size of 1 x 1
 //------------------------------------------------------------------------------------
-vlUInt CVTFFile::ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, VTFImageFormat ImageFormat)
+vlUInt CVTFFile::ComputeImageSize( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, VTFImageFormat ImageFormat )
 {
-	switch(ImageFormat)
+	switch ( ImageFormat )
 	{
-	case IMAGE_FORMAT_DXT1:
-	case IMAGE_FORMAT_DXT1_ONEBITALPHA:
-	case IMAGE_FORMAT_ATI1N:
-		if(uiWidth < 4 && uiWidth > 0)
-			uiWidth = 4;
+		case IMAGE_FORMAT_DXT1:
+		case IMAGE_FORMAT_DXT1_ONEBITALPHA:
+		case IMAGE_FORMAT_ATI1N:
+			if ( uiWidth < 4 && uiWidth > 0 )
+				uiWidth = 4;
 
-		if(uiHeight < 4 && uiHeight > 0)
-			uiHeight = 4;
+			if ( uiHeight < 4 && uiHeight > 0 )
+				uiHeight = 4;
 
-		return ((uiWidth + 3) / 4) * ((uiHeight + 3) / 4) * 8 * uiDepth;
-	case IMAGE_FORMAT_DXT3:
-	case IMAGE_FORMAT_DXT5:
-	case IMAGE_FORMAT_ATI2N:
-	case IMAGE_FORMAT_BC7:
-		if(uiWidth < 4 && uiWidth > 0)
-			uiWidth = 4;
+			return ( ( uiWidth + 3 ) / 4 ) * ( ( uiHeight + 3 ) / 4 ) * 8 * uiDepth;
+		case IMAGE_FORMAT_DXT3:
+		case IMAGE_FORMAT_DXT5:
+		case IMAGE_FORMAT_ATI2N:
+		case IMAGE_FORMAT_BC7:
+			if ( uiWidth < 4 && uiWidth > 0 )
+				uiWidth = 4;
 
-		if(uiHeight < 4 && uiHeight > 0)
-			uiHeight = 4;
+			if ( uiHeight < 4 && uiHeight > 0 )
+				uiHeight = 4;
 
-		return ((uiWidth + 3) / 4) * ((uiHeight + 3) / 4) * 16 * uiDepth;
-	default:
-		return uiWidth * uiHeight * uiDepth * CVTFFile::GetImageFormatInfo(ImageFormat).uiBytesPerPixel;
+			return ( ( uiWidth + 3 ) / 4 ) * ( ( uiHeight + 3 ) / 4 ) * 16 * uiDepth;
+		default:
+			return uiWidth * uiHeight * uiDepth * CVTFFile::GetImageFormatInfo( ImageFormat ).uiBytesPerPixel;
 	}
 }
 
@@ -3082,27 +3522,27 @@ vlUInt CVTFFile::ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDept
 // Gets the size in bytes of the data needed to store an image of size uiWidth x uiHeight
 // with uiMipmaps mipmap levels and ImageFormat format.
 //
-vlUInt CVTFFile::ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmaps, VTFImageFormat ImageFormat)
+vlUInt CVTFFile::ComputeImageSize( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmaps, VTFImageFormat ImageFormat )
 {
 	vlUInt uiImageSize = 0;
 
-	assert(uiWidth != 0 && uiHeight != 0 && uiDepth != 0);
+	assert( uiWidth != 0 && uiHeight != 0 && uiDepth != 0 );
 
-	for(vlUInt i = 0; i < uiMipmaps; i++)
+	for ( vlUInt i = 0; i < uiMipmaps; i++ )
 	{
-		uiImageSize += CVTFFile::ComputeImageSize(uiWidth, uiHeight, uiDepth, ImageFormat);
-		
+		uiImageSize += CVTFFile::ComputeImageSize( uiWidth, uiHeight, uiDepth, ImageFormat );
+
 		uiWidth >>= 1;
 		uiHeight >>= 1;
 		uiDepth >>= 1;
 
-		if(uiWidth < 1)
+		if ( uiWidth < 1 )
 			uiWidth = 1;
 
-		if(uiHeight < 1)
+		if ( uiHeight < 1 )
 			uiHeight = 1;
 
-		if(uiDepth < 1)
+		if ( uiDepth < 1 )
 			uiDepth = 1;
 	}
 
@@ -3114,21 +3554,21 @@ vlUInt CVTFFile::ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDept
 // Gets the number of mipmaps an image of size uiWidth x uiHeight will have including
 // the mipmap of size uiWidth x uiHeight.
 //
-vlUInt CVTFFile::ComputeMipmapCount(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth)
+vlUInt CVTFFile::ComputeMipmapCount( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth )
 {
 	vlUInt uiCount = 0;
 
-	assert(uiWidth != 0 && uiHeight != 0 && uiDepth != 0);
+	assert( uiWidth != 0 && uiHeight != 0 && uiDepth != 0 );
 
-	while(vlTrue)
+	while ( vlTrue )
 	{
 		uiCount++;
-		
+
 		uiWidth >>= 1;
 		uiHeight >>= 1;
 		uiDepth >>= 1;
 
-		if(uiWidth == 0 && uiHeight == 0 && uiDepth == 0)
+		if ( uiWidth == 0 && uiHeight == 0 && uiDepth == 0 )
 			break;
 
 		/*if(uiWidth < 1)
@@ -3149,49 +3589,49 @@ vlUInt CVTFFile::ComputeMipmapCount(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDe
 //
 // Computes the dimensions of a particular mip level
 //-----------------------------------------------------------------------------
-vlVoid CVTFFile::ComputeMipmapDimensions(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, vlUInt &uiMipmapWidth, vlUInt &uiMipmapHeight, vlUInt &uiMipmapDepth)
+vlVoid CVTFFile::ComputeMipmapDimensions( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, vlUInt &uiMipmapWidth, vlUInt &uiMipmapHeight, vlUInt &uiMipmapDepth )
 {
 	// work out the width/height by taking the orignal dimension
 	// and bit shifting them down uiMipmapLevel times
 	uiMipmapWidth = uiWidth >> uiMipmapLevel;
 	uiMipmapHeight = uiHeight >> uiMipmapLevel;
 	uiMipmapDepth = uiDepth >> uiMipmapLevel;
-	
+
 	// stop the dimension being less than 1 x 1
-	if(uiMipmapWidth < 1)
+	if ( uiMipmapWidth < 1 )
 		uiMipmapWidth = 1;
 
-	if(uiMipmapHeight < 1)
+	if ( uiMipmapHeight < 1 )
 		uiMipmapHeight = 1;
 
-	if(uiMipmapDepth < 1)
+	if ( uiMipmapDepth < 1 )
 		uiMipmapDepth = 1;
 }
 
 //-----------------------------------------------------------------------------
 // ComputeMIPSize( vlInt iMipLevel, VTFImageFormat fmt )
 //
-// Computes the size (in bytes) of a single mipmap of a single face of a single frame 
+// Computes the size (in bytes) of a single mipmap of a single face of a single frame
 //-----------------------------------------------------------------------------
-vlUInt CVTFFile::ComputeMipmapSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, VTFImageFormat ImageFormat)
+vlUInt CVTFFile::ComputeMipmapSize( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, VTFImageFormat ImageFormat )
 {
 	// figure out the width/height of this MIP level
 	vlUInt uiMipmapWidth, uiMipmapHeight, uiMipmapDepth;
-	CVTFFile::ComputeMipmapDimensions(uiWidth, uiHeight, uiDepth, uiMipmapLevel, uiMipmapWidth, uiMipmapHeight, uiMipmapDepth);
-	
+	CVTFFile::ComputeMipmapDimensions( uiWidth, uiHeight, uiDepth, uiMipmapLevel, uiMipmapWidth, uiMipmapHeight, uiMipmapDepth );
+
 	// return the memory requirements
-	return CVTFFile::ComputeImageSize(uiMipmapWidth, uiMipmapHeight, uiMipmapDepth, ImageFormat);
+	return CVTFFile::ComputeImageSize( uiMipmapWidth, uiMipmapHeight, uiMipmapDepth, ImageFormat );
 }
 
 //---------------------------------------------------------------------------------
 // ComputeDataOffset(vlUInt uiFrame, vlUInt uiFace, vlUInt uiMipLevel, VTFImageFormat ImageFormat)
 //
-// Returns the offset in our HiResDataBuffer of the data for an image at the 
+// Returns the offset in our HiResDataBuffer of the data for an image at the
 // chose frame, face, and mip level. Frame number starts at 0, Face starts at 0
 // MIP level 0 is the largest moving up to MIP count-1 for the smallest
 // To get the first, and largest image, you would use 0, 0, 0
 //---------------------------------------------------------------------------------
-vlUInt CVTFFile::ComputeDataOffset(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipLevel, VTFImageFormat ImageFormat) const
+vlUInt CVTFFile::ComputeDataOffset( vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipLevel, VTFImageFormat ImageFormat ) const
 {
 	vlUInt uiOffset = 0;
 
@@ -3200,51 +3640,51 @@ vlUInt CVTFFile::ComputeDataOffset(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice
 	vlUInt uiSliceCount = this->GetDepth();
 	vlUInt uiMipCount = this->GetMipmapCount();
 
-	if(uiFrame >= uiFrameCount)
+	if ( uiFrame >= uiFrameCount )
 	{
 		uiFrame = uiFrameCount - 1;
 	}
-	
-	if(uiFace >= uiFaceCount)
+
+	if ( uiFace >= uiFaceCount )
 	{
 		uiFace = uiFaceCount - 1;
 	}
 
-	if(uiSlice >= uiSliceCount)
+	if ( uiSlice >= uiSliceCount )
 	{
 		uiSlice = uiSliceCount - 1;
 	}
 
-	if(uiMipLevel >= uiMipCount)
+	if ( uiMipLevel >= uiMipCount )
 	{
 		uiMipLevel = uiMipCount - 1;
 	}
 
 	// Transverse past all frames and faces of each mipmap (up to the requested one).
-	for(vlInt i = (vlInt)uiMipCount - 1; i > (vlInt)uiMipLevel; i--)
+	for ( vlInt i = (vlInt)uiMipCount - 1; i > (vlInt)uiMipLevel; i-- )
 	{
-		uiOffset += this->ComputeMipmapSize(this->Header->Width, this->Header->Height, this->Header->Depth, i, ImageFormat) * uiFrameCount * uiFaceCount;
+		uiOffset += this->ComputeMipmapSize( this->Header->Width, this->Header->Height, this->Header->Depth, i, ImageFormat ) * uiFrameCount * uiFaceCount;
 	}
 
-	vlUInt uiTemp1 = this->ComputeMipmapSize(this->Header->Width, this->Header->Height, this->Header->Depth, uiMipLevel, ImageFormat);
-	vlUInt uiTemp2 = this->ComputeMipmapSize(this->Header->Width, this->Header->Height, 1, uiMipLevel, ImageFormat);
+	vlUInt uiTemp1 = this->ComputeMipmapSize( this->Header->Width, this->Header->Height, this->Header->Depth, uiMipLevel, ImageFormat );
+	vlUInt uiTemp2 = this->ComputeMipmapSize( this->Header->Width, this->Header->Height, 1, uiMipLevel, ImageFormat );
 
 	// Transverse past requested frames and faces of requested mipmap.
 	uiOffset += uiTemp1 * uiFrame * uiFaceCount * uiSliceCount;
 	uiOffset += uiTemp1 * uiFace * uiSliceCount;
 	uiOffset += uiTemp2 * uiSlice;
-	
+
 	return uiOffset;
 }
 
-vlUInt CVTFFile::GetAuxInfoOffset(vlUInt iFrame, vlUInt iFace, vlUInt iMipLevel) const
+vlUInt CVTFFile::GetAuxInfoOffset( vlUInt iFrame, vlUInt iFace, vlUInt iMipLevel ) const
 {
 	vlUInt faceCount = GetFaceCount();
-	return sizeof(SVTFAuxCompressionInfoHeader) +
-		(	(this->Header->MipCount - 1 - iMipLevel) * this->Header->Frames * faceCount +
-			iFrame * faceCount +
-			iFace ) *
-		sizeof(SVTFAuxCompressionInfoEntry);
+	return sizeof( SVTFAuxCompressionInfoHeader ) +
+		   ( ( this->Header->MipCount - 1 - iMipLevel ) * this->Header->Frames * faceCount +
+			 iFrame * faceCount +
+			 iFace ) *
+			   sizeof( SVTFAuxCompressionInfoEntry );
 }
 
 //-----------------------------------------------------------------------------------------------------
@@ -3253,9 +3693,9 @@ vlUInt CVTFFile::GetAuxInfoOffset(vlUInt iFrame, vlUInt iFace, vlUInt iMipLevel)
 // Converts data from the source format to RGBA8888 format. Data is read from *src
 // and written to *dst. Width and height are needed to it knows how much data to process
 //-----------------------------------------------------------------------------------------------------
-vlBool CVTFFile::ConvertToRGBA8888(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat)
+vlBool CVTFFile::ConvertToRGBA8888( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat )
 {
-	return CVTFFile::Convert(lpSource, lpDest, uiWidth, uiHeight, SourceFormat, IMAGE_FORMAT_RGBA8888);
+	return CVTFFile::Convert( lpSource, lpDest, uiWidth, uiHeight, SourceFormat, IMAGE_FORMAT_RGBA8888 );
 }
 
 //-----------------------------------------------------------------------------------------------------
@@ -3264,33 +3704,33 @@ vlBool CVTFFile::ConvertToRGBA8888(vlByte *lpSource, vlByte *lpDest, vlUInt uiWi
 // Converts data from the BCn to RGBA8888 format. Data is read from *src
 // and written to *dst. Width and height are needed to it knows how much data to process
 //-----------------------------------------------------------------------------------------------------
-vlBool CVTFFile::DecompressBCn(vlByte *src, vlByte *dst, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat)
+vlBool CVTFFile::DecompressBCn( vlByte *src, vlByte *dst, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat )
 {
-	CMP_Texture srcTexture = {0};
-	srcTexture.dwSize     = sizeof( srcTexture );
-	srcTexture.dwWidth    = uiWidth;
-	srcTexture.dwHeight   = uiHeight;
-	srcTexture.dwPitch    = 0;
-	srcTexture.format     = GetCMPFormat( SourceFormat, false );
+	CMP_Texture srcTexture = { 0 };
+	srcTexture.dwSize = sizeof( srcTexture );
+	srcTexture.dwWidth = uiWidth;
+	srcTexture.dwHeight = uiHeight;
+	srcTexture.dwPitch = 0;
+	srcTexture.format = GetCMPFormat( SourceFormat, false );
 	srcTexture.dwDataSize = CMP_CalculateBufferSize( &srcTexture );
-	srcTexture.pData      = (CMP_BYTE*) src;
+	srcTexture.pData = (CMP_BYTE *)src;
 
-	CMP_CompressOptions options = {0};
-	options.dwSize        = sizeof(options);
-	options.dwnumThreads  = 0;
+	CMP_CompressOptions options = { 0 };
+	options.dwSize = sizeof( options );
+	options.dwnumThreads = 0;
 	options.bDXT1UseAlpha = false;
 
-	CMP_Texture destTexture = {0};
-	destTexture.dwSize     = sizeof( destTexture );
-	destTexture.dwWidth    = uiWidth;
-	destTexture.dwHeight   = uiHeight;
-	destTexture.dwPitch    = 4 * uiWidth;
-	destTexture.format     = CMP_FORMAT_RGBA_8888;
+	CMP_Texture destTexture = { 0 };
+	destTexture.dwSize = sizeof( destTexture );
+	destTexture.dwWidth = uiWidth;
+	destTexture.dwHeight = uiHeight;
+	destTexture.dwPitch = 4 * uiWidth;
+	destTexture.format = CMP_FORMAT_RGBA_8888;
 	destTexture.dwDataSize = destTexture.dwPitch * uiHeight;
-	destTexture.pData      = (CMP_BYTE*) dst;
+	destTexture.pData = (CMP_BYTE *)dst;
 
 	CMP_ERROR cmp_status = CMP_ConvertTexture( &srcTexture, &destTexture, &options, NULL );
-	if (cmp_status != CMP_OK)
+	if ( cmp_status != CMP_OK )
 	{
 		LastError.Set( GetCMPErrorString( cmp_status ) );
 		return vlFalse;
@@ -3303,9 +3743,9 @@ vlBool CVTFFile::DecompressBCn(vlByte *src, vlByte *dst, vlUInt uiWidth, vlUInt
 // ConvertFromRGBA8888()
 // Convert input image data (lpSource) to output image data (lpDest) of format DestFormat.
 //
-vlBool CVTFFile::ConvertFromRGBA8888(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat)
+vlBool CVTFFile::ConvertFromRGBA8888( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat )
 {
-	return CVTFFile::Convert(lpSource, lpDest, uiWidth, uiHeight, IMAGE_FORMAT_RGBA8888, DestFormat);
+	return CVTFFile::Convert( lpSource, lpDest, uiWidth, uiHeight, IMAGE_FORMAT_RGBA8888, DestFormat );
 }
 
 //
@@ -3313,33 +3753,33 @@ vlBool CVTFFile::ConvertFromRGBA8888(vlByte *lpSource, vlByte *lpDest, vlUInt ui
 // Compress input image data (lpSource) to output image data (lpDest) of format DestFormat
 // where DestFormat is of format BCn.  Uses Compressonator library.
 //
-vlBool CVTFFile::CompressBCn(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat)
+vlBool CVTFFile::CompressBCn( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat )
 {
-	CMP_Texture srcTexture = {0};
-	srcTexture.dwSize     = sizeof( srcTexture );
-	srcTexture.dwWidth    = uiWidth;
-	srcTexture.dwHeight   = uiHeight;
-	srcTexture.dwPitch    = 4 * uiWidth;
-	srcTexture.format     = CMP_FORMAT_RGBA_8888;
+	CMP_Texture srcTexture = { 0 };
+	srcTexture.dwSize = sizeof( srcTexture );
+	srcTexture.dwWidth = uiWidth;
+	srcTexture.dwHeight = uiHeight;
+	srcTexture.dwPitch = 4 * uiWidth;
+	srcTexture.format = CMP_FORMAT_RGBA_8888;
 	srcTexture.dwDataSize = uiHeight * srcTexture.dwPitch;
-	srcTexture.pData      = (CMP_BYTE*) lpSource;
+	srcTexture.pData = (CMP_BYTE *)lpSource;
 
-	CMP_CompressOptions options = {0};
-	options.dwSize        = sizeof(options);
-	options.dwnumThreads  = 0;
+	CMP_CompressOptions options = { 0 };
+	options.dwSize = sizeof( options );
+	options.dwnumThreads = 0;
 	options.bDXT1UseAlpha = DestFormat == IMAGE_FORMAT_DXT1_ONEBITALPHA;
 
-	CMP_Texture destTexture = {0};
-	destTexture.dwSize     = sizeof( destTexture );
-	destTexture.dwWidth    = uiWidth;
-	destTexture.dwHeight   = uiHeight;
-	destTexture.dwPitch    = 0;
-	destTexture.format     = GetCMPFormat( DestFormat, false );
+	CMP_Texture destTexture = { 0 };
+	destTexture.dwSize = sizeof( destTexture );
+	destTexture.dwWidth = uiWidth;
+	destTexture.dwHeight = uiHeight;
+	destTexture.dwPitch = 0;
+	destTexture.format = GetCMPFormat( DestFormat, false );
 	destTexture.dwDataSize = CMP_CalculateBufferSize( &destTexture );
-	destTexture.pData      = (CMP_BYTE*) lpDest;
+	destTexture.pData = (CMP_BYTE *)lpDest;
 
 	CMP_ERROR cmp_status = CMP_ConvertTexture( &srcTexture, &destTexture, &options, NULL );
-	if (cmp_status != CMP_OK)
+	if ( cmp_status != CMP_OK )
 	{
 		LastError.Set( GetCMPErrorString( cmp_status ) );
 		return vlFalse;
@@ -3348,21 +3788,19 @@ vlBool CVTFFile::CompressBCn(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, v
 	return vlTrue;
 }
 
-typedef vlVoid (*TransformProc)(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A);
-
-vlVoid ToLuminance(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
+vlVoid ToLuminance( vlUInt16 &R, vlUInt16 &G, vlUInt16 &B, vlUInt16 &A )
 {
-	R = G = B = (vlUInt16)(sLuminanceWeightR * (vlSingle)R + sLuminanceWeightG * (vlSingle)G + sLuminanceWeightB * (vlSingle)B);
+	R = G = B = (vlUInt16)( sLuminanceWeightR * (vlSingle)R + sLuminanceWeightG * (vlSingle)G + sLuminanceWeightB * (vlSingle)B );
 }
 
-vlVoid FromLuminance(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
+vlVoid FromLuminance( vlUInt16 &R, vlUInt16 &G, vlUInt16 &B, vlUInt16 &A )
 {
 	B = G = R;
 }
 
-vlVoid ToBlueScreen(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
+vlVoid ToBlueScreen( vlUInt16 &R, vlUInt16 &G, vlUInt16 &B, vlUInt16 &A )
 {
-	if(A == 0x0000)
+	if ( A == 0x0000 )
 	{
 		R = uiBlueScreenMaskR;
 		G = uiBlueScreenMaskG;
@@ -3371,9 +3809,9 @@ vlVoid ToBlueScreen(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
 	A = 0xffff;
 }
 
-vlVoid FromBlueScreen(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
+vlVoid FromBlueScreen( vlUInt16 &R, vlUInt16 &G, vlUInt16 &B, vlUInt16 &A )
 {
-	if(R == uiBlueScreenMaskR && G == uiBlueScreenMaskG && B == uiBlueScreenMaskB)
+	if ( R == uiBlueScreenMaskR && G == uiBlueScreenMaskG && B == uiBlueScreenMaskB )
 	{
 		R = uiBlueScreenClearR;
 		G = uiBlueScreenClearG;
@@ -3386,7 +3824,7 @@ vlVoid FromBlueScreen(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
 	}
 }
 
-static inline vlSingle FP16ToFP32(vlUInt16 input)
+inline vlSingle CVTFFile::FP16ToFP32( vlUInt16 input )
 {
 	const vlUInt32 uiF32Bias = 127;
 	const vlUInt32 uiF16Bias = 15;
@@ -3398,248 +3836,204 @@ static inline vlSingle FP16ToFP32(vlUInt16 input)
 		vlUInt16 uiExponent : 5;
 		vlUInt16 uiSign : 1;
 	} fp16;
-	std::memcpy(&fp16, &input, sizeof(vlUInt16));
+	std::memcpy( &fp16, &input, sizeof( vlUInt16 ) );
 
-	if (fp16.uiExponent == 31)
+	if ( fp16.uiExponent == 31 )
 	{
-		if (fp16.uiMantissa == 0) // Check for Infinity
-			return sMaxFloat16Bits * ((fp16.uiSign == 1) ? -1.0f : 1.0f);
-		else if (fp16.uiMantissa != 0) // Check for NaN
+		if ( fp16.uiMantissa == 0 )		 // Check for Infinity
+			return sMaxFloat16Bits * ( ( fp16.uiSign == 1 ) ? -1.0f : 1.0f );
+		else if ( fp16.uiMantissa != 0 ) // Check for NaN
 			return 0.0f;
 	}
 
-	if (fp16.uiExponent == 0 && fp16.uiMantissa != 0)
+	if ( fp16.uiExponent == 0 && fp16.uiMantissa != 0 )
 	{
 		// Denorm...
-		const vlSingle sHalfDenorm = 1.0f / vlSingle(1 << 14);
-		const vlSingle sMantissa   = vlSingle(fp16.uiMantissa) / vlSingle(1 << 10);
-		const vlSingle sSign       = fp16.uiSign ? -1.0f : 1.0f;
+		const vlSingle sHalfDenorm = 1.0f / vlSingle( 1 << 14 );
+		const vlSingle sMantissa = vlSingle( fp16.uiMantissa ) / vlSingle( 1 << 10 );
+		const vlSingle sSign = fp16.uiSign ? -1.0f : 1.0f;
 
 		return sSign * sMantissa * sHalfDenorm;
 	}
 	else
 	{
 		const vlUInt32 uiMantissa = fp16.uiMantissa;
-		const vlUInt32 uiExponent = fp16.uiExponent != 0
-			? fp16.uiExponent - uiF16Bias + uiF32Bias
-			: 0;
-		const vlUInt32 uiSign     = fp16.uiSign;
+		const vlUInt32 uiExponent = fp16.uiExponent != 0 ? fp16.uiExponent - uiF16Bias + uiF32Bias : 0;
+		const vlUInt32 uiSign = fp16.uiSign;
 
-		vlUInt32 uiBits = (uiMantissa << 13) | (uiExponent << 23) | (uiSign << 31);
+		vlUInt32 uiBits = ( uiMantissa << 13 ) | ( uiExponent << 23 ) | ( uiSign << 31 );
 		vlSingle sValue;
-		std::memcpy(&sValue, &uiBits, sizeof(sValue));
+		std::memcpy( &sValue, &uiBits, sizeof( sValue ) );
 		return sValue;
 	}
 }
 
-// A very very basic Reinhard implementation for
-// previewing cubemaps...
-// (Feel free to use something better with proper luminance
-// and a white point!)
-vlSingle Reinhard(vlSingle sValue)
-{
-    return sValue / (1.0f + sValue);
-}
-
-vlVoid ToFP16(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
+inline unsigned short CVTFFile::FP32ToFP16( float input )
 {
+	//! IMPORTANT!
+	// We're using a modified version of the half_float library
+	// that allows the return for raw data.
+	// any conversion methods corrupt the data.
+	return half_float::half( input ).getData();
 }
 
-vlUInt16 FP16ToUnorm(vlUInt16 uiValue)
-{
-	vlSingle sValue = FP16ToFP32(uiValue);
-
-	sValue *= sFP16HDRExposure;
-	sValue = Reinhard(sValue);
-	sValue *= 65535.0f;
-	sValue = std::min(std::max(sValue, 0.0f), 65535.0f);
-	return (vlUInt16) sValue;
-}
-
-vlVoid FromFP16(vlUInt16& R, vlUInt16& G, vlUInt16& B, vlUInt16& A)
-{
-	R = FP16ToUnorm(R);
-	G = FP16ToUnorm(G);
-	B = FP16ToUnorm(B);
-	A = FP16ToUnorm(A);
-}
-
-typedef struct tagSVTFImageConvertInfo
-{
-	vlUInt	uiBitsPerPixel;			// Format bytes per pixel.
-	vlUInt	uiBytesPerPixel;		// Format bytes per pixel.
-	vlUInt	uiRBitsPerPixel;		// Format conversion red bits per pixel.  0 for N/A.
-	vlUInt	uiGBitsPerPixel;		// Format conversion green bits per pixel.  0 for N/A.
-	vlUInt	uiBBitsPerPixel;		// Format conversion blue bits per pixel.  0 for N/A.
-	vlUInt	uiABitsPerPixel;		// Format conversion alpha bits per pixel.  0 for N/A.
-	vlInt	iR;						// "Red" index.
-	vlInt	iG;						// "Green" index.
-	vlInt	iB;						// "Blue" index.
-	vlInt	iA;						// "Alpha" index.
-	vlBool	bIsCompressed;			// Format is compressed (DXT).
-	vlBool	bIsSupported;			// Format is supported by VTFLib.
-	TransformProc pToTransform;		// Custom transform to function.
-	TransformProc pFromTransform;	// Custom transform from function.
-	VTFImageFormat Format;
-} SVTFImageConvertInfo;
-
 static SVTFImageConvertInfo VTFImageConvertInfo[IMAGE_FORMAT_COUNT] =
-{
-	{	 32,  4,  8,  8,  8,  8,	 0,	 1,	 2,	 3,	vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_RGBA8888},
-	{	 32,  4,  8,  8,  8,  8,	 3,	 2,	 1,	 0, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_ABGR8888},
-	{	 24,  3,  8,  8,  8,  0,	 0,	 1,	 2,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_RGB888},
-	{	 24,  3,  8,  8,  8,  0,	 2,	 1,	 0,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGR888},
-	{	 16,  2,  5,  6,  5,  0,	 0,	 1,	 2,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_RGB565},
-	{	  8,  1,  8,  8,  8,  0,	 0,	-1,	-1,	-1, vlFalse,  vlTrue,	ToLuminance,	FromLuminance,	IMAGE_FORMAT_I8},
-	{	 16,  2,  8,  8,  8,  8,	 0,	-1,	-1,	 1, vlFalse,  vlTrue,	ToLuminance,	FromLuminance,	IMAGE_FORMAT_IA88},
-	{	  8,  1,  0,  0,  0,  0,	-1,	-1,	-1,	-1, vlFalse, vlFalse,	NULL,	NULL,		IMAGE_FORMAT_P8},
-	{ 	  8,  1,  0,  0,  0,  8,	-1,	-1,	-1,	 0, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_A8},
-	{ 	 24,  3,  8,  8,  8,  8,	 0,	 1,	 2,	-1, vlFalse,  vlTrue,	ToBlueScreen,	FromBlueScreen,	IMAGE_FORMAT_RGB888_BLUESCREEN},
-	{ 	 24,  3,  8,  8,  8,  8,	 2,	 1,	 0,	-1, vlFalse,  vlTrue,	ToBlueScreen,	FromBlueScreen,	IMAGE_FORMAT_BGR888_BLUESCREEN},
-	{ 	 32,  4,  8,  8,  8,  8,	 3,	 0,	 1,	 2, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_ARGB8888},
-	{ 	 32,  4,  8,  8,  8,  8,	 2,	 1,	 0,	 3, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGRA8888},
-	{ 	  4,  0,  0,  0,  0,  0,	-1,	-1,	-1,	-1,  vlTrue,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_DXT1},
-	{ 	  8,  0,  0,  0,  0,  8,	-1,	-1,	-1,	-1,  vlTrue,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_DXT3},
-	{ 	  8,  0,  0,  0,  0,  8,	-1,	-1,	-1,	-1,  vlTrue,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_DXT5},
-	{ 	 32,  4,  8,  8,  8,  0,	 2,	 1,	 0,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGRX8888},
-	{ 	 16,  2,  5,  6,  5,  0,	 2,	 1,	 0,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGR565},
-	{ 	 16,  2,  5,  5,  5,  0,	 2,	 1,	 0,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGRX5551},
-	{ 	 16,  2,  4,  4,  4,  4,	 2,	 1,	 0,	 3, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGRA4444},
-	{ 	  4,  0,  0,  0,  0,  1,	-1,	-1,	-1,	-1,  vlTrue,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_DXT1_ONEBITALPHA},
-	{ 	 16,  2,  5,  5,  5,  1,	 2,	 1,	 0,	 3, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_BGRA5551},
-	{ 	 16,  2,  8,  8,  0,  0,	 0,	 1,	-1,	-1, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_UV88},
-	{ 	 32,  4,  8,  8,  8,  8,	 0,	 1,	 2,	 3, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_UVWQ8888},
-	{    64,  8, 16, 16, 16, 16,	 0,	 1,	 2,	 3, vlFalse,  vlTrue,	ToFP16,	FromFP16,	IMAGE_FORMAT_RGBA16161616F},
-	{	 64,  8, 16, 16, 16, 16,	 0,	 1,	 2,	 3, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_RGBA16161616},
-	{ 	 32,  4,  8,  8,  8,  8,	 0,	 1,	 2,	 3, vlFalse,  vlTrue,	NULL,	NULL,		IMAGE_FORMAT_UVLX8888},
-	{ 	 32,  4, 32,  0,  0,  0,	 0,	-1,	-1,	-1, vlFalse, vlFalse,	NULL,	NULL,		IMAGE_FORMAT_R32F},
-	{ 	 96, 12, 32, 32, 32,  0,	 0,	 1,	 2,	-1, vlFalse, vlFalse,	NULL,	NULL,		IMAGE_FORMAT_RGB323232F},
-	{	128, 16, 32, 32, 32, 32,	 0,	 1,	 2,	 3, vlFalse, vlFalse,	NULL,	NULL,		IMAGE_FORMAT_RGBA32323232F},
-	{},
-	{},
-	{},
-	{	 32,  4,  0,  0,  0,  0,	-1,	-1,	-1,	-1, vlFalse, vlFalse,	NULL,	NULL,		IMAGE_FORMAT_NV_NULL},
-	{     8,  0,  0,  0,  0,  0,	-1, -1, -1, -1,  vlTrue, vlTrue,	NULL,	NULL,		IMAGE_FORMAT_ATI2N},
-	{	  4,  0,  0,  0,  0,  0,	-1, -1, -1, -1,  vlTrue, vlTrue,	NULL,	NULL,		IMAGE_FORMAT_ATI1N},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{},
-	{	  8,  0,  0,  0,  0,  0,	-1, -1, -1, -1,	 vlTrue, vlTrue,	NULL, NULL,			IMAGE_FORMAT_BC7},
+	{
+		{ 32, 4, 8, 8, 8, 8, 0, 1, 2, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGBA8888 },
+		{ 32, 4, 8, 8, 8, 8, 3, 2, 1, 0, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_ABGR8888 },
+		{ 24, 3, 8, 8, 8, 0, 0, 1, 2, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGB888 },
+		{ 24, 3, 8, 8, 8, 0, 2, 1, 0, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGR888 },
+		{ 16, 2, 5, 6, 5, 0, 0, 1, 2, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGB565 },
+		{ 8, 1, 8, 8, 8, 0, 0, -1, -1, -1, vlFalse, vlTrue, ToLuminance, FromLuminance, IMAGE_FORMAT_I8 },
+		{ 16, 2, 8, 8, 8, 8, 0, -1, -1, 1, vlFalse, vlTrue, ToLuminance, FromLuminance, IMAGE_FORMAT_IA88 },
+		{ 8, 1, 0, 0, 0, 0, -1, -1, -1, -1, vlFalse, vlFalse, NULL, NULL, IMAGE_FORMAT_P8 },
+		{ 8, 1, 0, 0, 0, 8, -1, -1, -1, 0, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_A8 },
+		{ 24, 3, 8, 8, 8, 8, 0, 1, 2, -1, vlFalse, vlTrue, ToBlueScreen, FromBlueScreen, IMAGE_FORMAT_RGB888_BLUESCREEN },
+		{ 24, 3, 8, 8, 8, 8, 2, 1, 0, -1, vlFalse, vlTrue, ToBlueScreen, FromBlueScreen, IMAGE_FORMAT_BGR888_BLUESCREEN },
+		{ 32, 4, 8, 8, 8, 8, 3, 0, 1, 2, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_ARGB8888 },
+		{ 32, 4, 8, 8, 8, 8, 2, 1, 0, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGRA8888 },
+		{ 4, 0, 0, 0, 0, 0, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_DXT1 },
+		{ 8, 0, 0, 0, 0, 8, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_DXT3 },
+		{ 8, 0, 0, 0, 0, 8, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_DXT5 },
+		{ 32, 4, 8, 8, 8, 0, 2, 1, 0, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGRX8888 },
+		{ 16, 2, 5, 6, 5, 0, 2, 1, 0, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGR565 },
+		{ 16, 2, 5, 5, 5, 0, 2, 1, 0, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGRX5551 },
+		{ 16, 2, 4, 4, 4, 4, 2, 1, 0, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGRA4444 },
+		{ 4, 0, 0, 0, 0, 1, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_DXT1_ONEBITALPHA },
+		{ 16, 2, 5, 5, 5, 1, 2, 1, 0, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_BGRA5551 },
+		{ 16, 2, 8, 8, 0, 0, 0, 1, -1, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_UV88 },
+		{ 32, 4, 8, 8, 8, 8, 0, 1, 2, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_UVWQ8888 },
+		{ 64, 8, 16, 16, 16, 16, 0, 1, 2, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGBA16161616F },
+		{ 64, 8, 16, 16, 16, 16, 0, 1, 2, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGBA16161616 },
+		{ 32, 4, 8, 8, 8, 8, 0, 1, 2, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_UVLX8888 },
+		{ 32, 4, 32, 0, 0, 0, 0, -1, -1, -1, vlFalse, vlFalse, NULL, NULL, IMAGE_FORMAT_R32F },
+		{ 96, 12, 32, 32, 32, 0, 0, 1, 2, -1, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGB323232F },
+		{ 128, 16, 32, 32, 32, 32, 0, 1, 2, 3, vlFalse, vlTrue, NULL, NULL, IMAGE_FORMAT_RGBA32323232F },
+		{},
+		{},
+		{},
+		{ 32, 4, 0, 0, 0, 0, -1, -1, -1, -1, vlFalse, vlFalse, NULL, NULL, IMAGE_FORMAT_NV_NULL },
+		{ 8, 0, 0, 0, 0, 0, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_ATI2N },
+		{ 4, 0, 0, 0, 0, 0, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_ATI1N },
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{},
+		{ 8, 0, 0, 0, 0, 0, -1, -1, -1, -1, vlTrue, vlTrue, NULL, NULL, IMAGE_FORMAT_BC7 },
 };
 
 // Get each channels shift and mask (for encoding and decoding).
-template<typename T>
-vlVoid GetShiftAndMask(const SVTFImageConvertInfo& Info, T &uiRShift, T &uiGShift, T &uiBShift, T &uiAShift, T &uiRMask, T &uiGMask, T &uiBMask, T &uiAMask)
+template <typename T>
+vlVoid GetShiftAndMask( const SVTFImageConvertInfo &Info, T &uiRShift, T &uiGShift, T &uiBShift, T &uiAShift, T &uiRMask, T &uiGMask, T &uiBMask, T &uiAMask )
 {
-	if(Info.iR >= 0)
+	if ( Info.iR >= 0 )
 	{
-		if(Info.iG >= 0 && Info.iG < Info.iR)
+		if ( Info.iG >= 0 && Info.iG < Info.iR )
 			uiRShift += (T)Info.uiGBitsPerPixel;
 
-		if(Info.iB >= 0 && Info.iB < Info.iR)
+		if ( Info.iB >= 0 && Info.iB < Info.iR )
 			uiRShift += (T)Info.uiBBitsPerPixel;
 
-		if(Info.iA >= 0 && Info.iA < Info.iR)
+		if ( Info.iA >= 0 && Info.iA < Info.iR )
 			uiRShift += (T)Info.uiABitsPerPixel;
 
-		uiRMask = (T)(~0) >> (T)((sizeof(T) * 8) - Info.uiRBitsPerPixel); // Mask is for down shifted values.
+		uiRMask = (T)( ~0 ) >> (T)( ( sizeof( T ) * 8 ) - Info.uiRBitsPerPixel ); // Mask is for down shifted values.
 	}
 
-	if(Info.iG >= 0)
+	if ( Info.iG >= 0 )
 	{
-		if(Info.iR >= 0 && Info.iR < Info.iG)
+		if ( Info.iR >= 0 && Info.iR < Info.iG )
 			uiGShift += (T)Info.uiRBitsPerPixel;
 
-		if(Info.iB >= 0 && Info.iB < Info.iG)
+		if ( Info.iB >= 0 && Info.iB < Info.iG )
 			uiGShift += (T)Info.uiBBitsPerPixel;
 
-		if(Info.iA >= 0 && Info.iA < Info.iG)
+		if ( Info.iA >= 0 && Info.iA < Info.iG )
 			uiGShift += (T)Info.uiABitsPerPixel;
 
-		uiGMask = (T)(~0) >> (T)((sizeof(T) * 8) - Info.uiGBitsPerPixel);
+		uiGMask = (T)( ~0 ) >> (T)( ( sizeof( T ) * 8 ) - Info.uiGBitsPerPixel );
 	}
 
-	if(Info.iB >= 0)
+	if ( Info.iB >= 0 )
 	{
-		if(Info.iR >= 0 && Info.iR < Info.iB)
+		if ( Info.iR >= 0 && Info.iR < Info.iB )
 			uiBShift += (T)Info.uiRBitsPerPixel;
 
-		if(Info.iG >= 0 && Info.iG < Info.iB)
+		if ( Info.iG >= 0 && Info.iG < Info.iB )
 			uiBShift += (T)Info.uiGBitsPerPixel;
 
-		if(Info.iA >= 0 && Info.iA < Info.iB)
+		if ( Info.iA >= 0 && Info.iA < Info.iB )
 			uiBShift += (T)Info.uiABitsPerPixel;
 
-		uiBMask = (T)(~0) >> (T)((sizeof(T) * 8) - Info.uiBBitsPerPixel);
+		uiBMask = (T)( ~0 ) >> (T)( ( sizeof( T ) * 8 ) - Info.uiBBitsPerPixel );
 	}
 
-	if(Info.iA >= 0)
+	if ( Info.iA >= 0 )
 	{
-		if(Info.iR >= 0 && Info.iR < Info.iA)
+		if ( Info.iR >= 0 && Info.iR < Info.iA )
 			uiAShift += (T)Info.uiRBitsPerPixel;
 
-		if(Info.iG >= 0 && Info.iG < Info.iA)
+		if ( Info.iG >= 0 && Info.iG < Info.iA )
 			uiAShift += (T)Info.uiGBitsPerPixel;
 
-		if(Info.iB >= 0 && Info.iB < Info.iA)
+		if ( Info.iB >= 0 && Info.iB < Info.iA )
 			uiAShift += (T)Info.uiBBitsPerPixel;
 
-		uiAMask = (T)(~0) >> (T)((sizeof(T) * 8) - Info.uiABitsPerPixel);
+		uiAMask = (T)( ~0 ) >> (T)( ( sizeof( T ) * 8 ) - Info.uiABitsPerPixel );
 	}
 }
 
 // Downsample a channel.
-template<typename T>
-T Shrink(T S, T SourceBits, T DestBits)
+template <typename T>
+T Shrink( T S, T SourceBits, T DestBits )
 {
-	if(SourceBits == 0 || DestBits == 0)
+	if ( SourceBits == 0 || DestBits == 0 )
 		return 0;
 
-	return S >> (SourceBits - DestBits);
+	return S >> ( SourceBits - DestBits );
 }
 
 // Upsample a channel.
-template<typename T>
-T Expand(T S, T SourceBits, T DestBits)
+template <typename T>
+T Expand( T S, T SourceBits, T DestBits )
 {
-	if(SourceBits == 0 || DestBits == 0)
+	if ( SourceBits == 0 || DestBits == 0 )
 		return 0;
 
 	T D = 0;
 
 	// Repeat source bit pattern as much as possible.
-	while(DestBits >= SourceBits)
+	while ( DestBits >= SourceBits )
 	{
 		D <<= SourceBits;
 		D |= S;
@@ -3647,7 +4041,7 @@ T Expand(T S, T SourceBits, T DestBits)
 	}
 
 	// Add most significant part of source bit pattern to least significant part of dest bit pattern.
-	if(DestBits)
+	if ( DestBits )
 	{
 		S >>= SourceBits - DestBits;
 		D <<= DestBits;
@@ -3658,33 +4052,33 @@ T Expand(T S, T SourceBits, T DestBits)
 }
 
 // Run custom transformation functions.
-template<typename T, typename U>
-vlVoid Transform(TransformProc pTransform1, TransformProc pTransform2, T SR, T SG, T SB, T SA, T SRBits, T SGBits, T SBBits, T SABits, U& DR, U& DG, U& DB, U& DA, U DRBits, U DGBits, U DBBits, U DABits)
+template <typename T, typename U>
+vlVoid Transform( TransformProc pTransform1, TransformProc pTransform2, T SR, T SG, T SB, T SA, T SRBits, T SGBits, T SBBits, T SABits, U &DR, U &DG, U &DB, U &DA, U DRBits, U DGBits, U DBBits, U DABits )
 {
 	vlUInt16 TR, TG, TB, TA;
 
 	// Expand from source to 16 bits for transform functions.
-	SRBits && SRBits < 16 ? TR = (vlUInt16)Expand<T>(SR, SRBits, 16) : TR = (vlUInt16)SR;
-	SGBits && SGBits < 16 ? TG = (vlUInt16)Expand<T>(SG, SGBits, 16) : TG = (vlUInt16)SG;
-	SBBits && SBBits < 16 ? TB = (vlUInt16)Expand<T>(SB, SBBits, 16) : TB = (vlUInt16)SB;
-	SABits && SABits < 16 ? TA = (vlUInt16)Expand<T>(SA, SABits, 16) : TA = (vlUInt16)SA;
+	SRBits &&SRBits < 16 ? TR = (vlUInt16)Expand<T>( SR, SRBits, 16 ) : TR = (vlUInt16)SR;
+	SGBits &&SGBits < 16 ? TG = (vlUInt16)Expand<T>( SG, SGBits, 16 ) : TG = (vlUInt16)SG;
+	SBBits &&SBBits < 16 ? TB = (vlUInt16)Expand<T>( SB, SBBits, 16 ) : TB = (vlUInt16)SB;
+	SABits &&SABits < 16 ? TA = (vlUInt16)Expand<T>( SA, SABits, 16 ) : TA = (vlUInt16)SA;
 
 	// Source transform then dest transform.
-	if(pTransform1)
-		pTransform1(TR, TG, TB, TA);
-	if(pTransform2)
-		pTransform2(TR, TG, TB, TA);
+	if ( pTransform1 )
+		pTransform1( TR, TG, TB, TA );
+	if ( pTransform2 )
+		pTransform2( TR, TG, TB, TA );
 
 	// Shrink to dest from 16 bits.
-	DRBits && DRBits < 16 ? DR = (U)Shrink<vlUInt16>(TR, 16, (vlUInt16)DRBits) : DR = (U)TR;
-	DGBits && DGBits < 16 ? DG = (U)Shrink<vlUInt16>(TG, 16, (vlUInt16)DGBits) : DG = (U)TG;
-	DBBits && DBBits < 16 ? DB = (U)Shrink<vlUInt16>(TB, 16, (vlUInt16)DBBits) : DB = (U)TB;
-	DABits && DABits < 16 ? DA = (U)Shrink<vlUInt16>(TA, 16, (vlUInt16)DABits) : DA = (U)TA;
+	DRBits &&DRBits < 16 ? DR = (U)Shrink<vlUInt16>( TR, 16, (vlUInt16)DRBits ) : DR = (U)TR;
+	DGBits &&DGBits < 16 ? DG = (U)Shrink<vlUInt16>( TG, 16, (vlUInt16)DGBits ) : DG = (U)TG;
+	DBBits &&DBBits < 16 ? DB = (U)Shrink<vlUInt16>( TB, 16, (vlUInt16)DBBits ) : DB = (U)TB;
+	DABits &&DABits < 16 ? DA = (U)Shrink<vlUInt16>( TA, 16, (vlUInt16)DABits ) : DA = (U)TA;
 }
 
 // Convert source to dest using required storage requirments (hence the template).
-template<typename T, typename U>
-vlBool ConvertTemplated(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const SVTFImageConvertInfo& SourceInfo, const SVTFImageConvertInfo& DestInfo)
+template <typename T, typename U>
+vlBool ConvertTemplated( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const SVTFImageConvertInfo &SourceInfo, const SVTFImageConvertInfo &DestInfo )
 {
 	vlUInt16 uiSourceRShift = 0, uiSourceGShift = 0, uiSourceBShift = 0, uiSourceAShift = 0;
 	vlUInt16 uiSourceRMask = 0, uiSourceGMask = 0, uiSourceBMask = 0, uiSourceAMask = 0;
@@ -3692,125 +4086,221 @@ vlBool ConvertTemplated(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt
 	vlUInt16 uiDestRShift = 0, uiDestGShift = 0, uiDestBShift = 0, uiDestAShift = 0;
 	vlUInt16 uiDestRMask = 0, uiDestGMask = 0, uiDestBMask = 0, uiDestAMask = 0;
 
-	GetShiftAndMask<vlUInt16>(SourceInfo, uiSourceRShift, uiSourceGShift, uiSourceBShift, uiSourceAShift, uiSourceRMask, uiSourceGMask, uiSourceBMask, uiSourceAMask);
-	GetShiftAndMask<vlUInt16>(DestInfo, uiDestRShift, uiDestGShift, uiDestBShift, uiDestAShift, uiDestRMask, uiDestGMask, uiDestBMask, uiDestAMask);
+	GetShiftAndMask<vlUInt16>( SourceInfo, uiSourceRShift, uiSourceGShift, uiSourceBShift, uiSourceAShift, uiSourceRMask, uiSourceGMask, uiSourceBMask, uiSourceAMask );
+	GetShiftAndMask<vlUInt16>( DestInfo, uiDestRShift, uiDestGShift, uiDestBShift, uiDestAShift, uiDestRMask, uiDestGMask, uiDestBMask, uiDestAMask );
 
-	vlByte *lpSourceEnd = lpSource + (uiWidth * uiHeight * SourceInfo.uiBytesPerPixel);
-	for(; lpSource < lpSourceEnd; lpSource += SourceInfo.uiBytesPerPixel, lpDest += DestInfo.uiBytesPerPixel)
+	vlByte *lpSourceEnd = lpSource + ( uiWidth * uiHeight * SourceInfo.uiBytesPerPixel );
+	for ( ; lpSource < lpSourceEnd; lpSource += SourceInfo.uiBytesPerPixel, lpDest += DestInfo.uiBytesPerPixel )
 	{
 		// read source into single variable
 		vlUInt i;
 		T Source = 0;
-		for(i = 0; i < SourceInfo.uiBytesPerPixel; i++)
+		for ( i = 0; i < SourceInfo.uiBytesPerPixel; i++ )
 		{
-			Source |= (T)lpSource[i] << ((T)i * 8);
+			Source |= (T)lpSource[i] << ( (T)i * 8 );
 		}
 
 		vlUInt16 SR = 0, SG = 0, SB = 0, SA = ~0;
-		vlUInt16 DR = 0, DG = 0, DB = 0, DA = ~0;	// default values
+		vlUInt16 DR = 0, DG = 0, DB = 0, DA = ~0; // default values
 
 		// read source values
-		if(uiSourceRMask)
-			SR = (vlUInt16)(Source >> (T)uiSourceRShift) & uiSourceRMask;	// isolate R channel
+		if ( uiSourceRMask )
+			SR = (vlUInt16)( Source >> (T)uiSourceRShift ) & uiSourceRMask; // isolate R channel
 
-		if(uiSourceGMask)
-			SG = (vlUInt16)(Source >> (T)uiSourceGShift) & uiSourceGMask;	// isolate G channel
+		if ( uiSourceGMask )
+			SG = (vlUInt16)( Source >> (T)uiSourceGShift ) & uiSourceGMask; // isolate G channel
 
-		if(uiSourceBMask)
-			SB = (vlUInt16)(Source >> (T)uiSourceBShift) & uiSourceBMask;	// isolate B channel
+		if ( uiSourceBMask )
+			SB = (vlUInt16)( Source >> (T)uiSourceBShift ) & uiSourceBMask; // isolate B channel
 
-		if(uiSourceAMask)
-			SA = (vlUInt16)(Source >> (T)uiSourceAShift) & uiSourceAMask;	// isolate A channel
+		if ( uiSourceAMask )
+			SA = (vlUInt16)( Source >> (T)uiSourceAShift ) & uiSourceAMask; // isolate A channel
 
-		if(SourceInfo.pFromTransform || DestInfo.pToTransform)
+		if ( SourceInfo.pFromTransform || DestInfo.pToTransform )
 		{
 			// transform values
-			Transform<vlUInt16, vlUInt16>(SourceInfo.pFromTransform, DestInfo.pToTransform, SR, SG, SB, SA, SourceInfo.uiRBitsPerPixel, SourceInfo.uiGBitsPerPixel, SourceInfo.uiBBitsPerPixel, SourceInfo.uiABitsPerPixel, DR, DG, DB, DA, DestInfo.uiRBitsPerPixel, DestInfo.uiGBitsPerPixel, DestInfo.uiBBitsPerPixel, DestInfo.uiABitsPerPixel);
+			Transform<vlUInt16, vlUInt16>( SourceInfo.pFromTransform, DestInfo.pToTransform, SR, SG, SB, SA, SourceInfo.uiRBitsPerPixel, SourceInfo.uiGBitsPerPixel, SourceInfo.uiBBitsPerPixel, SourceInfo.uiABitsPerPixel, DR, DG, DB, DA, DestInfo.uiRBitsPerPixel, DestInfo.uiGBitsPerPixel, DestInfo.uiBBitsPerPixel, DestInfo.uiABitsPerPixel );
 		}
 		else
 		{
 			// default value transform
-			if(uiSourceRMask && uiDestRMask)
+			if ( uiSourceRMask && uiDestRMask )
 			{
-				if(DestInfo.uiRBitsPerPixel < SourceInfo.uiRBitsPerPixel)	// downsample
-					DR = Shrink<vlUInt16>(SR, SourceInfo.uiRBitsPerPixel, DestInfo.uiRBitsPerPixel);
-				else if(DestInfo.uiRBitsPerPixel > SourceInfo.uiRBitsPerPixel)	// upsample
-					DR = Expand<vlUInt16>(SR, SourceInfo.uiRBitsPerPixel, DestInfo.uiRBitsPerPixel);
+				if ( DestInfo.uiRBitsPerPixel < SourceInfo.uiRBitsPerPixel )	  // downsample
+					DR = Shrink<vlUInt16>( SR, SourceInfo.uiRBitsPerPixel, DestInfo.uiRBitsPerPixel );
+				else if ( DestInfo.uiRBitsPerPixel > SourceInfo.uiRBitsPerPixel ) // upsample
+					DR = Expand<vlUInt16>( SR, SourceInfo.uiRBitsPerPixel, DestInfo.uiRBitsPerPixel );
 				else
 					DR = SR;
 			}
 
-			if(uiSourceGMask && uiDestGMask)
+			if ( uiSourceGMask && uiDestGMask )
 			{
-				if(DestInfo.uiGBitsPerPixel < SourceInfo.uiGBitsPerPixel)	// downsample
-					DG = Shrink<vlUInt16>(SG, SourceInfo.uiGBitsPerPixel, DestInfo.uiGBitsPerPixel);
-				else if(DestInfo.uiGBitsPerPixel > SourceInfo.uiGBitsPerPixel)	// upsample
-					DG = Expand<vlUInt16>(SG, SourceInfo.uiGBitsPerPixel, DestInfo.uiGBitsPerPixel);
+				if ( DestInfo.uiGBitsPerPixel < SourceInfo.uiGBitsPerPixel )	  // downsample
+					DG = Shrink<vlUInt16>( SG, SourceInfo.uiGBitsPerPixel, DestInfo.uiGBitsPerPixel );
+				else if ( DestInfo.uiGBitsPerPixel > SourceInfo.uiGBitsPerPixel ) // upsample
+					DG = Expand<vlUInt16>( SG, SourceInfo.uiGBitsPerPixel, DestInfo.uiGBitsPerPixel );
 				else
 					DG = SG;
 			}
 
-			if(uiSourceBMask && uiDestBMask)
+			if ( uiSourceBMask && uiDestBMask )
 			{
-				if(DestInfo.uiBBitsPerPixel < SourceInfo.uiBBitsPerPixel)	// downsample
-					DB = Shrink<vlUInt16>(SB, SourceInfo.uiBBitsPerPixel, DestInfo.uiBBitsPerPixel);
-				else if(DestInfo.uiBBitsPerPixel > SourceInfo.uiBBitsPerPixel)	// upsample
-					DB = Expand<vlUInt16>(SB, SourceInfo.uiBBitsPerPixel, DestInfo.uiBBitsPerPixel);
+				if ( DestInfo.uiBBitsPerPixel < SourceInfo.uiBBitsPerPixel )	  // downsample
+					DB = Shrink<vlUInt16>( SB, SourceInfo.uiBBitsPerPixel, DestInfo.uiBBitsPerPixel );
+				else if ( DestInfo.uiBBitsPerPixel > SourceInfo.uiBBitsPerPixel ) // upsample
+					DB = Expand<vlUInt16>( SB, SourceInfo.uiBBitsPerPixel, DestInfo.uiBBitsPerPixel );
 				else
 					DB = SB;
 			}
 
-			if(uiSourceAMask && uiDestAMask)
+			if ( uiSourceAMask && uiDestAMask )
 			{
-				if(DestInfo.uiABitsPerPixel < SourceInfo.uiABitsPerPixel)	// downsample
-					DA = Shrink<vlUInt16>(SA, SourceInfo.uiABitsPerPixel, DestInfo.uiABitsPerPixel);
-				else if(DestInfo.uiABitsPerPixel > SourceInfo.uiABitsPerPixel)	// upsample
-					DA = Expand<vlUInt16>(SA, SourceInfo.uiABitsPerPixel, DestInfo.uiABitsPerPixel);
+				if ( DestInfo.uiABitsPerPixel < SourceInfo.uiABitsPerPixel )	  // downsample
+					DA = Shrink<vlUInt16>( SA, SourceInfo.uiABitsPerPixel, DestInfo.uiABitsPerPixel );
+				else if ( DestInfo.uiABitsPerPixel > SourceInfo.uiABitsPerPixel ) // upsample
+					DA = Expand<vlUInt16>( SA, SourceInfo.uiABitsPerPixel, DestInfo.uiABitsPerPixel );
 				else
 					DA = SA;
 			}
 		}
 
 		// write source to single variable
-		U Dest = ((U)(DR & uiDestRMask) << (U)uiDestRShift) | ((U)(DG & uiDestGMask) << (U)uiDestGShift) | ((U)(DB & uiDestBMask) << (U)uiDestBShift) | ((U)(DA & uiDestAMask) << (U)uiDestAShift);
-		for(i = 0; i < DestInfo.uiBytesPerPixel; i++)
+		U Dest = ( (U)( DR & uiDestRMask ) << (U)uiDestRShift ) | ( (U)( DG & uiDestGMask ) << (U)uiDestGShift ) | ( (U)( DB & uiDestBMask ) << (U)uiDestBShift ) | ( (U)( DA & uiDestAMask ) << (U)uiDestAShift );
+		for ( i = 0; i < DestInfo.uiBytesPerPixel; i++ )
 		{
-			lpDest[i] = (vlByte)((Dest >> ((T)i * 8)) & 0xff);
+			lpDest[i] = (vlByte)( ( Dest >> ( (T)i * 8 ) ) & 0xff );
 		}
 	}
 
 	return vlTrue;
 }
 
-vlBool CVTFFile::Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat, VTFImageFormat DestFormat)
+vlBool CVTFFile::HALF_HDR_TO_LDR( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const SVTFImageConvertInfo &SourceInfo, const SVTFImageConvertInfo &DestInfo )
+{
+	int count = 0;
+	count += SourceInfo.uiRBitsPerPixel > 0;
+	count += SourceInfo.uiGBitsPerPixel > 0;
+	count += SourceInfo.uiBBitsPerPixel > 0;
+	count += SourceInfo.uiABitsPerPixel > 0;
+
+	vlUInt srcSize = CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA16161616F );
+
+	vlUInt midSize = CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA32323232F );
+
+	// due to the nature of stbi__hdr_to_ldr
+	// tmp is freed inside the function.
+	// so no need to free it ourselves.
+	float *tmp = static_cast<float *>( malloc( midSize ) );
+
+	unsigned short *fp16Data = reinterpret_cast<unsigned short *>( lpSource );
+
+	float *tmpStart = tmp;
+
+	for ( int i = 0; i < ( srcSize / sizeof( unsigned short ) ); i++ )
+	{
+		*tmp = CVTFFile::FP16ToFP32( fp16Data[i] );
+		tmp++;
+	}
+
+	auto data = stbi__hdr_to_ldr( tmpStart, uiWidth, uiHeight, 4 );
+
+	if ( !data )
+		return vlFalse;
+
+	Convert( data, lpDest, uiWidth, uiHeight, count > 3 ? IMAGE_FORMAT_RGBA8888 : IMAGE_FORMAT_RGB888, DestInfo.Format );
+
+	stbi_image_free( data );
+	return vlTrue;
+}
+
+vlBool CVTFFile::HDR_TO_LDR( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const SVTFImageConvertInfo &SourceInfo, const SVTFImageConvertInfo &DestInfo )
+{
+	int count = 0;
+	count += SourceInfo.uiRBitsPerPixel > 0;
+	count += SourceInfo.uiGBitsPerPixel > 0;
+	count += SourceInfo.uiBBitsPerPixel > 0;
+	count += SourceInfo.uiABitsPerPixel > 0;
+
+	vlUInt srcSize = CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceInfo.Format );
+
+	// due to the nature of stbi__hdr_to_ldr
+	// tmp is freed inside the function.
+	// so no need to free it ourselves.
+	float *tmp = static_cast<float *>( malloc( srcSize ) );
+
+	memcpy( tmp, lpSource, srcSize );
+
+	auto data = stbi__hdr_to_ldr( reinterpret_cast<float *>( tmp ), uiWidth, uiHeight, count );
+
+	if ( !data )
+		return vlFalse;
+
+	Convert( data, lpDest, uiWidth, uiHeight, count > 3 ? IMAGE_FORMAT_RGBA8888 : IMAGE_FORMAT_RGB888, DestInfo.Format );
+
+	stbi_image_free( data );
+	return vlTrue;
+}
+
+vlBool CVTFFile::LDR_TO_HDR( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const SVTFImageConvertInfo &SourceInfo, const SVTFImageConvertInfo &DestInfo )
 {
-	assert(lpSource != 0);
-	assert(lpDest != 0);
+	int count = 0;
+	count += SourceInfo.uiRBitsPerPixel > 0;
+	count += SourceInfo.uiGBitsPerPixel > 0;
+	count += SourceInfo.uiBBitsPerPixel > 0;
+	count += SourceInfo.uiABitsPerPixel > 0;
 
-	assert(SourceFormat >= 0 && SourceFormat < IMAGE_FORMAT_COUNT);
-	assert(DestFormat >= 0 && DestFormat < IMAGE_FORMAT_COUNT);
+	vlUInt srcSize = CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceInfo.Format );
 
-	const SVTFImageConvertInfo& SourceInfo = VTFImageConvertInfo[SourceFormat];
-	const SVTFImageConvertInfo& DestInfo = VTFImageConvertInfo[DestFormat];
+	// due to the nature of stbi__ldr_to_hdr
+	// tmp is freed inside the function.
+	// so no need to free it ourselves.
+	vlByte *tmp = static_cast<vlByte *>( malloc( srcSize ) );
 
-	if(!SourceInfo.bIsSupported || !DestInfo.bIsSupported)
+	memcpy( tmp, lpSource, srcSize );
+
+	auto data = stbi__ldr_to_hdr( reinterpret_cast<stbi_uc *>( tmp ), uiWidth, uiHeight, count );
+
+	if ( !data )
+		return vlFalse;
+
+	vlUInt destSize = CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, DestInfo.Format );
+
+	memcpy( lpDest, data, destSize );
+
+	stbi_image_free( data );
+	return vlTrue;
+}
+
+vlBool CVTFFile::Convert( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat, VTFImageFormat DestFormat )
+{
+	assert( lpSource != 0 );
+	assert( lpDest != 0 );
+
+	assert( SourceFormat >= 0 && SourceFormat < IMAGE_FORMAT_COUNT );
+	assert( DestFormat >= 0 && DestFormat < IMAGE_FORMAT_COUNT );
+
+	const SVTFImageConvertInfo &SourceInfo = VTFImageConvertInfo[SourceFormat];
+	const SVTFImageConvertInfo &DestInfo = VTFImageConvertInfo[DestFormat];
+
+	if ( !SourceInfo.bIsSupported || !DestInfo.bIsSupported )
 	{
-		LastError.Set("Image format conversion not supported.");
+		LastError.Set( "Image format conversion not supported." );
 
 		return vlFalse;
 	}
 
 	// Optimize common convertions.
-	if(SourceFormat == DestFormat)
+	if ( SourceFormat == DestFormat )
 	{
-		memcpy( lpDest, lpSource, CVTFFile::ComputeImageSize(uiWidth, uiHeight, 1, DestFormat));
+		memcpy( lpDest, lpSource, CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, DestFormat ) );
 		return vlTrue;
 	}
 
-	if(SourceFormat == IMAGE_FORMAT_RGB888 && DestFormat == IMAGE_FORMAT_RGBA8888)
+	if ( SourceFormat == IMAGE_FORMAT_RGB888 && DestFormat == IMAGE_FORMAT_RGBA8888 )
 	{
-		vlByte *lpLast = lpSource + CVTFFile::ComputeImageSize(uiWidth, uiHeight, 1, SourceFormat);
-		for(; lpSource < lpLast; lpSource += 3, lpDest += 4)
+		vlByte *lpLast = lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceFormat );
+		for ( ; lpSource < lpLast; lpSource += 3, lpDest += 4 )
 		{
 			lpDest[0] = lpSource[0];
 			lpDest[1] = lpSource[1];
@@ -3820,10 +4310,10 @@ vlBool CVTFFile::Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUIn
 		return vlTrue;
 	}
 
-	if(SourceFormat == IMAGE_FORMAT_RGBA8888 && DestFormat == IMAGE_FORMAT_RGB888)
+	if ( SourceFormat == IMAGE_FORMAT_RGBA8888 && DestFormat == IMAGE_FORMAT_RGB888 )
 	{
-		vlByte *lpLast = lpSource + CVTFFile::ComputeImageSize(uiWidth, uiHeight, 1, SourceFormat);
-		for(; lpSource < lpLast; lpSource += 4, lpDest += 3)
+		vlByte *lpLast = lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceFormat );
+		for ( ; lpSource < lpLast; lpSource += 4, lpDest += 3 )
 		{
 			lpDest[0] = lpSource[0];
 			lpDest[1] = lpSource[1];
@@ -3832,42 +4322,162 @@ vlBool CVTFFile::Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUIn
 		return vlTrue;
 	}
 
+	if ( SourceFormat == IMAGE_FORMAT_RGBA16161616F && ( DestFormat == IMAGE_FORMAT_RGBA32323232F || DestFormat == IMAGE_FORMAT_RGB323232F ) )
+	{
+		unsigned short *lpSourceHFP = reinterpret_cast<unsigned short *>( lpSource );
+		unsigned short *lpLastHFP = reinterpret_cast<unsigned short *>( lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceFormat ) );
+		float *lpDestFP = reinterpret_cast<float *>( lpDest );
+
+		int channelCount = DestFormat == IMAGE_FORMAT_RGBA32323232F ? 4 : 3;
+
+		for ( ; lpSourceHFP < lpLastHFP; lpSourceHFP += 4, lpDestFP += channelCount )
+		{
+			lpDestFP[0] = FP16ToFP32( lpSourceHFP[0] );
+			lpDestFP[1] = FP16ToFP32( lpSourceHFP[1] );
+			lpDestFP[2] = FP16ToFP32( lpSourceHFP[2] );
+			if ( channelCount == 4 )
+				lpDestFP[3] = FP16ToFP32( lpSourceHFP[3] );
+		}
+		return vlTrue;
+	}
+
+	if ( SourceFormat == IMAGE_FORMAT_RGBA16161616F )
+	{
+		auto lpIntermediateRGBA = new vlByte[CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 )];
+
+		if ( !HALF_HDR_TO_LDR( lpSource, lpIntermediateRGBA, uiWidth, uiHeight, SourceInfo, DestInfo ) )
+		{
+			delete[] lpIntermediateRGBA;
+			return vlFalse;
+		}
+
+		vlBool didConvert = ConvertFromRGBA8888( lpIntermediateRGBA, lpDest, uiWidth, uiHeight, DestFormat );
+
+		delete[] lpIntermediateRGBA;
+
+		return didConvert;
+	}
+
+	if ( DestFormat == IMAGE_FORMAT_RGBA16161616F )
+	{
+		if ( SourceFormat == IMAGE_FORMAT_RGBA32323232F || SourceFormat == IMAGE_FORMAT_RGB323232F )
+		{
+			vlBool hasAlpha = SourceFormat == IMAGE_FORMAT_RGBA32323232F;
+
+			float *lpSourceFP = reinterpret_cast<float *>( lpSource );
+			float *lpLastFP = reinterpret_cast<float *>( lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceFormat ) );
+			unsigned short *lpDestFP = reinterpret_cast<unsigned short *>( lpDest );
+
+			for ( ; lpSourceFP < lpLastFP; lpSourceFP += hasAlpha ? 4 : 3, lpDestFP += 4 )
+			{
+				float test = lpSourceFP[0];
+				float test2 = CVTFFile::FP32ToFP16( lpSourceFP[0] );
+
+				lpDestFP[0] = CVTFFile::FP32ToFP16( lpSourceFP[0] );
+				lpDestFP[1] = CVTFFile::FP32ToFP16( lpSourceFP[1] );
+				lpDestFP[2] = CVTFFile::FP32ToFP16( lpSourceFP[2] );
+				lpDestFP[3] = hasAlpha ? CVTFFile::FP32ToFP16( lpSourceFP[3] ) : CVTFFile::FP32ToFP16( 1.f );
+			}
+
+			return vlTrue;
+		}
+
+		auto lpIntermediateRGBA = new vlByte[CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA32323232F )];
+
+		if ( !LDR_TO_HDR( lpSource, lpIntermediateRGBA, uiWidth, uiHeight, SourceInfo, VTFImageConvertInfo[IMAGE_FORMAT_RGBA32323232F] ) )
+		{
+			delete[] lpIntermediateRGBA;
+			return vlFalse;
+		}
+
+		vlBool didConvert = Convert( lpIntermediateRGBA, lpDest, uiWidth, uiHeight, IMAGE_FORMAT_RGBA32323232F, DestFormat );
+
+		delete[] lpIntermediateRGBA;
+
+		return didConvert;
+	}
+
+	if ( SourceFormat == IMAGE_FORMAT_RGBA32323232F && DestFormat == IMAGE_FORMAT_RGB323232F )
+	{
+		float *lpSourceFP = reinterpret_cast<float *>( lpSource );
+		float *lpLastFP = reinterpret_cast<float *>( lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceFormat ) );
+		float *lpDestFP = reinterpret_cast<float *>( lpDest );
+
+		for ( ; lpSourceFP < lpLastFP; lpSourceFP += 4, lpDestFP += 3 )
+		{
+			lpDestFP[0] = lpSourceFP[0];
+			lpDestFP[1] = lpSourceFP[1];
+			lpDestFP[2] = lpSourceFP[2];
+		}
+		return vlTrue;
+	}
+
+	if ( SourceFormat == IMAGE_FORMAT_RGB323232F && DestFormat == IMAGE_FORMAT_RGBA32323232F )
+	{
+		float *lpSourceFP = reinterpret_cast<float *>( lpSource );
+		float *lpLastFP = reinterpret_cast<float *>( lpSource + CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, SourceFormat ) );
+		float *lpDestFP = reinterpret_cast<float *>( lpDest );
+
+		for ( ; lpSourceFP < lpLastFP; lpSourceFP += 3, lpDestFP += 4 )
+		{
+			lpDestFP[0] = lpSourceFP[0];
+			lpDestFP[1] = lpSourceFP[1];
+			lpDestFP[2] = lpSourceFP[2];
+			lpDestFP[3] = 1.f;
+		}
+
+		return vlTrue;
+	}
+
+	if ( DestFormat == IMAGE_FORMAT_RGBA32323232F || DestFormat == IMAGE_FORMAT_RGB323232F )
+	{
+		auto lpRGBA8888Data = new vlByte[ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 )];
+		if ( !ConvertToRGBA8888( lpSource, lpRGBA8888Data, uiWidth, uiHeight, SourceFormat ) )
+		{
+			delete[] lpRGBA8888Data;
+			return vlFalse;
+		}
+
+		if ( DestFormat == IMAGE_FORMAT_RGBA32323232F )
+			LDR_TO_HDR( lpRGBA8888Data, lpDest, uiWidth, uiHeight, VTFImageConvertInfo[IMAGE_FORMAT_RGBA8888], DestInfo );
+		else
+		{
+			auto lpRGBAFP32Data = new vlByte[ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA32323232F )];
+
+			LDR_TO_HDR( lpRGBA8888Data, lpRGBAFP32Data, uiWidth, uiHeight, VTFImageConvertInfo[IMAGE_FORMAT_RGBA8888], VTFImageConvertInfo[IMAGE_FORMAT_RGBA32323232F] );
+
+			if ( !Convert( lpRGBAFP32Data, lpSource, uiWidth, uiHeight, IMAGE_FORMAT_RGBA32323232F, DestFormat ) )
+			{
+				delete[] lpRGBAFP32Data;
+				delete[] lpRGBA8888Data;
+				return vlFalse;
+			}
+
+			delete[] lpRGBAFP32Data;
+		}
+
+		delete[] lpRGBA8888Data;
+
+		return vlTrue;
+	}
+
 	// Do general convertions.
-	if(SourceInfo.bIsCompressed || DestInfo.bIsCompressed)
+	if ( SourceInfo.bIsCompressed || DestInfo.bIsCompressed )
 	{
 		vlByte *lpSourceRGBA = lpSource;
 		vlBool bResult = vlTrue;
 
 		// allocate temp data for intermittent conversions
-		if(SourceFormat != IMAGE_FORMAT_RGBA8888)
+		if ( SourceFormat != IMAGE_FORMAT_RGBA8888 )
 		{
-			lpSourceRGBA = new vlByte[CVTFFile::ComputeImageSize(uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888)];
+			lpSourceRGBA = new vlByte[CVTFFile::ComputeImageSize( uiWidth, uiHeight, 1, IMAGE_FORMAT_RGBA8888 )];
 		}
 
 		// decompress the source or convert it to RGBA for compressing
-		switch(SourceFormat)
-		{
-		case IMAGE_FORMAT_RGBA8888:
-			break;
-		case IMAGE_FORMAT_DXT1:
-		case IMAGE_FORMAT_DXT1_ONEBITALPHA:
-		case IMAGE_FORMAT_DXT3:
-		case IMAGE_FORMAT_DXT5:
-		case IMAGE_FORMAT_ATI2N:
-		case IMAGE_FORMAT_ATI1N:
-		case IMAGE_FORMAT_BC7:
-			bResult = CVTFFile::DecompressBCn(lpSource, lpSourceRGBA, uiWidth, uiHeight, SourceFormat);
-			break;
-		default:
-			bResult = CVTFFile::Convert(lpSource, lpSourceRGBA, uiWidth, uiHeight, SourceFormat, IMAGE_FORMAT_RGBA8888);
-			break;
-		}
-
-		if(bResult)
+		switch ( SourceFormat )
 		{
-			// compress the source or convert it to the dest format if it is not compressed
-			switch(DestFormat)
-			{
+			case IMAGE_FORMAT_RGBA8888:
+				break;
 			case IMAGE_FORMAT_DXT1:
 			case IMAGE_FORMAT_DXT1_ONEBITALPHA:
 			case IMAGE_FORMAT_DXT3:
@@ -3875,18 +4485,37 @@ vlBool CVTFFile::Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUIn
 			case IMAGE_FORMAT_ATI2N:
 			case IMAGE_FORMAT_ATI1N:
 			case IMAGE_FORMAT_BC7:
-				bResult = CVTFFile::CompressBCn(lpSourceRGBA, lpDest, uiWidth, uiHeight, DestFormat);
+				bResult = CVTFFile::DecompressBCn( lpSource, lpSourceRGBA, uiWidth, uiHeight, SourceFormat );
 				break;
 			default:
-				bResult = CVTFFile::Convert(lpSourceRGBA, lpDest, uiWidth, uiHeight, IMAGE_FORMAT_RGBA8888, DestFormat);
+				bResult = CVTFFile::Convert( lpSource, lpSourceRGBA, uiWidth, uiHeight, SourceFormat, IMAGE_FORMAT_RGBA8888 );
 				break;
+		}
+
+		if ( bResult )
+		{
+			// compress the source or convert it to the dest format if it is not compressed
+			switch ( DestFormat )
+			{
+				case IMAGE_FORMAT_DXT1:
+				case IMAGE_FORMAT_DXT1_ONEBITALPHA:
+				case IMAGE_FORMAT_DXT3:
+				case IMAGE_FORMAT_DXT5:
+				case IMAGE_FORMAT_ATI2N:
+				case IMAGE_FORMAT_ATI1N:
+				case IMAGE_FORMAT_BC7:
+					bResult = CVTFFile::CompressBCn( lpSourceRGBA, lpDest, uiWidth, uiHeight, DestFormat );
+					break;
+				default:
+					bResult = CVTFFile::Convert( lpSourceRGBA, lpDest, uiWidth, uiHeight, IMAGE_FORMAT_RGBA8888, DestFormat );
+					break;
 			}
 		}
 
 		// free temp data
-		if(lpSourceRGBA != lpSource)
+		if ( lpSourceRGBA != lpSource )
 		{
-			delete []lpSourceRGBA;
+			delete[] lpSourceRGBA;
 		}
 
 		return bResult;
@@ -3894,49 +4523,53 @@ vlBool CVTFFile::Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUIn
 	else
 	{
 		// convert from one variable order and bit format to another
-		if(SourceInfo.uiBytesPerPixel <= 1)
+		if ( SourceInfo.uiBytesPerPixel <= 1 )
+		{
+			if ( DestInfo.uiBytesPerPixel <= 1 )
+				return ConvertTemplated<vlUInt8, vlUInt8>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 2 )
+				return ConvertTemplated<vlUInt8, vlUInt16>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 4 )
+				return ConvertTemplated<vlUInt8, vlUInt32>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 8 )
+				return ConvertTemplated<vlUInt8, vlUInt64>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+		}
+		else if ( SourceInfo.uiBytesPerPixel <= 2 )
 		{
-			if(DestInfo.uiBytesPerPixel <= 1)
-				return ConvertTemplated<vlUInt8, vlUInt8>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 2)
-				return ConvertTemplated<vlUInt8, vlUInt16>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 4)
-				return ConvertTemplated<vlUInt8, vlUInt32>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 8)
-				return ConvertTemplated<vlUInt8, vlUInt64>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
+			if ( DestInfo.uiBytesPerPixel <= 1 )
+				return ConvertTemplated<vlUInt16, vlUInt8>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 2 )
+				return ConvertTemplated<vlUInt16, vlUInt16>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 4 )
+				return ConvertTemplated<vlUInt16, vlUInt32>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 8 )
+				return ConvertTemplated<vlUInt16, vlUInt64>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
 		}
-		else if(SourceInfo.uiBytesPerPixel <= 2)
+		else if ( SourceInfo.uiBytesPerPixel <= 4 )
 		{
-			if(DestInfo.uiBytesPerPixel <= 1)
-				return ConvertTemplated<vlUInt16, vlUInt8>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 2)
-				return ConvertTemplated<vlUInt16, vlUInt16>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 4)
-				return ConvertTemplated<vlUInt16, vlUInt32>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 8)
-				return ConvertTemplated<vlUInt16, vlUInt64>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
+			if ( DestInfo.uiBytesPerPixel <= 1 )
+				return ConvertTemplated<vlUInt32, vlUInt8>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 2 )
+				return ConvertTemplated<vlUInt32, vlUInt16>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 4 )
+				return ConvertTemplated<vlUInt32, vlUInt32>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 8 )
+				return ConvertTemplated<vlUInt32, vlUInt64>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
 		}
-		else if(SourceInfo.uiBytesPerPixel <= 4)
+		else if ( SourceInfo.uiBytesPerPixel <= 8 )
 		{
-			if(DestInfo.uiBytesPerPixel <= 1)
-				return ConvertTemplated<vlUInt32, vlUInt8>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 2)
-				return ConvertTemplated<vlUInt32, vlUInt16>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 4)
-				return ConvertTemplated<vlUInt32, vlUInt32>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 8)
-				return ConvertTemplated<vlUInt32, vlUInt64>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
+			if ( DestInfo.uiBytesPerPixel <= 1 )
+				return ConvertTemplated<vlUInt64, vlUInt8>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 2 )
+				return ConvertTemplated<vlUInt64, vlUInt16>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 4 )
+				return ConvertTemplated<vlUInt64, vlUInt32>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
+			else if ( DestInfo.uiBytesPerPixel <= 8 )
+				return ConvertTemplated<vlUInt64, vlUInt64>( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
 		}
-		else if(SourceInfo.uiBytesPerPixel <= 8)
+		else if ( SourceInfo.uiBytesPerPixel <= 16 )
 		{
-			if(DestInfo.uiBytesPerPixel <= 1)
-				return ConvertTemplated<vlUInt64, vlUInt8>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 2)
-				return ConvertTemplated<vlUInt64, vlUInt16>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 4)
-				return ConvertTemplated<vlUInt64, vlUInt32>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
-			else if(DestInfo.uiBytesPerPixel <= 8)
-				return ConvertTemplated<vlUInt64, vlUInt64>(lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo);
+			return HDR_TO_LDR( lpSource, lpDest, uiWidth, uiHeight, SourceInfo, DestInfo );
 		}
 		return vlFalse;
 	}
@@ -3944,16 +4577,32 @@ vlBool CVTFFile::Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUIn
 	return vlFalse;
 }
 
-vlBool CVTFFile::Resize(vlByte *lpSourceRGBA8888, vlByte *lpDestRGBA8888, vlUInt uiSourceWidth, vlUInt uiSourceHeight, vlUInt uiDestWidth, vlUInt uiDestHeight, VTFMipmapFilter ResizeFilter, vlBool bSRGB)
+vlBool CVTFFile::Resize( vlByte *lpSourceRGBA8888, vlByte *lpDestRGBA8888, vlUInt uiSourceWidth, vlUInt uiSourceHeight, vlUInt uiDestWidth, vlUInt uiDestHeight, VTFMipmapFilter ResizeFilter, vlBool bSRGB )
+{
+	assert( ResizeFilter >= 0 && ResizeFilter < MIPMAP_FILTER_COUNT );
+
+	if ( !stbir_resize_uint8_generic(
+			 lpSourceRGBA8888, uiSourceWidth, uiSourceHeight, 0,
+			 lpDestRGBA8888, uiDestWidth, uiDestHeight, 0,
+			 4, 3, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_BOX, bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, NULL ) )
+	{
+		LastError.Set( "Error resizing image." );
+		return vlFalse;
+	}
+
+	return vlTrue;
+}
+
+vlBool CVTFFile::ResizeFloat( vlByte *lpSourceRGBAFP32, vlByte *lpDestRGBFP32, vlUInt uiSourceWidth, vlUInt uiSourceHeight, vlUInt uiDestWidth, vlUInt uiDestHeight, VTFMipmapFilter ResizeFilter, vlBool bSRGB )
 {
-	assert(ResizeFilter >= 0 && ResizeFilter < MIPMAP_FILTER_COUNT);
+	assert( ResizeFilter >= 0 && ResizeFilter < MIPMAP_FILTER_COUNT );
 
-	if (!stbir_resize_uint8_generic(
-		lpSourceRGBA8888, uiSourceWidth, uiSourceHeight, 0,
-		lpDestRGBA8888, uiDestWidth, uiDestHeight, 0,
-		4, 3, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_BOX, bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, NULL))
+	if ( !stbir_resize_float_generic(
+			 reinterpret_cast<float *>( lpSourceRGBAFP32 ), uiSourceWidth, uiSourceHeight, 0,
+			 reinterpret_cast<float *>( lpDestRGBFP32 ), uiDestWidth, uiDestHeight, 0,
+			 4, 3, 0, STBIR_EDGE_CLAMP, STBIR_FILTER_BOX, bSRGB ? STBIR_COLORSPACE_SRGB : STBIR_COLORSPACE_LINEAR, NULL ) )
 	{
-		LastError.Set("Error resizing image.");
+		LastError.Set( "Error resizing image." );
 		return vlFalse;
 	}
 
@@ -3964,9 +4613,9 @@ vlBool CVTFFile::Resize(vlByte *lpSourceRGBA8888, vlByte *lpDestRGBA8888, vlUInt
 // CorrectImageGamma()
 // Do gamma correction on the image data.
 //
-vlVoid CVTFFile::CorrectImageGamma(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle sGammaCorrection)
+vlVoid CVTFFile::CorrectImageGamma( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle sGammaCorrection )
 {
-	if(sGammaCorrection == 1.0f)
+	if ( sGammaCorrection == 1.0f )
 	{
 		return;
 	}
@@ -3976,15 +4625,15 @@ vlVoid CVTFFile::CorrectImageGamma(vlByte *lpImageDataRGBA8888, vlUInt uiWidth,
 	sGammaCorrection = 1.0f / sGammaCorrection;
 
 	// Precalculate all possible gamma correction values.
-	for(vlUInt i = 0; i < 256; i++)
+	for ( vlUInt i = 0; i < 256; i++ )
 	{
-		bTable[i] = (vlByte)(pow((vlSingle)i / 255.0f, sGammaCorrection) * 255.0f);
+		bTable[i] = (vlByte)( pow( (vlSingle)i / 255.0f, sGammaCorrection ) * 255.0f );
 	}
 
 	vlByte *lpImageDataRGBA8888End = lpImageDataRGBA8888 + uiWidth * uiHeight * 4;
 
 	// Do gamma correction on RGB channels.
-	for(; lpImageDataRGBA8888 < lpImageDataRGBA8888End; lpImageDataRGBA8888 += 4)
+	for ( ; lpImageDataRGBA8888 < lpImageDataRGBA8888End; lpImageDataRGBA8888 += 4 )
 	{
 		lpImageDataRGBA8888[0] = bTable[lpImageDataRGBA8888[0]];
 		lpImageDataRGBA8888[1] = bTable[lpImageDataRGBA8888[1]];
@@ -3996,7 +4645,7 @@ vlVoid CVTFFile::CorrectImageGamma(vlByte *lpImageDataRGBA8888, vlUInt uiWidth,
 // ComputeImageReflectivity()
 // Compute the image data reflectivity value.
 //
-vlVoid CVTFFile::ComputeImageReflectivity(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle &sX, vlSingle &sY, vlSingle &sZ)
+vlVoid CVTFFile::ComputeImageReflectivity( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle &sX, vlSingle &sY, vlSingle &sZ )
 {
 	sX = sY = sZ = 0.0f;
 
@@ -4006,9 +4655,9 @@ vlVoid CVTFFile::ComputeImageReflectivity(vlByte *lpImageDataRGBA8888, vlUInt ui
 	// Precalculate all possible reflectivity values.
 	//
 
-	for(vlUInt i = 0; i < 256; i++)
+	for ( vlUInt i = 0; i < 256; i++ )
 	{
-		sTable[i] = pow((vlSingle)i / 255.0f, 2.2f);
+		sTable[i] = pow( (vlSingle)i / 255.0f, 2.2f );
 	}
 
 	//
@@ -4036,13 +4685,13 @@ vlVoid CVTFFile::ComputeImageReflectivity(vlByte *lpImageDataRGBA8888, vlUInt ui
 
 	vlSingle sTempX, sTempY, sTempZ, sInverse;
 
-	for(vlUInt j = 0; j < uiHeight; j++)
+	for ( vlUInt j = 0; j < uiHeight; j++ )
 	{
 		sTempX = sTempY = sTempZ = 0.0f;
 
-		for(vlUInt i = 0; i < uiWidth; i++)
+		for ( vlUInt i = 0; i < uiWidth; i++ )
 		{
-			vlUInt uiIndex = (i + j * uiWidth) * 4;
+			vlUInt uiIndex = ( i + j * uiWidth ) * 4;
 
 			sTempX += sTable[lpImageDataRGBA8888[uiIndex + 0]];
 			sTempY += sTable[lpImageDataRGBA8888[uiIndex + 1]];
@@ -4067,16 +4716,16 @@ vlVoid CVTFFile::ComputeImageReflectivity(vlByte *lpImageDataRGBA8888, vlUInt ui
 // FlipImage()
 // Flips image data over the X axis.
 //
-vlVoid CVTFFile::FlipImage(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight)
+vlVoid CVTFFile::FlipImage( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight )
 {
 	vlUInt *lpImageData = (vlUInt *)lpImageDataRGBA8888;
 
-	for(vlUInt i = 0; i < uiWidth; i++)
+	for ( vlUInt i = 0; i < uiWidth; i++ )
 	{
-		for(vlUInt j = 0; j < uiHeight / 2; j++)
+		for ( vlUInt j = 0; j < uiHeight / 2; j++ )
 		{
-			vlUInt *pOne = lpImageData + (i + j * uiWidth);
-			vlUInt *pTwo = lpImageData + (i + (uiHeight - j - 1) * uiWidth);
+			vlUInt *pOne = lpImageData + ( i + j * uiWidth );
+			vlUInt *pTwo = lpImageData + ( i + ( uiHeight - j - 1 ) * uiWidth );
 
 			vlUInt uiTemp = *pOne;
 			*pOne = *pTwo;
@@ -4089,16 +4738,16 @@ vlVoid CVTFFile::FlipImage(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt u
 // MirrorImage()
 // Flips image data over the Y axis.
 //
-vlVoid CVTFFile::MirrorImage(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight)
+vlVoid CVTFFile::MirrorImage( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight )
 {
 	vlUInt *lpImageData = (vlUInt *)lpImageDataRGBA8888;
 
-	for(vlUInt i = 0; i < uiWidth / 2; i++)
+	for ( vlUInt i = 0; i < uiWidth / 2; i++ )
 	{
-		for(vlUInt j = 0; j < uiHeight; j++)
+		for ( vlUInt j = 0; j < uiHeight; j++ )
 		{
-			vlUInt *pOne = lpImageData + (i + j * uiWidth);
-			vlUInt *pTwo = lpImageData + ((uiWidth - i - 1) + j * uiWidth);
+			vlUInt *pOne = lpImageData + ( i + j * uiWidth );
+			vlUInt *pTwo = lpImageData + ( ( uiWidth - i - 1 ) + j * uiWidth );
 
 			vlUInt uiTemp = *pOne;
 			*pOne = *pTwo;
@@ -4111,8 +4760,8 @@ vlVoid CVTFFile::MirrorImage(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt
 // ConvertInPlace
 // Convert the image to format in place
 //
-vlBool CVTFFile::ConvertInPlace(VTFImageFormat format)
-{	
+vlBool CVTFFile::ConvertInPlace( VTFImageFormat format )
+{
 	const vlUInt uiSrcWidth = GetWidth();
 	const vlUInt uiSrcHeight = GetHeight();
 	const vlUInt uiSrcDepth = GetDepth();
@@ -4122,44 +4771,44 @@ vlBool CVTFFile::ConvertInPlace(VTFImageFormat format)
 	const vlUInt uiSliceCount = GetDepth();
 
 	// Compute and allocate a working buffer- will replace lpImageData at the end
-	const vlUInt usBufferSize = this->ComputeImageSize(this->Header->Width, this->Header->Height, uiMipCount, format) * uiFrameCount * uiFaceCount;
-	auto* buffer = new vlByte[usBufferSize];
+	const vlUInt usBufferSize = this->ComputeImageSize( this->Header->Width, this->Header->Height, uiMipCount, format ) * uiFrameCount * uiFaceCount;
+	auto *buffer = new vlByte[usBufferSize];
 
 	// Holy sweet mother of nested loops...
-	for(vlUInt uiFrame = 0; uiFrame < uiFrameCount; ++uiFrame)
+	for ( vlUInt uiFrame = 0; uiFrame < uiFrameCount; ++uiFrame )
 	{
-		for(vlUInt uiFace = 0; uiFace < uiFaceCount; ++uiFace)
+		for ( vlUInt uiFace = 0; uiFace < uiFaceCount; ++uiFace )
 		{
-			for(vlUInt uiSlice = 0; uiSlice < uiSliceCount; ++uiSlice)
+			for ( vlUInt uiSlice = 0; uiSlice < uiSliceCount; ++uiSlice )
 			{
-				for(vlUInt uiMip = 0; uiMip < uiMipCount; ++uiMip)
+				for ( vlUInt uiMip = 0; uiMip < uiMipCount; ++uiMip )
 				{
-					auto* lpSrcData = GetData(uiFrame, uiFace, uiSlice, uiMip);
+					auto *lpSrcData = GetData( uiFrame, uiFace, uiSlice, uiMip );
 
-					const vlUInt uiOffset = ComputeDataOffset(uiFrame, uiFace, uiSlice, uiMip, format);
-					assert(uiOffset < usBufferSize);
+					const vlUInt uiOffset = ComputeDataOffset( uiFrame, uiFace, uiSlice, uiMip, format );
+					assert( uiOffset < usBufferSize );
+
+					auto *lpDstData = (vlByte *)buffer + uiOffset;
 
-					auto* lpDstData = (vlByte*)buffer + uiOffset;
-					
 					vlUInt uiMipWidth, uiMipHeight, uiMipDepth;
-					ComputeMipmapDimensions(uiSrcWidth, uiSrcHeight, uiSrcDepth, uiMip, uiMipWidth, uiMipHeight, uiMipDepth);
-					if (!Convert(lpSrcData, lpDstData, uiMipWidth, uiMipHeight, GetFormat(), format))
+					ComputeMipmapDimensions( uiSrcWidth, uiSrcHeight, uiSrcDepth, uiMip, uiMipWidth, uiMipHeight, uiMipDepth );
+					if ( !Convert( lpSrcData, lpDstData, uiMipWidth, uiMipHeight, GetFormat(), format ) )
 					{
-						delete [] buffer;
+						delete[] buffer;
 						return false;
 					}
 				}
 			}
 		}
 	}
-	
+
 	// Recompute image buffer size
 	this->uiImageBufferSize = usBufferSize;
-	
-	auto* oldData = this->lpImageData;
+
+	auto *oldData = this->lpImageData;
 	this->lpImageData = buffer;
 	this->Header->ImageFormat = format;
-	
-	delete [] oldData;
+
+	delete[] oldData;
 	return true;
 }
diff --git a/VTFLib/VTFFile.h b/VTFLib/VTFFile.h
index dde56e5..f7f5efb 100644
--- a/VTFLib/VTFFile.h
+++ b/VTFLib/VTFFile.h
@@ -20,102 +20,122 @@
 #ifndef VTFFILE_H
 #define VTFFILE_H
 
-#include "stdafx.h"
 #include "Readers.h"
-#include "Writers.h"
 #include "VTFFormat.h"
+#include "Writers.h"
+#include "stdafx.h"
 
 #ifdef __cplusplus
-extern "C" {
+extern "C"
+{
 #endif
 
 //! VTFImageFormat info struct.
-/*!  
+/*!
 	The SVTFImageFormatInfo struct provides information on VTF image formats.
 
 	\see VTFImageFormat
 */
-#pragma pack(1)
-typedef struct tagSVTFImageFormatInfo
-{
-	const vlChar *lpName;			//!< Enumeration text equivalent.
-	vlUInt	uiBitsPerPixel;			//!< Format bits per pixel.
-	vlUInt	uiBytesPerPixel;		//!< Format bytes per pixel.
-	vlUInt	uiRedBitsPerPixel;		//!< Format red bits per pixel.  0 for N/A.
-	vlUInt	uiGreenBitsPerPixel;	//!< Format green bits per pixel.  0 for N/A.
-	vlUInt	uiBlueBitsPerPixel;		//!< Format blue bits per pixel.  0 for N/A.
-	vlUInt	uiAlphaBitsPerPixel;	//!< Format alpha bits per pixel.  0 for N/A.
-	vlBool	bIsCompressed;			//!< Format is compressed (DXT).
-	vlBool	bIsSupported;			//!< Format is supported by VTFLib.
-} SVTFImageFormatInfo;
+#pragma pack( 1 )
+	typedef struct tagSVTFImageFormatInfo
+	{
+		const vlChar *lpName;		//!< Enumeration text equivalent.
+		vlUInt uiBitsPerPixel;		//!< Format bits per pixel.
+		vlUInt uiBytesPerPixel;		//!< Format bytes per pixel.
+		vlUInt uiRedBitsPerPixel;	//!< Format red bits per pixel.  0 for N/A.
+		vlUInt uiGreenBitsPerPixel; //!< Format green bits per pixel.  0 for N/A.
+		vlUInt uiBlueBitsPerPixel;	//!< Format blue bits per pixel.  0 for N/A.
+		vlUInt uiAlphaBitsPerPixel; //!< Format alpha bits per pixel.  0 for N/A.
+		vlBool bIsCompressed;		//!< Format is compressed (DXT).
+		vlBool bIsSupported;		//!< Format is supported by VTFLib.
+	} SVTFImageFormatInfo;
 #pragma pack()
 
+	typedef vlVoid ( *TransformProc )( vlUInt16 &R, vlUInt16 &G, vlUInt16 &B, vlUInt16 &A );
+
+	typedef struct tagSVTFImageConvertInfo
+	{
+		vlUInt uiBitsPerPixel;		  // Format bytes per pixel.
+		vlUInt uiBytesPerPixel;		  // Format bytes per pixel.
+		vlUInt uiRBitsPerPixel;		  // Format conversion red bits per pixel.  0 for N/A.
+		vlUInt uiGBitsPerPixel;		  // Format conversion green bits per pixel.  0 for N/A.
+		vlUInt uiBBitsPerPixel;		  // Format conversion blue bits per pixel.  0 for N/A.
+		vlUInt uiABitsPerPixel;		  // Format conversion alpha bits per pixel.  0 for N/A.
+		vlInt iR;					  // "Red" index.
+		vlInt iG;					  // "Green" index.
+		vlInt iB;					  // "Blue" index.
+		vlInt iA;					  // "Alpha" index.
+		vlBool bIsCompressed;		  // Format is compressed (DXT).
+		vlBool bIsSupported;		  // Format is supported by VTFLib.
+		TransformProc pToTransform;	  // Custom transform to function.
+		TransformProc pFromTransform; // Custom transform from function.
+		VTFImageFormat Format;
+	} SVTFImageConvertInfo;
+
 //! VTF Creation options struct.
-/*!  
+/*!
 	The SVTFCreateOptions struct defines options and settings to be used when
 	creating VTF images	with methods such as CVTFFile::Create().
 
 	\see CVTFFile::Create()
 */
-#pragma pack(1)
-typedef struct tagSVTFCreateOptions
-{
-	vlUInt uiVersion[2];								//!< Output image version.
-	VTFImageFormat ImageFormat;							//!< Output image output storage format.
+#pragma pack( 1 )
+	typedef struct tagSVTFCreateOptions
+	{
+		vlUInt uiVersion[2];		  //!< Output image version.
+		VTFImageFormat ImageFormat;	  //!< Output image output storage format.
 
-	vlUInt uiFlags;										//!< Output image header flags.
-	vlUInt uiStartFrame;								//!< Output image start frame.
-	vlSingle sBumpScale;								//!< Output image bump scale.
-	vlSingle sReflectivity[3];							//!< Output image reflectivity. (Only used if bReflectivity is false.)
+		vlUInt uiFlags;				  //!< Output image header flags.
+		vlUInt uiStartFrame;		  //!< Output image start frame.
+		vlSingle sBumpScale;		  //!< Output image bump scale.
+		vlSingle sReflectivity[3];	  //!< Output image reflectivity. (Only used if bReflectivity is false.)
 
-	vlBool bMipmaps;									//!< Generate MIPmaps. (Space is always allocated.)
-	VTFMipmapFilter MipmapFilter;						//!< MIP map re-size filter.
+		vlBool bMipmaps;			  //!< Generate MIPmaps. (Space is always allocated.)
+		VTFMipmapFilter MipmapFilter; //!< MIP map re-size filter.
 
-	vlBool bThumbnail;									//!< Generate thumbnail image.
-	vlBool bReflectivity;								//!< Compute image reflectivity.
+		vlBool bThumbnail;			  //!< Generate thumbnail image.
+		vlBool bReflectivity;		  //!< Compute image reflectivity.
 
-	vlBool bResize;										//!< Resize the input image.
-	VTFResizeMethod ResizeMethod;						//!< New size compution method.
-	VTFMipmapFilter ResizeFilter;						//!< Re-size filter.
-	vlUInt uiResizeWidth;								//!< New width after re-size if method is RESIZE_SET.
-	vlUInt uiResizeHeight;								//!< New height after re-size if method is RESIZE_SET.
+		vlBool bResize;				  //!< Resize the input image.
+		VTFResizeMethod ResizeMethod; //!< New size compution method.
+		VTFMipmapFilter ResizeFilter; //!< Re-size filter.
+		vlUInt uiResizeWidth;		  //!< New width after re-size if method is RESIZE_SET.
+		vlUInt uiResizeHeight;		  //!< New height after re-size if method is RESIZE_SET.
 
-	vlBool bResizeClamp;								//!< Clamp re-size size.
-	vlUInt uiResizeClampWidth;							//!< Maximum width to re-size to.
-	vlUInt uiResizeClampHeight;							//!< Maximum height to re-size to.
+		vlBool bResizeClamp;		  //!< Clamp re-size size.
+		vlUInt uiResizeClampWidth;	  //!< Maximum width to re-size to.
+		vlUInt uiResizeClampHeight;	  //!< Maximum height to re-size to.
 
-	vlBool bGammaCorrection;							//!< Gamma correct input image.
-	vlSingle sGammaCorrection;							//!< Gamma correction to apply.
+		vlBool bGammaCorrection;	  //!< Gamma correct input image.
+		vlSingle sGammaCorrection;	  //!< Gamma correction to apply.
 
-	vlBool bSphereMap;									//!< Generate a sphere map for six faced environment maps.
-	vlBool bSRGB;										//!< Texture is in the SRGB color space.
-} SVTFCreateOptions;
+		vlBool bSphereMap;			  //!< Generate a sphere map for six faced environment maps.
+		vlBool bSRGB;				  //!< Texture is in the SRGB color space.
+	} SVTFCreateOptions;
 #pragma pack()
 
-//! VTF Init options struct.
-/*!  
-	The SVTFCreateOptions struct defines basic parameters of the texture
+	//! VTF Init options struct.
+	/*!
+		The SVTFCreateOptions struct defines basic parameters of the texture
 
-	\see CVTFFile::Init()
-*/
-typedef struct tagSVTFInitOptions
-{
-	vlUInt uiWidth;
-	vlUInt uiHeight;
-	vlUInt uiSlices;
-	
-	vlUInt uiFrames;
-	vlUInt uiFaces;
-	
-	VTFImageFormat ImageFormat;
-	
-	vlBool bThumbnail;
-	vlUInt nMipMaps;
-	
-	vlBool bNullImageData;
-} SVTFInitOptions;
+		\see CVTFFile::Init()
+	*/
+	typedef struct tagSVTFInitOptions
+	{
+		vlUInt uiWidth;
+		vlUInt uiHeight;
+		vlUInt uiSlices;
+
+		vlUInt uiFrames;
+		vlUInt uiFaces;
+
+		VTFImageFormat ImageFormat;
 
+		vlBool bThumbnail;
+		vlUInt nMipMaps;
 
+		vlBool bNullImageData;
+	} SVTFInitOptions;
 
 #ifdef __cplusplus
 }
@@ -135,7 +155,7 @@ namespace VTFLib
 		in short, uncompressed 32-bit image data. There are functions for
 		converting the data to other formats internally, however for image
 		creation you are probably sticking best with RGBA8888 for simplicity.
-	 
+
 		The majority of functions return a vlBool value. This is simply a
 		test as to whether a function has succeeded or failed to execute properly.
 		In the case of functions for checking flags, the vlBool indicates
@@ -145,18 +165,16 @@ namespace VTFLib
 	class VTFLIB_API CVTFFile
 	{
 	private:
+		SVTFHeader *Header;			  // VTF header
 
-		SVTFHeader *Header;						// VTF header
-	
-		vlUInt uiImageBufferSize;				// Size of VTF image data buffer
-		vlByte *lpImageData;					// VTF image buffer
+		vlUInt uiImageBufferSize;	  // Size of VTF image data buffer
+		vlByte *lpImageData;		  // VTF image buffer
 
-		vlUInt uiThumbnailBufferSize;			// Size of VTF thumbnail image data buffer
-		vlByte *lpThumbnailImageData;			// VTF thumbnail image buffer
+		vlUInt uiThumbnailBufferSize; // Size of VTF thumbnail image data buffer
+		vlByte *lpThumbnailImageData; // VTF thumbnail image buffer
 
 	public:
-
-		CVTFFile();		//!< Default constructor
+		CVTFFile(); //!< Default constructor
 
 		//! Create a new VTFFile class as a copy of another.
 		/*!
@@ -165,7 +183,7 @@ namespace VTFLib
 
 			\param VTFFile is the CVTFFile class you want to copy.
 		*/
-		CVTFFile(const CVTFFile &VTFFile);
+		CVTFFile( const CVTFFile &VTFFile );
 
 		//! Create a new VTFFile class as a duplicate of another.
 		/*!
@@ -175,16 +193,15 @@ namespace VTFLib
 			\param VTFFile is the CVTFFile class you want to copy.
 			\param ImageFormat the format you want to convert the copied image data to.
 		*/
-		CVTFFile(const CVTFFile &VTFFile, VTFImageFormat ImageFormat);
+		CVTFFile( const CVTFFile &VTFFile, VTFImageFormat ImageFormat );
 
-		~CVTFFile();	//!< Deconstructor
+		~CVTFFile(); //!< Deconstructor
 
 	public:
-		
 		//! Inits a new empty VTF image
 		/*!
 			Inits a new empty VTF image. This is almost the same as the old Create function, but with a new name and takes total mip count
-		
+
 			\param uiWidth is the width in pixels of the main VTF image.
 			\param uiHeight is the height in pixels of the main VTF image.
 			\param uiFrames is the number of frames in the VTF image (default 1).
@@ -195,16 +212,16 @@ namespace VTFLib
 			\param bThumbnail sets if the VTF image will contain an additional thumbnail (default true).
 			\param bNullImageData sets if the image data should be zero'd out on creation (default false).
 		*/
-		vlBool Init(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames = 1, vlUInt uiFaces = 1, vlUInt uiSlices = 1, VTFImageFormat ImageFormat = IMAGE_FORMAT_RGBA8888, vlBool bThumbnail = vlTrue, vlInt nMipmaps = -1, vlBool bNullImageData = vlFalse);
-		
+		vlBool Init( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames = 1, vlUInt uiFaces = 1, vlUInt uiSlices = 1, VTFImageFormat ImageFormat = IMAGE_FORMAT_RGBA8888, vlBool bThumbnail = vlTrue, vlInt nMipmaps = -1, vlBool bNullImageData = vlFalse );
+
 		//! Inits a new empty VTF image
 		/*!
 			Inits a new empty VTF image. Same as the other variant of Init but takes a struct as a param
-		
+
 			\param initOpts is a struct containing init options for the texture
 			\see tagSVTFInitOptions
 		*/
-		vlBool Init(const SVTFInitOptions& initOpts);
+		vlBool Init( const SVTFInitOptions &initOpts );
 
 		//! Creates a new empty VTF image..
 		/*!
@@ -223,13 +240,13 @@ namespace VTFLib
 			\note Animated and static textures have 1 face. Cubemaps have 6, one for each side of the cube.
 			\see tagSVTFCreateOptions
 		*/
-		[[deprecated]] vlBool Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames = 1, vlUInt uiFaces = 1, vlUInt uiSlices = 1, VTFImageFormat ImageFormat = IMAGE_FORMAT_RGBA8888, vlBool bThumbnail = vlTrue, vlBool bMipmaps = vlTrue, vlBool bNullImageData = vlFalse);
+		[[deprecated]] vlBool Create( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames = 1, vlUInt uiFaces = 1, vlUInt uiSlices = 1, VTFImageFormat ImageFormat = IMAGE_FORMAT_RGBA8888, vlBool bThumbnail = vlTrue, vlBool bMipmaps = vlTrue, vlBool bNullImageData = vlFalse );
 
 		//! Create a new VTF image from existing data.
 		/*!
 			Creates a new VTF image using image data already stored in memory. The existing
 			image data should be stored in RGBA8888 format.
-			
+
 			\param uiWidth is the width in pixels of the main VTF image.
 			\param uiHeight is the height in pixels of the main VTF image.
 			\param lpImageDataRGBA8888 is a pointer to the source RGBA8888 data.
@@ -237,13 +254,13 @@ namespace VTFLib
 			\return true on successful creation, otherwise false.
 			\see tagSVTFCreateOptions
 		*/
-		vlBool Create(vlUInt uiWidth, vlUInt uiHeight, vlByte *lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions);
+		vlBool Create( vlUInt uiWidth, vlUInt uiHeight, vlByte *lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions );
 
 		//! Create a new VTF multi-frame or cubemap image from existing data.
 		/*!
 			Creates a new multi-frame or cubemap VTF image using image data already stored
 			in memory. The existing image data should be stored in RGBA8888 format.
-			
+
 			\param uiWidth is the width in pixels of the main VTF image.
 			\param uiHeight is the height in pixels of the main VTF image.
 			\param uiFrames is the number of frames in the VTF image.
@@ -255,8 +272,8 @@ namespace VTFLib
 			\note Animated and static textures have 1 face. Cubemaps have 6, one for each side of the cube.
 			\see tagSVTFCreateOptions
 		*/
-		vlBool Create(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt vlSlices, vlByte **lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions);
-		
+		vlBool Create( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt vlSlices, vlByte **lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions );
+
 		//! Destroys the current VTF image by setting the header, thumbnail and image data to zero.
 		vlVoid Destroy();
 
@@ -279,9 +296,9 @@ namespace VTFLib
 			\param bHeaderOnly sets whether to load just the VTF header or not (default false).
 			\return true on sucessful load, otherwise false.
 		*/
-		vlBool Load(const vlChar *cFileName, vlBool bHeaderOnly = vlFalse);
+		vlBool Load( const vlChar *cFileName, vlBool bHeaderOnly = vlFalse );
 
-		//! Loads a VTF image from memory. 
+		//! Loads a VTF image from memory.
 		/*!
 			Loads a VTF image file stored in memory into the current VTFFile class.
 			You may choose to load just the header only if you want to get info about the file
@@ -292,9 +309,9 @@ namespace VTFLib
 			\param bHeaderOnly sets whether to load just the VTF header or not (default false).
 			\return true on sucessful load, otherwise false.
 		*/
-		vlBool Load(const vlVoid *lpData, vlUInt uiBufferSize, vlBool bHeaderOnly = vlFalse);
+		vlBool Load( const vlVoid *lpData, vlUInt uiBufferSize, vlBool bHeaderOnly = vlFalse );
 
-		//! Loads a VTF image using callback functions. 
+		//! Loads a VTF image using callback functions.
 		/*!
 			Loads a VTF image file into the current VTFFile class.
 			You may choose to load just the header only if you want to get info about the file
@@ -304,7 +321,7 @@ namespace VTFLib
 			\param bHeaderOnly sets whether to load just the VTF header or not (default false).
 			\return true on sucessful load, otherwise false.
 		*/
-		vlBool Load(vlVoid *pUserData, vlBool bHeaderOnly = vlFalse);
+		vlBool Load( vlVoid *pUserData, vlBool bHeaderOnly = vlFalse );
 
 		//! Save a VTF image from disk.
 		/*!
@@ -313,7 +330,7 @@ namespace VTFLib
 			\param cFileName is the path and filename of the file to load.
 			\return true on sucessful save, otherwise false.
 		*/
-		vlBool Save(const vlChar *cFileName) const;
+		vlBool Save( const vlChar *cFileName ) const;
 
 		//! Save a VTF image to memory.
 		/*!
@@ -323,7 +340,7 @@ namespace VTFLib
 			\param uiBufferSize is the size of the VTF file in bytes.
 			\return true on sucessful save, otherwise false.
 		*/
-		vlBool Save(vlVoid *lpData, vlUInt uiBufferSize, vlUInt &uiSize) const;
+		vlBool Save( vlVoid *lpData, vlUInt uiBufferSize, vlUInt &uiSize ) const;
 
 		//! Save a VTF image using callback functions.
 		/*!
@@ -332,54 +349,53 @@ namespace VTFLib
 			\param pUserData is a pointer to custom user data.
 			\return true on sucessful save, otherwise false.
 		*/
-		vlBool Save(vlVoid *pUserData) const;
-		
+		vlBool Save( vlVoid *pUserData ) const;
+
 		//! Convert the internal storage of the VTF to the specified format
-		vlBool ConvertInPlace(VTFImageFormat format);
+		vlBool ConvertInPlace( VTFImageFormat format );
 
 	private:
-		vlBool IsPowerOfTwo(vlUInt uiSize);
-		vlUInt NextPowerOfTwo(vlUInt uiSize);
+		vlBool IsPowerOfTwo( vlUInt uiSize );
+		vlUInt NextPowerOfTwo( vlUInt uiSize );
 
-		vlVoid ComputeResources();	 //!< Computes header VTF directory resources.
+		vlVoid ComputeResources(); //!< Computes header VTF directory resources.
 
 		// Interface with out reader/writer classes
-		vlBool Load(IO::Readers::IReader *Reader, vlBool bHeaderOnly);
-		vlBool Save(IO::Writers::IWriter *Writer) const;
+		vlBool Load( IO::Readers::IReader *Reader, vlBool bHeaderOnly );
+		vlBool Save( IO::Writers::IWriter *Writer ) const;
 
 		// Saves a VTF with a given compression level
-		vlBool SaveCompressed(IO::Writers::IWriter* Writer, vlInt iCompressionLevel) const;
+		vlBool SaveCompressed( IO::Writers::IWriter *Writer, vlInt iCompressionLevel ) const;
 
 	public:
-
 		//! Check if image data has been loaded.
 		/*!
 			Check to see if the image buffer has data in it. If a VTF file was loaded
 			into the class with the bHeaderOnly option, this will return false.
-			
+
 			\return true if image data is present, otherwise false.
 		*/
 		vlBool GetHasImage() const;
 
-		vlUInt GetMajorVersion() const;	 //!< Returns the VTF file major version number.
-		vlUInt GetMinorVersion() const;	 //!< Returns the VTF file minor version number.
-		bool SetVersion(vlUInt major, vlUInt minor);
-		
-		vlUInt GetSize() const;			 //!< Returns the VTF file size in bytes.
+		vlUInt GetMajorVersion() const; //!< Returns the VTF file major version number.
+		vlUInt GetMinorVersion() const; //!< Returns the VTF file minor version number.
+		bool SetVersion( vlUInt major, vlUInt minor );
+
+		vlUInt GetSize() const;						 //!< Returns the VTF file size in bytes.
 
-		vlUInt GetWidth() const;	//!< Returns the width of the image in pixels from the VTF header.
-		vlUInt GetHeight() const;	//!< Returns the height of the image in pixels from the VTF header.
-		vlUInt GetDepth() const;	//!< Returns the depth of the image in pixels from the VTF header.
+		vlUInt GetWidth() const;					 //!< Returns the width of the image in pixels from the VTF header.
+		vlUInt GetHeight() const;					 //!< Returns the height of the image in pixels from the VTF header.
+		vlUInt GetDepth() const;					 //!< Returns the depth of the image in pixels from the VTF header.
 
-		vlUInt GetFrameCount() const;	//!< Returns the frame count from the VTF header.
-		vlUInt GetFaceCount() const;	//!< Returns the face count from the VTF header.
-		vlUInt GetMipmapCount() const;	//!< Returns the number of MIP levels in the image from the VTF header.
+		vlUInt GetFrameCount() const;				 //!< Returns the frame count from the VTF header.
+		vlUInt GetFaceCount() const;				 //!< Returns the face count from the VTF header.
+		vlUInt GetMipmapCount() const;				 //!< Returns the number of MIP levels in the image from the VTF header.
 
-		vlUInt GetStartFrame() const;				//!< Returns the start frame from the VTF header.
-		vlVoid SetStartFrame(vlUInt uiStartFrame);  //!< Sets the start frame in the VTF header.
+		vlUInt GetStartFrame() const;				 //!< Returns the start frame from the VTF header.
+		vlVoid SetStartFrame( vlUInt uiStartFrame ); //!< Sets the start frame in the VTF header.
 
-		vlUInt GetFlags() const;	//!< Returns the image flags from the VTF header.
-		vlVoid SetFlags(vlUInt uiFlags); 	//!< Sets the image flags in the VTF header.
+		vlUInt GetFlags() const;					 //!< Returns the image flags from the VTF header.
+		vlVoid SetFlags( vlUInt uiFlags );			 //!< Sets the image flags in the VTF header.
 
 		//! Check if a specific flag is set in the VTF header.
 		/*!
@@ -388,7 +404,7 @@ namespace VTFLib
 			\param ImageFlag is the flag you wish to check for.
 			\return true if the flag is set, otherwise false.
 		*/
-		vlBool GetFlag(VTFImageFlag ImageFlag) const;
+		vlBool GetFlag( VTFImageFlag ImageFlag ) const;
 
 		//! Set the state of a specific flag in the VTF header.
 		/*!
@@ -397,16 +413,16 @@ namespace VTFLib
 			\param ImageFlag is the flag you wish to set.
 			\param bState is the state you wish to set for the flag.
 		*/
-		vlVoid SetFlag(VTFImageFlag ImageFlag, vlBool bState);
+		vlVoid SetFlag( VTFImageFlag ImageFlag, vlBool bState );
+
+		vlSingle GetBumpmapScale() const; //!< Get the bump scale value.
 
-		vlSingle GetBumpmapScale() const;	//!< Get the bump scale value.
-		
 		//! Set the bump scale value.
 		/*!
 			Sets the bump scale in the VTF header to the given floating point value.
 			\param sBumpmapScale is the scale value to set.
 		*/
-		vlVoid SetBumpmapScale(vlSingle sBumpmapScale);
+		vlVoid SetBumpmapScale( vlSingle sBumpmapScale );
 
 		//! Get the reflectivity values.
 		/*!
@@ -414,7 +430,7 @@ namespace VTFLib
 
 			\param sX, sY, sZ are the variables to hold the values reflectivity vector.
 		*/
-		vlVoid GetReflectivity(vlSingle &sX, vlSingle &sY, vlSingle &sZ) const;
+		vlVoid GetReflectivity( vlSingle &sX, vlSingle &sY, vlSingle &sZ ) const;
 
 		//! Set the reflectivity values.
 		/*!
@@ -422,10 +438,10 @@ namespace VTFLib
 
 			\param sX, sY, sZ are the values for each reflectivity vector axis.
 		*/
-		vlVoid SetReflectivity(vlSingle sX, vlSingle sY, vlSingle sZ);
+		vlVoid SetReflectivity( vlSingle sX, vlSingle sY, vlSingle sZ );
+
+		VTFImageFormat GetFormat() const; //!< Returns the storage format of the main image data set in the VTF header.
 
-		VTFImageFormat GetFormat() const;	//!< Returns the storage format of the main image data set in the VTF header.
-		
 		//! Get a pointer to the image data for a specific image.
 		/*!
 			Returns a pointer to the image data for a given frame, face and MIP level.
@@ -439,13 +455,13 @@ namespace VTFLib
 			at index 0 for the largest image moving down in size.
 			\see GetFormat()
 		*/
-		vlByte *GetData(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel) const;
-		
+		vlByte *GetData( vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel ) const;
+
 		//! Set the image data for a specific image.
 		/*!
 			Sets the image data for a given frame, face and MIP level. The source image
 			data pointed to by lpData must be in the format specified in the VTF header.
-			
+
 			\param uiFrame is the desired frame.
 			\param uiFace is the desired face.
 			\param uiSlice is the desired z slice.
@@ -457,16 +473,15 @@ namespace VTFLib
 			at index 0 for the largest image moving down in size.
 			\see GetFormat()
 		*/
-		vlVoid SetData(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel, vlByte *lpData);
+		vlVoid SetData( vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel, vlByte *lpData );
 
 	public:
-		
-		vlBool GetHasThumbnail() const;		//!< Returns if a the current VTF image image contains a thumbnail version.
+		vlBool GetHasThumbnail() const;			   //!< Returns if a the current VTF image image contains a thumbnail version.
 
-		vlUInt GetThumbnailWidth() const;	//!< Returns the width in pixels of the current images thumbnail.
-		vlUInt GetThumbnailHeight() const;	//!< Returns the heught in pixels of the current images thumbnail.
+		vlUInt GetThumbnailWidth() const;		   //!< Returns the width in pixels of the current images thumbnail.
+		vlUInt GetThumbnailHeight() const;		   //!< Returns the heught in pixels of the current images thumbnail.
 
-		VTFImageFormat GetThumbnailFormat() const;	//!< Returns the image format of the current images thumbnail.
+		VTFImageFormat GetThumbnailFormat() const; //!< Returns the image format of the current images thumbnail.
 
 		//! Get a pointer to the thumbnail image data for the current image.
 		/*!
@@ -481,18 +496,18 @@ namespace VTFLib
 			Sets the thumbnail image data for the current image. The source image
 			data pointed to by lpData must be in the format specified for the thumbnail
 			in the VTF header.
-			
+
 			\param lpData is a pointer to the image data.
 			\see GetThumbnailFormat()
 		*/
-		vlVoid SetThumbnailData(vlByte *lpData);
+		vlVoid SetThumbnailData( vlByte *lpData );
 
 	public:
 		vlBool GetSupportsResources() const;			//!< Returns true if the current VTF file version supports resources.
 
 		vlUInt GetResourceCount() const;				//!< Returns the number of resources contained within the VTF file.
-		vlUInt GetResourceType(vlUInt uiIndex) const;	//!< Returns the resource type;
-		vlBool GetHasResource(vlUInt uiType) const;		//!< Returns true if the resource exists.
+		vlUInt GetResourceType( vlUInt uiIndex ) const; //!< Returns the resource type;
+		vlBool GetHasResource( vlUInt uiType ) const;	//!< Returns true if the resource exists.
 
 		//! Get a VTF resource type's data.
 		/*!
@@ -502,7 +517,7 @@ namespace VTFLib
 			\param uiSize is the size of the resource data.
 			\return a pointer to the resource data buffer if the resource exists.
 		*/
-		vlVoid *GetResourceData(vlUInt uiType, vlUInt &uiSize) const;
+		vlVoid *GetResourceData( vlUInt uiType, vlUInt &uiSize ) const;
 
 		//! Set a VTF resource type's data.
 		/*!
@@ -514,10 +529,10 @@ namespace VTFLib
 			\param lpData is the resource data; if null the resource data is zeroed.
 			\return a pointer to the resource data buffer if the resource exists or was created.
 		*/
-		vlVoid *SetResourceData(vlUInt uiType, vlUInt uiSize, vlVoid *lpData);
+		vlVoid *SetResourceData( vlUInt uiType, vlUInt uiSize, vlVoid *lpData );
 
 	public:
-		vlInt GetAuxCompressionLevel() const;						//!< Gets the auxiliary compression level of the VTF
+		vlInt GetAuxCompressionLevel() const; //!< Gets the auxiliary compression level of the VTF
 
 		//!< Returns true if the compression level was successfully set
 		/*!
@@ -527,10 +542,9 @@ namespace VTFLib
 			0 compression means no compression, 1-9 are increasing levels of compression and
 			SVTFAuxCompressionInfoHeader::DEFAULT_COMPRESSION lets the algorithm decide.
 		*/
-		vlBool SetAuxCompressionLevel(vlInt iCompressionLevel);
+		vlBool SetAuxCompressionLevel( vlInt iCompressionLevel );
 
 	public:
-
 		//! Generate MIP maps from the main image data.
 		/*!
 			Generates MIP maps for the image down to 1 x 1 pixel using the data in
@@ -540,7 +554,7 @@ namespace VTFLib
 			\param bSRGB is whether we are generating mips for color data or not.
 			\return true on sucessful creation, otherwise false.
 		*/
-		vlBool GenerateMipmaps(VTFMipmapFilter MipmapFilter, vlBool bSRGB);
+		vlBool GenerateMipmaps( VTFMipmapFilter MipmapFilter, vlBool bSRGB );
 
 		//! Generate MIP maps from a specific face and frame.
 		/*!
@@ -555,7 +569,7 @@ namespace VTFLib
 			for the first face. Cubemaps have 6 faces, others only 1.
 			\return true on sucessful creation, otherwise false.
 		*/
-		vlBool GenerateMipmaps(vlUInt uiFace, vlUInt uiFrame, VTFMipmapFilter MipmapFilter, vlBool bSRGB);
+		vlBool GenerateMipmaps( vlUInt uiFace, vlUInt uiFrame, VTFMipmapFilter MipmapFilter, vlBool bSRGB );
 
 		//! Generate a thumbnail image.
 		/*!
@@ -565,7 +579,7 @@ namespace VTFLib
 			\return true on sucessful creation, otherwise false.
 			\see SetThumbnailData()
 		*/
-		vlBool GenerateThumbnail(vlBool bSRGB);
+		vlBool GenerateThumbnail( vlBool bSRGB );
 
 		//! Convert image to a normal map.
 		/*!
@@ -578,13 +592,13 @@ namespace VTFLib
 			\return true on sucessful creation, otherwise false.
 			\note  The options for conversion are the same used in the nVidea NormalMap Photoshop plug-in.
 		*/
-		vlBool GenerateNormalMap(VTFKernelFilter KernelFilter = KERNEL_FILTER_3X3, VTFHeightConversionMethod HeightConversionMethod = HEIGHT_CONVERSION_METHOD_AVERAGE_RGB, VTFNormalAlphaResult NormalAlphaResult = NORMAL_ALPHA_RESULT_WHITE);
-		
+		vlBool GenerateNormalMap( VTFKernelFilter KernelFilter = KERNEL_FILTER_3X3, VTFHeightConversionMethod HeightConversionMethod = HEIGHT_CONVERSION_METHOD_AVERAGE_RGB, VTFNormalAlphaResult NormalAlphaResult = NORMAL_ALPHA_RESULT_WHITE );
+
 		//! Convert image to a normal map from a specific frame.
 		/*!
 			Converts the image to a normal map using the image data in
 			the given frame as the source.
-			
+
 			\param uiFrame is the frame index to use.
 			\param KernelFilter is the kernel filter to use (default 3x3).
 			\param HeightConversionMethod is the method of determining the height data from the source (default average RGB).
@@ -592,16 +606,14 @@ namespace VTFLib
 			\return true on sucessful creation, otherwise false.
 			\note  The options for conversion are the same used in the nVidea NormalMap Photoshop plug-in.
 		*/
-		vlBool GenerateNormalMap(vlUInt uiFrame, VTFKernelFilter KernelFilter = KERNEL_FILTER_3X3, VTFHeightConversionMethod HeightConversionMethod = HEIGHT_CONVERSION_METHOD_AVERAGE_RGB, VTFNormalAlphaResult NormalAlphaResult = NORMAL_ALPHA_RESULT_WHITE);
+		vlBool GenerateNormalMap( vlUInt uiFrame, VTFKernelFilter KernelFilter = KERNEL_FILTER_3X3, VTFHeightConversionMethod HeightConversionMethod = HEIGHT_CONVERSION_METHOD_AVERAGE_RGB, VTFNormalAlphaResult NormalAlphaResult = NORMAL_ALPHA_RESULT_WHITE );
 
-		vlBool GenerateSphereMap();		//!< Creates a spheremap from using the 6 faces of the image making up its cubemap.
+		vlBool GenerateSphereMap(); //!< Creates a spheremap from using the 6 faces of the image making up its cubemap.
 
 	public:
+		vlBool ComputeReflectivity(); //!< Calculates and sets the reflectivity vector values for the VTF image based on the colour averages of each pixel.
 
-		vlBool ComputeReflectivity();	//!< Calculates and sets the reflectivity vector values for the VTF image based on the colour averages of each pixel.
-	
 	public:
-
 		//! Get VTFImageFormat info.
 		/*!
 			Returns a SImageFormatInfo info struct for the specified VTFImageFormat.
@@ -609,7 +621,7 @@ namespace VTFLib
 			\param ImageFormat is the format to get info on.
 			\return SImageFormatInfo info struct.
 		*/
-		static SVTFImageFormatInfo const &GetImageFormatInfo(VTFImageFormat ImageFormat);
+		static SVTFImageFormatInfo const &GetImageFormatInfo( VTFImageFormat ImageFormat );
 
 		//! Calculate data buffer size for an image
 		/*!
@@ -623,7 +635,7 @@ namespace VTFLib
 			\param ImageFormat is the storage format of the image data.
 			\return size of the image data in bytes.
 		*/
-		static vlUInt ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, VTFImageFormat ImageFormat);
+		static vlUInt ComputeImageSize( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, VTFImageFormat ImageFormat );
 
 		//! Calculate data buffer size for an image with MIP maps
 		/*!
@@ -638,7 +650,7 @@ namespace VTFLib
 			\param ImageFormat is the storage format of the image data.
 			\return size of the image data in bytes.
 		*/
-		static vlUInt ComputeImageSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmaps, VTFImageFormat ImageFormat);
+		static vlUInt ComputeImageSize( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmaps, VTFImageFormat ImageFormat );
 
 		//! Compute the number of MIP maps needed by an image
 		/*!
@@ -650,7 +662,7 @@ namespace VTFLib
 			\param uiDepth is the depth in pixels of the original image.
 			\return number of MIP maps needed.
 		*/
-		static vlUInt ComputeMipmapCount(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth);	//!< Returns how many MIP map levels are required for an image uiWidth and uiHeight in size, down to 1x1 pixel.
+		static vlUInt ComputeMipmapCount( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth ); //!< Returns how many MIP map levels are required for an image uiWidth and uiHeight in size, down to 1x1 pixel.
 
 		//! Compute the dimensions of a specific MIP level.
 		/*!
@@ -665,11 +677,11 @@ namespace VTFLib
 			\param uiMipmapHeight is the variable to hold the calculated height.
 			\param uiMipmapDepth is the variable to hold the calculated depth.
 		*/
-		static vlVoid ComputeMipmapDimensions(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, vlUInt &uiMipmapWidth, vlUInt &uiMipmapHeight, vlUInt &uiMipmapDepth);
-		
+		static vlVoid ComputeMipmapDimensions( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, vlUInt &uiMipmapWidth, vlUInt &uiMipmapHeight, vlUInt &uiMipmapDepth );
+
 		//! Compute how much memory a specific MIP map level needs.
 		/*!
-			Computers the total memory needed in bytes for the a specific MIP map level 
+			Computers the total memory needed in bytes for the a specific MIP map level
 			of an image of a given width and height stored in the specified image format.
 
 			\param uiWidth is the width in pixels of the source image.
@@ -679,17 +691,15 @@ namespace VTFLib
 			\param ImageFormat is the image format the MIP map image data is stored in.
 			\return size of the MIP map image data in bytes.
 		*/
-		static vlUInt ComputeMipmapSize(vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, VTFImageFormat ImageFormat);
+		static vlUInt ComputeMipmapSize( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiDepth, vlUInt uiMipmapLevel, VTFImageFormat ImageFormat );
 
 	private:
-
 		// Calculates where in the VTF image the data begins
-		vlUInt ComputeDataOffset(vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel, VTFImageFormat ImageFormat) const;
+		vlUInt ComputeDataOffset( vlUInt uiFrame, vlUInt uiFace, vlUInt uiSlice, vlUInt uiMipmapLevel, VTFImageFormat ImageFormat ) const;
 
-		vlUInt GetAuxInfoOffset(vlUInt iFrame, vlUInt iFace, vlUInt iMipLevel) const;
+		vlUInt GetAuxInfoOffset( vlUInt iFrame, vlUInt iFace, vlUInt iMipLevel ) const;
 
 	public:
-
 		//! Convert an image to RGBA8888 format.
 		/*!
 			Converts image data stored in the given format to RGBA8888 format.
@@ -701,7 +711,7 @@ namespace VTFLib
 			\param SourceFormat is the image format of the source data.
 			\return true on sucessful conversion, otherwise false.
 		*/
-		static vlBool ConvertToRGBA8888(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat);
+		static vlBool ConvertToRGBA8888( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat );
 
 		//! Convert an image from RGBA8888 format.
 		/*!
@@ -714,7 +724,7 @@ namespace VTFLib
 			\param DestFormat is the image format you wish to convert to.
 			\return true on sucessful conversion, otherwise false.
 		*/
-		static vlBool ConvertFromRGBA8888(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat);
+		static vlBool ConvertFromRGBA8888( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat );
 
 		//! Convert an image from any format to any format.
 		/*!
@@ -728,7 +738,7 @@ namespace VTFLib
 			\param DestFormat is the image format you wish to convert to.
 			\return true on sucessful conversion, otherwise false.
 		*/
-		static vlBool Convert(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat, VTFImageFormat DestFormat);
+		static vlBool Convert( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat, VTFImageFormat DestFormat );
 
 		//! Re-sizes an image.
 		/*!
@@ -744,18 +754,16 @@ namespace VTFLib
 			\param bRGB is whether we are generating mips for color data or not.
 			\return true on sucessful re-size, otherwise false.
 		*/
-		static vlBool Resize(vlByte *lpSourceRGBA8888, vlByte *lpDestRGBA8888, vlUInt uiSourceWidth, vlUInt uiSourceHeight, vlUInt uiDestWidth, vlUInt uiDestHeight, VTFMipmapFilter ResizeFilter, vlBool bSRGB);
+		static vlBool Resize( vlByte *lpSourceRGBA8888, vlByte *lpDestRGBA8888, vlUInt uiSourceWidth, vlUInt uiSourceHeight, vlUInt uiDestWidth, vlUInt uiDestHeight, VTFMipmapFilter ResizeFilter, vlBool bSRGB );
 
 	private:
-		
 		// BCn format decompression function
-		static vlBool DecompressBCn(vlByte *src, vlByte *dst, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat);
+		static vlBool DecompressBCn( vlByte *src, vlByte *dst, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat SourceFormat );
 
 		// BCn format compression function
-		static vlBool CompressBCn(vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat);
+		static vlBool CompressBCn( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, VTFImageFormat DestFormat );
 
 	public:
-
 		//! Correct and images gamma.
 		/*!
 			Applies gamma correction to an image.
@@ -765,7 +773,7 @@ namespace VTFLib
 			\param uiHeight is the height of the source image in pixels.
 			\param sGammaCorrection is the amount of gamma correction to apply.
 		*/
-		static vlVoid CorrectImageGamma(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle sGammaCorrection);
+		static vlVoid CorrectImageGamma( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle sGammaCorrection );
 
 		//! Computes the reflectivity for an image.
 		/*!
@@ -780,11 +788,19 @@ namespace VTFLib
 			\see GetReflectivity()
 			\see SetReflectivity()
 		*/
-		static vlVoid ComputeImageReflectivity(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle &sX, vlSingle &sY, vlSingle &sZ);
-
-		static vlVoid FlipImage(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight);		//!< Flips an image vertically along its X-axis.
-		static vlVoid MirrorImage(vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight);	//!< Flips an image horizontally along its Y-axis.
+		static vlVoid ComputeImageReflectivity( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight, vlSingle &sX, vlSingle &sY, vlSingle &sZ );
+
+		static vlVoid FlipImage( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight );   //!< Flips an image vertically along its X-axis.
+		static vlVoid MirrorImage( vlByte *lpImageDataRGBA8888, vlUInt uiWidth, vlUInt uiHeight ); //!< Flips an image horizontally along its Y-axis.
+		vlBool CreateFloat( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, vlByte **lpImageDataFP, const SVTFCreateOptions &VTFCreateOptions, const VTFImageFormat &SourceFormat );
+		static vlBool ResizeFloat( vlByte *lpSourceRGBAFP32, vlByte *lpDestRGBFP32, vlUInt uiSourceWidth, vlUInt uiSourceHeight, vlUInt uiDestWidth, vlUInt uiDestHeight, VTFMipmapFilter ResizeFilter, vlBool bSRGB );
+		static vlSingle FP16ToFP32( vlUInt16 input );
+		static vlBool HALF_HDR_TO_LDR( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const tagSVTFImageConvertInfo &SourceInfo, const tagSVTFImageConvertInfo &DestInfo );
+		static vlBool HDR_TO_LDR( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const tagSVTFImageConvertInfo &SourceInfo, const tagSVTFImageConvertInfo &DestInfo );
+		static vlBool LDR_TO_HDR( vlByte *lpSource, vlByte *lpDest, vlUInt uiWidth, vlUInt uiHeight, const SVTFImageConvertInfo &SourceInfo, const SVTFImageConvertInfo &DestInfo );
+		static unsigned short FP32ToFP16( float input );
+		vlBool Create( vlUInt uiWidth, vlUInt uiHeight, vlUInt uiFrames, vlUInt uiFaces, vlUInt uiSlices, vlByte **lpImageDataRGBA8888, const SVTFCreateOptions &VTFCreateOptions, const VTFImageFormat &SourceFormat );
 	};
-}
+} // namespace VTFLib
 
 #endif
diff --git a/thirdparty/half/ChangeLog.txt b/thirdparty/half/ChangeLog.txt
new file mode 100644
index 0000000..37f3dbf
--- /dev/null
+++ b/thirdparty/half/ChangeLog.txt
@@ -0,0 +1,213 @@
+Release Notes											{#changelog}
+=============
+
+2.2.0 release (2021-06-12):
+---------------------------
+
+- Added `rsqrt` function for inverse square root.
+- Improved performance of `pow` function.
+- Fixed bug that forgot to include `<immintrin.h>` for F16C intrinsics.
+
+
+2.1.0 release (2019-08-05):
+---------------------------
+
+- Added detection of IEEE floating-point exceptions to operators and functions.
+- Added configuration options for automatic exception handling.
+- Added functions for explicitly managing floating-point exception flags.
+- Improved accuracy of `pow` and `atan2` functions.
+
+
+2.0.0 release (2019-07-23):
+---------------------------
+
+- Made internal implementation independent from built-in floating point 
+  facilities for increased reliability and IEEE-conformance.
+- Changed default rounding mode to rounding to nearest.
+- Always round ties to even when rounding to nearest.
+- Extended `constexpr` support to comparison and classification functions.
+- Added support for F16C compiler intrinsics for conversions.
+- Enabled C++11 feature detection for Intel compilers.
+
+
+1.12.0 release (2017-03-06):
+----------------------------
+
+- Changed behaviour of `half_cast` to perform conversions to/from `double` 
+  and `long double` directly according to specified rounding mode, without an 
+  intermediate `float` conversion.
+- Added `noexcept` specifiers to constructors.
+- Fixed minor portability problem with `logb` and `ilogb`.
+- Tested for *VC++ 2015*.
+
+
+1.11.0 release (2013-11-16):
+----------------------------
+
+- Made tie-breaking behaviour in round to nearest configurable by 
+  `HALF_ROUND_TIES_TO_EVEN` macro.
+- Completed support for all C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported.
+- Fixed inability to disable support for C++11 mathematical functions on 
+  *VC++ 2013*.
+
+
+1.10.0 release (2013-11-09):
+----------------------------
+
+- Made default rounding mode configurable by `HALF_ROUND_STYLE` macro.
+- Added support for non-IEEE single-precision implementations.
+- Added `HALF_ENABLE_CPP11_TYPE_TRAITS` preprocessor flag for checking 
+  support for C++11 type traits and TMP features.
+- Restricted `half_cast` to support built-in arithmetic types only.
+- Changed behaviour of `half_cast` to respect rounding mode when casting 
+  to/from integer types.
+
+
+1.9.2 release (2013-11-01):
+---------------------------
+
+- Tested for *gcc 4.8*.
+- Tested and fixed for *VC++ 2013*.
+- Removed unnecessary warnings in *MSVC*.
+
+
+1.9.1 release (2013-08-08):
+---------------------------
+
+- Fixed problems with older gcc and MSVC versions.
+- Small fix to non-C++11 implementations of `remainder` and `remquo`.
+
+
+1.9.0 release (2013-08-07):
+---------------------------
+
+- Changed behaviour of `nearbyint`, `rint`, `lrint` and `llrint` to use 
+  rounding mode of half-precision implementation (which is 
+  truncating/indeterminate) instead of single-precision rounding mode.
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  `remainder`, `remquo` and `cbrt`.
+- Minor implementation changes.
+
+
+1.8.1 release (2013-01-22):
+---------------------------
+
+- Fixed bug resulting in multiple definitions of the `nanh` function due to 
+  a missing `inline` specification.
+
+
+1.8.0 release (2013-01-19):
+---------------------------
+
+- Added support for more C++11 mathematical functions even if single-
+  precision versions from `<cmath>` are unsupported, in particular 
+  exponential and logarithm functions, hyperbolic area functions and the 
+  hypotenuse function.
+- Made `fma` function use default implementation if single-precision version
+  from `<cmath>` is not faster and thus `FP_FAST_FMAH` to be defined always.
+- Fixed overload resolution issues when invoking certain mathematical 
+  functions by unqualified calls.
+
+
+1.7.0 release (2012-10-26):
+---------------------------
+
+- Added support for C++11 `noexcept` specifiers.
+- Changed C++11 `long long` to be supported on *VC++ 2003* and up.
+
+
+1.6.1 release (2012-09-13):
+---------------------------
+
+- Made `fma` and `fdim` functions available even if corresponding 
+  single-precision functions are not.
+
+
+1.6.0 release (2012-09-12):
+---------------------------
+
+- Added `HALF_ENABLE_CPP11_LONG_LONG` to control support for `long long` 
+  integers and corresponding mathematical functions.
+- Fixed C++98 compatibility on non-VC compilers.
+
+
+1.5.1 release (2012-08-17):
+---------------------------
+
+- Recorrected `std::numeric_limits::round_style` to always return 
+  `std::round_indeterminate`, due to overflow-handling deviating from 
+  correct round-toward-zero behaviour.
+
+
+1.5.0 release (2012-08-16):
+---------------------------
+
+- Added `half_cast` for explicitly casting between half and any type 
+  convertible to/from `float` and allowing the explicit specification of 
+  the rounding mode to use.
+
+
+1.4.0 release (2012-08-12):
+---------------------------
+
+- Added support for C++11 generalized constant expressions (`constexpr`).
+
+
+1.3.1 release (2012-08-11):
+---------------------------
+
+- Fixed requirement for `std::signbit` and `std::isnan` (even if C++11 
+  `<cmath>` functions disabled) on non-VC compilers.
+
+
+1.3.0 release (2012-08-10):
+---------------------------
+
+- Made requirement for `<cstdint>` and `static_assert` optional and thus 
+  made the library C++98-compatible.
+- Made support for C++11 features user-overridable through explicit 
+  definition of corresponding preprocessor symbols to either 0 or 1.
+- Renamed `HALF_ENABLE_HASH` to `HALF_ENABLE_CPP11_HASH` in correspondence 
+  with other C++11 preprocessor symbols.
+
+
+1.2.0 release (2012-08-07):
+---------------------------
+
+- Added proper preprocessor definitions for `HUGE_VALH` and `FP_FAST_FMAH` 
+  in correspondence with their single-precision counterparts from `<cmath>`.
+- Fixed internal preprocessor macros to be properly undefined after use.
+
+
+1.1.2 release (2012-08-07):
+---------------------------
+
+- Revised `std::numeric_limits::round_style` to return 
+  `std::round_toward_zero` if the `float` version also does and 
+  `std::round_indeterminate` otherwise.
+- Fixed `std::numeric_limits::round_error` to reflect worst-case round 
+  toward zero behaviour.
+
+
+1.1.1 release (2012-08-06):
+---------------------------
+
+- Fixed `std::numeric_limits::min` to return smallest positive normal 
+  number, instead of subnormal number.
+- Fixed `std::numeric_limits::round_style` to return 
+  `std::round_indeterminate` due to mixture of separately rounded 
+  single-precision arithmetics with truncating single-to-half conversions.
+
+
+1.1.0 release (2012-08-06):
+---------------------------
+
+- Added half-precision literals.
+
+
+1.0.0 release (2012-08-05):
+---------------------------
+
+- First release.
diff --git a/thirdparty/half/LICENSE.txt b/thirdparty/half/LICENSE.txt
new file mode 100644
index 0000000..45f55db
--- /dev/null
+++ b/thirdparty/half/LICENSE.txt
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) 2012-2021 Christian Rau
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/thirdparty/half/README.txt b/thirdparty/half/README.txt
new file mode 100644
index 0000000..3dd0d1c
--- /dev/null
+++ b/thirdparty/half/README.txt
@@ -0,0 +1,317 @@
+HALF-PRECISION FLOATING-POINT LIBRARY (Version 2.2.0)
+-----------------------------------------------------
+
+This is a C++ header-only library to provide an IEEE 754 conformant 16-bit 
+half-precision floating-point type along with corresponding arithmetic 
+operators, type conversions and common mathematical functions. It aims for both 
+efficiency and ease of use, trying to accurately mimic the behaviour of the 
+built-in floating-point types at the best performance possible.
+
+
+INSTALLATION AND REQUIREMENTS
+-----------------------------
+
+Conveniently, the library consists of just a single header file containing all 
+the functionality, which can be directly included by your projects, without the 
+neccessity to build anything or link to anything.
+
+Whereas this library is fully C++98-compatible, it can profit from certain 
+C++11 features. Support for those features is checked automatically at compile 
+(or rather preprocessing) time, but can be explicitly enabled or disabled by 
+predefining the corresponding preprocessor symbols to either 1 or 0 yourself 
+before including half.hpp. This is useful when the automatic detection fails 
+(for more exotic implementations) or when a feature should be explicitly 
+disabled:
+
+  - 'long long' integer type for mathematical functions returning 'long long' 
+    results (enabled for VC++ 2003 and icc 11.1 and newer, gcc and clang, 
+    overridable with 'HALF_ENABLE_CPP11_LONG_LONG').
+
+  - Static assertions for extended compile-time checks (enabled for VC++ 2010, 
+    gcc 4.3, clang 2.9, icc 11.1 and newer, overridable with 
+    'HALF_ENABLE_CPP11_STATIC_ASSERT').
+
+  - Generalized constant expressions (enabled for VC++ 2015, gcc 4.6, clang 3.1, 
+    icc 14.0 and newer, overridable with 'HALF_ENABLE_CPP11_CONSTEXPR').
+
+  - noexcept exception specifications (enabled for VC++ 2015, gcc 4.6, 
+    clang 3.0, icc 14.0 and newer, overridable with 'HALF_ENABLE_CPP11_NOEXCEPT').
+
+  - User-defined literals for half-precision literals to work (enabled for 
+    VC++ 2015, gcc 4.7, clang 3.1, icc 15.0 and newer, overridable with 
+    'HALF_ENABLE_CPP11_USER_LITERALS').
+
+  - Thread-local storage for per-thread floating-point exception flags (enabled 
+    for VC++ 2015, gcc 4.8, clang 3.3, icc 15.0 and newer, overridable with 
+    'HALF_ENABLE_CPP11_THREAD_LOCAL').
+
+  - Type traits and template meta-programming features from <type_traits> 
+    (enabled for VC++ 2010, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_TYPE_TRAITS').
+
+  - Special integer types from <cstdint> (enabled for VC++ 2010, libstdc++ 4.3, 
+    libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CSTDINT').
+
+  - Certain C++11 single-precision mathematical functions from <cmath> for 
+    floating-point classification during conversions from higher precision types 
+    (enabled for VC++ 2013, libstdc++ 4.3, libc++ and newer, overridable with 
+    'HALF_ENABLE_CPP11_CMATH').
+
+  - Floating-point environment control from <cfenv> for possible exception 
+    propagation to the built-in floating-point platform (enabled for VC++ 2013, 
+    libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_CFENV').
+
+  - Hash functor 'std::hash' from <functional> (enabled for VC++ 2010, 
+    libstdc++ 4.3, libc++ and newer, overridable with 'HALF_ENABLE_CPP11_HASH').
+
+The library has been tested successfully with Visual C++ 2005-2015, gcc 4-8 
+and clang 3-8 on 32- and 64-bit x86 systems. Please contact me if you have any 
+problems, suggestions or even just success testing it on other platforms.
+
+
+DOCUMENTATION
+-------------
+
+What follows are some general words about the usage of the library and its 
+implementation. For a complete documentation of its interface consult the 
+corresponding website http://half.sourceforge.net. You may also generate the 
+complete developer documentation from the library's only include file's doxygen 
+comments, but this is more relevant to developers rather than mere users.
+
+BASIC USAGE
+
+To make use of the library just include its only header file half.hpp, which 
+defines all half-precision functionality inside the 'half_float' namespace. The 
+actual 16-bit half-precision data type is represented by the 'half' type, which 
+uses the standard IEEE representation with 1 sign bit, 5 exponent bits and 11 
+mantissa bits (including the hidden bit) and supports all types of special 
+values, like subnormal values, infinity and NaNs. This type behaves like the 
+built-in floating-point types as much as possible, supporting the usual 
+arithmetic, comparison and streaming operators, which makes its use pretty 
+straight-forward:
+
+    using half_float::half;
+    half a(3.4), b(5);
+    half c = a * b;
+    c += 3;
+    if(c > a)
+        std::cout << c << std::endl;
+
+Additionally the 'half_float' namespace also defines half-precision versions 
+for all mathematical functions of the C++ standard library, which can be used 
+directly through ADL:
+
+    half a(-3.14159);
+    half s = sin(abs(a));
+    long l = lround(s);
+
+You may also specify explicit half-precision literals, since the library 
+provides a user-defined literal inside the 'half_float::literal' namespace, 
+which you just need to import (assuming support for C++11 user-defined literals):
+
+    using namespace half_float::literal;
+    half x = 1.0_h;
+
+Furthermore the library provides proper specializations for 
+'std::numeric_limits', defining various implementation properties, and 
+'std::hash' for hashing half-precision numbers (assuming support for C++11 
+'std::hash'). Similar to the corresponding preprocessor symbols from <cmath> 
+the library also defines the 'HUGE_VALH' constant and maybe the 'FP_FAST_FMAH' 
+symbol.
+
+CONVERSIONS AND ROUNDING
+
+The half is explicitly constructible/convertible from a single-precision float 
+argument. Thus it is also explicitly constructible/convertible from any type 
+implicitly convertible to float, but constructing it from types like double or 
+int will involve the usual warnings arising when implicitly converting those to 
+float because of the lost precision. On the one hand those warnings are 
+intentional, because converting those types to half neccessarily also reduces 
+precision. But on the other hand they are raised for explicit conversions from 
+those types, when the user knows what he is doing. So if those warnings keep 
+bugging you, then you won't get around first explicitly converting to float 
+before converting to half, or use the 'half_cast' described below. In addition 
+you can also directly assign float values to halfs.
+
+In contrast to the float-to-half conversion, which reduces precision, the 
+conversion from half to float (and thus to any other type implicitly 
+convertible from float) is implicit, because all values represetable with 
+half-precision are also representable with single-precision. This way the 
+half-to-float conversion behaves similar to the builtin float-to-double 
+conversion and all arithmetic expressions involving both half-precision and 
+single-precision arguments will be of single-precision type. This way you can 
+also directly use the mathematical functions of the C++ standard library, 
+though in this case you will invoke the single-precision versions which will 
+also return single-precision values, which is (even if maybe performing the 
+exact same computation, see below) not as conceptually clean when working in a 
+half-precision environment.
+
+The default rounding mode for conversions between half and more precise types 
+as well as for rounding results of arithmetic operations and mathematical 
+functions rounds to the nearest representable value. But by predefining the 
+'HALF_ROUND_STYLE' preprocessor symbol this default can be overridden with one 
+of the other standard rounding modes using their respective constants or the 
+equivalent values of 'std::float_round_style' (it can even be synchronized with 
+the built-in single-precision implementation by defining it to 
+'std::numeric_limits<float>::round_style'):
+
+  - 'std::round_indeterminate' (-1) for the fastest rounding.
+
+  - 'std::round_toward_zero' (0) for rounding toward zero.
+
+  - 'std::round_to_nearest' (1) for rounding to the nearest value (default).
+
+  - 'std::round_toward_infinity' (2) for rounding toward positive infinity.
+
+  - 'std::round_toward_neg_infinity' (3) for rounding toward negative infinity.
+
+In addition to changing the overall default rounding mode one can also use the 
+'half_cast'. This converts between half and any built-in arithmetic type using 
+a configurable rounding mode (or the default rounding mode if none is 
+specified). In addition to a configurable rounding mode, 'half_cast' has 
+another big difference to a mere 'static_cast': Any conversions are performed 
+directly using the given rounding mode, without any intermediate conversion 
+to/from 'float'. This is especially relevant for conversions to integer types, 
+which don't necessarily truncate anymore. But also for conversions from 
+'double' or 'long double' this may produce more precise results than a 
+pre-conversion to 'float' using the single-precision implementation's current 
+rounding mode would.
+
+    half a = half_cast<half>(4.2);
+    half b = half_cast<half,std::numeric_limits<float>::round_style>(4.2f);
+    assert( half_cast<int, std::round_to_nearest>( 0.7_h )     == 1 );
+    assert( half_cast<half,std::round_toward_zero>( 4097 )     == 4096.0_h );
+    assert( half_cast<half,std::round_toward_infinity>( 4097 ) == 4100.0_h );
+    assert( half_cast<half,std::round_toward_infinity>( std::numeric_limits<double>::min() ) > 0.0_h );
+
+ACCURACY AND PERFORMANCE
+
+From version 2.0 onward the library is implemented without employing the 
+underlying floating-point implementation of the system (except for conversions, 
+of course), providing an entirely self-contained half-precision implementation 
+with results independent from the system's existing single- or double-precision 
+implementation and its rounding behaviour.
+
+As to accuracy, many of the operators and functions provided by this library 
+are exact to rounding for all rounding modes, i.e. the error to the exact 
+result is at most 0.5 ULP (unit in the last place) for rounding to nearest and 
+less than 1 ULP for all other rounding modes. This holds for all the operations 
+required by the IEEE 754 standard and many more. Specifically the following 
+functions might exhibit a deviation from the correctly rounded exact result by 
+1 ULP for a select few input values: 'expm1', 'log1p', 'pow', 'atan2', 'erf', 
+'erfc', 'lgamma', 'tgamma' (for more details see the documentation of the 
+individual functions). All other functions and operators are always exact to 
+rounding or independent of the rounding mode altogether.
+
+The increased IEEE-conformance and cleanliness of this implementation comes 
+with a certain performance cost compared to doing computations and mathematical 
+functions in hardware-accelerated single-precision. On average and depending on 
+the platform, the arithemtic operators are about 75% as fast and the 
+mathematical functions about 33-50% as fast as performing the corresponding 
+operations in single-precision and converting between the inputs and outputs. 
+However, directly computing with half-precision values is a rather rare 
+use-case and usually using actual 'float' values for all computations and 
+temproraries and using 'half's only for storage is the recommended way. But 
+nevertheless the goal of this library was to provide a complete and 
+conceptually clean IEEE-confromant half-precision implementation and in the few 
+cases when you do need to compute directly in half-precision you do so for a 
+reason and want accurate results.
+
+If necessary, this internal implementation can be overridden by predefining the 
+'HALF_ARITHMETIC_TYPE' preprocessor symbol to one of the built-in 
+floating-point types ('float', 'double' or 'long double'), which will cause the 
+library to use this type for computing arithmetic operations and mathematical 
+functions (if available). However, due to using the platform's floating-point 
+implementation (and its rounding behaviour) internally, this might cause 
+results to deviate from the specified half-precision rounding mode. It will of 
+course also inhibit the automatic exception detection described below.
+
+The conversion operations between half-precision and single-precision types can 
+also make use of the F16C extension for x86 processors by using the 
+corresponding compiler intrinsics from <immintrin.h>. Support for this is 
+checked at compile-time by looking for the '__F16C__' macro which at least gcc 
+and clang define based on the target platform. It can also be enabled manually 
+by predefining the 'HALF_ENABLE_F16C_INTRINSICS' preprocessor symbol to 1, or 0 
+for explicitly disabling it. However, this will directly use the corresponding 
+intrinsics for conversion without checking if they are available at runtime 
+(possibly crashing if they are not), so make sure they are supported on the 
+target platform before enabling this.
+
+EXCEPTION HANDLING
+
+The half-precision implementation supports all 5 required floating-point 
+exceptions from the IEEE standard to indicate erroneous inputs or inexact 
+results during operations. These are represented by exception flags which 
+actually use the same values as the corresponding 'FE_...' flags defined in 
+C++11's <cfenv> header if supported, specifically:
+
+  - 'FE_INVALID' for invalid inputs to an operation.
+  - 'FE_DIVBYZERO' for finite inputs producing infinite results.
+  - 'FE_OVERFLOW' if a result is too large to represent finitely.
+  - 'FE_UNDERFLOW' for a subnormal or zero result after rounding.
+  - 'FE_INEXACT' if a result needed rounding to be representable.
+  - 'FE_ALL_EXCEPT' as a convenient OR of all possible exception flags.
+
+The internal exception flag state will start with all flags cleared and is 
+maintained per thread if C++11 thread-local storage is supported, otherwise it 
+will be maintained globally and will theoretically NOT be thread-safe (while 
+practically being as thread-safe as a simple integer variable can be). These 
+flags can be managed explicitly using the library's error handling functions, 
+which again try to mimic the built-in functions for handling floating-point 
+exceptions from <cfenv>. You can clear them with 'feclearexcept' (which is the 
+only way a flag can be cleared), test them with 'fetestexcept', explicitly 
+raise errors with 'feraiseexcept' and save and restore their state using 
+'fegetexceptflag' and 'fesetexceptflag'. You can also throw corresponding C++ 
+exceptions based on the current flag state using 'fethrowexcept'.
+
+However, any automatic exception detection and handling during half-precision 
+operations and functions is DISABLED by default, since it comes with a minor 
+performance overhead due to runtime checks, and reacting to IEEE floating-point 
+exceptions is rarely ever needed in application code. But the library fully 
+supports IEEE-conformant detection of floating-point exceptions and various 
+ways for handling them, which can be enabled by pre-defining the corresponding 
+preprocessor symbols to 1. They can be enabled individually or all at once and 
+they will be processed in the order they are listed here:
+
+  - 'HALF_ERRHANDLING_FLAGS' sets the internal exception flags described above 
+    whenever the corresponding exception occurs.
+  - 'HALF_ERRHANDLING_ERRNO' sets the value of 'errno' from <cerrno> similar to 
+    the behaviour of the built-in floating-point types when 'MATH_ERRNO' is used.
+  - 'HALF_ERRHANDLING_FENV' will propagate exceptions to the built-in 
+    floating-point implementation using 'std::feraiseexcept' if support for 
+    C++11 floating-point control is enabled. However, this does not synchronize 
+    exceptions: neither will clearing  propagate nor will it work in reverse.
+  - 'HALF_ERRHANDLING_THROW_...' can be defined to a string literal which will 
+    be used as description message for a C++ exception that is thrown whenever 
+    a 'FE_...' exception occurs, similar to the behaviour of 'fethrowexcept'.
+
+If any of the above error handling is activated, non-quiet operations on 
+half-precision values will also raise a 'FE_INVALID' exception whenever 
+they encounter a signaling NaN value, in addition to transforming the value 
+into a quiet NaN. If error handling is disabled, signaling NaNs will be 
+treated like quiet NaNs (while still getting explicitly quieted if propagated 
+to the result). There can also be additional treatment of overflow and 
+underflow errors after they have been processed as above, which is ENABLED by 
+default (but of course only takes effect if any other exception handling is 
+activated) unless overridden by pre-defining the corresponding preprocessor 
+symbol to 0:
+
+  - 'HALF_ERRHANDLING_OVERFLOW_TO_INEXACT' will cause overflow errors to also 
+    raise a 'FE_INEXACT' exception.
+  - 'HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT' will cause underflow errors to also 
+    raise a 'FE_INEXACT' exception. This will also slightly change the 
+    behaviour of the underflow exception, which will ONLY be raised if the 
+    result is actually inexact due to underflow. If this is disabled, underflow 
+    exceptions will be raised for ANY (possibly exact) subnormal result.
+
+
+CREDITS AND CONTACT
+-------------------
+
+This library is developed by CHRISTIAN RAU and released under the MIT License 
+(see LICENSE.txt). If you have any questions or problems with it, feel free to 
+contact me at rauy@users.sourceforge.net.
+
+Additional credit goes to JEROEN VAN DER ZIJP for his paper on "Fast Half Float 
+Conversions", whose algorithms have been used in the library for converting 
+between half-precision and single-precision values.
diff --git a/thirdparty/half/include/half.hpp b/thirdparty/half/include/half.hpp
new file mode 100644
index 0000000..78e52cc
--- /dev/null
+++ b/thirdparty/half/include/half.hpp
@@ -0,0 +1,4956 @@
+// half - IEEE 754-based half-precision floating-point library.
+//
+// Copyright (c) 2012-2021 Christian Rau <rauy@users.sourceforge.net>
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
+// modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+// WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+// Version 2.2.0
+
+/// \file
+/// Main header file for half-precision functionality.
+
+#ifndef HALF_HALF_HPP
+#define HALF_HALF_HPP
+
+#define HALF_GCC_VERSION ( __GNUC__ * 100 + __GNUC_MINOR__ )
+
+#if defined( __INTEL_COMPILER )
+#define HALF_ICC_VERSION __INTEL_COMPILER
+#elif defined( __ICC )
+#define HALF_ICC_VERSION __ICC
+#elif defined( __ICL )
+#define HALF_ICC_VERSION __ICL
+#else
+#define HALF_ICC_VERSION 0
+#endif
+
+// check C++11 language features
+#if defined( __clang__ ) // clang
+#if __has_feature( cxx_static_assert ) && !defined( HALF_ENABLE_CPP11_STATIC_ASSERT )
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if __has_feature( cxx_constexpr ) && !defined( HALF_ENABLE_CPP11_CONSTEXPR )
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if __has_feature( cxx_noexcept ) && !defined( HALF_ENABLE_CPP11_NOEXCEPT )
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if __has_feature( cxx_user_literals ) && !defined( HALF_ENABLE_CPP11_USER_LITERALS )
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if __has_feature( cxx_thread_local ) && !defined( HALF_ENABLE_CPP11_THREAD_LOCAL )
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if ( defined( __GXX_EXPERIMENTAL_CXX0X__ ) || __cplusplus >= 201103L ) && !defined( HALF_ENABLE_CPP11_LONG_LONG )
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#elif HALF_ICC_VERSION && defined( __INTEL_CXX11_MODE__ ) // Intel C++
+#if HALF_ICC_VERSION >= 1500 && !defined( HALF_ENABLE_CPP11_THREAD_LOCAL )
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if HALF_ICC_VERSION >= 1500 && !defined( HALF_ENABLE_CPP11_USER_LITERALS )
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if HALF_ICC_VERSION >= 1400 && !defined( HALF_ENABLE_CPP11_CONSTEXPR )
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_ICC_VERSION >= 1400 && !defined( HALF_ENABLE_CPP11_NOEXCEPT )
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_ICC_VERSION >= 1110 && !defined( HALF_ENABLE_CPP11_STATIC_ASSERT )
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if HALF_ICC_VERSION >= 1110 && !defined( HALF_ENABLE_CPP11_LONG_LONG )
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#elif defined( __GNUC__ ) // gcc
+#if defined( __GXX_EXPERIMENTAL_CXX0X__ ) || __cplusplus >= 201103L
+#if HALF_GCC_VERSION >= 408 && !defined( HALF_ENABLE_CPP11_THREAD_LOCAL )
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if HALF_GCC_VERSION >= 407 && !defined( HALF_ENABLE_CPP11_USER_LITERALS )
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if HALF_GCC_VERSION >= 406 && !defined( HALF_ENABLE_CPP11_CONSTEXPR )
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if HALF_GCC_VERSION >= 406 && !defined( HALF_ENABLE_CPP11_NOEXCEPT )
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined( HALF_ENABLE_CPP11_STATIC_ASSERT )
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if !defined( HALF_ENABLE_CPP11_LONG_LONG )
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#endif
+#define HALF_TWOS_COMPLEMENT_INT 1
+#elif defined( _MSC_VER ) // Visual C++
+#if _MSC_VER >= 1900 && !defined( HALF_ENABLE_CPP11_THREAD_LOCAL )
+#define HALF_ENABLE_CPP11_THREAD_LOCAL 1
+#endif
+#if _MSC_VER >= 1900 && !defined( HALF_ENABLE_CPP11_USER_LITERALS )
+#define HALF_ENABLE_CPP11_USER_LITERALS 1
+#endif
+#if _MSC_VER >= 1900 && !defined( HALF_ENABLE_CPP11_CONSTEXPR )
+#define HALF_ENABLE_CPP11_CONSTEXPR 1
+#endif
+#if _MSC_VER >= 1900 && !defined( HALF_ENABLE_CPP11_NOEXCEPT )
+#define HALF_ENABLE_CPP11_NOEXCEPT 1
+#endif
+#if _MSC_VER >= 1600 && !defined( HALF_ENABLE_CPP11_STATIC_ASSERT )
+#define HALF_ENABLE_CPP11_STATIC_ASSERT 1
+#endif
+#if _MSC_VER >= 1310 && !defined( HALF_ENABLE_CPP11_LONG_LONG )
+#define HALF_ENABLE_CPP11_LONG_LONG 1
+#endif
+#define HALF_TWOS_COMPLEMENT_INT 1
+#define HALF_POP_WARNINGS		 1
+#pragma warning( push )
+#pragma warning( disable : 4099 4127 4146 ) // struct vs class, constant in if, negative unsigned
+#endif
+
+// check C++11 library features
+#include <utility>
+#if defined( _LIBCPP_VERSION ) // libc++
+#if defined( __GXX_EXPERIMENTAL_CXX0X__ ) || __cplusplus >= 201103
+#ifndef HALF_ENABLE_CPP11_TYPE_TRAITS
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CSTDINT
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CMATH
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_HASH
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#ifndef HALF_ENABLE_CPP11_CFENV
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#endif
+#elif defined( __GLIBCXX__ ) // libstdc++
+#if defined( __GXX_EXPERIMENTAL_CXX0X__ ) || __cplusplus >= 201103
+#ifdef __clang__
+#if __GLIBCXX__ >= 20080606 && !defined( HALF_ENABLE_CPP11_TYPE_TRAITS )
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined( HALF_ENABLE_CPP11_CSTDINT )
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined( HALF_ENABLE_CPP11_CMATH )
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined( HALF_ENABLE_CPP11_HASH )
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#if __GLIBCXX__ >= 20080606 && !defined( HALF_ENABLE_CPP11_CFENV )
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#else
+#if HALF_GCC_VERSION >= 403 && !defined( HALF_ENABLE_CPP11_TYPE_TRAITS )
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined( HALF_ENABLE_CPP11_CSTDINT )
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined( HALF_ENABLE_CPP11_CMATH )
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined( HALF_ENABLE_CPP11_HASH )
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#if HALF_GCC_VERSION >= 403 && !defined( HALF_ENABLE_CPP11_CFENV )
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#endif
+#endif
+#elif defined( _CPPLIB_VER ) // Dinkumware/Visual C++
+#if _CPPLIB_VER >= 520 && !defined( HALF_ENABLE_CPP11_TYPE_TRAITS )
+#define HALF_ENABLE_CPP11_TYPE_TRAITS 1
+#endif
+#if _CPPLIB_VER >= 520 && !defined( HALF_ENABLE_CPP11_CSTDINT )
+#define HALF_ENABLE_CPP11_CSTDINT 1
+#endif
+#if _CPPLIB_VER >= 520 && !defined( HALF_ENABLE_CPP11_HASH )
+#define HALF_ENABLE_CPP11_HASH 1
+#endif
+#if _CPPLIB_VER >= 610 && !defined( HALF_ENABLE_CPP11_CMATH )
+#define HALF_ENABLE_CPP11_CMATH 1
+#endif
+#if _CPPLIB_VER >= 610 && !defined( HALF_ENABLE_CPP11_CFENV )
+#define HALF_ENABLE_CPP11_CFENV 1
+#endif
+#endif
+#undef HALF_GCC_VERSION
+#undef HALF_ICC_VERSION
+
+// any error throwing C++ exceptions?
+#if defined( HALF_ERRHANDLING_THROW_INVALID ) || defined( HALF_ERRHANDLING_THROW_DIVBYZERO ) || defined( HALF_ERRHANDLING_THROW_OVERFLOW ) || defined( HALF_ERRHANDLING_THROW_UNDERFLOW ) || defined( HALF_ERRHANDLING_THROW_INEXACT )
+#define HALF_ERRHANDLING_THROWS 1
+#endif
+
+// any error handling enabled?
+#define HALF_ERRHANDLING ( HALF_ERRHANDLING_FLAGS || HALF_ERRHANDLING_ERRNO || HALF_ERRHANDLING_FENV || HALF_ERRHANDLING_THROWS )
+
+#if HALF_ERRHANDLING
+#define HALF_UNUSED_NOERR( name ) name
+#else
+#define HALF_UNUSED_NOERR( name )
+#endif
+
+// support constexpr
+#if HALF_ENABLE_CPP11_CONSTEXPR
+#define HALF_CONSTEXPR		 constexpr
+#define HALF_CONSTEXPR_CONST constexpr
+#if HALF_ERRHANDLING
+#define HALF_CONSTEXPR_NOERR
+#else
+#define HALF_CONSTEXPR_NOERR constexpr
+#endif
+#else
+#define HALF_CONSTEXPR
+#define HALF_CONSTEXPR_CONST const
+#define HALF_CONSTEXPR_NOERR
+#endif
+
+// support noexcept
+#if HALF_ENABLE_CPP11_NOEXCEPT
+#define HALF_NOEXCEPT noexcept
+#define HALF_NOTHROW  noexcept
+#else
+#define HALF_NOEXCEPT
+#define HALF_NOTHROW throw()
+#endif
+
+// support thread storage
+#if HALF_ENABLE_CPP11_THREAD_LOCAL
+#define HALF_THREAD_LOCAL thread_local
+#else
+#define HALF_THREAD_LOCAL static
+#endif
+
+#include <algorithm>
+#include <climits>
+#include <cmath>
+#include <cstdlib>
+#include <cstring>
+#include <istream>
+#include <limits>
+#include <ostream>
+#include <stdexcept>
+#include <utility>
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+#include <type_traits>
+#endif
+#if HALF_ENABLE_CPP11_CSTDINT
+#include <cstdint>
+#endif
+#if HALF_ERRHANDLING_ERRNO
+#include <cerrno>
+#endif
+#if HALF_ENABLE_CPP11_CFENV
+#include <cfenv>
+#endif
+#if HALF_ENABLE_CPP11_HASH
+#include <functional>
+#endif
+
+#ifndef HALF_ENABLE_F16C_INTRINSICS
+/// Enable F16C intruction set intrinsics.
+/// Defining this to 1 enables the use of [F16C compiler intrinsics](https://en.wikipedia.org/wiki/F16C) for converting between
+/// half-precision and single-precision values which may result in improved performance. This will not perform additional checks
+/// for support of the F16C instruction set, so an appropriate target platform is required when enabling this feature.
+///
+/// Unless predefined it will be enabled automatically when the `__F16C__` symbol is defined, which some compilers do on supporting platforms.
+#define HALF_ENABLE_F16C_INTRINSICS __F16C__
+#endif
+#if HALF_ENABLE_F16C_INTRINSICS
+#include <immintrin.h>
+#endif
+
+#ifdef HALF_DOXYGEN_ONLY
+/// Type for internal floating-point computations.
+/// This can be predefined to a built-in floating-point type (`float`, `double` or `long double`) to override the internal
+/// half-precision implementation to use this type for computing arithmetic operations and mathematical function (if available).
+/// This can result in improved performance for arithmetic operators and mathematical functions but might cause results to
+/// deviate from the specified half-precision rounding mode and inhibits proper detection of half-precision exceptions.
+#define HALF_ARITHMETIC_TYPE ( undefined )
+
+/// Enable internal exception flags.
+/// Defining this to 1 causes operations on half-precision values to raise internal floating-point exception flags according to
+/// the IEEE 754 standard. These can then be cleared and checked with clearexcept(), testexcept().
+#define HALF_ERRHANDLING_FLAGS 0
+
+/// Enable exception propagation to `errno`.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to
+/// [errno](https://en.cppreference.com/w/cpp/error/errno) from `<cerrno>`. Specifically this will propagate domain errors as
+/// [EDOM](https://en.cppreference.com/w/cpp/error/errno_macros) and pole, overflow and underflow errors as
+/// [ERANGE](https://en.cppreference.com/w/cpp/error/errno_macros). Inexact errors won't be propagated.
+#define HALF_ERRHANDLING_ERRNO 0
+
+/// Enable exception propagation to built-in floating-point platform.
+/// Defining this to 1 causes operations on half-precision values to propagate floating-point exceptions to the built-in
+/// single- and double-precision implementation's exception flags using the
+/// [C++11 floating-point environment control](https://en.cppreference.com/w/cpp/numeric/fenv) from `<cfenv>`. However, this
+/// does not work in reverse and single- or double-precision exceptions will not raise the corresponding half-precision
+/// exception flags, nor will explicitly clearing flags clear the corresponding built-in flags.
+#define HALF_ERRHANDLING_FENV 0
+
+/// Throw C++ exception on domain errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on domain errors.
+#define HALF_ERRHANDLING_THROW_INVALID ( undefined )
+
+/// Throw C++ exception on pole errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::domain_error](https://en.cppreference.com/w/cpp/error/domain_error) with the specified message on pole errors.
+#define HALF_ERRHANDLING_THROW_DIVBYZERO ( undefined )
+
+/// Throw C++ exception on overflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::overflow_error](https://en.cppreference.com/w/cpp/error/overflow_error) with the specified message on overflows.
+#define HALF_ERRHANDLING_THROW_OVERFLOW ( undefined )
+
+/// Throw C++ exception on underflow errors.
+/// Defining this to a string literal causes operations on half-precision values to throw a
+/// [std::underflow_error](https://en.cppreference.com/w/cpp/error/underflow_error) with the specified message on underflows.
+#define HALF_ERRHANDLING_THROW_UNDERFLOW ( undefined )
+
+/// Throw C++ exception on rounding errors.
+/// Defining this to 1 causes operations on half-precision values to throw a
+/// [std::range_error](https://en.cppreference.com/w/cpp/error/range_error) with the specified message on general rounding errors.
+#define HALF_ERRHANDLING_THROW_INEXACT ( undefined )
+#endif
+
+#ifndef HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+/// Raise INEXACT exception on overflow.
+/// Defining this to 1 (default) causes overflow errors to automatically raise inexact exceptions in addition.
+/// These will be raised after any possible handling of the underflow exception.
+#define HALF_ERRHANDLING_OVERFLOW_TO_INEXACT 1
+#endif
+
+#ifndef HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+/// Raise INEXACT exception on underflow.
+/// Defining this to 1 (default) causes underflow errors to automatically raise inexact exceptions in addition.
+/// These will be raised after any possible handling of the underflow exception.
+///
+/// **Note:** This will actually cause underflow (and the accompanying inexact) exceptions to be raised *only* when the result
+/// is inexact, while if disabled bare underflow errors will be raised for *any* (possibly exact) subnormal result.
+#define HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT 1
+#endif
+
+/// Default rounding mode.
+/// This specifies the rounding mode used for all conversions between [half](\ref half_float::half)s and more precise types
+/// (unless using half_cast() and specifying the rounding mode directly) as well as in arithmetic operations and mathematical
+/// functions. It can be redefined (before including half.hpp) to one of the standard rounding modes using their respective
+/// constants or the equivalent values of
+/// [std::float_round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/float_round_style):
+///
+/// `std::float_round_style`         | value | rounding
+/// ---------------------------------|-------|-------------------------
+/// `std::round_indeterminate`       | -1    | fastest
+/// `std::round_toward_zero`         | 0     | toward zero
+/// `std::round_to_nearest`          | 1     | to nearest (default)
+/// `std::round_toward_infinity`     | 2     | toward positive infinity
+/// `std::round_toward_neg_infinity` | 3     | toward negative infinity
+///
+/// By default this is set to `1` (`std::round_to_nearest`), which rounds results to the nearest representable value. It can even
+/// be set to [std::numeric_limits<float>::round_style](https://en.cppreference.com/w/cpp/types/numeric_limits/round_style) to synchronize
+/// the rounding mode with that of the built-in single-precision implementation (which is likely `std::round_to_nearest`, though).
+#ifndef HALF_ROUND_STYLE
+#define HALF_ROUND_STYLE 1 // = std::round_to_nearest
+#endif
+
+/// Value signaling overflow.
+/// In correspondence with `HUGE_VAL[F|L]` from `<cmath>` this symbol expands to a positive value signaling the overflow of an
+/// operation, in particular it just evaluates to positive infinity.
+///
+/// **See also:** Documentation for [HUGE_VAL](https://en.cppreference.com/w/cpp/numeric/math/HUGE_VAL)
+#define HUGE_VALH std::numeric_limits<half_float::half>::infinity()
+
+/// Fast half-precision fma function.
+/// This symbol is defined if the fma() function generally executes as fast as, or faster than, a separate
+/// half-precision multiplication followed by an addition, which is always the case.
+///
+/// **See also:** Documentation for [FP_FAST_FMA](https://en.cppreference.com/w/cpp/numeric/math/fma)
+#define FP_FAST_FMAH 1
+
+///	Half rounding mode.
+/// In correspondence with `FLT_ROUNDS` from `<cfloat>` this symbol expands to the rounding mode used for
+/// half-precision operations. It is an alias for [HALF_ROUND_STYLE](\ref HALF_ROUND_STYLE).
+///
+/// **See also:** Documentation for [FLT_ROUNDS](https://en.cppreference.com/w/cpp/types/climits/FLT_ROUNDS)
+#define HLF_ROUNDS HALF_ROUND_STYLE
+
+#ifndef FP_ILOGB0
+#define FP_ILOGB0 INT_MIN
+#endif
+#ifndef FP_ILOGBNAN
+#define FP_ILOGBNAN INT_MAX
+#endif
+#ifndef FP_SUBNORMAL
+#define FP_SUBNORMAL 0
+#endif
+#ifndef FP_ZERO
+#define FP_ZERO 1
+#endif
+#ifndef FP_NAN
+#define FP_NAN 2
+#endif
+#ifndef FP_INFINITE
+#define FP_INFINITE 3
+#endif
+#ifndef FP_NORMAL
+#define FP_NORMAL 4
+#endif
+
+#if !HALF_ENABLE_CPP11_CFENV && !defined( FE_ALL_EXCEPT )
+#define FE_INVALID	  0x10
+#define FE_DIVBYZERO  0x08
+#define FE_OVERFLOW	  0x04
+#define FE_UNDERFLOW  0x02
+#define FE_INEXACT	  0x01
+#define FE_ALL_EXCEPT ( FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW | FE_INEXACT )
+#endif
+
+/// Main namespace for half-precision functionality.
+/// This namespace contains all the functionality provided by the library.
+namespace half_float
+{
+	class half;
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	/// Library-defined half-precision literals.
+	/// Import this namespace to enable half-precision floating-point literals:
+	/// ~~~~{.cpp}
+	/// using namespace half_float::literal;
+	/// half_float::half = 4.2_h;
+	/// ~~~~
+	namespace literal
+	{
+		half operator"" _h( long double );
+	}
+#endif
+
+	/// \internal
+	/// \brief Implementation details.
+	namespace detail
+	{
+#if HALF_ENABLE_CPP11_TYPE_TRAITS
+		/// Conditional type.
+		template <bool B, typename T, typename F>
+		struct conditional : std::conditional<B, T, F>
+		{
+		};
+
+		/// Helper for tag dispatching.
+		template <bool B>
+		struct bool_type : std::integral_constant<bool, B>
+		{
+		};
+		using std::false_type;
+		using std::true_type;
+
+		/// Type traits for floating-point types.
+		template <typename T>
+		struct is_float : std::is_floating_point<T>
+		{
+		};
+#else
+		/// Conditional type.
+		template <bool, typename T, typename>
+		struct conditional
+		{
+			typedef T type;
+		};
+		template <typename T, typename F>
+		struct conditional<false, T, F>
+		{
+			typedef F type;
+		};
+
+		/// Helper for tag dispatching.
+		template <bool>
+		struct bool_type
+		{
+		};
+		typedef bool_type<true> true_type;
+		typedef bool_type<false> false_type;
+
+		/// Type traits for floating-point types.
+		template <typename>
+		struct is_float : false_type
+		{
+		};
+		template <typename T>
+		struct is_float<const T> : is_float<T>
+		{
+		};
+		template <typename T>
+		struct is_float<volatile T> : is_float<T>
+		{
+		};
+		template <typename T>
+		struct is_float<const volatile T> : is_float<T>
+		{
+		};
+		template <>
+		struct is_float<float> : true_type
+		{
+		};
+		template <>
+		struct is_float<double> : true_type
+		{
+		};
+		template <>
+		struct is_float<long double> : true_type
+		{
+		};
+#endif
+
+		/// Type traits for floating-point bits.
+		template <typename T>
+		struct bits
+		{
+			typedef unsigned char type;
+		};
+		template <typename T>
+		struct bits<const T> : bits<T>
+		{
+		};
+		template <typename T>
+		struct bits<volatile T> : bits<T>
+		{
+		};
+		template <typename T>
+		struct bits<const volatile T> : bits<T>
+		{
+		};
+
+#if HALF_ENABLE_CPP11_CSTDINT
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef std::uint_least16_t uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef std::uint_fast32_t uint32;
+
+		/// Fastest signed integer of (at least) 32 bits width.
+		typedef std::int_fast32_t int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template <>
+		struct bits<float>
+		{
+			typedef std::uint_least32_t type;
+		};
+
+		/// Unsigned integer of (at least) 64 bits width.
+		template <>
+		struct bits<double>
+		{
+			typedef std::uint_least64_t type;
+		};
+#else
+		/// Unsigned integer of (at least) 16 bits width.
+		typedef unsigned short uint16;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef unsigned long uint32;
+
+		/// Fastest unsigned integer of (at least) 32 bits width.
+		typedef long int32;
+
+		/// Unsigned integer of (at least) 32 bits width.
+		template <>
+		struct bits<float> : conditional<std::numeric_limits<unsigned int>::digits >= 32, unsigned int, unsigned long>
+		{
+		};
+
+#if HALF_ENABLE_CPP11_LONG_LONG
+		/// Unsigned integer of (at least) 64 bits width.
+		template <>
+		struct bits<double> : conditional<std::numeric_limits<unsigned long>::digits >= 64, unsigned long, unsigned long long>
+		{
+		};
+#else
+		/// Unsigned integer of (at least) 64 bits width.
+		template <>
+		struct bits<double>
+		{
+			typedef unsigned long type;
+		};
+#endif
+#endif
+
+#ifdef HALF_ARITHMETIC_TYPE
+		/// Type to use for arithmetic computations and mathematic functions internally.
+		typedef HALF_ARITHMETIC_TYPE internal_t;
+#endif
+
+		/// Tag type for binary construction.
+		struct binary_t
+		{
+		};
+
+		/// Tag for binary construction.
+		HALF_CONSTEXPR_CONST binary_t binary = binary_t();
+
+		/// \name Implementation defined classification and arithmetic
+		/// \{
+
+		/// Check for infinity.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if infinity
+		/// \retval false else
+		template <typename T>
+		bool builtin_isinf( T arg )
+		{
+#if HALF_ENABLE_CPP11_CMATH
+			return std::isinf( arg );
+#elif defined( _MSC_VER )
+			return !::_finite( static_cast<double>( arg ) ) && !::_isnan( static_cast<double>( arg ) );
+#else
+			return arg == std::numeric_limits<T>::infinity() || arg == -std::numeric_limits<T>::infinity();
+#endif
+		}
+
+		/// Check for NaN.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if not a number
+		/// \retval false else
+		template <typename T>
+		bool builtin_isnan( T arg )
+		{
+#if HALF_ENABLE_CPP11_CMATH
+			return std::isnan( arg );
+#elif defined( _MSC_VER )
+			return ::_isnan( static_cast<double>( arg ) ) != 0;
+#else
+			return arg != arg;
+#endif
+		}
+
+		/// Check sign.
+		/// \tparam T argument type (builtin floating-point type)
+		/// \param arg value to query
+		/// \retval true if signbit set
+		/// \retval false else
+		template <typename T>
+		bool builtin_signbit( T arg )
+		{
+#if HALF_ENABLE_CPP11_CMATH
+			return std::signbit( arg );
+#else
+			return arg < T() || ( arg == T() && T( 1 ) / arg < T() );
+#endif
+		}
+
+		/// Platform-independent sign mask.
+		/// \param arg integer value in two's complement
+		/// \retval -1 if \a arg negative
+		/// \retval 0 if \a arg positive
+		inline uint32 sign_mask( uint32 arg )
+		{
+			static const int N = std::numeric_limits<uint32>::digits - 1;
+#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>( arg ) >> N;
+#else
+			return -( ( arg >> N ) & 1 );
+#endif
+		}
+
+		/// Platform-independent arithmetic right shift.
+		/// \param arg integer value in two's complement
+		/// \param i shift amount (at most 31)
+		/// \return \a arg right shifted for \a i bits with possible sign extension
+		inline uint32 arithmetic_shift( uint32 arg, int i )
+		{
+#if HALF_TWOS_COMPLEMENT_INT
+			return static_cast<int32>( arg ) >> i;
+#else
+			return static_cast<int32>( arg ) / ( static_cast<int32>( 1 ) << i ) - ( ( arg >> ( std::numeric_limits<uint32>::digits - 1 ) ) & 1 );
+#endif
+		}
+
+		/// \}
+		/// \name Error handling
+		/// \{
+
+		/// Internal exception flags.
+		/// \return reference to global exception flags
+		inline int &errflags()
+		{
+			HALF_THREAD_LOCAL int flags = 0;
+			return flags;
+		}
+
+		/// Raise floating-point exception.
+		/// \param flags exceptions to raise
+		/// \param cond condition to raise exceptions for
+		inline void raise( int HALF_UNUSED_NOERR( flags ), bool HALF_UNUSED_NOERR( cond ) = true )
+		{
+#if HALF_ERRHANDLING
+			if ( !cond )
+				return;
+#if HALF_ERRHANDLING_FLAGS
+			errflags() |= flags;
+#endif
+#if HALF_ERRHANDLING_ERRNO
+			if ( flags & FE_INVALID )
+				errno = EDOM;
+			else if ( flags & ( FE_DIVBYZERO | FE_OVERFLOW | FE_UNDERFLOW ) )
+				errno = ERANGE;
+#endif
+#if HALF_ERRHANDLING_FENV && HALF_ENABLE_CPP11_CFENV
+			std::feraiseexcept( flags );
+#endif
+#ifdef HALF_ERRHANDLING_THROW_INVALID
+			if ( flags & FE_INVALID )
+				throw std::domain_error( HALF_ERRHANDLING_THROW_INVALID );
+#endif
+#ifdef HALF_ERRHANDLING_THROW_DIVBYZERO
+			if ( flags & FE_DIVBYZERO )
+				throw std::domain_error( HALF_ERRHANDLING_THROW_DIVBYZERO );
+#endif
+#ifdef HALF_ERRHANDLING_THROW_OVERFLOW
+			if ( flags & FE_OVERFLOW )
+				throw std::overflow_error( HALF_ERRHANDLING_THROW_OVERFLOW );
+#endif
+#ifdef HALF_ERRHANDLING_THROW_UNDERFLOW
+			if ( flags & FE_UNDERFLOW )
+				throw std::underflow_error( HALF_ERRHANDLING_THROW_UNDERFLOW );
+#endif
+#ifdef HALF_ERRHANDLING_THROW_INEXACT
+			if ( flags & FE_INEXACT )
+				throw std::range_error( HALF_ERRHANDLING_THROW_INEXACT );
+#endif
+#if HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			if ( ( flags & FE_UNDERFLOW ) && !( flags & FE_INEXACT ) )
+				raise( FE_INEXACT );
+#endif
+#if HALF_ERRHANDLING_OVERFLOW_TO_INEXACT
+			if ( ( flags & FE_OVERFLOW ) && !( flags & FE_INEXACT ) )
+				raise( FE_INEXACT );
+#endif
+#endif
+		}
+
+		/// Check and signal for any NaN.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \retval true if either \a x or \a y is NaN
+		/// \retval false else
+		/// \exception FE_INVALID if \a x or \a y is NaN
+		inline HALF_CONSTEXPR_NOERR bool compsignal( unsigned int x, unsigned int y )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_INVALID, ( x & 0x7FFF ) > 0x7C00 || ( y & 0x7FFF ) > 0x7C00 );
+#endif
+			return ( x & 0x7FFF ) > 0x7C00 || ( y & 0x7FFF ) > 0x7C00;
+		}
+
+		/// Signal and silence signaling NaN.
+		/// \param nan half-precision NaN value
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a nan is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal( unsigned int nan )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_INVALID, !( nan & 0x200 ) );
+#endif
+			return nan | 0x200;
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x or \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal( unsigned int x, unsigned int y )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_INVALID, ( ( x & 0x7FFF ) > 0x7C00 && !( x & 0x200 ) ) || ( ( y & 0x7FFF ) > 0x7C00 && !( y & 0x200 ) ) );
+#endif
+			return ( ( x & 0x7FFF ) > 0x7C00 ) ? ( x | 0x200 ) : ( y | 0x200 );
+		}
+
+		/// Signal and silence signaling NaNs.
+		/// \param x first half-precision value to check
+		/// \param y second half-precision value to check
+		/// \param z third half-precision value to check
+		/// \return quiet NaN
+		/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int signal( unsigned int x, unsigned int y, unsigned int z )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_INVALID, ( ( x & 0x7FFF ) > 0x7C00 && !( x & 0x200 ) ) || ( ( y & 0x7FFF ) > 0x7C00 && !( y & 0x200 ) ) || ( ( z & 0x7FFF ) > 0x7C00 && !( z & 0x200 ) ) );
+#endif
+			return ( ( x & 0x7FFF ) > 0x7C00 ) ? ( x | 0x200 ) : ( ( y & 0x7FFF ) > 0x7C00 ) ? ( y | 0x200 ) :
+																							   ( z | 0x200 );
+		}
+
+		/// Select value or signaling NaN.
+		/// \param x preferred half-precision value
+		/// \param y ignored half-precision value except for signaling NaN
+		/// \return \a y if signaling NaN, \a x otherwise
+		/// \exception FE_INVALID if \a y is signaling NaN
+		inline HALF_CONSTEXPR_NOERR unsigned int select( unsigned int x, unsigned int HALF_UNUSED_NOERR( y ) )
+		{
+#if HALF_ERRHANDLING
+			return ( ( ( y & 0x7FFF ) > 0x7C00 ) && !( y & 0x200 ) ) ? signal( y ) : x;
+#else
+			return x;
+#endif
+		}
+
+		/// Raise domain error and return NaN.
+		/// return quiet NaN
+		/// \exception FE_INVALID
+		inline HALF_CONSTEXPR_NOERR unsigned int invalid()
+		{
+#if HALF_ERRHANDLING
+			raise( FE_INVALID );
+#endif
+			return 0x7FFF;
+		}
+
+		/// Raise pole error and return infinity.
+		/// \param sign half-precision value with sign bit only
+		/// \return half-precision infinity with sign of \a sign
+		/// \exception FE_DIVBYZERO
+		inline HALF_CONSTEXPR_NOERR unsigned int pole( unsigned int sign = 0 )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_DIVBYZERO );
+#endif
+			return sign | 0x7C00;
+		}
+
+		/// Check value for underflow.
+		/// \param arg non-zero half-precision value to check
+		/// \return \a arg
+		/// \exception FE_UNDERFLOW if arg is subnormal
+		inline HALF_CONSTEXPR_NOERR unsigned int check_underflow( unsigned int arg )
+		{
+#if HALF_ERRHANDLING && !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT
+			raise( FE_UNDERFLOW, !( arg & 0x7C00 ) );
+#endif
+			return arg;
+		}
+
+		/// \}
+		/// \name Conversion and rounding
+		/// \{
+
+		/// Half-precision overflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded overflowing half-precision value
+		/// \exception FE_OVERFLOW
+		template <std::float_round_style R>
+		HALF_CONSTEXPR_NOERR unsigned int overflow( unsigned int sign = 0 )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_OVERFLOW );
+#endif
+			return ( R == std::round_toward_infinity )	   ? ( sign + 0x7C00 - ( sign >> 15 ) ) :
+				   ( R == std::round_toward_neg_infinity ) ? ( sign + 0x7BFF + ( sign >> 15 ) ) :
+				   ( R == std::round_toward_zero )		   ? ( sign | 0x7BFF ) :
+															 ( sign | 0x7C00 );
+		}
+
+		/// Half-precision underflow.
+		/// \tparam R rounding mode to use
+		/// \param sign half-precision value with sign bit only
+		/// \return rounded underflowing half-precision value
+		/// \exception FE_UNDERFLOW
+		template <std::float_round_style R>
+		HALF_CONSTEXPR_NOERR unsigned int underflow( unsigned int sign = 0 )
+		{
+#if HALF_ERRHANDLING
+			raise( FE_UNDERFLOW );
+#endif
+			return ( R == std::round_toward_infinity )	   ? ( sign + 1 - ( sign >> 15 ) ) :
+				   ( R == std::round_toward_neg_infinity ) ? ( sign + ( sign >> 15 ) ) :
+															 sign;
+		}
+
+		/// Round half-precision number.
+		/// \tparam R rounding mode to use
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param value finite half-precision number to round
+		/// \param g guard bit (most significant discarded bit)
+		/// \param s sticky bit (or of all but the most significant discarded bits)
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template <std::float_round_style R, bool I>
+		HALF_CONSTEXPR_NOERR unsigned int rounded( unsigned int value, int g, int s )
+		{
+#if HALF_ERRHANDLING
+			value += ( R == std::round_to_nearest )			 ? ( g & ( s | value ) ) :
+					 ( R == std::round_toward_infinity )	 ? ( ~( value >> 15 ) & ( g | s ) ) :
+					 ( R == std::round_toward_neg_infinity ) ? ( ( value >> 15 ) & ( g | s ) ) :
+															   0;
+			if ( ( value & 0x7C00 ) == 0x7C00 )
+				raise( FE_OVERFLOW );
+			else if ( value & 0x7C00 )
+				raise( FE_INEXACT, I || ( g | s ) != 0 );
+			else
+				raise( FE_UNDERFLOW, !( HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT ) || I || ( g | s ) != 0 );
+			return value;
+#else
+			return ( R == std::round_to_nearest )		   ? ( value + ( g & ( s | value ) ) ) :
+				   ( R == std::round_toward_infinity )	   ? ( value + ( ~( value >> 15 ) & ( g | s ) ) ) :
+				   ( R == std::round_toward_neg_infinity ) ? ( value + ( ( value >> 15 ) & ( g | s ) ) ) :
+															 value;
+#endif
+		}
+
+		/// Round half-precision number to nearest integer value.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \param value half-precision value to round
+		/// \return half-precision bits for nearest integral value
+		/// \exception FE_INVALID for signaling NaN
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template <std::float_round_style R, bool E, bool I>
+		unsigned int integral( unsigned int value )
+		{
+			unsigned int abs = value & 0x7FFF;
+			if ( abs < 0x3C00 )
+			{
+				raise( FE_INEXACT, I );
+				return ( ( R == std::round_to_nearest )			 ? ( 0x3C00 & -static_cast<unsigned>( abs >= ( 0x3800 + E ) ) ) :
+						 ( R == std::round_toward_infinity )	 ? ( 0x3C00 & -( ~( value >> 15 ) & ( abs != 0 ) ) ) :
+						 ( R == std::round_toward_neg_infinity ) ? ( 0x3C00 & -static_cast<unsigned>( value > 0x8000 ) ) :
+																   0 ) |
+					   ( value & 0x8000 );
+			}
+			if ( abs >= 0x6400 )
+				return ( abs > 0x7C00 ) ? signal( value ) : value;
+			unsigned int exp = 25 - ( abs >> 10 ), mask = ( 1 << exp ) - 1;
+			raise( FE_INEXACT, I && ( value & mask ) );
+			return ( ( ( R == std::round_to_nearest )		   ? ( ( 1 << ( exp - 1 ) ) - ( ~( value >> exp ) & E ) ) :
+					   ( R == std::round_toward_infinity )	   ? ( mask & ( ( value >> 15 ) - 1 ) ) :
+					   ( R == std::round_toward_neg_infinity ) ? ( mask & -( value >> 15 ) ) :
+																 0 ) +
+					 value ) &
+				   ~mask;
+		}
+
+		/// Convert fixed point to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam F number of fractional bits in [11,31]
+		/// \tparam S `true` for signed, `false` for unsigned
+		/// \tparam N `true` for additional normalization step, `false` if already normalized to 1.F
+		/// \tparam I `true` to always raise INEXACT exception, `false` to raise only for rounded results
+		/// \param m mantissa in Q1.F fixed point format
+		/// \param exp biased exponent - 1
+		/// \param sign half-precision value with sign bit only
+		/// \param s sticky bit (or of all but the most significant already discarded bits)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template <std::float_round_style R, unsigned int F, bool S, bool N, bool I>
+		unsigned int fixed2half( uint32 m, int exp = 14, unsigned int sign = 0, int s = 0 )
+		{
+			if ( S )
+			{
+				uint32 msign = sign_mask( m );
+				m = ( m ^ msign ) - msign;
+				sign = msign & 0x8000;
+			}
+			if ( N )
+				for ( ; m < ( static_cast<uint32>( 1 ) << F ) && exp; m <<= 1, --exp )
+					;
+			else if ( exp < 0 )
+				return rounded<R, I>( sign + ( m >> ( F - 10 - exp ) ), ( m >> ( F - 11 - exp ) ) & 1, s | ( ( m & ( ( static_cast<uint32>( 1 ) << ( F - 11 - exp ) ) - 1 ) ) != 0 ) );
+			return rounded<R, I>( sign + ( exp << 10 ) + ( m >> ( F - 10 ) ), ( m >> ( F - 11 ) ) & 1, s | ( ( m & ( ( static_cast<uint32>( 1 ) << ( F - 11 ) ) - 1 ) ) != 0 ) );
+		}
+
+		/// Convert IEEE single-precision to half-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \tparam R rounding mode to use
+		/// \param value single-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template <std::float_round_style R>
+		unsigned int float2half_impl( float value, true_type )
+		{
+#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_set_ss( value ),
+													( R == std::round_to_nearest )			? _MM_FROUND_TO_NEAREST_INT :
+													( R == std::round_toward_zero )			? _MM_FROUND_TO_ZERO :
+													( R == std::round_toward_infinity )		? _MM_FROUND_TO_POS_INF :
+													( R == std::round_toward_neg_infinity ) ? _MM_FROUND_TO_NEG_INF :
+																							  _MM_FROUND_CUR_DIRECTION ) );
+#else
+			bits<float>::type fbits;
+			std::memcpy( &fbits, &value, sizeof( float ) );
+#if 1
+			unsigned int sign = ( fbits >> 16 ) & 0x8000;
+			fbits &= 0x7FFFFFFF;
+			if ( fbits >= 0x7F800000 )
+				return sign | 0x7C00 | ( ( fbits > 0x7F800000 ) ? ( 0x200 | ( ( fbits >> 13 ) & 0x3FF ) ) : 0 );
+			if ( fbits >= 0x47800000 )
+				return overflow<R>( sign );
+			if ( fbits >= 0x38800000 )
+				return rounded<R, false>( sign | ( ( ( fbits >> 23 ) - 112 ) << 10 ) | ( ( fbits >> 13 ) & 0x3FF ), ( fbits >> 12 ) & 1, ( fbits & 0xFFF ) != 0 );
+			if ( fbits >= 0x33000000 )
+			{
+				int i = 125 - ( fbits >> 23 );
+				fbits = ( fbits & 0x7FFFFF ) | 0x800000;
+				return rounded<R, false>( sign | ( fbits >> ( i + 1 ) ), ( fbits >> i ) & 1, ( fbits & ( ( static_cast<uint32>( 1 ) << i ) - 1 ) ) != 0 );
+			}
+			if ( fbits != 0 )
+				return underflow<R>( sign );
+			return sign;
+#else
+			static const uint16 base_table[512] = {
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+				0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, 0x0100,
+				0x0200, 0x0400, 0x0800, 0x0C00, 0x1000, 0x1400, 0x1800, 0x1C00, 0x2000, 0x2400, 0x2800, 0x2C00, 0x3000, 0x3400, 0x3800, 0x3C00,
+				0x4000, 0x4400, 0x4800, 0x4C00, 0x5000, 0x5400, 0x5800, 0x5C00, 0x6000, 0x6400, 0x6800, 0x6C00, 0x7000, 0x7400, 0x7800, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF,
+				0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7BFF, 0x7C00,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000,
+				0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8001, 0x8002, 0x8004, 0x8008, 0x8010, 0x8020, 0x8040, 0x8080, 0x8100,
+				0x8200, 0x8400, 0x8800, 0x8C00, 0x9000, 0x9400, 0x9800, 0x9C00, 0xA000, 0xA400, 0xA800, 0xAC00, 0xB000, 0xB400, 0xB800, 0xBC00,
+				0xC000, 0xC400, 0xC800, 0xCC00, 0xD000, 0xD400, 0xD800, 0xDC00, 0xE000, 0xE400, 0xE800, 0xEC00, 0xF000, 0xF400, 0xF800, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF,
+				0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFBFF, 0xFC00 };
+			static const unsigned char shift_table[256] = {
+				24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+				25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+				25, 25, 25, 25, 25, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+				13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+				24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 13 };
+			int sexp = fbits >> 23, exp = sexp & 0xFF, i = shift_table[exp];
+			fbits &= 0x7FFFFF;
+			uint32 m = ( fbits | ( ( exp != 0 ) << 23 ) ) & -static_cast<uint32>( exp != 0xFF );
+			return rounded<R, false>( base_table[sexp] + ( fbits >> i ), ( m >> ( i - 1 ) ) & 1, ( ( ( static_cast<uint32>( 1 ) << ( i - 1 ) ) - 1 ) & m ) != 0 );
+#endif
+#endif
+		}
+
+		/// Convert IEEE double-precision to half-precision.
+		/// \tparam R rounding mode to use
+		/// \param value double-precision value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template <std::float_round_style R>
+		unsigned int float2half_impl( double value, true_type )
+		{
+#if HALF_ENABLE_F16C_INTRINSICS
+			if ( R == std::round_indeterminate )
+				return _mm_cvtsi128_si32( _mm_cvtps_ph( _mm_cvtpd_ps( _mm_set_sd( value ) ), _MM_FROUND_CUR_DIRECTION ) );
+#endif
+			bits<double>::type dbits;
+			std::memcpy( &dbits, &value, sizeof( double ) );
+			uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF;
+			unsigned int sign = ( hi >> 16 ) & 0x8000;
+			hi &= 0x7FFFFFFF;
+			if ( hi >= 0x7FF00000 )
+				return sign | 0x7C00 | ( ( dbits & 0xFFFFFFFFFFFFF ) ? ( 0x200 | ( ( hi >> 10 ) & 0x3FF ) ) : 0 );
+			if ( hi >= 0x40F00000 )
+				return overflow<R>( sign );
+			if ( hi >= 0x3F100000 )
+				return rounded<R, false>( sign | ( ( ( hi >> 20 ) - 1008 ) << 10 ) | ( ( hi >> 10 ) & 0x3FF ), ( hi >> 9 ) & 1, ( ( hi & 0x1FF ) | lo ) != 0 );
+			if ( hi >= 0x3E600000 )
+			{
+				int i = 1018 - ( hi >> 20 );
+				hi = ( hi & 0xFFFFF ) | 0x100000;
+				return rounded<R, false>( sign | ( hi >> ( i + 1 ) ), ( hi >> i ) & 1, ( ( hi & ( ( static_cast<uint32>( 1 ) << i ) - 1 ) ) | lo ) != 0 );
+			}
+			if ( ( hi | lo ) != 0 )
+				return underflow<R>( sign );
+			return sign;
+		}
+
+		/// Convert non-IEEE floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template <std::float_round_style R, typename T>
+		unsigned int float2half_impl( T value, ... )
+		{
+			unsigned int hbits = static_cast<unsigned>( builtin_signbit( value ) ) << 15;
+			if ( value == T() )
+				return hbits;
+			if ( builtin_isnan( value ) )
+				return hbits | 0x7FFF;
+			if ( builtin_isinf( value ) )
+				return hbits | 0x7C00;
+			int exp;
+			std::frexp( value, &exp );
+			if ( exp > 16 )
+				return overflow<R>( hbits );
+			if ( exp < -13 )
+				value = std::ldexp( value, 25 );
+			else
+			{
+				value = std::ldexp( value, 12 - exp );
+				hbits |= ( ( exp + 13 ) << 10 );
+			}
+			T ival, frac = std::modf( value, &ival );
+			int m = std::abs( static_cast<int>( ival ) );
+			return rounded<R, false>( hbits + ( m >> 1 ), m & 1, frac != T() );
+		}
+
+		/// Convert floating-point to half-precision.
+		/// \tparam R rounding mode to use
+		/// \tparam T source type (builtin floating-point type)
+		/// \param value floating-point value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template <std::float_round_style R, typename T>
+		unsigned int float2half( T value )
+		{
+			return float2half_impl<R>( value, bool_type < std::numeric_limits<T>::is_iec559 && sizeof( typename bits<T>::type ) == sizeof( T ) > () );
+		}
+
+		/// Convert integer to half-precision floating-point.
+		/// \tparam R rounding mode to use
+		/// \tparam T type to convert (builtin integer type)
+		/// \param value integral value to convert
+		/// \return rounded half-precision value
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template <std::float_round_style R, typename T>
+		unsigned int int2half( T value )
+		{
+			unsigned int bits = static_cast<unsigned>( value < 0 ) << 15;
+			if ( !value )
+				return bits;
+			if ( bits )
+				value = -value;
+			if ( value > 0xFFFF )
+				return overflow<R>( bits );
+			unsigned int m = static_cast<unsigned int>( value ), exp = 24;
+			for ( ; m < 0x400; m <<= 1, --exp )
+				;
+			for ( ; m > 0x7FF; m >>= 1, ++exp )
+				;
+			bits |= ( exp << 10 ) + m;
+			return ( exp > 24 ) ? rounded<R, false>( bits, ( value >> ( exp - 25 ) ) & 1, ( ( ( 1 << ( exp - 25 ) ) - 1 ) & value ) != 0 ) : bits;
+		}
+
+		/// Convert half-precision to IEEE single-precision.
+		/// Credit for this goes to [Jeroen van der Zijp](ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf).
+		/// \param value half-precision value to convert
+		/// \return single-precision value
+		inline float half2float_impl( unsigned int value, float, true_type )
+		{
+#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtss_f32( _mm_cvtph_ps( _mm_cvtsi32_si128( value ) ) );
+#else
+#if 0
+			bits<float>::type fbits = static_cast<bits<float>::type>(value&0x8000) << 16;
+			int abs = value & 0x7FFF;
+			if(abs)
+			{
+				fbits |= 0x38000000 << static_cast<unsigned>(abs>=0x7C00);
+				for(; abs<0x400; abs<<=1,fbits-=0x800000) ;
+				fbits += static_cast<bits<float>::type>(abs) << 13;
+			}
+#else
+			static const bits<float>::type mantissa_table[2048] = {
+				0x00000000, 0x33800000, 0x34000000, 0x34400000, 0x34800000, 0x34A00000, 0x34C00000, 0x34E00000, 0x35000000, 0x35100000, 0x35200000, 0x35300000, 0x35400000, 0x35500000, 0x35600000, 0x35700000,
+				0x35800000, 0x35880000, 0x35900000, 0x35980000, 0x35A00000, 0x35A80000, 0x35B00000, 0x35B80000, 0x35C00000, 0x35C80000, 0x35D00000, 0x35D80000, 0x35E00000, 0x35E80000, 0x35F00000, 0x35F80000,
+				0x36000000, 0x36040000, 0x36080000, 0x360C0000, 0x36100000, 0x36140000, 0x36180000, 0x361C0000, 0x36200000, 0x36240000, 0x36280000, 0x362C0000, 0x36300000, 0x36340000, 0x36380000, 0x363C0000,
+				0x36400000, 0x36440000, 0x36480000, 0x364C0000, 0x36500000, 0x36540000, 0x36580000, 0x365C0000, 0x36600000, 0x36640000, 0x36680000, 0x366C0000, 0x36700000, 0x36740000, 0x36780000, 0x367C0000,
+				0x36800000, 0x36820000, 0x36840000, 0x36860000, 0x36880000, 0x368A0000, 0x368C0000, 0x368E0000, 0x36900000, 0x36920000, 0x36940000, 0x36960000, 0x36980000, 0x369A0000, 0x369C0000, 0x369E0000,
+				0x36A00000, 0x36A20000, 0x36A40000, 0x36A60000, 0x36A80000, 0x36AA0000, 0x36AC0000, 0x36AE0000, 0x36B00000, 0x36B20000, 0x36B40000, 0x36B60000, 0x36B80000, 0x36BA0000, 0x36BC0000, 0x36BE0000,
+				0x36C00000, 0x36C20000, 0x36C40000, 0x36C60000, 0x36C80000, 0x36CA0000, 0x36CC0000, 0x36CE0000, 0x36D00000, 0x36D20000, 0x36D40000, 0x36D60000, 0x36D80000, 0x36DA0000, 0x36DC0000, 0x36DE0000,
+				0x36E00000, 0x36E20000, 0x36E40000, 0x36E60000, 0x36E80000, 0x36EA0000, 0x36EC0000, 0x36EE0000, 0x36F00000, 0x36F20000, 0x36F40000, 0x36F60000, 0x36F80000, 0x36FA0000, 0x36FC0000, 0x36FE0000,
+				0x37000000, 0x37010000, 0x37020000, 0x37030000, 0x37040000, 0x37050000, 0x37060000, 0x37070000, 0x37080000, 0x37090000, 0x370A0000, 0x370B0000, 0x370C0000, 0x370D0000, 0x370E0000, 0x370F0000,
+				0x37100000, 0x37110000, 0x37120000, 0x37130000, 0x37140000, 0x37150000, 0x37160000, 0x37170000, 0x37180000, 0x37190000, 0x371A0000, 0x371B0000, 0x371C0000, 0x371D0000, 0x371E0000, 0x371F0000,
+				0x37200000, 0x37210000, 0x37220000, 0x37230000, 0x37240000, 0x37250000, 0x37260000, 0x37270000, 0x37280000, 0x37290000, 0x372A0000, 0x372B0000, 0x372C0000, 0x372D0000, 0x372E0000, 0x372F0000,
+				0x37300000, 0x37310000, 0x37320000, 0x37330000, 0x37340000, 0x37350000, 0x37360000, 0x37370000, 0x37380000, 0x37390000, 0x373A0000, 0x373B0000, 0x373C0000, 0x373D0000, 0x373E0000, 0x373F0000,
+				0x37400000, 0x37410000, 0x37420000, 0x37430000, 0x37440000, 0x37450000, 0x37460000, 0x37470000, 0x37480000, 0x37490000, 0x374A0000, 0x374B0000, 0x374C0000, 0x374D0000, 0x374E0000, 0x374F0000,
+				0x37500000, 0x37510000, 0x37520000, 0x37530000, 0x37540000, 0x37550000, 0x37560000, 0x37570000, 0x37580000, 0x37590000, 0x375A0000, 0x375B0000, 0x375C0000, 0x375D0000, 0x375E0000, 0x375F0000,
+				0x37600000, 0x37610000, 0x37620000, 0x37630000, 0x37640000, 0x37650000, 0x37660000, 0x37670000, 0x37680000, 0x37690000, 0x376A0000, 0x376B0000, 0x376C0000, 0x376D0000, 0x376E0000, 0x376F0000,
+				0x37700000, 0x37710000, 0x37720000, 0x37730000, 0x37740000, 0x37750000, 0x37760000, 0x37770000, 0x37780000, 0x37790000, 0x377A0000, 0x377B0000, 0x377C0000, 0x377D0000, 0x377E0000, 0x377F0000,
+				0x37800000, 0x37808000, 0x37810000, 0x37818000, 0x37820000, 0x37828000, 0x37830000, 0x37838000, 0x37840000, 0x37848000, 0x37850000, 0x37858000, 0x37860000, 0x37868000, 0x37870000, 0x37878000,
+				0x37880000, 0x37888000, 0x37890000, 0x37898000, 0x378A0000, 0x378A8000, 0x378B0000, 0x378B8000, 0x378C0000, 0x378C8000, 0x378D0000, 0x378D8000, 0x378E0000, 0x378E8000, 0x378F0000, 0x378F8000,
+				0x37900000, 0x37908000, 0x37910000, 0x37918000, 0x37920000, 0x37928000, 0x37930000, 0x37938000, 0x37940000, 0x37948000, 0x37950000, 0x37958000, 0x37960000, 0x37968000, 0x37970000, 0x37978000,
+				0x37980000, 0x37988000, 0x37990000, 0x37998000, 0x379A0000, 0x379A8000, 0x379B0000, 0x379B8000, 0x379C0000, 0x379C8000, 0x379D0000, 0x379D8000, 0x379E0000, 0x379E8000, 0x379F0000, 0x379F8000,
+				0x37A00000, 0x37A08000, 0x37A10000, 0x37A18000, 0x37A20000, 0x37A28000, 0x37A30000, 0x37A38000, 0x37A40000, 0x37A48000, 0x37A50000, 0x37A58000, 0x37A60000, 0x37A68000, 0x37A70000, 0x37A78000,
+				0x37A80000, 0x37A88000, 0x37A90000, 0x37A98000, 0x37AA0000, 0x37AA8000, 0x37AB0000, 0x37AB8000, 0x37AC0000, 0x37AC8000, 0x37AD0000, 0x37AD8000, 0x37AE0000, 0x37AE8000, 0x37AF0000, 0x37AF8000,
+				0x37B00000, 0x37B08000, 0x37B10000, 0x37B18000, 0x37B20000, 0x37B28000, 0x37B30000, 0x37B38000, 0x37B40000, 0x37B48000, 0x37B50000, 0x37B58000, 0x37B60000, 0x37B68000, 0x37B70000, 0x37B78000,
+				0x37B80000, 0x37B88000, 0x37B90000, 0x37B98000, 0x37BA0000, 0x37BA8000, 0x37BB0000, 0x37BB8000, 0x37BC0000, 0x37BC8000, 0x37BD0000, 0x37BD8000, 0x37BE0000, 0x37BE8000, 0x37BF0000, 0x37BF8000,
+				0x37C00000, 0x37C08000, 0x37C10000, 0x37C18000, 0x37C20000, 0x37C28000, 0x37C30000, 0x37C38000, 0x37C40000, 0x37C48000, 0x37C50000, 0x37C58000, 0x37C60000, 0x37C68000, 0x37C70000, 0x37C78000,
+				0x37C80000, 0x37C88000, 0x37C90000, 0x37C98000, 0x37CA0000, 0x37CA8000, 0x37CB0000, 0x37CB8000, 0x37CC0000, 0x37CC8000, 0x37CD0000, 0x37CD8000, 0x37CE0000, 0x37CE8000, 0x37CF0000, 0x37CF8000,
+				0x37D00000, 0x37D08000, 0x37D10000, 0x37D18000, 0x37D20000, 0x37D28000, 0x37D30000, 0x37D38000, 0x37D40000, 0x37D48000, 0x37D50000, 0x37D58000, 0x37D60000, 0x37D68000, 0x37D70000, 0x37D78000,
+				0x37D80000, 0x37D88000, 0x37D90000, 0x37D98000, 0x37DA0000, 0x37DA8000, 0x37DB0000, 0x37DB8000, 0x37DC0000, 0x37DC8000, 0x37DD0000, 0x37DD8000, 0x37DE0000, 0x37DE8000, 0x37DF0000, 0x37DF8000,
+				0x37E00000, 0x37E08000, 0x37E10000, 0x37E18000, 0x37E20000, 0x37E28000, 0x37E30000, 0x37E38000, 0x37E40000, 0x37E48000, 0x37E50000, 0x37E58000, 0x37E60000, 0x37E68000, 0x37E70000, 0x37E78000,
+				0x37E80000, 0x37E88000, 0x37E90000, 0x37E98000, 0x37EA0000, 0x37EA8000, 0x37EB0000, 0x37EB8000, 0x37EC0000, 0x37EC8000, 0x37ED0000, 0x37ED8000, 0x37EE0000, 0x37EE8000, 0x37EF0000, 0x37EF8000,
+				0x37F00000, 0x37F08000, 0x37F10000, 0x37F18000, 0x37F20000, 0x37F28000, 0x37F30000, 0x37F38000, 0x37F40000, 0x37F48000, 0x37F50000, 0x37F58000, 0x37F60000, 0x37F68000, 0x37F70000, 0x37F78000,
+				0x37F80000, 0x37F88000, 0x37F90000, 0x37F98000, 0x37FA0000, 0x37FA8000, 0x37FB0000, 0x37FB8000, 0x37FC0000, 0x37FC8000, 0x37FD0000, 0x37FD8000, 0x37FE0000, 0x37FE8000, 0x37FF0000, 0x37FF8000,
+				0x38000000, 0x38004000, 0x38008000, 0x3800C000, 0x38010000, 0x38014000, 0x38018000, 0x3801C000, 0x38020000, 0x38024000, 0x38028000, 0x3802C000, 0x38030000, 0x38034000, 0x38038000, 0x3803C000,
+				0x38040000, 0x38044000, 0x38048000, 0x3804C000, 0x38050000, 0x38054000, 0x38058000, 0x3805C000, 0x38060000, 0x38064000, 0x38068000, 0x3806C000, 0x38070000, 0x38074000, 0x38078000, 0x3807C000,
+				0x38080000, 0x38084000, 0x38088000, 0x3808C000, 0x38090000, 0x38094000, 0x38098000, 0x3809C000, 0x380A0000, 0x380A4000, 0x380A8000, 0x380AC000, 0x380B0000, 0x380B4000, 0x380B8000, 0x380BC000,
+				0x380C0000, 0x380C4000, 0x380C8000, 0x380CC000, 0x380D0000, 0x380D4000, 0x380D8000, 0x380DC000, 0x380E0000, 0x380E4000, 0x380E8000, 0x380EC000, 0x380F0000, 0x380F4000, 0x380F8000, 0x380FC000,
+				0x38100000, 0x38104000, 0x38108000, 0x3810C000, 0x38110000, 0x38114000, 0x38118000, 0x3811C000, 0x38120000, 0x38124000, 0x38128000, 0x3812C000, 0x38130000, 0x38134000, 0x38138000, 0x3813C000,
+				0x38140000, 0x38144000, 0x38148000, 0x3814C000, 0x38150000, 0x38154000, 0x38158000, 0x3815C000, 0x38160000, 0x38164000, 0x38168000, 0x3816C000, 0x38170000, 0x38174000, 0x38178000, 0x3817C000,
+				0x38180000, 0x38184000, 0x38188000, 0x3818C000, 0x38190000, 0x38194000, 0x38198000, 0x3819C000, 0x381A0000, 0x381A4000, 0x381A8000, 0x381AC000, 0x381B0000, 0x381B4000, 0x381B8000, 0x381BC000,
+				0x381C0000, 0x381C4000, 0x381C8000, 0x381CC000, 0x381D0000, 0x381D4000, 0x381D8000, 0x381DC000, 0x381E0000, 0x381E4000, 0x381E8000, 0x381EC000, 0x381F0000, 0x381F4000, 0x381F8000, 0x381FC000,
+				0x38200000, 0x38204000, 0x38208000, 0x3820C000, 0x38210000, 0x38214000, 0x38218000, 0x3821C000, 0x38220000, 0x38224000, 0x38228000, 0x3822C000, 0x38230000, 0x38234000, 0x38238000, 0x3823C000,
+				0x38240000, 0x38244000, 0x38248000, 0x3824C000, 0x38250000, 0x38254000, 0x38258000, 0x3825C000, 0x38260000, 0x38264000, 0x38268000, 0x3826C000, 0x38270000, 0x38274000, 0x38278000, 0x3827C000,
+				0x38280000, 0x38284000, 0x38288000, 0x3828C000, 0x38290000, 0x38294000, 0x38298000, 0x3829C000, 0x382A0000, 0x382A4000, 0x382A8000, 0x382AC000, 0x382B0000, 0x382B4000, 0x382B8000, 0x382BC000,
+				0x382C0000, 0x382C4000, 0x382C8000, 0x382CC000, 0x382D0000, 0x382D4000, 0x382D8000, 0x382DC000, 0x382E0000, 0x382E4000, 0x382E8000, 0x382EC000, 0x382F0000, 0x382F4000, 0x382F8000, 0x382FC000,
+				0x38300000, 0x38304000, 0x38308000, 0x3830C000, 0x38310000, 0x38314000, 0x38318000, 0x3831C000, 0x38320000, 0x38324000, 0x38328000, 0x3832C000, 0x38330000, 0x38334000, 0x38338000, 0x3833C000,
+				0x38340000, 0x38344000, 0x38348000, 0x3834C000, 0x38350000, 0x38354000, 0x38358000, 0x3835C000, 0x38360000, 0x38364000, 0x38368000, 0x3836C000, 0x38370000, 0x38374000, 0x38378000, 0x3837C000,
+				0x38380000, 0x38384000, 0x38388000, 0x3838C000, 0x38390000, 0x38394000, 0x38398000, 0x3839C000, 0x383A0000, 0x383A4000, 0x383A8000, 0x383AC000, 0x383B0000, 0x383B4000, 0x383B8000, 0x383BC000,
+				0x383C0000, 0x383C4000, 0x383C8000, 0x383CC000, 0x383D0000, 0x383D4000, 0x383D8000, 0x383DC000, 0x383E0000, 0x383E4000, 0x383E8000, 0x383EC000, 0x383F0000, 0x383F4000, 0x383F8000, 0x383FC000,
+				0x38400000, 0x38404000, 0x38408000, 0x3840C000, 0x38410000, 0x38414000, 0x38418000, 0x3841C000, 0x38420000, 0x38424000, 0x38428000, 0x3842C000, 0x38430000, 0x38434000, 0x38438000, 0x3843C000,
+				0x38440000, 0x38444000, 0x38448000, 0x3844C000, 0x38450000, 0x38454000, 0x38458000, 0x3845C000, 0x38460000, 0x38464000, 0x38468000, 0x3846C000, 0x38470000, 0x38474000, 0x38478000, 0x3847C000,
+				0x38480000, 0x38484000, 0x38488000, 0x3848C000, 0x38490000, 0x38494000, 0x38498000, 0x3849C000, 0x384A0000, 0x384A4000, 0x384A8000, 0x384AC000, 0x384B0000, 0x384B4000, 0x384B8000, 0x384BC000,
+				0x384C0000, 0x384C4000, 0x384C8000, 0x384CC000, 0x384D0000, 0x384D4000, 0x384D8000, 0x384DC000, 0x384E0000, 0x384E4000, 0x384E8000, 0x384EC000, 0x384F0000, 0x384F4000, 0x384F8000, 0x384FC000,
+				0x38500000, 0x38504000, 0x38508000, 0x3850C000, 0x38510000, 0x38514000, 0x38518000, 0x3851C000, 0x38520000, 0x38524000, 0x38528000, 0x3852C000, 0x38530000, 0x38534000, 0x38538000, 0x3853C000,
+				0x38540000, 0x38544000, 0x38548000, 0x3854C000, 0x38550000, 0x38554000, 0x38558000, 0x3855C000, 0x38560000, 0x38564000, 0x38568000, 0x3856C000, 0x38570000, 0x38574000, 0x38578000, 0x3857C000,
+				0x38580000, 0x38584000, 0x38588000, 0x3858C000, 0x38590000, 0x38594000, 0x38598000, 0x3859C000, 0x385A0000, 0x385A4000, 0x385A8000, 0x385AC000, 0x385B0000, 0x385B4000, 0x385B8000, 0x385BC000,
+				0x385C0000, 0x385C4000, 0x385C8000, 0x385CC000, 0x385D0000, 0x385D4000, 0x385D8000, 0x385DC000, 0x385E0000, 0x385E4000, 0x385E8000, 0x385EC000, 0x385F0000, 0x385F4000, 0x385F8000, 0x385FC000,
+				0x38600000, 0x38604000, 0x38608000, 0x3860C000, 0x38610000, 0x38614000, 0x38618000, 0x3861C000, 0x38620000, 0x38624000, 0x38628000, 0x3862C000, 0x38630000, 0x38634000, 0x38638000, 0x3863C000,
+				0x38640000, 0x38644000, 0x38648000, 0x3864C000, 0x38650000, 0x38654000, 0x38658000, 0x3865C000, 0x38660000, 0x38664000, 0x38668000, 0x3866C000, 0x38670000, 0x38674000, 0x38678000, 0x3867C000,
+				0x38680000, 0x38684000, 0x38688000, 0x3868C000, 0x38690000, 0x38694000, 0x38698000, 0x3869C000, 0x386A0000, 0x386A4000, 0x386A8000, 0x386AC000, 0x386B0000, 0x386B4000, 0x386B8000, 0x386BC000,
+				0x386C0000, 0x386C4000, 0x386C8000, 0x386CC000, 0x386D0000, 0x386D4000, 0x386D8000, 0x386DC000, 0x386E0000, 0x386E4000, 0x386E8000, 0x386EC000, 0x386F0000, 0x386F4000, 0x386F8000, 0x386FC000,
+				0x38700000, 0x38704000, 0x38708000, 0x3870C000, 0x38710000, 0x38714000, 0x38718000, 0x3871C000, 0x38720000, 0x38724000, 0x38728000, 0x3872C000, 0x38730000, 0x38734000, 0x38738000, 0x3873C000,
+				0x38740000, 0x38744000, 0x38748000, 0x3874C000, 0x38750000, 0x38754000, 0x38758000, 0x3875C000, 0x38760000, 0x38764000, 0x38768000, 0x3876C000, 0x38770000, 0x38774000, 0x38778000, 0x3877C000,
+				0x38780000, 0x38784000, 0x38788000, 0x3878C000, 0x38790000, 0x38794000, 0x38798000, 0x3879C000, 0x387A0000, 0x387A4000, 0x387A8000, 0x387AC000, 0x387B0000, 0x387B4000, 0x387B8000, 0x387BC000,
+				0x387C0000, 0x387C4000, 0x387C8000, 0x387CC000, 0x387D0000, 0x387D4000, 0x387D8000, 0x387DC000, 0x387E0000, 0x387E4000, 0x387E8000, 0x387EC000, 0x387F0000, 0x387F4000, 0x387F8000, 0x387FC000,
+				0x38000000, 0x38002000, 0x38004000, 0x38006000, 0x38008000, 0x3800A000, 0x3800C000, 0x3800E000, 0x38010000, 0x38012000, 0x38014000, 0x38016000, 0x38018000, 0x3801A000, 0x3801C000, 0x3801E000,
+				0x38020000, 0x38022000, 0x38024000, 0x38026000, 0x38028000, 0x3802A000, 0x3802C000, 0x3802E000, 0x38030000, 0x38032000, 0x38034000, 0x38036000, 0x38038000, 0x3803A000, 0x3803C000, 0x3803E000,
+				0x38040000, 0x38042000, 0x38044000, 0x38046000, 0x38048000, 0x3804A000, 0x3804C000, 0x3804E000, 0x38050000, 0x38052000, 0x38054000, 0x38056000, 0x38058000, 0x3805A000, 0x3805C000, 0x3805E000,
+				0x38060000, 0x38062000, 0x38064000, 0x38066000, 0x38068000, 0x3806A000, 0x3806C000, 0x3806E000, 0x38070000, 0x38072000, 0x38074000, 0x38076000, 0x38078000, 0x3807A000, 0x3807C000, 0x3807E000,
+				0x38080000, 0x38082000, 0x38084000, 0x38086000, 0x38088000, 0x3808A000, 0x3808C000, 0x3808E000, 0x38090000, 0x38092000, 0x38094000, 0x38096000, 0x38098000, 0x3809A000, 0x3809C000, 0x3809E000,
+				0x380A0000, 0x380A2000, 0x380A4000, 0x380A6000, 0x380A8000, 0x380AA000, 0x380AC000, 0x380AE000, 0x380B0000, 0x380B2000, 0x380B4000, 0x380B6000, 0x380B8000, 0x380BA000, 0x380BC000, 0x380BE000,
+				0x380C0000, 0x380C2000, 0x380C4000, 0x380C6000, 0x380C8000, 0x380CA000, 0x380CC000, 0x380CE000, 0x380D0000, 0x380D2000, 0x380D4000, 0x380D6000, 0x380D8000, 0x380DA000, 0x380DC000, 0x380DE000,
+				0x380E0000, 0x380E2000, 0x380E4000, 0x380E6000, 0x380E8000, 0x380EA000, 0x380EC000, 0x380EE000, 0x380F0000, 0x380F2000, 0x380F4000, 0x380F6000, 0x380F8000, 0x380FA000, 0x380FC000, 0x380FE000,
+				0x38100000, 0x38102000, 0x38104000, 0x38106000, 0x38108000, 0x3810A000, 0x3810C000, 0x3810E000, 0x38110000, 0x38112000, 0x38114000, 0x38116000, 0x38118000, 0x3811A000, 0x3811C000, 0x3811E000,
+				0x38120000, 0x38122000, 0x38124000, 0x38126000, 0x38128000, 0x3812A000, 0x3812C000, 0x3812E000, 0x38130000, 0x38132000, 0x38134000, 0x38136000, 0x38138000, 0x3813A000, 0x3813C000, 0x3813E000,
+				0x38140000, 0x38142000, 0x38144000, 0x38146000, 0x38148000, 0x3814A000, 0x3814C000, 0x3814E000, 0x38150000, 0x38152000, 0x38154000, 0x38156000, 0x38158000, 0x3815A000, 0x3815C000, 0x3815E000,
+				0x38160000, 0x38162000, 0x38164000, 0x38166000, 0x38168000, 0x3816A000, 0x3816C000, 0x3816E000, 0x38170000, 0x38172000, 0x38174000, 0x38176000, 0x38178000, 0x3817A000, 0x3817C000, 0x3817E000,
+				0x38180000, 0x38182000, 0x38184000, 0x38186000, 0x38188000, 0x3818A000, 0x3818C000, 0x3818E000, 0x38190000, 0x38192000, 0x38194000, 0x38196000, 0x38198000, 0x3819A000, 0x3819C000, 0x3819E000,
+				0x381A0000, 0x381A2000, 0x381A4000, 0x381A6000, 0x381A8000, 0x381AA000, 0x381AC000, 0x381AE000, 0x381B0000, 0x381B2000, 0x381B4000, 0x381B6000, 0x381B8000, 0x381BA000, 0x381BC000, 0x381BE000,
+				0x381C0000, 0x381C2000, 0x381C4000, 0x381C6000, 0x381C8000, 0x381CA000, 0x381CC000, 0x381CE000, 0x381D0000, 0x381D2000, 0x381D4000, 0x381D6000, 0x381D8000, 0x381DA000, 0x381DC000, 0x381DE000,
+				0x381E0000, 0x381E2000, 0x381E4000, 0x381E6000, 0x381E8000, 0x381EA000, 0x381EC000, 0x381EE000, 0x381F0000, 0x381F2000, 0x381F4000, 0x381F6000, 0x381F8000, 0x381FA000, 0x381FC000, 0x381FE000,
+				0x38200000, 0x38202000, 0x38204000, 0x38206000, 0x38208000, 0x3820A000, 0x3820C000, 0x3820E000, 0x38210000, 0x38212000, 0x38214000, 0x38216000, 0x38218000, 0x3821A000, 0x3821C000, 0x3821E000,
+				0x38220000, 0x38222000, 0x38224000, 0x38226000, 0x38228000, 0x3822A000, 0x3822C000, 0x3822E000, 0x38230000, 0x38232000, 0x38234000, 0x38236000, 0x38238000, 0x3823A000, 0x3823C000, 0x3823E000,
+				0x38240000, 0x38242000, 0x38244000, 0x38246000, 0x38248000, 0x3824A000, 0x3824C000, 0x3824E000, 0x38250000, 0x38252000, 0x38254000, 0x38256000, 0x38258000, 0x3825A000, 0x3825C000, 0x3825E000,
+				0x38260000, 0x38262000, 0x38264000, 0x38266000, 0x38268000, 0x3826A000, 0x3826C000, 0x3826E000, 0x38270000, 0x38272000, 0x38274000, 0x38276000, 0x38278000, 0x3827A000, 0x3827C000, 0x3827E000,
+				0x38280000, 0x38282000, 0x38284000, 0x38286000, 0x38288000, 0x3828A000, 0x3828C000, 0x3828E000, 0x38290000, 0x38292000, 0x38294000, 0x38296000, 0x38298000, 0x3829A000, 0x3829C000, 0x3829E000,
+				0x382A0000, 0x382A2000, 0x382A4000, 0x382A6000, 0x382A8000, 0x382AA000, 0x382AC000, 0x382AE000, 0x382B0000, 0x382B2000, 0x382B4000, 0x382B6000, 0x382B8000, 0x382BA000, 0x382BC000, 0x382BE000,
+				0x382C0000, 0x382C2000, 0x382C4000, 0x382C6000, 0x382C8000, 0x382CA000, 0x382CC000, 0x382CE000, 0x382D0000, 0x382D2000, 0x382D4000, 0x382D6000, 0x382D8000, 0x382DA000, 0x382DC000, 0x382DE000,
+				0x382E0000, 0x382E2000, 0x382E4000, 0x382E6000, 0x382E8000, 0x382EA000, 0x382EC000, 0x382EE000, 0x382F0000, 0x382F2000, 0x382F4000, 0x382F6000, 0x382F8000, 0x382FA000, 0x382FC000, 0x382FE000,
+				0x38300000, 0x38302000, 0x38304000, 0x38306000, 0x38308000, 0x3830A000, 0x3830C000, 0x3830E000, 0x38310000, 0x38312000, 0x38314000, 0x38316000, 0x38318000, 0x3831A000, 0x3831C000, 0x3831E000,
+				0x38320000, 0x38322000, 0x38324000, 0x38326000, 0x38328000, 0x3832A000, 0x3832C000, 0x3832E000, 0x38330000, 0x38332000, 0x38334000, 0x38336000, 0x38338000, 0x3833A000, 0x3833C000, 0x3833E000,
+				0x38340000, 0x38342000, 0x38344000, 0x38346000, 0x38348000, 0x3834A000, 0x3834C000, 0x3834E000, 0x38350000, 0x38352000, 0x38354000, 0x38356000, 0x38358000, 0x3835A000, 0x3835C000, 0x3835E000,
+				0x38360000, 0x38362000, 0x38364000, 0x38366000, 0x38368000, 0x3836A000, 0x3836C000, 0x3836E000, 0x38370000, 0x38372000, 0x38374000, 0x38376000, 0x38378000, 0x3837A000, 0x3837C000, 0x3837E000,
+				0x38380000, 0x38382000, 0x38384000, 0x38386000, 0x38388000, 0x3838A000, 0x3838C000, 0x3838E000, 0x38390000, 0x38392000, 0x38394000, 0x38396000, 0x38398000, 0x3839A000, 0x3839C000, 0x3839E000,
+				0x383A0000, 0x383A2000, 0x383A4000, 0x383A6000, 0x383A8000, 0x383AA000, 0x383AC000, 0x383AE000, 0x383B0000, 0x383B2000, 0x383B4000, 0x383B6000, 0x383B8000, 0x383BA000, 0x383BC000, 0x383BE000,
+				0x383C0000, 0x383C2000, 0x383C4000, 0x383C6000, 0x383C8000, 0x383CA000, 0x383CC000, 0x383CE000, 0x383D0000, 0x383D2000, 0x383D4000, 0x383D6000, 0x383D8000, 0x383DA000, 0x383DC000, 0x383DE000,
+				0x383E0000, 0x383E2000, 0x383E4000, 0x383E6000, 0x383E8000, 0x383EA000, 0x383EC000, 0x383EE000, 0x383F0000, 0x383F2000, 0x383F4000, 0x383F6000, 0x383F8000, 0x383FA000, 0x383FC000, 0x383FE000,
+				0x38400000, 0x38402000, 0x38404000, 0x38406000, 0x38408000, 0x3840A000, 0x3840C000, 0x3840E000, 0x38410000, 0x38412000, 0x38414000, 0x38416000, 0x38418000, 0x3841A000, 0x3841C000, 0x3841E000,
+				0x38420000, 0x38422000, 0x38424000, 0x38426000, 0x38428000, 0x3842A000, 0x3842C000, 0x3842E000, 0x38430000, 0x38432000, 0x38434000, 0x38436000, 0x38438000, 0x3843A000, 0x3843C000, 0x3843E000,
+				0x38440000, 0x38442000, 0x38444000, 0x38446000, 0x38448000, 0x3844A000, 0x3844C000, 0x3844E000, 0x38450000, 0x38452000, 0x38454000, 0x38456000, 0x38458000, 0x3845A000, 0x3845C000, 0x3845E000,
+				0x38460000, 0x38462000, 0x38464000, 0x38466000, 0x38468000, 0x3846A000, 0x3846C000, 0x3846E000, 0x38470000, 0x38472000, 0x38474000, 0x38476000, 0x38478000, 0x3847A000, 0x3847C000, 0x3847E000,
+				0x38480000, 0x38482000, 0x38484000, 0x38486000, 0x38488000, 0x3848A000, 0x3848C000, 0x3848E000, 0x38490000, 0x38492000, 0x38494000, 0x38496000, 0x38498000, 0x3849A000, 0x3849C000, 0x3849E000,
+				0x384A0000, 0x384A2000, 0x384A4000, 0x384A6000, 0x384A8000, 0x384AA000, 0x384AC000, 0x384AE000, 0x384B0000, 0x384B2000, 0x384B4000, 0x384B6000, 0x384B8000, 0x384BA000, 0x384BC000, 0x384BE000,
+				0x384C0000, 0x384C2000, 0x384C4000, 0x384C6000, 0x384C8000, 0x384CA000, 0x384CC000, 0x384CE000, 0x384D0000, 0x384D2000, 0x384D4000, 0x384D6000, 0x384D8000, 0x384DA000, 0x384DC000, 0x384DE000,
+				0x384E0000, 0x384E2000, 0x384E4000, 0x384E6000, 0x384E8000, 0x384EA000, 0x384EC000, 0x384EE000, 0x384F0000, 0x384F2000, 0x384F4000, 0x384F6000, 0x384F8000, 0x384FA000, 0x384FC000, 0x384FE000,
+				0x38500000, 0x38502000, 0x38504000, 0x38506000, 0x38508000, 0x3850A000, 0x3850C000, 0x3850E000, 0x38510000, 0x38512000, 0x38514000, 0x38516000, 0x38518000, 0x3851A000, 0x3851C000, 0x3851E000,
+				0x38520000, 0x38522000, 0x38524000, 0x38526000, 0x38528000, 0x3852A000, 0x3852C000, 0x3852E000, 0x38530000, 0x38532000, 0x38534000, 0x38536000, 0x38538000, 0x3853A000, 0x3853C000, 0x3853E000,
+				0x38540000, 0x38542000, 0x38544000, 0x38546000, 0x38548000, 0x3854A000, 0x3854C000, 0x3854E000, 0x38550000, 0x38552000, 0x38554000, 0x38556000, 0x38558000, 0x3855A000, 0x3855C000, 0x3855E000,
+				0x38560000, 0x38562000, 0x38564000, 0x38566000, 0x38568000, 0x3856A000, 0x3856C000, 0x3856E000, 0x38570000, 0x38572000, 0x38574000, 0x38576000, 0x38578000, 0x3857A000, 0x3857C000, 0x3857E000,
+				0x38580000, 0x38582000, 0x38584000, 0x38586000, 0x38588000, 0x3858A000, 0x3858C000, 0x3858E000, 0x38590000, 0x38592000, 0x38594000, 0x38596000, 0x38598000, 0x3859A000, 0x3859C000, 0x3859E000,
+				0x385A0000, 0x385A2000, 0x385A4000, 0x385A6000, 0x385A8000, 0x385AA000, 0x385AC000, 0x385AE000, 0x385B0000, 0x385B2000, 0x385B4000, 0x385B6000, 0x385B8000, 0x385BA000, 0x385BC000, 0x385BE000,
+				0x385C0000, 0x385C2000, 0x385C4000, 0x385C6000, 0x385C8000, 0x385CA000, 0x385CC000, 0x385CE000, 0x385D0000, 0x385D2000, 0x385D4000, 0x385D6000, 0x385D8000, 0x385DA000, 0x385DC000, 0x385DE000,
+				0x385E0000, 0x385E2000, 0x385E4000, 0x385E6000, 0x385E8000, 0x385EA000, 0x385EC000, 0x385EE000, 0x385F0000, 0x385F2000, 0x385F4000, 0x385F6000, 0x385F8000, 0x385FA000, 0x385FC000, 0x385FE000,
+				0x38600000, 0x38602000, 0x38604000, 0x38606000, 0x38608000, 0x3860A000, 0x3860C000, 0x3860E000, 0x38610000, 0x38612000, 0x38614000, 0x38616000, 0x38618000, 0x3861A000, 0x3861C000, 0x3861E000,
+				0x38620000, 0x38622000, 0x38624000, 0x38626000, 0x38628000, 0x3862A000, 0x3862C000, 0x3862E000, 0x38630000, 0x38632000, 0x38634000, 0x38636000, 0x38638000, 0x3863A000, 0x3863C000, 0x3863E000,
+				0x38640000, 0x38642000, 0x38644000, 0x38646000, 0x38648000, 0x3864A000, 0x3864C000, 0x3864E000, 0x38650000, 0x38652000, 0x38654000, 0x38656000, 0x38658000, 0x3865A000, 0x3865C000, 0x3865E000,
+				0x38660000, 0x38662000, 0x38664000, 0x38666000, 0x38668000, 0x3866A000, 0x3866C000, 0x3866E000, 0x38670000, 0x38672000, 0x38674000, 0x38676000, 0x38678000, 0x3867A000, 0x3867C000, 0x3867E000,
+				0x38680000, 0x38682000, 0x38684000, 0x38686000, 0x38688000, 0x3868A000, 0x3868C000, 0x3868E000, 0x38690000, 0x38692000, 0x38694000, 0x38696000, 0x38698000, 0x3869A000, 0x3869C000, 0x3869E000,
+				0x386A0000, 0x386A2000, 0x386A4000, 0x386A6000, 0x386A8000, 0x386AA000, 0x386AC000, 0x386AE000, 0x386B0000, 0x386B2000, 0x386B4000, 0x386B6000, 0x386B8000, 0x386BA000, 0x386BC000, 0x386BE000,
+				0x386C0000, 0x386C2000, 0x386C4000, 0x386C6000, 0x386C8000, 0x386CA000, 0x386CC000, 0x386CE000, 0x386D0000, 0x386D2000, 0x386D4000, 0x386D6000, 0x386D8000, 0x386DA000, 0x386DC000, 0x386DE000,
+				0x386E0000, 0x386E2000, 0x386E4000, 0x386E6000, 0x386E8000, 0x386EA000, 0x386EC000, 0x386EE000, 0x386F0000, 0x386F2000, 0x386F4000, 0x386F6000, 0x386F8000, 0x386FA000, 0x386FC000, 0x386FE000,
+				0x38700000, 0x38702000, 0x38704000, 0x38706000, 0x38708000, 0x3870A000, 0x3870C000, 0x3870E000, 0x38710000, 0x38712000, 0x38714000, 0x38716000, 0x38718000, 0x3871A000, 0x3871C000, 0x3871E000,
+				0x38720000, 0x38722000, 0x38724000, 0x38726000, 0x38728000, 0x3872A000, 0x3872C000, 0x3872E000, 0x38730000, 0x38732000, 0x38734000, 0x38736000, 0x38738000, 0x3873A000, 0x3873C000, 0x3873E000,
+				0x38740000, 0x38742000, 0x38744000, 0x38746000, 0x38748000, 0x3874A000, 0x3874C000, 0x3874E000, 0x38750000, 0x38752000, 0x38754000, 0x38756000, 0x38758000, 0x3875A000, 0x3875C000, 0x3875E000,
+				0x38760000, 0x38762000, 0x38764000, 0x38766000, 0x38768000, 0x3876A000, 0x3876C000, 0x3876E000, 0x38770000, 0x38772000, 0x38774000, 0x38776000, 0x38778000, 0x3877A000, 0x3877C000, 0x3877E000,
+				0x38780000, 0x38782000, 0x38784000, 0x38786000, 0x38788000, 0x3878A000, 0x3878C000, 0x3878E000, 0x38790000, 0x38792000, 0x38794000, 0x38796000, 0x38798000, 0x3879A000, 0x3879C000, 0x3879E000,
+				0x387A0000, 0x387A2000, 0x387A4000, 0x387A6000, 0x387A8000, 0x387AA000, 0x387AC000, 0x387AE000, 0x387B0000, 0x387B2000, 0x387B4000, 0x387B6000, 0x387B8000, 0x387BA000, 0x387BC000, 0x387BE000,
+				0x387C0000, 0x387C2000, 0x387C4000, 0x387C6000, 0x387C8000, 0x387CA000, 0x387CC000, 0x387CE000, 0x387D0000, 0x387D2000, 0x387D4000, 0x387D6000, 0x387D8000, 0x387DA000, 0x387DC000, 0x387DE000,
+				0x387E0000, 0x387E2000, 0x387E4000, 0x387E6000, 0x387E8000, 0x387EA000, 0x387EC000, 0x387EE000, 0x387F0000, 0x387F2000, 0x387F4000, 0x387F6000, 0x387F8000, 0x387FA000, 0x387FC000, 0x387FE000 };
+			static const bits<float>::type exponent_table[64] = {
+				0x00000000, 0x00800000, 0x01000000, 0x01800000, 0x02000000, 0x02800000, 0x03000000, 0x03800000, 0x04000000, 0x04800000, 0x05000000, 0x05800000, 0x06000000, 0x06800000, 0x07000000, 0x07800000,
+				0x08000000, 0x08800000, 0x09000000, 0x09800000, 0x0A000000, 0x0A800000, 0x0B000000, 0x0B800000, 0x0C000000, 0x0C800000, 0x0D000000, 0x0D800000, 0x0E000000, 0x0E800000, 0x0F000000, 0x47800000,
+				0x80000000, 0x80800000, 0x81000000, 0x81800000, 0x82000000, 0x82800000, 0x83000000, 0x83800000, 0x84000000, 0x84800000, 0x85000000, 0x85800000, 0x86000000, 0x86800000, 0x87000000, 0x87800000,
+				0x88000000, 0x88800000, 0x89000000, 0x89800000, 0x8A000000, 0x8A800000, 0x8B000000, 0x8B800000, 0x8C000000, 0x8C800000, 0x8D000000, 0x8D800000, 0x8E000000, 0x8E800000, 0x8F000000, 0xC7800000 };
+			static const unsigned short offset_table[64] = {
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024,
+				0, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024 };
+			bits<float>::type fbits = mantissa_table[offset_table[value >> 10] + ( value & 0x3FF )] + exponent_table[value >> 10];
+#endif
+			float out;
+			std::memcpy( &out, &fbits, sizeof( float ) );
+			return out;
+#endif
+		}
+
+		/// Convert half-precision to IEEE double-precision.
+		/// \param value half-precision value to convert
+		/// \return double-precision value
+		inline double half2float_impl( unsigned int value, double, true_type )
+		{
+#if HALF_ENABLE_F16C_INTRINSICS
+			return _mm_cvtsd_f64( _mm_cvtps_pd( _mm_cvtph_ps( _mm_cvtsi32_si128( value ) ) ) );
+#else
+			uint32 hi = static_cast<uint32>( value & 0x8000 ) << 16;
+			unsigned int abs = value & 0x7FFF;
+			if ( abs )
+			{
+				hi |= 0x3F000000 << static_cast<unsigned>( abs >= 0x7C00 );
+				for ( ; abs < 0x400; abs <<= 1, hi -= 0x100000 )
+					;
+				hi += static_cast<uint32>( abs ) << 10;
+			}
+			bits<double>::type dbits = static_cast<bits<double>::type>( hi ) << 32;
+			double out;
+			std::memcpy( &out, &dbits, sizeof( double ) );
+			return out;
+#endif
+		}
+
+		/// Convert half-precision to non-IEEE floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template <typename T>
+		T half2float_impl( unsigned int value, T, ... )
+		{
+			T out;
+			unsigned int abs = value & 0x7FFF;
+			if ( abs > 0x7C00 )
+				out = ( std::numeric_limits<T>::has_signaling_NaN && !( abs & 0x200 ) ) ? std::numeric_limits<T>::signaling_NaN() :
+					  std::numeric_limits<T>::has_quiet_NaN								? std::numeric_limits<T>::quiet_NaN() :
+																						  T();
+			else if ( abs == 0x7C00 )
+				out = std::numeric_limits<T>::has_infinity ? std::numeric_limits<T>::infinity() : std::numeric_limits<T>::max();
+			else if ( abs > 0x3FF )
+				out = std::ldexp( static_cast<T>( ( abs & 0x3FF ) | 0x400 ), ( abs >> 10 ) - 25 );
+			else
+				out = std::ldexp( static_cast<T>( abs ), -24 );
+			return ( value & 0x8000 ) ? -out : out;
+		}
+
+		/// Convert half-precision to floating-point.
+		/// \tparam T type to convert to (builtin integer type)
+		/// \param value half-precision value to convert
+		/// \return floating-point value
+		template <typename T>
+		T half2float( unsigned int value )
+		{
+			return half2float_impl( value, T(), bool_type < std::numeric_limits<T>::is_iec559 && sizeof( typename bits<T>::type ) == sizeof( T ) > () );
+		}
+
+		/// Convert half-precision floating-point to integer.
+		/// \tparam R rounding mode to use
+		/// \tparam E `true` for round to even, `false` for round away from zero
+		/// \tparam I `true` to raise INEXACT exception (if inexact), `false` to never raise it
+		/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits)
+		/// \param value half-precision value to convert
+		/// \return rounded integer value
+		/// \exception FE_INVALID if value is not representable in type \a T
+		/// \exception FE_INEXACT if value had to be rounded and \a I is `true`
+		template <std::float_round_style R, bool E, bool I, typename T>
+		T half2int( unsigned int value )
+		{
+			unsigned int abs = value & 0x7FFF;
+			if ( abs >= 0x7C00 )
+			{
+				raise( FE_INVALID );
+				return ( value & 0x8000 ) ? std::numeric_limits<T>::min() : std::numeric_limits<T>::max();
+			}
+			if ( abs < 0x3800 )
+			{
+				raise( FE_INEXACT, I );
+				return ( R == std::round_toward_infinity )	   ? T( ~( value >> 15 ) & ( abs != 0 ) ) :
+					   ( R == std::round_toward_neg_infinity ) ? -T( value > 0x8000 ) :
+																 T();
+			}
+			int exp = 25 - ( abs >> 10 );
+			unsigned int m = ( value & 0x3FF ) | 0x400;
+			int32 i = static_cast<int32>( ( exp <= 0 ) ? ( m << -exp ) : ( ( m + ( ( R == std::round_to_nearest ) ? ( ( 1 << ( exp - 1 ) ) - ( ~( m >> exp ) & E ) ) : ( R == std::round_toward_infinity ) ? ( ( ( 1 << exp ) - 1 ) & ( ( value >> 15 ) - 1 ) ) :
+																																								   ( R == std::round_toward_neg_infinity ) ? ( ( ( 1 << exp ) - 1 ) & -( value >> 15 ) ) :
+																																																			 0 ) ) >>
+																		   exp ) );
+			if ( ( !std::numeric_limits<T>::is_signed && ( value & 0x8000 ) ) || ( std::numeric_limits<T>::digits < 16 &&
+																				   ( ( value & 0x8000 ) ? ( -i < std::numeric_limits<T>::min() ) : ( i > std::numeric_limits<T>::max() ) ) ) )
+				raise( FE_INVALID );
+			else if ( I && exp > 0 && ( m & ( ( 1 << exp ) - 1 ) ) )
+				raise( FE_INEXACT );
+			return static_cast<T>( ( value & 0x8000 ) ? -i : i );
+		}
+
+		/// \}
+		/// \name Mathematics
+		/// \{
+
+		/// upper part of 64-bit multiplication.
+		/// \tparam R rounding mode to use
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y
+		template <std::float_round_style R>
+		uint32 mulhi( uint32 x, uint32 y )
+		{
+			uint32 xy = ( x >> 16 ) * ( y & 0xFFFF ), yx = ( x & 0xFFFF ) * ( y >> 16 ), c = ( xy & 0xFFFF ) + ( yx & 0xFFFF ) + ( ( ( x & 0xFFFF ) * ( y & 0xFFFF ) ) >> 16 );
+			return ( x >> 16 ) * ( y >> 16 ) + ( xy >> 16 ) + ( yx >> 16 ) + ( c >> 16 ) +
+				   ( ( R == std::round_to_nearest ) ? ( ( c >> 15 ) & 1 ) : ( R == std::round_toward_infinity ) ? ( ( c & 0xFFFF ) != 0 ) :
+																												  0 );
+		}
+
+		/// 64-bit multiplication.
+		/// \param x first factor
+		/// \param y second factor
+		/// \return upper 32 bit of \a x * \a y rounded to nearest
+		inline uint32 multiply64( uint32 x, uint32 y )
+		{
+#if HALF_ENABLE_CPP11_LONG_LONG
+			return static_cast<uint32>( ( static_cast<unsigned long long>( x ) * static_cast<unsigned long long>( y ) + 0x80000000 ) >> 32 );
+#else
+			return mulhi<std::round_to_nearest>( x, y );
+#endif
+		}
+
+		/// 64-bit division.
+		/// \param x upper 32 bit of dividend
+		/// \param y divisor
+		/// \param s variable to store sticky bit for rounding
+		/// \return (\a x << 32) / \a y
+		inline uint32 divide64( uint32 x, uint32 y, int &s )
+		{
+#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long xx = static_cast<unsigned long long>( x ) << 32;
+			return s = ( xx % y != 0 ), static_cast<uint32>( xx / y );
+#else
+			y >>= 1;
+			uint32 rem = x, div = 0;
+			for ( unsigned int i = 0; i < 32; ++i )
+			{
+				div <<= 1;
+				if ( rem >= y )
+				{
+					rem -= y;
+					div |= 1;
+				}
+				rem <<= 1;
+			}
+			return s = rem > 1, div;
+#endif
+		}
+
+		/// Half precision positive modulus.
+		/// \tparam Q `true` to compute full quotient, `false` else
+		/// \tparam R `true` to compute signed remainder, `false` for positive remainder
+		/// \param x first operand as positive finite half-precision value
+		/// \param y second operand as positive finite half-precision value
+		/// \param quo adress to store quotient at, `nullptr` if \a Q `false`
+		/// \return modulus of \a x / \a y
+		template <bool Q, bool R>
+		unsigned int mod( unsigned int x, unsigned int y, int *quo = NULL )
+		{
+			unsigned int q = 0;
+			if ( x > y )
+			{
+				int absx = x, absy = y, expx = 0, expy = 0;
+				for ( ; absx < 0x400; absx <<= 1, --expx )
+					;
+				for ( ; absy < 0x400; absy <<= 1, --expy )
+					;
+				expx += absx >> 10;
+				expy += absy >> 10;
+				int mx = ( absx & 0x3FF ) | 0x400, my = ( absy & 0x3FF ) | 0x400;
+				for ( int d = expx - expy; d; --d )
+				{
+					if ( !Q && mx == my )
+						return 0;
+					if ( mx >= my )
+					{
+						mx -= my;
+						q += Q;
+					}
+					mx <<= 1;
+					q <<= static_cast<int>( Q );
+				}
+				if ( !Q && mx == my )
+					return 0;
+				if ( mx >= my )
+				{
+					mx -= my;
+					++q;
+				}
+				if ( Q )
+				{
+					q &= ( 1 << ( std::numeric_limits<int>::digits - 1 ) ) - 1;
+					if ( !mx )
+						return *quo = q, 0;
+				}
+				for ( ; mx < 0x400; mx <<= 1, --expy )
+					;
+				x = ( expy > 0 ) ? ( ( expy << 10 ) | ( mx & 0x3FF ) ) : ( mx >> ( 1 - expy ) );
+			}
+			if ( R )
+			{
+				unsigned int a, b;
+				if ( y < 0x800 )
+				{
+					a = ( x < 0x400 ) ? ( x << 1 ) : ( x + 0x400 );
+					b = y;
+				}
+				else
+				{
+					a = x;
+					b = y - 0x400;
+				}
+				if ( a > b || ( a == b && ( q & 1 ) ) )
+				{
+					int exp = ( y >> 10 ) + ( y <= 0x3FF ), d = exp - ( x >> 10 ) - ( x <= 0x3FF );
+					int m = ( ( ( y & 0x3FF ) | ( ( y > 0x3FF ) << 10 ) ) << 1 ) - ( ( ( x & 0x3FF ) | ( ( x > 0x3FF ) << 10 ) ) << ( 1 - d ) );
+					for ( ; m < 0x800 && exp > 1; m <<= 1, --exp )
+						;
+					x = 0x8000 + ( ( exp - 1 ) << 10 ) + ( m >> 1 );
+					q += Q;
+				}
+			}
+			if ( Q )
+				*quo = q;
+			return x;
+		}
+
+		/// Fixed point square root.
+		/// \tparam F number of fractional bits
+		/// \param r radicand in Q1.F fixed point format
+		/// \param exp exponent
+		/// \return square root as Q1.F/2
+		template <unsigned int F>
+		uint32 sqrt( uint32 &r, int &exp )
+		{
+			int i = exp & 1;
+			r <<= i;
+			exp = ( exp - i ) / 2;
+			uint32 m = 0;
+			for ( uint32 bit = static_cast<uint32>( 1 ) << F; bit; bit >>= 2 )
+			{
+				if ( r < m + bit )
+					m >>= 1;
+				else
+				{
+					r -= m + bit;
+					m = ( m >> 1 ) + bit;
+				}
+			}
+			return m;
+		}
+
+		/// Fixed point binary exponential.
+		/// This uses the BKM algorithm in E-mode.
+		/// \param m exponent in [0,1) as Q0.31
+		/// \param n number of iterations (at most 32)
+		/// \return 2 ^ \a m as Q1.31
+		inline uint32 exp2( uint32 m, unsigned int n = 32 )
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if ( !m )
+				return 0x80000000;
+			uint32 mx = 0x80000000, my = 0;
+			for ( unsigned int i = 1; i < n; ++i )
+			{
+				uint32 mz = my + logs[i];
+				if ( mz <= m )
+				{
+					my = mz;
+					mx += mx >> i;
+				}
+			}
+			return mx;
+		}
+
+		/// Fixed point binary logarithm.
+		/// This uses the BKM algorithm in L-mode.
+		/// \param m mantissa in [1,2) as Q1.30
+		/// \param n number of iterations (at most 32)
+		/// \return log2(\a m) as Q0.31
+		inline uint32 log2( uint32 m, unsigned int n = 32 )
+		{
+			static const uint32 logs[] = {
+				0x80000000, 0x4AE00D1D, 0x2934F098, 0x15C01A3A, 0x0B31FB7D, 0x05AEB4DD, 0x02DCF2D1, 0x016FE50B,
+				0x00B84E23, 0x005C3E10, 0x002E24CA, 0x001713D6, 0x000B8A47, 0x0005C53B, 0x0002E2A3, 0x00017153,
+				0x0000B8AA, 0x00005C55, 0x00002E2B, 0x00001715, 0x00000B8B, 0x000005C5, 0x000002E3, 0x00000171,
+				0x000000B9, 0x0000005C, 0x0000002E, 0x00000017, 0x0000000C, 0x00000006, 0x00000003, 0x00000001 };
+			if ( m == 0x40000000 )
+				return 0;
+			uint32 mx = 0x40000000, my = 0;
+			for ( unsigned int i = 1; i < n; ++i )
+			{
+				uint32 mz = mx + ( mx >> i );
+				if ( mz <= m )
+				{
+					mx = mz;
+					my += logs[i];
+				}
+			}
+			return my;
+		}
+
+		/// Fixed point sine and cosine.
+		/// This uses the CORDIC algorithm in rotation mode.
+		/// \param mz angle in [-pi/2,pi/2] as Q1.30
+		/// \param n number of iterations (at most 31)
+		/// \return sine and cosine of \a mz as Q1.30
+		inline std::pair<uint32, uint32> sincos( uint32 mz, unsigned int n = 31 )
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mx = 0x26DD3B6A, my = 0;
+			for ( unsigned int i = 0; i < n; ++i )
+			{
+				uint32 sign = sign_mask( mz );
+				uint32 tx = mx - ( arithmetic_shift( my, i ) ^ sign ) + sign;
+				uint32 ty = my + ( arithmetic_shift( mx, i ) ^ sign ) - sign;
+				mx = tx;
+				my = ty;
+				mz -= ( angles[i] ^ sign ) - sign;
+			}
+			return std::make_pair( my, mx );
+		}
+
+		/// Fixed point arc tangent.
+		/// This uses the CORDIC algorithm in vectoring mode.
+		/// \param my y coordinate as Q0.30
+		/// \param mx x coordinate as Q0.30
+		/// \param n number of iterations (at most 31)
+		/// \return arc tangent of \a my / \a mx as Q1.30
+		inline uint32 atan2( uint32 my, uint32 mx, unsigned int n = 31 )
+		{
+			static const uint32 angles[] = {
+				0x3243F6A9, 0x1DAC6705, 0x0FADBAFD, 0x07F56EA7, 0x03FEAB77, 0x01FFD55C, 0x00FFFAAB, 0x007FFF55,
+				0x003FFFEB, 0x001FFFFD, 0x00100000, 0x00080000, 0x00040000, 0x00020000, 0x00010000, 0x00008000,
+				0x00004000, 0x00002000, 0x00001000, 0x00000800, 0x00000400, 0x00000200, 0x00000100, 0x00000080,
+				0x00000040, 0x00000020, 0x00000010, 0x00000008, 0x00000004, 0x00000002, 0x00000001 };
+			uint32 mz = 0;
+			for ( unsigned int i = 0; i < n; ++i )
+			{
+				uint32 sign = sign_mask( my );
+				uint32 tx = mx + ( arithmetic_shift( my, i ) ^ sign ) - sign;
+				uint32 ty = my - ( arithmetic_shift( mx, i ) ^ sign ) + sign;
+				mx = tx;
+				my = ty;
+				mz += ( angles[i] ^ sign ) - sign;
+			}
+			return mz;
+		}
+
+		/// Reduce argument for trigonometric functions.
+		/// \param abs half-precision floating-point value
+		/// \param k value to take quarter period
+		/// \return \a abs reduced to [-pi/4,pi/4] as Q0.30
+		inline uint32 angle_arg( unsigned int abs, int &k )
+		{
+			uint32 m = ( abs & 0x3FF ) | ( ( abs > 0x3FF ) << 10 );
+			int exp = ( abs >> 10 ) + ( abs <= 0x3FF ) - 15;
+			if ( abs < 0x3A48 )
+				return k = 0, m << ( exp + 20 );
+#if HALF_ENABLE_CPP11_LONG_LONG
+			unsigned long long y = m * 0xA2F9836E4E442, mask = ( 1ULL << ( 62 - exp ) ) - 1, yi = ( y + ( mask >> 1 ) ) & ~mask, f = y - yi;
+			uint32 sign = -static_cast<uint32>( f >> 63 );
+			k = static_cast<int>( yi >> ( 62 - exp ) );
+			return ( multiply64( static_cast<uint32>( ( sign ? -f : f ) >> ( 31 - exp ) ), 0xC90FDAA2 ) ^ sign ) - sign;
+#else
+			uint32 yh = m * 0xA2F98 + mulhi<std::round_toward_zero>( m, 0x36E4E442 ), yl = ( m * 0x36E4E442 ) & 0xFFFFFFFF;
+			uint32 mask = ( static_cast<uint32>( 1 ) << ( 30 - exp ) ) - 1, yi = ( yh + ( mask >> 1 ) ) & ~mask, sign = -static_cast<uint32>( yi > yh );
+			k = static_cast<int>( yi >> ( 30 - exp ) );
+			uint32 fh = ( yh ^ sign ) + ( yi ^ ~sign ) - ~sign, fl = ( yl ^ sign ) - sign;
+			return ( multiply64( ( exp > -1 ) ? ( ( ( fh << ( 1 + exp ) ) & 0xFFFFFFFF ) | ( ( fl & 0xFFFFFFFF ) >> ( 31 - exp ) ) ) : fh, 0xC90FDAA2 ) ^ sign ) - sign;
+#endif
+		}
+
+		/// Get arguments for atan2 function.
+		/// \param abs half-precision floating-point value
+		/// \return \a abs and sqrt(1 - \a abs^2) as Q0.30
+		inline std::pair<uint32, uint32> atan2_args( unsigned int abs )
+		{
+			int exp = -15;
+			for ( ; abs < 0x400; abs <<= 1, --exp )
+				;
+			exp += abs >> 10;
+			uint32 my = ( ( abs & 0x3FF ) | 0x400 ) << 5, r = my * my;
+			int rexp = 2 * exp;
+			r = 0x40000000 - ( ( rexp > -31 ) ? ( ( r >> -rexp ) | ( ( r & ( ( static_cast<uint32>( 1 ) << -rexp ) - 1 ) ) != 0 ) ) : 1 );
+			for ( rexp = 0; r < 0x40000000; r <<= 1, --rexp )
+				;
+			uint32 mx = sqrt<30>( r, rexp );
+			int d = exp - rexp;
+			if ( d < 0 )
+				return std::make_pair( ( d < -14 ) ? ( ( my >> ( -d - 14 ) ) + ( ( my >> ( -d - 15 ) ) & 1 ) ) : ( my << ( 14 + d ) ), ( mx << 14 ) + ( r << 13 ) / mx );
+			if ( d > 0 )
+				return std::make_pair( my << 14, ( d > 14 ) ? ( ( mx >> ( d - 14 ) ) + ( ( mx >> ( d - 15 ) ) & 1 ) ) : ( ( d == 14 ) ? mx : ( ( mx << ( 14 - d ) ) + ( r << ( 13 - d ) ) / mx ) ) );
+			return std::make_pair( my << 13, ( mx << 13 ) + ( r << 12 ) / mx );
+		}
+
+		/// Get exponentials for hyperbolic computation
+		/// \param abs half-precision floating-point value
+		/// \param exp variable to take unbiased exponent of larger result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return exp(abs) and exp(-\a abs) as Q1.31 with same exponent
+		inline std::pair<uint32, uint32> hyperbolic_args( unsigned int abs, int &exp, unsigned int n = 32 )
+		{
+			uint32 mx = detail::multiply64( static_cast<uint32>( ( abs & 0x3FF ) + ( ( abs > 0x3FF ) << 10 ) ) << 21, 0xB8AA3B29 ), my;
+			int e = ( abs >> 10 ) + ( abs <= 0x3FF );
+			if ( e < 14 )
+			{
+				exp = 0;
+				mx >>= 14 - e;
+			}
+			else
+			{
+				exp = mx >> ( 45 - e );
+				mx = ( mx << ( e - 14 ) ) & 0x7FFFFFFF;
+			}
+			mx = exp2( mx, n );
+			int d = exp << 1, s;
+			if ( mx > 0x80000000 )
+			{
+				my = divide64( 0x80000000, mx, s );
+				my |= s;
+				++d;
+			}
+			else
+				my = mx;
+			return std::make_pair( mx, ( d < 31 ) ? ( ( my >> d ) | ( ( my & ( ( static_cast<uint32>( 1 ) << d ) - 1 ) ) != 0 ) ) : 1 );
+		}
+
+		/// Postprocessing for binary exponential.
+		/// \tparam R rounding mode to use
+		/// \param m fractional part of as Q0.31
+		/// \param exp absolute value of unbiased exponent
+		/// \param esign sign of actual exponent
+		/// \param sign sign bit of result
+		/// \param n number of BKM iterations (at most 32)
+		/// \return value converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded or \a I is `true`
+		template <std::float_round_style R>
+		unsigned int exp2_post( uint32 m, int exp, bool esign, unsigned int sign = 0, unsigned int n = 32 )
+		{
+			if ( esign )
+			{
+				exp = -exp - ( m != 0 );
+				if ( exp < -25 )
+					return underflow<R>( sign );
+				else if ( exp == -25 )
+					return rounded<R, false>( sign, 1, m != 0 );
+			}
+			else if ( exp > 15 )
+				return overflow<R>( sign );
+			if ( !m )
+				return sign | ( ( ( exp += 15 ) > 0 ) ? ( exp << 10 ) : check_underflow( 0x200 >> -exp ) );
+			m = exp2( m, n );
+			int s = 0;
+			if ( esign )
+				m = divide64( 0x80000000, m, s );
+			return fixed2half<R, 31, false, false, true>( m, exp + 14, sign, s );
+		}
+
+		/// Postprocessing for binary logarithm.
+		/// \tparam R rounding mode to use
+		/// \tparam L logarithm for base transformation as Q1.31
+		/// \param m fractional part of logarithm as Q0.31
+		/// \param ilog signed integer part of logarithm
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return value base-transformed and converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template <std::float_round_style R, uint32 L>
+		unsigned int log2_post( uint32 m, int ilog, int exp, unsigned int sign = 0 )
+		{
+			uint32 msign = sign_mask( ilog );
+			m = ( ( ( static_cast<uint32>( ilog ) << 27 ) + ( m >> 4 ) ) ^ msign ) - msign;
+			if ( !m )
+				return 0;
+			for ( ; m < 0x80000000; m <<= 1, --exp )
+				;
+			int i = m >= L, s;
+			exp += i;
+			m >>= 1 + i;
+			sign ^= msign & 0x8000;
+			if ( exp < -11 )
+				return underflow<R>( sign );
+			m = divide64( m, L, s );
+			return fixed2half<R, 30, false, false, true>( m, exp, sign, 1 );
+		}
+
+		/// Hypotenuse square root and postprocessing.
+		/// \tparam R rounding mode to use
+		/// \param r mantissa as Q2.30
+		/// \param exp biased exponent
+		/// \return square root converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if value had to be rounded
+		template <std::float_round_style R>
+		unsigned int hypot_post( uint32 r, int exp )
+		{
+			int i = r >> 31;
+			if ( ( exp += i ) > 46 )
+				return overflow<R>();
+			if ( exp < -34 )
+				return underflow<R>();
+			r = ( r >> i ) | ( r & i );
+			uint32 m = sqrt<30>( r, exp += 15 );
+			return fixed2half<R, 15, false, false, false>( m, exp - 1, 0, r != 0 );
+		}
+
+		/// Division and postprocessing for tangents.
+		/// \tparam R rounding mode to use
+		/// \param my dividend as Q1.31
+		/// \param mx divisor as Q1.31
+		/// \param exp biased exponent of result
+		/// \param sign sign bit of result
+		/// \return quotient converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template <std::float_round_style R>
+		unsigned int tangent_post( uint32 my, uint32 mx, int exp, unsigned int sign = 0 )
+		{
+			int i = my >= mx, s;
+			exp += i;
+			if ( exp > 29 )
+				return overflow<R>( sign );
+			if ( exp < -11 )
+				return underflow<R>( sign );
+			uint32 m = divide64( my >> ( i + 1 ), mx, s );
+			return fixed2half<R, 30, false, false, true>( m, exp, sign, s );
+		}
+
+		/// Area function and postprocessing.
+		/// This computes the value directly in Q2.30 using the representation `asinh|acosh(x) = log(x+sqrt(x^2+|-1))`.
+		/// \tparam R rounding mode to use
+		/// \tparam S `true` for asinh, `false` for acosh
+		/// \param arg half-precision argument
+		/// \return asinh|acosh(\a arg) converted to half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template <std::float_round_style R, bool S>
+		unsigned int area( unsigned int arg )
+		{
+			int abs = arg & 0x7FFF, expx = ( abs >> 10 ) + ( abs <= 0x3FF ) - 15, expy = -15, ilog, i;
+			uint32 mx = static_cast<uint32>( ( abs & 0x3FF ) | ( ( abs > 0x3FF ) << 10 ) ) << 20, my, r;
+			for ( ; abs < 0x400; abs <<= 1, --expy )
+				;
+			expy += abs >> 10;
+			r = ( ( abs & 0x3FF ) | 0x400 ) << 5;
+			r *= r;
+			i = r >> 31;
+			expy = 2 * expy + i;
+			r >>= i;
+			if ( S )
+			{
+				if ( expy < 0 )
+				{
+					r = 0x40000000 + ( ( expy > -30 ) ? ( ( r >> -expy ) | ( ( r & ( ( static_cast<uint32>( 1 ) << -expy ) - 1 ) ) != 0 ) ) : 1 );
+					expy = 0;
+				}
+				else
+				{
+					r += 0x40000000 >> expy;
+					i = r >> 31;
+					r = ( r >> i ) | ( r & i );
+					expy += i;
+				}
+			}
+			else
+			{
+				r -= 0x40000000 >> expy;
+				for ( ; r < 0x40000000; r <<= 1, --expy )
+					;
+			}
+			my = sqrt<30>( r, expy );
+			my = ( my << 15 ) + ( r << 14 ) / my;
+			if ( S )
+			{
+				mx >>= expy - expx;
+				ilog = expy;
+			}
+			else
+			{
+				my >>= expx - expy;
+				ilog = expx;
+			}
+			my += mx;
+			i = my >> 31;
+			static const int G = S && ( R == std::round_to_nearest );
+			return log2_post<R, 0xB8AA3B2A>( log2( my >> i, 26 + S + G ) + ( G << 3 ), ilog + i, 17, arg & ( static_cast<unsigned>( S ) << 15 ) );
+		}
+
+		/// Class for 1.31 unsigned floating-point computation
+		struct f31
+		{
+			/// Constructor.
+			/// \param mant mantissa as 1.31
+			/// \param e exponent
+			HALF_CONSTEXPR f31( uint32 mant, int e ) :
+				m( mant ), exp( e ) {}
+
+			/// Constructor.
+			/// \param abs unsigned half-precision value
+			f31( unsigned int abs ) :
+				exp( -15 )
+			{
+				for ( ; abs < 0x400; abs <<= 1, --exp )
+					;
+				m = static_cast<uint32>( ( abs & 0x3FF ) | 0x400 ) << 21;
+				exp += ( abs >> 10 );
+			}
+
+			/// Addition operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a + \a b
+			friend f31 operator+( f31 a, f31 b )
+			{
+				if ( b.exp > a.exp )
+					std::swap( a, b );
+				int d = a.exp - b.exp;
+				uint32 m = a.m + ( ( d < 32 ) ? ( b.m >> d ) : 0 );
+				int i = ( m & 0xFFFFFFFF ) < a.m;
+				return f31( ( ( m + i ) >> i ) | 0x80000000, a.exp + i );
+			}
+
+			/// Subtraction operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a - \a b
+			friend f31 operator-( f31 a, f31 b )
+			{
+				int d = a.exp - b.exp, exp = a.exp;
+				uint32 m = a.m - ( ( d < 32 ) ? ( b.m >> d ) : 0 );
+				if ( !m )
+					return f31( 0, -32 );
+				for ( ; m < 0x80000000; m <<= 1, --exp )
+					;
+				return f31( m, exp );
+			}
+
+			/// Multiplication operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a * \a b
+			friend f31 operator*( f31 a, f31 b )
+			{
+				uint32 m = multiply64( a.m, b.m );
+				int i = m >> 31;
+				return f31( m << ( 1 - i ), a.exp + b.exp + i );
+			}
+
+			/// Division operator.
+			/// \param a first operand
+			/// \param b second operand
+			/// \return \a a / \a b
+			friend f31 operator/( f31 a, f31 b )
+			{
+				int i = a.m >= b.m, s;
+				uint32 m = divide64( ( a.m + i ) >> i, b.m, s );
+				return f31( m, a.exp - b.exp + i - 1 );
+			}
+
+			uint32 m; ///< mantissa as 1.31.
+			int exp;  ///< exponent.
+		};
+
+		/// Error function and postprocessing.
+		/// This computes the value directly in Q1.31 using the approximations given
+		/// [here](https://en.wikipedia.org/wiki/Error_function#Approximation_with_elementary_functions).
+		/// \tparam R rounding mode to use
+		/// \tparam C `true` for comlementary error function, `false` else
+		/// \param arg half-precision function argument
+		/// \return approximated value of error function in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if no other exception occurred
+		template <std::float_round_style R, bool C>
+		unsigned int erf( unsigned int arg )
+		{
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			f31 x( abs ), x2 = x * x * f31( 0xB8AA3B29, 0 ), t = f31( 0x80000000, 0 ) / ( f31( 0x80000000, 0 ) + f31( 0xA7BA054A, -2 ) * x ), t2 = t * t;
+			f31 e = ( ( f31( 0x87DC2213, 0 ) * t2 + f31( 0xB5F0E2AE, 0 ) ) * t2 + f31( 0x82790637, -2 ) - ( f31( 0xBA00E2B8, 0 ) * t2 + f31( 0x91A98E62, -2 ) ) * t ) * t /
+					( ( x2.exp < 0 ) ? f31( exp2( ( x2.exp > -32 ) ? ( x2.m >> -x2.exp ) : 0, 30 ), 0 ) : f31( exp2( ( x2.m << x2.exp ) & 0x7FFFFFFF, 22 ), x2.m >> ( 31 - x2.exp ) ) );
+			return ( !C || sign )  ? fixed2half<R, 31, false, true, true>( 0x80000000 - ( e.m >> ( C - e.exp ) ), 14 + C, sign & ( C - 1U ) ) :
+				   ( e.exp < -25 ) ? underflow<R>() :
+									 fixed2half<R, 30, false, false, true>( e.m >> 1, e.exp + 14, 0, e.m & 1 );
+		}
+
+		/// Gamma function and postprocessing.
+		/// This approximates the value of either the gamma function or its logarithm directly in Q1.31.
+		/// \tparam R rounding mode to use
+		/// \tparam L `true` for lograithm of gamma function, `false` for gamma function
+		/// \param arg half-precision floating-point value
+		/// \return lgamma/tgamma(\a arg) in half-precision
+		/// \exception FE_OVERFLOW on overflows
+		/// \exception FE_UNDERFLOW on underflows
+		/// \exception FE_INEXACT if \a arg is not a positive integer
+		template <std::float_round_style R, bool L>
+		unsigned int gamma( unsigned int arg )
+		{
+			/*			static const double p[] ={ 2.50662827563479526904, 225.525584619175212544, -268.295973841304927459, 80.9030806934622512966, -5.00757863970517583837, 0.0114684895434781459556 };
+						double t = arg + 4.65, s = p[0];
+						for(unsigned int i=0; i<5; ++i)
+							s += p[i+1] / (arg+i);
+						return std::log(s) + (arg-0.5)*std::log(t) - t;
+			*/
+			static const f31 pi( 0xC90FDAA2, 1 ), lbe( 0xB8AA3B29, 0 );
+			unsigned int abs = arg & 0x7FFF, sign = arg & 0x8000;
+			bool bsign = sign != 0;
+			f31 z( abs ), x = sign ? ( z + f31( 0x80000000, 0 ) ) : z, t = x + f31( 0x94CCCCCD, 2 ), s = f31( 0xA06C9901, 1 ) + f31( 0xBBE654E2, -7 ) / ( x + f31( 0x80000000, 2 ) ) + f31( 0xA1CE6098, 6 ) / ( x + f31( 0x80000000, 1 ) ) + f31( 0xE1868CB7, 7 ) / x - f31( 0x8625E279, 8 ) / ( x + f31( 0x80000000, 0 ) ) - f31( 0xA03E158F, 2 ) / ( x + f31( 0xC0000000, 1 ) );
+			int i = ( s.exp >= 2 ) + ( s.exp >= 4 ) + ( s.exp >= 8 ) + ( s.exp >= 16 );
+			s = f31( ( static_cast<uint32>( s.exp ) << ( 31 - i ) ) + ( log2( s.m >> 1, 28 ) >> i ), i ) / lbe;
+			if ( x.exp != -1 || x.m != 0x80000000 )
+			{
+				i = ( t.exp >= 2 ) + ( t.exp >= 4 ) + ( t.exp >= 8 );
+				f31 l = f31( ( static_cast<uint32>( t.exp ) << ( 31 - i ) ) + ( log2( t.m >> 1, 30 ) >> i ), i ) / lbe;
+				s = ( x.exp < -1 ) ? ( s - ( f31( 0x80000000, -1 ) - x ) * l ) : ( s + ( x - f31( 0x80000000, -1 ) ) * l );
+			}
+			s = x.exp ? ( s - t ) : ( t - s );
+			if ( bsign )
+			{
+				if ( z.exp >= 0 )
+				{
+					sign &= ( L | ( ( z.m >> ( 31 - z.exp ) ) & 1 ) ) - 1;
+					for ( z = f31( ( z.m << ( 1 + z.exp ) ) & 0xFFFFFFFF, -1 ); z.m < 0x80000000; z.m <<= 1, --z.exp )
+						;
+				}
+				if ( z.exp == -1 )
+					z = f31( 0x80000000, 0 ) - z;
+				if ( z.exp < -1 )
+				{
+					z = z * pi;
+					z.m = sincos( z.m >> ( 1 - z.exp ), 30 ).first;
+					for ( z.exp = 1; z.m < 0x80000000; z.m <<= 1, --z.exp )
+						;
+				}
+				else
+					z = f31( 0x80000000, 0 );
+			}
+			if ( L )
+			{
+				if ( bsign )
+				{
+					f31 l( 0x92868247, 0 );
+					if ( z.exp < 0 )
+					{
+						uint32 m = log2( ( z.m + 1 ) >> 1, 27 );
+						z = f31( -( ( static_cast<uint32>( z.exp ) << 26 ) + ( m >> 5 ) ), 5 );
+						for ( ; z.m < 0x80000000; z.m <<= 1, --z.exp )
+							;
+						l = l + z / lbe;
+					}
+					sign = static_cast<unsigned>( x.exp && ( l.exp < s.exp || ( l.exp == s.exp && l.m < s.m ) ) ) << 15;
+					s = sign ? ( s - l ) : x.exp ? ( l - s ) :
+												   ( l + s );
+				}
+				else
+				{
+					sign = static_cast<unsigned>( x.exp == 0 ) << 15;
+					if ( s.exp < -24 )
+						return underflow<R>( sign );
+					if ( s.exp > 15 )
+						return overflow<R>( sign );
+				}
+			}
+			else
+			{
+				s = s * lbe;
+				uint32 m;
+				if ( s.exp < 0 )
+				{
+					m = s.m >> -s.exp;
+					s.exp = 0;
+				}
+				else
+				{
+					m = ( s.m << s.exp ) & 0x7FFFFFFF;
+					s.exp = ( s.m >> ( 31 - s.exp ) );
+				}
+				s.m = exp2( m, 27 );
+				if ( !x.exp )
+					s = f31( 0x80000000, 0 ) / s;
+				if ( bsign )
+				{
+					if ( z.exp < 0 )
+						s = s * z;
+					s = pi / s;
+					if ( s.exp < -24 )
+						return underflow<R>( sign );
+				}
+				else if ( z.exp > 0 && !( z.m & ( ( 1 << ( 31 - z.exp ) ) - 1 ) ) )
+					return ( ( s.exp + 14 ) << 10 ) + ( s.m >> 21 );
+				if ( s.exp > 15 )
+					return overflow<R>( sign );
+			}
+			return fixed2half<R, 31, false, false, true>( s.m, s.exp + 14, sign );
+		}
+		/// \}
+
+		template <typename, typename, std::float_round_style>
+		struct half_caster;
+	} // namespace detail
+
+	/// Half-precision floating-point type.
+	/// This class implements an IEEE-conformant half-precision floating-point type with the usual arithmetic
+	/// operators and conversions. It is implicitly convertible to single-precision floating-point, which makes artihmetic
+	/// expressions and functions with mixed-type operands to be of the most precise operand type.
+	///
+	/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and
+	/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which
+	/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the
+	/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of
+	/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most
+	/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit
+	/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if
+	/// your C++ implementation supports an unsigned integer type of exactly 16 bits width. But this should be the case on
+	/// nearly any reasonable platform.
+	///
+	/// So if your C++ implementation is not totally exotic or imposes special alignment requirements, it is a reasonable
+	/// assumption that the data of a half is just comprised of the 2 bytes of the underlying IEEE representation.
+	class half
+	{
+	public:
+		/// \name Construction and assignment
+		/// \{
+
+		/// Default constructor.
+		/// This initializes the half to 0. Although this does not match the builtin types' default-initialization semantics
+		/// and may be less efficient than no initialization, it is needed to provide proper value-initialization semantics.
+		HALF_CONSTEXPR half() HALF_NOEXCEPT : data_() {}
+
+		/// Conversion constructor.
+		/// \param rhs float to convert
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		explicit half( float rhs ) :
+			data_( static_cast<detail::uint16>( detail::float2half<round_style>( rhs ) ) ) {}
+
+		/// Conversion to single-precision.
+		/// \return single precision value representing expression value
+		operator float() const { return detail::half2float<float>( data_ ); }
+
+		/// Assignment operator.
+		/// \param rhs single-precision value to copy from
+		/// \return reference to this half
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		half &operator=( float rhs )
+		{
+			data_ = static_cast<detail::uint16>( detail::float2half<round_style>( rhs ) );
+			return *this;
+		}
+
+		/// \}
+		/// \name Arithmetic updates
+		/// \{
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator+(half,half)
+		half &operator+=( half rhs ) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator-(half,half)
+		half &operator-=( half rhs ) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator*(half,half)
+		half &operator*=( half rhs ) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \tparam T type of concrete half expression
+		/// \param rhs half expression to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator/(half,half)
+		half &operator/=( half rhs ) { return *this = *this / rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to add
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half &operator+=( float rhs ) { return *this = *this + rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to subtract
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half &operator-=( float rhs ) { return *this = *this - rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to multiply with
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half &operator*=( float rhs ) { return *this = *this * rhs; }
+
+		/// Arithmetic assignment.
+		/// \param rhs single-precision value to divide by
+		/// \return reference to this half
+		/// \exception FE_... according to operator=()
+		half &operator/=( float rhs ) { return *this = *this / rhs; }
+
+		/// \}
+		/// \name Increment and decrement
+		/// \{
+
+		/// Prefix increment.
+		/// \return incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half &operator++() { return *this = *this + half( detail::binary, 0x3C00 ); }
+
+		/// Prefix decrement.
+		/// \return decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half &operator--() { return *this = *this + half( detail::binary, 0xBC00 ); }
+
+		/// Postfix increment.
+		/// \return non-incremented half value
+		/// \exception FE_... according to operator+(half,half)
+		half operator++( int )
+		{
+			half out( *this );
+			++*this;
+			return out;
+		}
+
+		/// Postfix decrement.
+		/// \return non-decremented half value
+		/// \exception FE_... according to operator-(half,half)
+		half operator--( int )
+		{
+			half out( *this );
+			--*this;
+			return out;
+		}
+		/// \}
+
+		detail::uint16 getData() { return data_; }
+
+	private:
+		/// Rounding mode to use
+		static const std::float_round_style round_style = ( std::float_round_style )( HALF_ROUND_STYLE );
+
+		/// Constructor.
+		/// \param bits binary representation to set half to
+		HALF_CONSTEXPR half( detail::binary_t, unsigned int bits ) HALF_NOEXCEPT : data_( static_cast<detail::uint16>( bits ) ) {}
+
+		/// Internal binary representation
+		detail::uint16 data_;
+
+#ifndef HALF_DOXYGEN_ONLY
+		friend HALF_CONSTEXPR_NOERR bool operator==( half, half );
+		friend HALF_CONSTEXPR_NOERR bool operator!=( half, half );
+		friend HALF_CONSTEXPR_NOERR bool operator<( half, half );
+		friend HALF_CONSTEXPR_NOERR bool operator>( half, half );
+		friend HALF_CONSTEXPR_NOERR bool operator<=( half, half );
+		friend HALF_CONSTEXPR_NOERR bool operator>=( half, half );
+		friend HALF_CONSTEXPR half operator-( half );
+		friend half operator+( half, half );
+		friend half operator-( half, half );
+		friend half operator*( half, half );
+		friend half operator/( half, half );
+		template <typename charT, typename traits>
+		friend std::basic_ostream<charT, traits> &operator<<( std::basic_ostream<charT, traits> &, half );
+		template <typename charT, typename traits>
+		friend std::basic_istream<charT, traits> &operator>>( std::basic_istream<charT, traits> &, half & );
+		friend HALF_CONSTEXPR half fabs( half );
+		friend half fmod( half, half );
+		friend half remainder( half, half );
+		friend half remquo( half, half, int * );
+		friend half fma( half, half, half );
+		friend HALF_CONSTEXPR_NOERR half fmax( half, half );
+		friend HALF_CONSTEXPR_NOERR half fmin( half, half );
+		friend half fdim( half, half );
+		friend half nanh( const char * );
+		friend half exp( half );
+		friend half exp2( half );
+		friend half expm1( half );
+		friend half log( half );
+		friend half log10( half );
+		friend half log2( half );
+		friend half log1p( half );
+		friend half sqrt( half );
+		friend half rsqrt( half );
+		friend half cbrt( half );
+		friend half hypot( half, half );
+		friend half hypot( half, half, half );
+		friend half pow( half, half );
+		friend void sincos( half, half *, half * );
+		friend half sin( half );
+		friend half cos( half );
+		friend half tan( half );
+		friend half asin( half );
+		friend half acos( half );
+		friend half atan( half );
+		friend half atan2( half, half );
+		friend half sinh( half );
+		friend half cosh( half );
+		friend half tanh( half );
+		friend half asinh( half );
+		friend half acosh( half );
+		friend half atanh( half );
+		friend half erf( half );
+		friend half erfc( half );
+		friend half lgamma( half );
+		friend half tgamma( half );
+		friend half ceil( half );
+		friend half floor( half );
+		friend half trunc( half );
+		friend half round( half );
+		friend long lround( half );
+		friend half rint( half );
+		friend long lrint( half );
+		friend half nearbyint( half );
+#ifdef HALF_ENABLE_CPP11_LONG_LONG
+		friend long long llround( half );
+		friend long long llrint( half );
+#endif
+		friend half frexp( half, int * );
+		friend half scalbln( half, long );
+		friend half modf( half, half * );
+		friend int ilogb( half );
+		friend half logb( half );
+		friend half nextafter( half, half );
+		friend half nexttoward( half, long double );
+		friend HALF_CONSTEXPR half copysign( half, half );
+		friend HALF_CONSTEXPR int fpclassify( half );
+		friend HALF_CONSTEXPR bool isfinite( half );
+		friend HALF_CONSTEXPR bool isinf( half );
+		friend HALF_CONSTEXPR bool isnan( half );
+		friend HALF_CONSTEXPR bool isnormal( half );
+		friend HALF_CONSTEXPR bool signbit( half );
+		friend HALF_CONSTEXPR bool isgreater( half, half );
+		friend HALF_CONSTEXPR bool isgreaterequal( half, half );
+		friend HALF_CONSTEXPR bool isless( half, half );
+		friend HALF_CONSTEXPR bool islessequal( half, half );
+		friend HALF_CONSTEXPR bool islessgreater( half, half );
+		template <typename, typename, std::float_round_style>
+		friend struct detail::half_caster;
+		friend class std::numeric_limits<half>;
+#if HALF_ENABLE_CPP11_HASH
+		friend struct std::hash<half>;
+#endif
+#if HALF_ENABLE_CPP11_USER_LITERALS
+		friend half literal::operator"" _h( long double );
+#endif
+#endif
+	};
+
+#if HALF_ENABLE_CPP11_USER_LITERALS
+	namespace literal
+	{
+		/// Half literal.
+		/// While this returns a properly rounded half-precision value, half literals can unfortunately not be constant
+		/// expressions due to rather involved conversions. So don't expect this to be a literal literal without involving
+		/// conversion operations at runtime. It is a convenience feature, not a performance optimization.
+		/// \param value literal value
+		/// \return half with of given value (possibly rounded)
+		/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+		inline half operator"" _h( long double value )
+		{
+			return half( detail::binary, detail::float2half<half::round_style>( value ) );
+		}
+	} // namespace literal
+#endif
+
+	namespace detail
+	{
+		/// Helper class for half casts.
+		/// This class template has to be specialized for all valid cast arguments to define an appropriate static
+		/// `cast` member function and a corresponding `type` member denoting its return type.
+		/// \tparam T destination type
+		/// \tparam U source type
+		/// \tparam R rounding mode to use
+		template <typename T, typename U, std::float_round_style R = ( std::float_round_style )( HALF_ROUND_STYLE )>
+		struct half_caster
+		{
+		};
+		template <typename U, std::float_round_style R>
+		struct half_caster<half, U, R>
+		{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert( std::is_arithmetic<U>::value, "half_cast from non-arithmetic type unsupported" );
+#endif
+
+			static half cast( U arg )
+			{
+				return cast_impl( arg, is_float<U>() );
+			};
+
+		private:
+			static half cast_impl( U arg, true_type ) { return half( binary, float2half<R>( arg ) ); }
+			static half cast_impl( U arg, false_type ) { return half( binary, int2half<R>( arg ) ); }
+		};
+		template <typename T, std::float_round_style R>
+		struct half_caster<T, half, R>
+		{
+#if HALF_ENABLE_CPP11_STATIC_ASSERT && HALF_ENABLE_CPP11_TYPE_TRAITS
+			static_assert( std::is_arithmetic<T>::value, "half_cast to non-arithmetic type unsupported" );
+#endif
+
+			static T cast( half arg )
+			{
+				return cast_impl( arg, is_float<T>() );
+			}
+
+		private:
+			static T cast_impl( half arg, true_type ) { return half2float<T>( arg.data_ ); }
+			static T cast_impl( half arg, false_type ) { return half2int<R, true, true, T>( arg.data_ ); }
+		};
+		template <std::float_round_style R>
+		struct half_caster<half, half, R>
+		{
+			static half cast( half arg ) { return arg; }
+		};
+	} // namespace detail
+} // namespace half_float
+
+/// Extensions to the C++ standard library.
+namespace std
+{
+	/// Numeric limits for half-precision floats.
+	/// **See also:** Documentation for [std::numeric_limits](https://en.cppreference.com/w/cpp/types/numeric_limits)
+	template <>
+	class numeric_limits<half_float::half>
+	{
+	public:
+		/// Is template specialization.
+		static HALF_CONSTEXPR_CONST bool is_specialized = true;
+
+		/// Supports signed values.
+		static HALF_CONSTEXPR_CONST bool is_signed = true;
+
+		/// Is not an integer type.
+		static HALF_CONSTEXPR_CONST bool is_integer = false;
+
+		/// Is not exact.
+		static HALF_CONSTEXPR_CONST bool is_exact = false;
+
+		/// Doesn't provide modulo arithmetic.
+		static HALF_CONSTEXPR_CONST bool is_modulo = false;
+
+		/// Has a finite set of values.
+		static HALF_CONSTEXPR_CONST bool is_bounded = true;
+
+		/// IEEE conformant.
+		static HALF_CONSTEXPR_CONST bool is_iec559 = true;
+
+		/// Supports infinity.
+		static HALF_CONSTEXPR_CONST bool has_infinity = true;
+
+		/// Supports quiet NaNs.
+		static HALF_CONSTEXPR_CONST bool has_quiet_NaN = true;
+
+		/// Supports signaling NaNs.
+		static HALF_CONSTEXPR_CONST bool has_signaling_NaN = true;
+
+		/// Supports subnormal values.
+		static HALF_CONSTEXPR_CONST float_denorm_style has_denorm = denorm_present;
+
+		/// Supports no denormalization detection.
+		static HALF_CONSTEXPR_CONST bool has_denorm_loss = false;
+
+#if HALF_ERRHANDLING_THROWS
+		static HALF_CONSTEXPR_CONST bool traps = true;
+#else
+		/// Traps only if [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID) is acitvated.
+		static HALF_CONSTEXPR_CONST bool traps = false;
+#endif
+
+		/// Does not support no pre-rounding underflow detection.
+		static HALF_CONSTEXPR_CONST bool tinyness_before = false;
+
+		/// Rounding mode.
+		static HALF_CONSTEXPR_CONST float_round_style round_style = half_float::half::round_style;
+
+		/// Significant digits.
+		static HALF_CONSTEXPR_CONST int digits = 11;
+
+		/// Significant decimal digits.
+		static HALF_CONSTEXPR_CONST int digits10 = 3;
+
+		/// Required decimal digits to represent all possible values.
+		static HALF_CONSTEXPR_CONST int max_digits10 = 5;
+
+		/// Number base.
+		static HALF_CONSTEXPR_CONST int radix = 2;
+
+		/// One more than smallest exponent.
+		static HALF_CONSTEXPR_CONST int min_exponent = -13;
+
+		/// Smallest normalized representable power of 10.
+		static HALF_CONSTEXPR_CONST int min_exponent10 = -4;
+
+		/// One more than largest exponent
+		static HALF_CONSTEXPR_CONST int max_exponent = 16;
+
+		/// Largest finitely representable power of 10.
+		static HALF_CONSTEXPR_CONST int max_exponent10 = 4;
+
+		/// Smallest positive normal value.
+		static HALF_CONSTEXPR half_float::half min() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x0400 ); }
+
+		/// Smallest finite value.
+		static HALF_CONSTEXPR half_float::half lowest() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0xFBFF ); }
+
+		/// Largest finite value.
+		static HALF_CONSTEXPR half_float::half max() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x7BFF ); }
+
+		/// Difference between 1 and next representable value.
+		static HALF_CONSTEXPR half_float::half epsilon() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x1400 ); }
+
+		/// Maximum rounding error in ULP (units in the last place).
+		static HALF_CONSTEXPR half_float::half round_error() HALF_NOTHROW
+		{
+			return half_float::half( half_float::detail::binary, ( round_style == std::round_to_nearest ) ? 0x3800 : 0x3C00 );
+		}
+
+		/// Positive infinity.
+		static HALF_CONSTEXPR half_float::half infinity() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x7C00 ); }
+
+		/// Quiet NaN.
+		static HALF_CONSTEXPR half_float::half quiet_NaN() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x7FFF ); }
+
+		/// Signaling NaN.
+		static HALF_CONSTEXPR half_float::half signaling_NaN() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x7DFF ); }
+
+		/// Smallest positive subnormal value.
+		static HALF_CONSTEXPR half_float::half denorm_min() HALF_NOTHROW { return half_float::half( half_float::detail::binary, 0x0001 ); }
+	};
+
+#if HALF_ENABLE_CPP11_HASH
+	/// Hash function for half-precision floats.
+	/// This is only defined if C++11 `std::hash` is supported and enabled.
+	///
+	/// **See also:** Documentation for [std::hash](https://en.cppreference.com/w/cpp/utility/hash)
+	template <>
+	struct hash<half_float::half>
+	{
+		/// Type of function argument.
+		typedef half_float::half argument_type;
+
+		/// Function return type.
+		typedef size_t result_type;
+
+		/// Compute hash function.
+		/// \param arg half to hash
+		/// \return hash value
+		result_type operator()( argument_type arg ) const { return hash<half_float::detail::uint16>()( arg.data_ & -static_cast<unsigned>( arg.data_ != 0x8000 ) ); }
+	};
+#endif
+} // namespace std
+
+namespace half_float
+{
+	/// \anchor compop
+	/// \name Comparison operators
+	/// \{
+
+	/// Comparison for equality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator==( half x, half y )
+	{
+		return !detail::compsignal( x.data_, y.data_ ) && ( x.data_ == y.data_ || !( ( x.data_ | y.data_ ) & 0x7FFF ) );
+	}
+
+	/// Comparison for inequality.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if operands not equal
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator!=( half x, half y )
+	{
+		return detail::compsignal( x.data_, y.data_ ) || ( x.data_ != y.data_ && ( ( x.data_ | y.data_ ) & 0x7FFF ) );
+	}
+
+	/// Comparison for less than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<( half x, half y )
+	{
+		return !detail::compsignal( x.data_, y.data_ ) &&
+			   ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) < ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) );
+	}
+
+	/// Comparison for greater than.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>( half x, half y )
+	{
+		return !detail::compsignal( x.data_, y.data_ ) &&
+			   ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) > ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) );
+	}
+
+	/// Comparison for less equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator<=( half x, half y )
+	{
+		return !detail::compsignal( x.data_, y.data_ ) &&
+			   ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) <= ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) );
+	}
+
+	/// Comparison for greater equal.
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	/// \exception FE_INVALID if \a x or \a y is NaN
+	inline HALF_CONSTEXPR_NOERR bool operator>=( half x, half y )
+	{
+		return !detail::compsignal( x.data_, y.data_ ) &&
+			   ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) >= ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) );
+	}
+
+	/// \}
+	/// \anchor arithmetics
+	/// \name Arithmetic operators
+	/// \{
+
+	/// Identity.
+	/// \param arg operand
+	/// \return unchanged operand
+	inline HALF_CONSTEXPR half operator+( half arg )
+	{
+		return arg;
+	}
+
+	/// Negation.
+	/// \param arg operand
+	/// \return negated operand
+	inline HALF_CONSTEXPR half operator-( half arg )
+	{
+		return half( detail::binary, arg.data_ ^ 0x8000 );
+	}
+
+	/// Addition.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return sum of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with different signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator+( half x, half y )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( detail::half2float<detail::internal_t>( x.data_ ) + detail::half2float<detail::internal_t>( y.data_ ) ) );
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF;
+		bool sub = ( ( x.data_ ^ y.data_ ) & 0x8000 ) != 0;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 ) ? detail::signal( x.data_, y.data_ ) : ( absy != 0x7C00 )	 ? x.data_ :
+																											   ( sub && absx == 0x7C00 ) ? detail::invalid() :
+																																		   y.data_ );
+		if ( !absx )
+			return absy ? y : half( detail::binary, ( half::round_style == std::round_toward_neg_infinity ) ? ( x.data_ | y.data_ ) : ( x.data_ & y.data_ ) );
+		if ( !absy )
+			return x;
+		unsigned int sign = ( ( sub && absy > absx ) ? y.data_ : x.data_ ) & 0x8000;
+		if ( absy > absx )
+			std::swap( absx, absy );
+		int exp = ( absx >> 10 ) + ( absx <= 0x3FF ), d = exp - ( absy >> 10 ) - ( absy <= 0x3FF ), mx = ( ( absx & 0x3FF ) | ( ( absx > 0x3FF ) << 10 ) ) << 3, my;
+		if ( d < 13 )
+		{
+			my = ( ( absy & 0x3FF ) | ( ( absy > 0x3FF ) << 10 ) ) << 3;
+			my = ( my >> d ) | ( ( my & ( ( 1 << d ) - 1 ) ) != 0 );
+		}
+		else
+			my = 1;
+		if ( sub )
+		{
+			if ( !( mx -= my ) )
+				return half( detail::binary, static_cast<unsigned>( half::round_style == std::round_toward_neg_infinity ) << 15 );
+			for ( ; mx < 0x2000 && exp > 1; mx <<= 1, --exp )
+				;
+		}
+		else
+		{
+			mx += my;
+			int i = mx >> 14;
+			if ( ( exp += i ) > 30 )
+				return half( detail::binary, detail::overflow<half::round_style>( sign ) );
+			mx = ( mx >> i ) | ( mx & i );
+		}
+		return half( detail::binary, detail::rounded<half::round_style, false>( sign + ( ( exp - 1 ) << 10 ) + ( mx >> 3 ), ( mx >> 2 ) & 1, ( mx & 0x3 ) != 0 ) );
+#endif
+	}
+
+	/// Subtraction.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return difference of half expressions
+	/// \exception FE_INVALID if \a x and \a y are infinities with equal signs or signaling NaNs
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator-( half x, half y )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( detail::half2float<detail::internal_t>( x.data_ ) - detail::half2float<detail::internal_t>( y.data_ ) ) );
+#else
+		return x + -y;
+#endif
+	}
+
+	/// Multiplication.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return product of half expressions
+	/// \exception FE_INVALID if multiplying 0 with infinity or if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator*( half x, half y )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( detail::half2float<detail::internal_t>( x.data_ ) * detail::half2float<detail::internal_t>( y.data_ ) ) );
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -16;
+		unsigned int sign = ( x.data_ ^ y.data_ ) & 0x8000;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 )								? detail::signal( x.data_, y.data_ ) :
+										 ( ( absx == 0x7C00 && !absy ) || ( absy == 0x7C00 && !absx ) ) ? detail::invalid() :
+																										  ( sign | 0x7C00 ) );
+		if ( !absx || !absy )
+			return half( detail::binary, sign );
+		for ( ; absx < 0x400; absx <<= 1, --exp )
+			;
+		for ( ; absy < 0x400; absy <<= 1, --exp )
+			;
+		detail::uint32 m = static_cast<detail::uint32>( ( absx & 0x3FF ) | 0x400 ) * static_cast<detail::uint32>( ( absy & 0x3FF ) | 0x400 );
+		int i = m >> 21, s = m & i;
+		exp += ( absx >> 10 ) + ( absy >> 10 ) + i;
+		if ( exp > 29 )
+			return half( detail::binary, detail::overflow<half::round_style>( sign ) );
+		else if ( exp < -11 )
+			return half( detail::binary, detail::underflow<half::round_style>( sign ) );
+		return half( detail::binary, detail::fixed2half<half::round_style, 20, false, false, false>( m >> i, exp, sign, s ) );
+#endif
+	}
+
+	/// Division.
+	/// This operation is exact to rounding for all rounding modes.
+	/// \param x left operand
+	/// \param y right operand
+	/// \return quotient of half expressions
+	/// \exception FE_INVALID if dividing 0s or infinities with each other or if \a x or \a y is signaling NaN
+	/// \exception FE_DIVBYZERO if dividing finite value by 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half operator/( half x, half y )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( detail::half2float<detail::internal_t>( x.data_ ) / detail::half2float<detail::internal_t>( y.data_ ) ) );
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = 14;
+		unsigned int sign = ( x.data_ ^ y.data_ ) & 0x8000;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 ) ? detail::signal( x.data_, y.data_ ) :
+										 ( absx == absy )					? detail::invalid() :
+																			  ( sign | ( ( absx == 0x7C00 ) ? 0x7C00 : 0 ) ) );
+		if ( !absx )
+			return half( detail::binary, absy ? sign : detail::invalid() );
+		if ( !absy )
+			return half( detail::binary, detail::pole( sign ) );
+		for ( ; absx < 0x400; absx <<= 1, --exp )
+			;
+		for ( ; absy < 0x400; absy <<= 1, ++exp )
+			;
+		detail::uint32 mx = ( absx & 0x3FF ) | 0x400, my = ( absy & 0x3FF ) | 0x400;
+		int i = mx < my;
+		exp += ( absx >> 10 ) - ( absy >> 10 ) - i;
+		if ( exp > 29 )
+			return half( detail::binary, detail::overflow<half::round_style>( sign ) );
+		else if ( exp < -11 )
+			return half( detail::binary, detail::underflow<half::round_style>( sign ) );
+		mx <<= 12 + i;
+		my <<= 1;
+		return half( detail::binary, detail::fixed2half<half::round_style, 11, false, false, false>( mx / my, exp, sign, mx % my != 0 ) );
+#endif
+	}
+
+	/// \}
+	/// \anchor streaming
+	/// \name Input and output
+	/// \{
+
+	/// Output operator.
+	///	This uses the built-in functionality for streaming out floating-point numbers.
+	/// \param out output stream to write into
+	/// \param arg half expression to write
+	/// \return reference to output stream
+	template <typename charT, typename traits>
+	std::basic_ostream<charT, traits> &operator<<( std::basic_ostream<charT, traits> &out, half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return out << detail::half2float<detail::internal_t>( arg.data_ );
+#else
+		return out << detail::half2float<float>( arg.data_ );
+#endif
+	}
+
+	/// Input operator.
+	///	This uses the built-in functionality for streaming in floating-point numbers, specifically double precision floating
+	/// point numbers (unless overridden with [HALF_ARITHMETIC_TYPE](\ref HALF_ARITHMETIC_TYPE)). So the input string is first
+	/// rounded to double precision using the underlying platform's current floating-point rounding mode before being rounded
+	/// to half-precision using the library's half-precision rounding mode.
+	/// \param in input stream to read from
+	/// \param arg half to read into
+	/// \return reference to input stream
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template <typename charT, typename traits>
+	std::basic_istream<charT, traits> &operator>>( std::basic_istream<charT, traits> &in, half &arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f;
+#else
+		double f;
+#endif
+		if ( in >> f )
+			arg.data_ = detail::float2half<half::round_style>( f );
+		return in;
+	}
+
+	/// \}
+	/// \anchor basic
+	/// \name Basic mathematical operations
+	/// \{
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::fabs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half fabs( half arg )
+	{
+		return half( detail::binary, arg.data_ & 0x7FFF );
+	}
+
+	/// Absolute value.
+	/// **See also:** Documentation for [std::abs](https://en.cppreference.com/w/cpp/numeric/math/fabs).
+	/// \param arg operand
+	/// \return absolute value of \a arg
+	inline HALF_CONSTEXPR half abs( half arg )
+	{
+		return fabs( arg );
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::fmod](https://en.cppreference.com/w/cpp/numeric/math/fmod).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half fmod( half x, half y )
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 ) ? detail::signal( x.data_, y.data_ ) :
+										 ( absx == 0x7C00 )					? detail::invalid() :
+																			  x.data_ );
+		if ( !absy )
+			return half( detail::binary, detail::invalid() );
+		if ( !absx )
+			return x;
+		if ( absx == absy )
+			return half( detail::binary, sign );
+		return half( detail::binary, sign | detail::mod<false, false>( absx, absy ) );
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remainder](https://en.cppreference.com/w/cpp/numeric/math/remainder).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remainder( half x, half y )
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, sign = x.data_ & 0x8000;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 ) ? detail::signal( x.data_, y.data_ ) :
+										 ( absx == 0x7C00 )					? detail::invalid() :
+																			  x.data_ );
+		if ( !absy )
+			return half( detail::binary, detail::invalid() );
+		if ( absx == absy )
+			return half( detail::binary, sign );
+		return half( detail::binary, sign ^ detail::mod<false, true>( absx, absy ) );
+	}
+
+	/// Remainder of division.
+	/// **See also:** Documentation for [std::remquo](https://en.cppreference.com/w/cpp/numeric/math/remquo).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param quo address to store some bits of quotient at
+	/// \return remainder of floating-point division.
+	/// \exception FE_INVALID if \a x is infinite or \a y is 0 or if \a x or \a y is signaling NaN
+	inline half remquo( half x, half y, int *quo )
+	{
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, value = x.data_ & 0x8000;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 ) ? detail::signal( x.data_, y.data_ ) :
+										 ( absx == 0x7C00 )					? detail::invalid() :
+																			  ( *quo = 0, x.data_ ) );
+		if ( !absy )
+			return half( detail::binary, detail::invalid() );
+		bool qsign = ( ( value ^ y.data_ ) & 0x8000 ) != 0;
+		int q = 1;
+		if ( absx != absy )
+			value ^= detail::mod<true, true>( absx, absy, &q );
+		return *quo = qsign ? -q : q, half( detail::binary, value );
+	}
+
+	/// Fused multiply add.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fma](https://en.cppreference.com/w/cpp/numeric/math/fma).
+	/// \param x first operand
+	/// \param y second operand
+	/// \param z third operand
+	/// \return ( \a x * \a y ) + \a z rounded as one operation.
+	/// \exception FE_INVALID according to operator*() and operator+() unless any argument is a quiet NaN and no argument is a signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding the final addition
+	inline half fma( half x, half y, half z )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>( x.data_ ), fy = detail::half2float<detail::internal_t>( y.data_ ), fz = detail::half2float<detail::internal_t>( z.data_ );
+#if HALF_ENABLE_CPP11_CMATH && FP_FAST_FMA
+		return half( detail::binary, detail::float2half<half::round_style>( std::fma( fx, fy, fz ) ) );
+#else
+		return half( detail::binary, detail::float2half<half::round_style>( fx * fy + fz ) );
+#endif
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, exp = -15;
+		unsigned int sign = ( x.data_ ^ y.data_ ) & 0x8000;
+		bool sub = ( ( sign ^ z.data_ ) & 0x8000 ) != 0;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00 )
+			return ( absx > 0x7C00 || absy > 0x7C00 || absz > 0x7C00 ) ? half( detail::binary, detail::signal( x.data_, y.data_, z.data_ ) ) :
+				   ( absx == 0x7C00 )								   ? half( detail::binary, ( !absy || ( sub && absz == 0x7C00 ) ) ? detail::invalid() : ( sign | 0x7C00 ) ) :
+				   ( absy == 0x7C00 )								   ? half( detail::binary, ( !absx || ( sub && absz == 0x7C00 ) ) ? detail::invalid() : ( sign | 0x7C00 ) ) :
+																		 z;
+		if ( !absx || !absy )
+			return absz ? z : half( detail::binary, ( half::round_style == std::round_toward_neg_infinity ) ? ( z.data_ | sign ) : ( z.data_ & sign ) );
+		for ( ; absx < 0x400; absx <<= 1, --exp )
+			;
+		for ( ; absy < 0x400; absy <<= 1, --exp )
+			;
+		detail::uint32 m = static_cast<detail::uint32>( ( absx & 0x3FF ) | 0x400 ) * static_cast<detail::uint32>( ( absy & 0x3FF ) | 0x400 );
+		int i = m >> 21;
+		exp += ( absx >> 10 ) + ( absy >> 10 ) + i;
+		m <<= 3 - i;
+		if ( absz )
+		{
+			int expz = 0;
+			for ( ; absz < 0x400; absz <<= 1, --expz )
+				;
+			expz += absz >> 10;
+			detail::uint32 mz = static_cast<detail::uint32>( ( absz & 0x3FF ) | 0x400 ) << 13;
+			if ( expz > exp || ( expz == exp && mz > m ) )
+			{
+				std::swap( m, mz );
+				std::swap( exp, expz );
+				if ( sub )
+					sign = z.data_ & 0x8000;
+			}
+			int d = exp - expz;
+			mz = ( d < 23 ) ? ( ( mz >> d ) | ( ( mz & ( ( static_cast<detail::uint32>( 1 ) << d ) - 1 ) ) != 0 ) ) : 1;
+			if ( sub )
+			{
+				m = m - mz;
+				if ( !m )
+					return half( detail::binary, static_cast<unsigned>( half::round_style == std::round_toward_neg_infinity ) << 15 );
+				for ( ; m < 0x800000; m <<= 1, --exp )
+					;
+			}
+			else
+			{
+				m += mz;
+				i = m >> 24;
+				m = ( m >> i ) | ( m & i );
+				exp += i;
+			}
+		}
+		if ( exp > 30 )
+			return half( detail::binary, detail::overflow<half::round_style>( sign ) );
+		else if ( exp < -10 )
+			return half( detail::binary, detail::underflow<half::round_style>( sign ) );
+		return half( detail::binary, detail::fixed2half<half::round_style, 23, false, false, false>( m, exp - 1, sign ) );
+#endif
+	}
+
+	/// Maximum of half expressions.
+	/// **See also:** Documentation for [std::fmax](https://en.cppreference.com/w/cpp/numeric/math/fmax).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return maximum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmax( half x, half y )
+	{
+		return half( detail::binary, ( !isnan( y ) && ( isnan( x ) || ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) <
+																		  ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) ) ) ?
+										 detail::select( y.data_, x.data_ ) :
+										 detail::select( x.data_, y.data_ ) );
+	}
+
+	/// Minimum of half expressions.
+	/// **See also:** Documentation for [std::fmin](https://en.cppreference.com/w/cpp/numeric/math/fmin).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return minimum of operands, ignoring quiet NaNs
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	inline HALF_CONSTEXPR_NOERR half fmin( half x, half y )
+	{
+		return half( detail::binary, ( !isnan( y ) && ( isnan( x ) || ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) >
+																		  ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) ) ) ?
+										 detail::select( y.data_, x.data_ ) :
+										 detail::select( x.data_, y.data_ ) );
+	}
+
+	/// Positive difference.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::fdim](https://en.cppreference.com/w/cpp/numeric/math/fdim).
+	/// \param x first operand
+	/// \param y second operand
+	/// \return \a x - \a y or 0 if difference negative
+	/// \exception FE_... according to operator-(half,half)
+	inline half fdim( half x, half y )
+	{
+		if ( isnan( x ) || isnan( y ) )
+			return half( detail::binary, detail::signal( x.data_, y.data_ ) );
+		return ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) <= ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) ? half( detail::binary, 0 ) : ( x - y );
+	}
+
+	/// Get NaN value.
+	/// **See also:** Documentation for [std::nan](https://en.cppreference.com/w/cpp/numeric/math/nan).
+	/// \param arg string code
+	/// \return quiet NaN
+	inline half nanh( const char *arg )
+	{
+		unsigned int value = 0x7FFF;
+		while ( *arg )
+			value ^= static_cast<unsigned>( *arg++ ) & 0xFF;
+		return half( detail::binary, value );
+	}
+
+	/// \}
+	/// \anchor exponential
+	/// \name Exponential functions
+	/// \{
+
+	/// Exponential function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp](https://en.cppreference.com/w/cpp/numeric/math/exp).
+	/// \param arg function argument
+	/// \return e raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::exp( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, e = ( abs >> 10 ) + ( abs <= 0x3FF ), exp;
+		if ( !abs )
+			return half( detail::binary, 0x3C00 );
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? ( 0x7C00 & ( ( arg.data_ >> 15 ) - 1U ) ) : detail::signal( arg.data_ ) );
+		if ( abs >= 0x4C80 )
+			return half( detail::binary, ( arg.data_ & 0x8000 ) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>() );
+		detail::uint32 m = detail::multiply64( static_cast<detail::uint32>( ( abs & 0x3FF ) + ( ( abs > 0x3FF ) << 10 ) ) << 21, 0xB8AA3B29 );
+		if ( e < 14 )
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = m >> ( 45 - e );
+			m = ( m << ( e - 14 ) ) & 0x7FFFFFFF;
+		}
+		return half( detail::binary, detail::exp2_post<half::round_style>( m, exp, ( arg.data_ & 0x8000 ) != 0, 0, 26 ) );
+#endif
+	}
+
+	/// Binary exponential.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::exp2](https://en.cppreference.com/w/cpp/numeric/math/exp2).
+	/// \param arg function argument
+	/// \return 2 raised to \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half exp2( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::exp2( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, e = ( abs >> 10 ) + ( abs <= 0x3FF ), exp = ( abs & 0x3FF ) + ( ( abs > 0x3FF ) << 10 );
+		if ( !abs )
+			return half( detail::binary, 0x3C00 );
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? ( 0x7C00 & ( ( arg.data_ >> 15 ) - 1U ) ) : detail::signal( arg.data_ ) );
+		if ( abs >= 0x4E40 )
+			return half( detail::binary, ( arg.data_ & 0x8000 ) ? detail::underflow<half::round_style>() : detail::overflow<half::round_style>() );
+		return half( detail::binary, detail::exp2_post<half::round_style>(
+										 ( static_cast<detail::uint32>( exp ) << ( 6 + e ) ) & 0x7FFFFFFF, exp >> ( 25 - e ), ( arg.data_ & 0x8000 ) != 0, 0, 28 ) );
+#endif
+	}
+
+	/// Exponential minus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest`
+	/// and in <1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::expm1](https://en.cppreference.com/w/cpp/numeric/math/expm1).
+	/// \param arg function argument
+	/// \return e raised to \a arg and subtracted by 1
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half expm1( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::expm1( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000, e = ( abs >> 10 ) + ( abs <= 0x3FF ), exp;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? ( 0x7C00 + ( sign >> 1 ) ) : detail::signal( arg.data_ ) );
+		if ( abs >= 0x4A00 )
+			return half( detail::binary, ( arg.data_ & 0x8000 ) ? detail::rounded<half::round_style, true>( 0xBBFF, 1, 1 ) : detail::overflow<half::round_style>() );
+		detail::uint32 m = detail::multiply64( static_cast<detail::uint32>( ( abs & 0x3FF ) + ( ( abs > 0x3FF ) << 10 ) ) << 21, 0xB8AA3B29 );
+		if ( e < 14 )
+		{
+			exp = 0;
+			m >>= 14 - e;
+		}
+		else
+		{
+			exp = m >> ( 45 - e );
+			m = ( m << ( e - 14 ) ) & 0x7FFFFFFF;
+		}
+		m = detail::exp2( m );
+		if ( sign )
+		{
+			int s = 0;
+			if ( m > 0x80000000 )
+			{
+				++exp;
+				m = detail::divide64( 0x80000000, m, s );
+			}
+			m = 0x80000000 - ( ( m >> exp ) | ( ( m & ( ( static_cast<detail::uint32>( 1 ) << exp ) - 1 ) ) != 0 ) | s );
+			exp = 0;
+		}
+		else
+			m -= ( exp < 31 ) ? ( 0x80000000 >> exp ) : 1;
+		for ( exp += 14; m < 0x80000000 && exp; m <<= 1, --exp )
+			;
+		if ( exp > 29 )
+			return half( detail::binary, detail::overflow<half::round_style>() );
+		return half( detail::binary, detail::rounded<half::round_style, true>( sign + ( exp << 10 ) + ( m >> 21 ), ( m >> 20 ) & 1, ( m & 0xFFFFF ) != 0 ) );
+#endif
+	}
+
+	/// Natural logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log](https://en.cppreference.com/w/cpp/numeric/math/log).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base e
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::log( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if ( !abs )
+			return half( detail::binary, detail::pole( 0x8000 ) );
+		if ( arg.data_ & 0x8000 )
+			return half( detail::binary, ( arg.data_ <= 0xFC00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs >= 0x7C00 )
+			return ( abs == 0x7C00 ) ? arg : half( detail::binary, detail::signal( arg.data_ ) );
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		exp += abs >> 10;
+		return half( detail::binary, detail::log2_post<half::round_style, 0xB8AA3B2A>(
+										 detail::log2( static_cast<detail::uint32>( ( abs & 0x3FF ) | 0x400 ) << 20, 27 ) + 8, exp, 17 ) );
+#endif
+	}
+
+	/// Common logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log10](https://en.cppreference.com/w/cpp/numeric/math/log10).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 10
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log10( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::log10( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if ( !abs )
+			return half( detail::binary, detail::pole( 0x8000 ) );
+		if ( arg.data_ & 0x8000 )
+			return half( detail::binary, ( arg.data_ <= 0xFC00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs >= 0x7C00 )
+			return ( abs == 0x7C00 ) ? arg : half( detail::binary, detail::signal( arg.data_ ) );
+		switch ( abs )
+		{
+			case 0x4900:
+				return half( detail::binary, 0x3C00 );
+			case 0x5640:
+				return half( detail::binary, 0x4000 );
+			case 0x63D0:
+				return half( detail::binary, 0x4200 );
+			case 0x70E2:
+				return half( detail::binary, 0x4400 );
+		}
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		exp += abs >> 10;
+		return half( detail::binary, detail::log2_post<half::round_style, 0xD49A784C>(
+										 detail::log2( static_cast<detail::uint32>( ( abs & 0x3FF ) | 0x400 ) << 20, 27 ) + 8, exp, 16 ) );
+#endif
+	}
+
+	/// Binary logarithm.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::log2](https://en.cppreference.com/w/cpp/numeric/math/log2).
+	/// \param arg function argument
+	/// \return logarithm of \a arg to base 2
+	/// \exception FE_INVALID for signaling NaN or negative argument
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log2( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::log2( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = -15, s = 0;
+		if ( !abs )
+			return half( detail::binary, detail::pole( 0x8000 ) );
+		if ( arg.data_ & 0x8000 )
+			return half( detail::binary, ( arg.data_ <= 0xFC00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs >= 0x7C00 )
+			return ( abs == 0x7C00 ) ? arg : half( detail::binary, detail::signal( arg.data_ ) );
+		if ( abs == 0x3C00 )
+			return half( detail::binary, 0 );
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		exp += ( abs >> 10 );
+		if ( !( abs & 0x3FF ) )
+		{
+			unsigned int value = static_cast<unsigned>( exp < 0 ) << 15, m = std::abs( exp ) << 6;
+			for ( exp = 18; m < 0x400; m <<= 1, --exp )
+				;
+			return half( detail::binary, value + ( exp << 10 ) + m );
+		}
+		detail::uint32 ilog = exp, sign = detail::sign_mask( ilog ), m = ( ( ( ilog << 27 ) + ( detail::log2( static_cast<detail::uint32>( ( abs & 0x3FF ) | 0x400 ) << 20, 28 ) >> 4 ) ) ^ sign ) - sign;
+		if ( !m )
+			return half( detail::binary, 0 );
+		for ( exp = 14; m < 0x8000000 && exp; m <<= 1, --exp )
+			;
+		for ( ; m > 0xFFFFFFF; m >>= 1, ++exp )
+			s |= m & 1;
+		return half( detail::binary, detail::fixed2half<half::round_style, 27, false, false, true>( m, exp, sign & 0x8000, s ) );
+#endif
+	}
+
+	/// Natural logarithm plus one.
+	/// This function may be 1 ULP off the correctly rounded exact result in <0.05% of inputs for `std::round_to_nearest`
+	/// and in ~1% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::log1p](https://en.cppreference.com/w/cpp/numeric/math/log1p).
+	/// \param arg function argument
+	/// \return logarithm of \a arg plus 1 to base e
+	/// \exception FE_INVALID for signaling NaN or argument <-1
+	/// \exception FE_DIVBYZERO for -1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half log1p( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::log1p( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		if ( arg.data_ >= 0xBC00 )
+			return half( detail::binary, ( arg.data_ == 0xBC00 ) ? detail::pole( 0x8000 ) : ( arg.data_ <= 0xFC00 ) ? detail::invalid() :
+																													  detail::signal( arg.data_ ) );
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if ( !abs || abs >= 0x7C00 )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		exp += abs >> 10;
+		detail::uint32 m = static_cast<detail::uint32>( ( abs & 0x3FF ) | 0x400 ) << 20;
+		if ( arg.data_ & 0x8000 )
+		{
+			m = 0x40000000 - ( m >> -exp );
+			for ( exp = 0; m < 0x40000000; m <<= 1, --exp )
+				;
+		}
+		else
+		{
+			if ( exp < 0 )
+			{
+				m = 0x40000000 + ( m >> -exp );
+				exp = 0;
+			}
+			else
+			{
+				m += 0x40000000 >> exp;
+				int i = m >> 31;
+				m >>= i;
+				exp += i;
+			}
+		}
+		return half( detail::binary, detail::log2_post<half::round_style, 0xB8AA3B2A>( detail::log2( m ), exp, 17 ) );
+#endif
+	}
+
+	/// \}
+	/// \anchor power
+	/// \name Power functions
+	/// \{
+
+	/// Square root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sqrt](https://en.cppreference.com/w/cpp/numeric/math/sqrt).
+	/// \param arg function argument
+	/// \return square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half sqrt( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::sqrt( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = 15;
+		if ( !abs || arg.data_ >= 0x7C00 )
+			return half( detail::binary, ( abs > 0x7C00 ) ? detail::signal( arg.data_ ) : ( arg.data_ > 0x8000 ) ? detail::invalid() :
+																												   arg.data_ );
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		detail::uint32 r = static_cast<detail::uint32>( ( abs & 0x3FF ) | 0x400 ) << 10, m = detail::sqrt<20>( r, exp += abs >> 10 );
+		return half( detail::binary, detail::rounded<half::round_style, false>( ( exp << 10 ) + ( m & 0x3FF ), r > m, r != 0 ) );
+#endif
+	}
+
+	/// Inverse square root.
+	/// This function is exact to rounding for all rounding modes and thus generally more accurate than directly computing
+	/// 1 / sqrt(\a arg) in half-precision, in addition to also being faster.
+	/// \param arg function argument
+	/// \return reciprocal of square root of \a arg
+	/// \exception FE_INVALID for signaling NaN and negative arguments
+	/// \exception FE_INEXACT according to rounding
+	inline half rsqrt( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( detail::internal_t( 1 ) / std::sqrt( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF, bias = 0x4000;
+		if ( !abs || arg.data_ >= 0x7C00 )
+			return half( detail::binary, ( abs > 0x7C00 ) ? detail::signal( arg.data_ ) : ( arg.data_ > 0x8000 ) ? detail::invalid() :
+																					  !abs						 ? detail::pole( arg.data_ & 0x8000 ) :
+																												   0 );
+		for ( ; abs < 0x400; abs <<= 1, bias -= 0x400 )
+			;
+		unsigned int frac = ( abs += bias ) & 0x7FF;
+		if ( frac == 0x400 )
+			return half( detail::binary, 0x7A00 - ( abs >> 1 ) );
+		if ( ( half::round_style == std::round_to_nearest && ( frac == 0x3FE || frac == 0x76C ) ) ||
+			 ( half::round_style != std::round_to_nearest && ( frac == 0x15A || frac == 0x3FC || frac == 0x401 || frac == 0x402 || frac == 0x67B ) ) )
+			return pow( arg, half( detail::binary, 0xB800 ) );
+		detail::uint32 f = 0x17376 - abs, mx = ( abs & 0x3FF ) | 0x400, my = ( ( f >> 1 ) & 0x3FF ) | 0x400, mz = my * my;
+		int expy = ( f >> 11 ) - 31, expx = 32 - ( abs >> 10 ), i = mz >> 21;
+		for ( mz = 0x60000000 - ( ( ( mz >> i ) * mx ) >> ( expx - 2 * expy - i ) ); mz < 0x40000000; mz <<= 1, --expy )
+			;
+		i = ( my *= mz >> 10 ) >> 31;
+		expy += i;
+		my = ( my >> ( 20 + i ) ) + 1;
+		i = ( mz = my * my ) >> 21;
+		for ( mz = 0x60000000 - ( ( ( mz >> i ) * mx ) >> ( expx - 2 * expy - i ) ); mz < 0x40000000; mz <<= 1, --expy )
+			;
+		i = ( my *= ( mz >> 10 ) + 1 ) >> 31;
+		return half( detail::binary, detail::fixed2half<half::round_style, 30, false, false, true>( my >> i, expy + i + 14 ) );
+#endif
+	}
+
+	/// Cubic root.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cbrt](https://en.cppreference.com/w/cpp/numeric/math/cbrt).
+	/// \param arg function argument
+	/// \return cubic root of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT according to rounding
+	inline half cbrt( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::cbrt( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = -15;
+		if ( !abs || abs == 0x3C00 || abs >= 0x7C00 )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		detail::uint32 ilog = exp + ( abs >> 10 ), sign = detail::sign_mask( ilog ), f, m = ( ( ( ilog << 27 ) + ( detail::log2( static_cast<detail::uint32>( ( abs & 0x3FF ) | 0x400 ) << 20, 24 ) >> 4 ) ) ^ sign ) - sign;
+		for ( exp = 2; m < 0x80000000; m <<= 1, --exp )
+			;
+		m = detail::multiply64( m, 0xAAAAAAAB );
+		int i = m >> 31, s;
+		exp += i;
+		m <<= 1 - i;
+		if ( exp < 0 )
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = ( m << exp ) & 0x7FFFFFFF;
+			exp = m >> ( 31 - exp );
+		}
+		m = detail::exp2( f, ( half::round_style == std::round_to_nearest ) ? 29 : 26 );
+		if ( sign )
+		{
+			if ( m > 0x80000000 )
+			{
+				m = detail::divide64( 0x80000000, m, s );
+				++exp;
+			}
+			exp = -exp;
+		}
+		return half( detail::binary, ( half::round_style == std::round_to_nearest ) ?
+										 detail::fixed2half<half::round_style, 31, false, false, false>( m, exp + 14, arg.data_ & 0x8000 ) :
+										 detail::fixed2half<half::round_style, 23, false, false, false>( ( m + 0x80 ) >> 8, exp + 14, arg.data_ & 0x8000 ) );
+#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot( half x, half y )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>( x.data_ ), fy = detail::half2float<detail::internal_t>( y.data_ );
+#if HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::hypot( fx, fy ) ) );
+#else
+		return half( detail::binary, detail::float2half<half::round_style>( std::sqrt( fx * fx + fy * fy ) ) );
+#endif
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, expx = 0, expy = 0;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx == 0x7C00 ) ? detail::select( 0x7C00, y.data_ ) :
+										 ( absy == 0x7C00 ) ? detail::select( 0x7C00, x.data_ ) :
+															  detail::signal( x.data_, y.data_ ) );
+		if ( !absx )
+			return half( detail::binary, absy ? detail::check_underflow( absy ) : 0 );
+		if ( !absy )
+			return half( detail::binary, detail::check_underflow( absx ) );
+		if ( absy > absx )
+			std::swap( absx, absy );
+		for ( ; absx < 0x400; absx <<= 1, --expx )
+			;
+		for ( ; absy < 0x400; absy <<= 1, --expy )
+			;
+		detail::uint32 mx = ( absx & 0x3FF ) | 0x400, my = ( absy & 0x3FF ) | 0x400;
+		mx *= mx;
+		my *= my;
+		int ix = mx >> 21, iy = my >> 21;
+		expx = 2 * ( expx + ( absx >> 10 ) ) - 15 + ix;
+		expy = 2 * ( expy + ( absy >> 10 ) ) - 15 + iy;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		int d = expx - expy;
+		my = ( d < 30 ) ? ( ( my >> d ) | ( ( my & ( ( static_cast<detail::uint32>( 1 ) << d ) - 1 ) ) != 0 ) ) : 1;
+		return half( detail::binary, detail::hypot_post<half::round_style>( mx + my, expx ) );
+#endif
+	}
+
+	/// Hypotenuse function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::hypot](https://en.cppreference.com/w/cpp/numeric/math/hypot).
+	/// \param x first argument
+	/// \param y second argument
+	/// \param z third argument
+	/// \return square root of sum of squares without internal over- or underflows
+	/// \exception FE_INVALID if \a x, \a y or \a z is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding of the final square root
+	inline half hypot( half x, half y, half z )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t fx = detail::half2float<detail::internal_t>( x.data_ ), fy = detail::half2float<detail::internal_t>( y.data_ ), fz = detail::half2float<detail::internal_t>( z.data_ );
+		return half( detail::binary, detail::float2half<half::round_style>( std::sqrt( fx * fx + fy * fy + fz * fz ) ) );
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, absz = z.data_ & 0x7FFF, expx = 0, expy = 0, expz = 0;
+		if ( !absx )
+			return hypot( y, z );
+		if ( !absy )
+			return hypot( x, z );
+		if ( !absz )
+			return hypot( x, y );
+		if ( absx >= 0x7C00 || absy >= 0x7C00 || absz >= 0x7C00 )
+			return half( detail::binary, ( absx == 0x7C00 ) ? detail::select( 0x7C00, detail::select( y.data_, z.data_ ) ) :
+										 ( absy == 0x7C00 ) ? detail::select( 0x7C00, detail::select( x.data_, z.data_ ) ) :
+										 ( absz == 0x7C00 ) ? detail::select( 0x7C00, detail::select( x.data_, y.data_ ) ) :
+															  detail::signal( x.data_, y.data_, z.data_ ) );
+		if ( absz > absy )
+			std::swap( absy, absz );
+		if ( absy > absx )
+			std::swap( absx, absy );
+		if ( absz > absy )
+			std::swap( absy, absz );
+		for ( ; absx < 0x400; absx <<= 1, --expx )
+			;
+		for ( ; absy < 0x400; absy <<= 1, --expy )
+			;
+		for ( ; absz < 0x400; absz <<= 1, --expz )
+			;
+		detail::uint32 mx = ( absx & 0x3FF ) | 0x400, my = ( absy & 0x3FF ) | 0x400, mz = ( absz & 0x3FF ) | 0x400;
+		mx *= mx;
+		my *= my;
+		mz *= mz;
+		int ix = mx >> 21, iy = my >> 21, iz = mz >> 21;
+		expx = 2 * ( expx + ( absx >> 10 ) ) - 15 + ix;
+		expy = 2 * ( expy + ( absy >> 10 ) ) - 15 + iy;
+		expz = 2 * ( expz + ( absz >> 10 ) ) - 15 + iz;
+		mx <<= 10 - ix;
+		my <<= 10 - iy;
+		mz <<= 10 - iz;
+		int d = expy - expz;
+		mz = ( d < 30 ) ? ( ( mz >> d ) | ( ( mz & ( ( static_cast<detail::uint32>( 1 ) << d ) - 1 ) ) != 0 ) ) : 1;
+		my += mz;
+		if ( my & 0x80000000 )
+		{
+			my = ( my >> 1 ) | ( my & 1 );
+			if ( ++expy > expx )
+			{
+				std::swap( mx, my );
+				std::swap( expx, expy );
+			}
+		}
+		d = expx - expy;
+		my = ( d < 30 ) ? ( ( my >> d ) | ( ( my & ( ( static_cast<detail::uint32>( 1 ) << d ) - 1 ) ) != 0 ) ) : 1;
+		return half( detail::binary, detail::hypot_post<half::round_style>( mx + my, expx ) );
+#endif
+	}
+
+	/// Power function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.00025% of inputs.
+	///
+	/// **See also:** Documentation for [std::pow](https://en.cppreference.com/w/cpp/numeric/math/pow).
+	/// \param x base
+	/// \param y exponent
+	/// \return \a x raised to \a y
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN or if \a x is finite an negative and \a y is finite and not integral
+	/// \exception FE_DIVBYZERO if \a x is 0 and \a y is negative
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half pow( half x, half y )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::pow( detail::half2float<detail::internal_t>( x.data_ ), detail::half2float<detail::internal_t>( y.data_ ) ) ) );
+#else
+		int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, exp = -15;
+		if ( !absy || x.data_ == 0x3C00 )
+			return half( detail::binary, detail::select( 0x3C00, ( x.data_ == 0x3C00 ) ? y.data_ : x.data_ ) );
+		bool is_int = absy >= 0x6400 || ( absy >= 0x3C00 && !( absy & ( ( 1 << ( 25 - ( absy >> 10 ) ) ) - 1 ) ) );
+		unsigned int sign = x.data_ & ( static_cast<unsigned>( ( absy < 0x6800 ) && is_int && ( ( absy >> ( 25 - ( absy >> 10 ) ) ) & 1 ) ) << 15 );
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+			return half( detail::binary, ( absx > 0x7C00 || absy > 0x7C00 ) ? detail::signal( x.data_, y.data_ ) :
+										 ( absy == 0x7C00 )					? ( ( absx == 0x3C00 ) ? 0x3C00 : ( !absx && y.data_ == 0xFC00 ) ? detail::pole() :
+																																			   ( 0x7C00 & -( ( y.data_ >> 15 ) ^ ( absx > 0x3C00 ) ) ) ) :
+																			  ( sign | ( 0x7C00 & ( ( y.data_ >> 15 ) - 1U ) ) ) );
+		if ( !absx )
+			return half( detail::binary, ( y.data_ & 0x8000 ) ? detail::pole( sign ) : sign );
+		if ( ( x.data_ & 0x8000 ) && !is_int )
+			return half( detail::binary, detail::invalid() );
+		if ( x.data_ == 0xBC00 )
+			return half( detail::binary, sign | 0x3C00 );
+		switch ( y.data_ )
+		{
+			case 0x3800:
+				return sqrt( x );
+			case 0x3C00:
+				return half( detail::binary, detail::check_underflow( x.data_ ) );
+			case 0x4000:
+				return x * x;
+			case 0xBC00:
+				return half( detail::binary, 0x3C00 ) / x;
+		}
+		for ( ; absx < 0x400; absx <<= 1, --exp )
+			;
+		detail::uint32 ilog = exp + ( absx >> 10 ), msign = detail::sign_mask( ilog ), f, m = ( ( ( ilog << 27 ) + ( ( detail::log2( static_cast<detail::uint32>( ( absx & 0x3FF ) | 0x400 ) << 20 ) + 8 ) >> 4 ) ) ^ msign ) - msign;
+		for ( exp = -11; m < 0x80000000; m <<= 1, --exp )
+			;
+		for ( ; absy < 0x400; absy <<= 1, --exp )
+			;
+		m = detail::multiply64( m, static_cast<detail::uint32>( ( absy & 0x3FF ) | 0x400 ) << 21 );
+		int i = m >> 31;
+		exp += ( absy >> 10 ) + i;
+		m <<= 1 - i;
+		if ( exp < 0 )
+		{
+			f = m >> -exp;
+			exp = 0;
+		}
+		else
+		{
+			f = ( m << exp ) & 0x7FFFFFFF;
+			exp = m >> ( 31 - exp );
+		}
+		return half( detail::binary, detail::exp2_post<half::round_style>( f, exp, ( ( msign & 1 ) ^ ( y.data_ >> 15 ) ) != 0, sign ) );
+#endif
+	}
+
+	/// \}
+	/// \anchor trigonometric
+	/// \name Trigonometric functions
+	/// \{
+
+	/// Compute sine and cosine simultaneously.
+	///	This returns the same results as sin() and cos() but is faster than calling each function individually.
+	///
+	/// This function is exact to rounding for all rounding modes.
+	/// \param arg function argument
+	/// \param sin variable to take sine of \a arg
+	/// \param cos variable to take cosine of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline void sincos( half arg, half *sin, half *cos )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		detail::internal_t f = detail::half2float<detail::internal_t>( arg.data_ );
+		*sin = half( detail::binary, detail::float2half<half::round_style>( std::sin( f ) ) );
+		*cos = half( detail::binary, detail::float2half<half::round_style>( std::cos( f ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15, k;
+		if ( abs >= 0x7C00 )
+			*sin = *cos = half( detail::binary, ( abs == 0x7C00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		else if ( !abs )
+		{
+			*sin = arg;
+			*cos = half( detail::binary, 0x3C00 );
+		}
+		else if ( abs < 0x2500 )
+		{
+			*sin = half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 1, 1, 1 ) );
+			*cos = half( detail::binary, detail::rounded<half::round_style, true>( 0x3BFF, 1, 1 ) );
+		}
+		else
+		{
+			if ( half::round_style != std::round_to_nearest )
+			{
+				switch ( abs )
+				{
+					case 0x48B7:
+						*sin = half( detail::binary, detail::rounded<half::round_style, true>( ( ~arg.data_ & 0x8000 ) | 0x1D07, 1, 1 ) );
+						*cos = half( detail::binary, detail::rounded<half::round_style, true>( 0xBBFF, 1, 1 ) );
+						return;
+					case 0x598C:
+						*sin = half( detail::binary, detail::rounded<half::round_style, true>( ( arg.data_ & 0x8000 ) | 0x3BFF, 1, 1 ) );
+						*cos = half( detail::binary, detail::rounded<half::round_style, true>( 0x80FC, 1, 1 ) );
+						return;
+					case 0x6A64:
+						*sin = half( detail::binary, detail::rounded<half::round_style, true>( ( ~arg.data_ & 0x8000 ) | 0x3BFE, 1, 1 ) );
+						*cos = half( detail::binary, detail::rounded<half::round_style, true>( 0x27FF, 1, 1 ) );
+						return;
+					case 0x6D8C:
+						*sin = half( detail::binary, detail::rounded<half::round_style, true>( ( arg.data_ & 0x8000 ) | 0x0FE6, 1, 1 ) );
+						*cos = half( detail::binary, detail::rounded<half::round_style, true>( 0x3BFF, 1, 1 ) );
+						return;
+				}
+			}
+			std::pair<detail::uint32, detail::uint32> sc = detail::sincos( detail::angle_arg( abs, k ), 28 );
+			switch ( k & 3 )
+			{
+				case 1:
+					sc = std::make_pair( sc.second, -sc.first );
+					break;
+				case 2:
+					sc = std::make_pair( -sc.first, -sc.second );
+					break;
+				case 3:
+					sc = std::make_pair( -sc.second, sc.first );
+					break;
+			}
+			*sin = half( detail::binary, detail::fixed2half<half::round_style, 30, true, true, true>( ( sc.first ^ -static_cast<detail::uint32>( sign ) ) + sign ) );
+			*cos = half( detail::binary, detail::fixed2half<half::round_style, 30, true, true, true>( sc.second ) );
+		}
+#endif
+	}
+
+	/// Sine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sin](https://en.cppreference.com/w/cpp/numeric/math/sin).
+	/// \param arg function argument
+	/// \return sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sin( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::sin( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs < 0x2900 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 1, 1, 1 ) );
+		if ( half::round_style != std::round_to_nearest )
+			switch ( abs )
+			{
+				case 0x48B7:
+					return half( detail::binary, detail::rounded<half::round_style, true>( ( ~arg.data_ & 0x8000 ) | 0x1D07, 1, 1 ) );
+				case 0x6A64:
+					return half( detail::binary, detail::rounded<half::round_style, true>( ( ~arg.data_ & 0x8000 ) | 0x3BFE, 1, 1 ) );
+				case 0x6D8C:
+					return half( detail::binary, detail::rounded<half::round_style, true>( ( arg.data_ & 0x8000 ) | 0x0FE6, 1, 1 ) );
+			}
+		std::pair<detail::uint32, detail::uint32> sc = detail::sincos( detail::angle_arg( abs, k ), 28 );
+		detail::uint32 sign = -static_cast<detail::uint32>( ( ( k >> 1 ) & 1 ) ^ ( arg.data_ >> 15 ) );
+		return half( detail::binary, detail::fixed2half<half::round_style, 30, true, true, true>( ( ( ( k & 1 ) ? sc.second : sc.first ) ^ sign ) - sign ) );
+#endif
+	}
+
+	/// Cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cos](https://en.cppreference.com/w/cpp/numeric/math/cos).
+	/// \param arg function argument
+	/// \return cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cos( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::cos( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, k;
+		if ( !abs )
+			return half( detail::binary, 0x3C00 );
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs < 0x2500 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( 0x3BFF, 1, 1 ) );
+		if ( half::round_style != std::round_to_nearest && abs == 0x598C )
+			return half( detail::binary, detail::rounded<half::round_style, true>( 0x80FC, 1, 1 ) );
+		std::pair<detail::uint32, detail::uint32> sc = detail::sincos( detail::angle_arg( abs, k ), 28 );
+		detail::uint32 sign = -static_cast<detail::uint32>( ( ( k >> 1 ) ^ k ) & 1 );
+		return half( detail::binary, detail::fixed2half<half::round_style, 30, true, true, true>( ( ( ( k & 1 ) ? sc.first : sc.second ) ^ sign ) - sign ) );
+#endif
+	}
+
+	/// Tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tan](https://en.cppreference.com/w/cpp/numeric/math/tan).
+	/// \param arg function argument
+	/// \return tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or infinity
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tan( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::tan( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = 13, k;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs < 0x2700 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_, 0, 1 ) );
+		if ( half::round_style != std::round_to_nearest )
+			switch ( abs )
+			{
+				case 0x658C:
+					return half( detail::binary, detail::rounded<half::round_style, true>( ( arg.data_ & 0x8000 ) | 0x07E6, 1, 1 ) );
+				case 0x7330:
+					return half( detail::binary, detail::rounded<half::round_style, true>( ( ~arg.data_ & 0x8000 ) | 0x4B62, 1, 1 ) );
+			}
+		std::pair<detail::uint32, detail::uint32> sc = detail::sincos( detail::angle_arg( abs, k ), 30 );
+		if ( k & 1 )
+			sc = std::make_pair( -sc.second, sc.first );
+		detail::uint32 signy = detail::sign_mask( sc.first ), signx = detail::sign_mask( sc.second );
+		detail::uint32 my = ( sc.first ^ signy ) - signy, mx = ( sc.second ^ signx ) - signx;
+		for ( ; my < 0x80000000; my <<= 1, --exp )
+			;
+		for ( ; mx < 0x80000000; mx <<= 1, ++exp )
+			;
+		return half( detail::binary, detail::tangent_post<half::round_style>( my, mx, exp, ( signy ^ signx ^ arg.data_ ) & 0x8000 ) );
+#endif
+	}
+
+	/// Arc sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asin](https://en.cppreference.com/w/cpp/numeric/math/asin).
+	/// \param arg function argument
+	/// \return arc sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asin( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::asin( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x3C00 )
+			return half( detail::binary, ( abs > 0x7C00 ) ? detail::signal( arg.data_ ) : ( abs > 0x3C00 ) ? detail::invalid() :
+																											 detail::rounded<half::round_style, true>( sign | 0x3E48, 0, 1 ) );
+		if ( abs < 0x2900 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_, 0, 1 ) );
+		if ( half::round_style != std::round_to_nearest && ( abs == 0x2B44 || abs == 0x2DC3 ) )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ + 1, 1, 1 ) );
+		std::pair<detail::uint32, detail::uint32> sc = detail::atan2_args( abs );
+		detail::uint32 m = detail::atan2( sc.first, sc.second, ( half::round_style == std::round_to_nearest ) ? 27 : 26 );
+		return half( detail::binary, detail::fixed2half<half::round_style, 30, false, true, true>( m, 14, sign ) );
+#endif
+	}
+
+	/// Arc cosine function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acos](https://en.cppreference.com/w/cpp/numeric/math/acos).
+	/// \param arg function argument
+	/// \return arc cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acos( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::acos( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ >> 15;
+		if ( !abs )
+			return half( detail::binary, detail::rounded<half::round_style, true>( 0x3E48, 0, 1 ) );
+		if ( abs >= 0x3C00 )
+			return half( detail::binary, ( abs > 0x7C00 ) ? detail::signal( arg.data_ ) : ( abs > 0x3C00 ) ? detail::invalid() :
+																					  sign				   ? detail::rounded<half::round_style, true>( 0x4248, 0, 1 ) :
+																											 0 );
+		std::pair<detail::uint32, detail::uint32> cs = detail::atan2_args( abs );
+		detail::uint32 m = detail::atan2( cs.second, cs.first, 28 );
+		return half( detail::binary, detail::fixed2half<half::round_style, 31, false, true, true>( sign ? ( 0xC90FDAA2 - m ) : m, 15, 0, sign ) );
+#endif
+	}
+
+	/// Arc tangent function.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atan](https://en.cppreference.com/w/cpp/numeric/math/atan).
+	/// \param arg function argument
+	/// \return arc tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::atan( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? detail::rounded<half::round_style, true>( sign | 0x3E48, 0, 1 ) : detail::signal( arg.data_ ) );
+		if ( abs <= 0x2700 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 1, 1, 1 ) );
+		int exp = ( abs >> 10 ) + ( abs <= 0x3FF );
+		detail::uint32 my = ( abs & 0x3FF ) | ( ( abs > 0x3FF ) << 10 );
+		detail::uint32 m = ( exp > 15 ) ? detail::atan2( my << 19, 0x20000000 >> ( exp - 15 ), ( half::round_style == std::round_to_nearest ) ? 26 : 24 ) :
+										  detail::atan2( my << ( exp + 4 ), 0x20000000, ( half::round_style == std::round_to_nearest ) ? 30 : 28 );
+		return half( detail::binary, detail::fixed2half<half::round_style, 30, false, true, true>( m, 14, sign ) );
+#endif
+	}
+
+	/// Arc tangent function.
+	/// This function may be 1 ULP off the correctly rounded exact result in ~0.005% of inputs for `std::round_to_nearest`,
+	/// in ~0.1% of inputs for `std::round_toward_zero` and in ~0.02% of inputs for any other rounding mode.
+	///
+	/// **See also:** Documentation for [std::atan2](https://en.cppreference.com/w/cpp/numeric/math/atan2).
+	/// \param y numerator
+	/// \param x denominator
+	/// \return arc tangent value
+	/// \exception FE_INVALID if \a x or \a y is signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atan2( half y, half x )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::atan2( detail::half2float<detail::internal_t>( y.data_ ), detail::half2float<detail::internal_t>( x.data_ ) ) ) );
+#else
+		unsigned int absx = x.data_ & 0x7FFF, absy = y.data_ & 0x7FFF, signx = x.data_ >> 15, signy = y.data_ & 0x8000;
+		if ( absx >= 0x7C00 || absy >= 0x7C00 )
+		{
+			if ( absx > 0x7C00 || absy > 0x7C00 )
+				return half( detail::binary, detail::signal( x.data_, y.data_ ) );
+			if ( absy == 0x7C00 )
+				return half( detail::binary, ( absx < 0x7C00 ) ? detail::rounded<half::round_style, true>( signy | 0x3E48, 0, 1 ) :
+											 signx			   ? detail::rounded<half::round_style, true>( signy | 0x40B6, 0, 1 ) :
+																 detail::rounded<half::round_style, true>( signy | 0x3A48, 0, 1 ) );
+			return ( x.data_ == 0x7C00 ) ? half( detail::binary, signy ) : half( detail::binary, detail::rounded<half::round_style, true>( signy | 0x4248, 0, 1 ) );
+		}
+		if ( !absy )
+			return signx ? half( detail::binary, detail::rounded<half::round_style, true>( signy | 0x4248, 0, 1 ) ) : y;
+		if ( !absx )
+			return half( detail::binary, detail::rounded<half::round_style, true>( signy | 0x3E48, 0, 1 ) );
+		int d = ( absy >> 10 ) + ( absy <= 0x3FF ) - ( absx >> 10 ) - ( absx <= 0x3FF );
+		if ( d > ( signx ? 18 : 12 ) )
+			return half( detail::binary, detail::rounded<half::round_style, true>( signy | 0x3E48, 0, 1 ) );
+		if ( signx && d < -11 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( signy | 0x4248, 0, 1 ) );
+		if ( !signx && d < ( ( half::round_style == std::round_toward_zero ) ? -15 : -9 ) )
+		{
+			for ( ; absy < 0x400; absy <<= 1, --d )
+				;
+			detail::uint32 mx = ( ( absx << 1 ) & 0x7FF ) | 0x800, my = ( ( absy << 1 ) & 0x7FF ) | 0x800;
+			int i = my < mx;
+			d -= i;
+			if ( d < -25 )
+				return half( detail::binary, detail::underflow<half::round_style>( signy ) );
+			my <<= 11 + i;
+			return half( detail::binary, detail::fixed2half<half::round_style, 11, false, false, true>( my / mx, d + 14, signy, my % mx != 0 ) );
+		}
+		detail::uint32 m = detail::atan2( ( ( absy & 0x3FF ) | ( ( absy > 0x3FF ) << 10 ) ) << ( 19 + ( ( d < 0 ) ? d : ( d > 0 ) ? 0 :
+																																	-1 ) ),
+										  ( ( absx & 0x3FF ) | ( ( absx > 0x3FF ) << 10 ) ) << ( 19 - ( ( d > 0 ) ? d : ( d < 0 ) ? 0 :
+																																	1 ) ) );
+		return half( detail::binary, detail::fixed2half<half::round_style, 31, false, true, true>( signx ? ( 0xC90FDAA2 - m ) : m, 15, signy, signx ) );
+#endif
+	}
+
+	/// \}
+	/// \anchor hyperbolic
+	/// \name Hyperbolic functions
+	/// \{
+
+	/// Hyperbolic sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::sinh](https://en.cppreference.com/w/cpp/numeric/math/sinh).
+	/// \param arg function argument
+	/// \return hyperbolic sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half sinh( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::sinh( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if ( !abs || abs >= 0x7C00 )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		if ( abs <= 0x2900 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_, 0, 1 ) );
+		std::pair<detail::uint32, detail::uint32> mm = detail::hyperbolic_args( abs, exp, ( half::round_style == std::round_to_nearest ) ? 29 : 27 );
+		detail::uint32 m = mm.first - mm.second;
+		for ( exp += 13; m < 0x80000000 && exp; m <<= 1, --exp )
+			;
+		unsigned int sign = arg.data_ & 0x8000;
+		if ( exp > 29 )
+			return half( detail::binary, detail::overflow<half::round_style>( sign ) );
+		return half( detail::binary, detail::fixed2half<half::round_style, 31, false, false, true>( m, exp, sign ) );
+#endif
+	}
+
+	/// Hyperbolic cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::cosh](https://en.cppreference.com/w/cpp/numeric/math/cosh).
+	/// \param arg function argument
+	/// \return hyperbolic cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half cosh( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::cosh( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if ( !abs )
+			return half( detail::binary, 0x3C00 );
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs > 0x7C00 ) ? detail::signal( arg.data_ ) : 0x7C00 );
+		std::pair<detail::uint32, detail::uint32> mm = detail::hyperbolic_args( abs, exp, ( half::round_style == std::round_to_nearest ) ? 23 : 26 );
+		detail::uint32 m = mm.first + mm.second, i = ( ~m & 0xFFFFFFFF ) >> 31;
+		m = ( m >> i ) | ( m & i ) | 0x80000000;
+		if ( ( exp += 13 + i ) > 29 )
+			return half( detail::binary, detail::overflow<half::round_style>() );
+		return half( detail::binary, detail::fixed2half<half::round_style, 31, false, false, true>( m, exp ) );
+#endif
+	}
+
+	/// Hyperbolic tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::tanh](https://en.cppreference.com/w/cpp/numeric/math/tanh).
+	/// \param arg function argument
+	/// \return hyperbolic tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tanh( half arg )
+	{
+#ifdef HALF_ARITHMETIC_TYPE
+		return half( detail::binary, detail::float2half<half::round_style>( std::tanh( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs > 0x7C00 ) ? detail::signal( arg.data_ ) : ( arg.data_ - 0x4000 ) );
+		if ( abs >= 0x4500 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( ( arg.data_ & 0x8000 ) | 0x3BFF, 1, 1 ) );
+		if ( abs < 0x2700 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 1, 1, 1 ) );
+		if ( half::round_style != std::round_to_nearest && abs == 0x2D3F )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 3, 0, 1 ) );
+		std::pair<detail::uint32, detail::uint32> mm = detail::hyperbolic_args( abs, exp, 27 );
+		detail::uint32 my = mm.first - mm.second - ( half::round_style != std::round_to_nearest ), mx = mm.first + mm.second, i = ( ~mx & 0xFFFFFFFF ) >> 31;
+		for ( exp = 13; my < 0x80000000; my <<= 1, --exp )
+			;
+		mx = ( mx >> i ) | 0x80000000;
+		return half( detail::binary, detail::tangent_post<half::round_style>( my, mx, exp - i, arg.data_ & 0x8000 ) );
+#endif
+	}
+
+	/// Hyperbolic area sine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::asinh](https://en.cppreference.com/w/cpp/numeric/math/asinh).
+	/// \param arg function argument
+	/// \return area sine value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half asinh( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::asinh( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF;
+		if ( !abs || abs >= 0x7C00 )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		if ( abs <= 0x2900 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 1, 1, 1 ) );
+		if ( half::round_style != std::round_to_nearest )
+			switch ( abs )
+			{
+				case 0x32D4:
+					return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 13, 1, 1 ) );
+				case 0x3B5B:
+					return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_ - 197, 1, 1 ) );
+			}
+		return half( detail::binary, detail::area<half::round_style, true>( arg.data_ ) );
+#endif
+	}
+
+	/// Hyperbolic area cosine.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::acosh](https://en.cppreference.com/w/cpp/numeric/math/acosh).
+	/// \param arg function argument
+	/// \return area cosine value of \a arg
+	/// \exception FE_INVALID for signaling NaN or arguments <1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half acosh( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::acosh( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF;
+		if ( ( arg.data_ & 0x8000 ) || abs < 0x3C00 )
+			return half( detail::binary, ( abs <= 0x7C00 ) ? detail::invalid() : detail::signal( arg.data_ ) );
+		if ( abs == 0x3C00 )
+			return half( detail::binary, 0 );
+		if ( arg.data_ >= 0x7C00 )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		return half( detail::binary, detail::area<half::round_style, false>( arg.data_ ) );
+#endif
+	}
+
+	/// Hyperbolic area tangent.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::atanh](https://en.cppreference.com/w/cpp/numeric/math/atanh).
+	/// \param arg function argument
+	/// \return area tangent value of \a arg
+	/// \exception FE_INVALID for signaling NaN or if abs(\a arg) > 1
+	/// \exception FE_DIVBYZERO for +/-1
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half atanh( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::atanh( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF, exp = 0;
+		if ( !abs )
+			return arg;
+		if ( abs >= 0x3C00 )
+			return half( detail::binary, ( abs == 0x3C00 ) ? detail::pole( arg.data_ & 0x8000 ) : ( abs <= 0x7C00 ) ? detail::invalid() :
+																													  detail::signal( arg.data_ ) );
+		if ( abs < 0x2700 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( arg.data_, 0, 1 ) );
+		detail::uint32 m = static_cast<detail::uint32>( ( abs & 0x3FF ) | ( ( abs > 0x3FF ) << 10 ) ) << ( ( abs >> 10 ) + ( abs <= 0x3FF ) + 6 ), my = 0x80000000 + m, mx = 0x80000000 - m;
+		for ( ; mx < 0x80000000; mx <<= 1, ++exp )
+			;
+		int i = my >= mx, s;
+		return half( detail::binary, detail::log2_post<half::round_style, 0xB8AA3B2A>( detail::log2(
+																						   ( detail::divide64( my >> i, mx, s ) + 1 ) >> 1, 27 ) +
+																						   0x10,
+																					   exp + i - 1, 16, arg.data_ & 0x8000 ) );
+#endif
+	}
+
+	/// \}
+	/// \anchor special
+	/// \name Error and gamma functions
+	/// \{
+
+	/// Error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erf](https://en.cppreference.com/w/cpp/numeric/math/erf).
+	/// \param arg function argument
+	/// \return error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erf( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::erf( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if ( !abs || abs >= 0x7C00 )
+			return ( abs >= 0x7C00 ) ? half( detail::binary, ( abs == 0x7C00 ) ? ( arg.data_ - 0x4000 ) : detail::signal( arg.data_ ) ) : arg;
+		if ( abs >= 0x4200 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( ( arg.data_ & 0x8000 ) | 0x3BFF, 1, 1 ) );
+		return half( detail::binary, detail::erf<half::round_style, false>( arg.data_ ) );
+#endif
+	}
+
+	/// Complementary error function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.5% of inputs.
+	///
+	/// **See also:** Documentation for [std::erfc](https://en.cppreference.com/w/cpp/numeric/math/erfc).
+	/// \param arg function argument
+	/// \return 1 minus error function value of \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half erfc( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::erfc( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if ( abs >= 0x7C00 )
+			return ( abs >= 0x7C00 ) ? half( detail::binary, ( abs == 0x7C00 ) ? ( sign >> 1 ) : detail::signal( arg.data_ ) ) : arg;
+		if ( !abs )
+			return half( detail::binary, 0x3C00 );
+		if ( abs >= 0x4400 )
+			return half( detail::binary, detail::rounded<half::round_style, true>( ( sign >> 1 ) - ( sign >> 15 ), sign >> 15, 1 ) );
+		return half( detail::binary, detail::erf<half::round_style, true>( arg.data_ ) );
+#endif
+	}
+
+	/// Natural logarithm of gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in ~0.025% of inputs.
+	///
+	/// **See also:** Documentation for [std::lgamma](https://en.cppreference.com/w/cpp/numeric/math/lgamma).
+	/// \param arg function argument
+	/// \return natural logarith of gamma function for \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0 or negative integer arguments
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half lgamma( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::lgamma( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		int abs = arg.data_ & 0x7FFF;
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? 0x7C00 : detail::signal( arg.data_ ) );
+		if ( !abs || arg.data_ >= 0xE400 || ( arg.data_ >= 0xBC00 && !( abs & ( ( 1 << ( 25 - ( abs >> 10 ) ) ) - 1 ) ) ) )
+			return half( detail::binary, detail::pole() );
+		if ( arg.data_ == 0x3C00 || arg.data_ == 0x4000 )
+			return half( detail::binary, 0 );
+		return half( detail::binary, detail::gamma<half::round_style, true>( arg.data_ ) );
+#endif
+	}
+
+	/// Gamma function.
+	/// This function may be 1 ULP off the correctly rounded exact result for any rounding mode in <0.25% of inputs.
+	///
+	/// **See also:** Documentation for [std::tgamma](https://en.cppreference.com/w/cpp/numeric/math/tgamma).
+	/// \param arg function argument
+	/// \return gamma function value of \a arg
+	/// \exception FE_INVALID for signaling NaN, negative infinity or negative integer arguments
+	/// \exception FE_DIVBYZERO for 0
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half tgamma( half arg )
+	{
+#if defined( HALF_ARITHMETIC_TYPE ) && HALF_ENABLE_CPP11_CMATH
+		return half( detail::binary, detail::float2half<half::round_style>( std::tgamma( detail::half2float<detail::internal_t>( arg.data_ ) ) ) );
+#else
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if ( !abs )
+			return half( detail::binary, detail::pole( arg.data_ ) );
+		if ( abs >= 0x7C00 )
+			return ( arg.data_ == 0x7C00 ) ? arg : half( detail::binary, detail::signal( arg.data_ ) );
+		if ( arg.data_ >= 0xE400 || ( arg.data_ >= 0xBC00 && !( abs & ( ( 1 << ( 25 - ( abs >> 10 ) ) ) - 1 ) ) ) )
+			return half( detail::binary, detail::invalid() );
+		if ( arg.data_ >= 0xCA80 )
+			return half( detail::binary, detail::underflow<half::round_style>( ( 1 - ( ( abs >> ( 25 - ( abs >> 10 ) ) ) & 1 ) ) << 15 ) );
+		if ( arg.data_ <= 0x100 || ( arg.data_ >= 0x4900 && arg.data_ < 0x8000 ) )
+			return half( detail::binary, detail::overflow<half::round_style>() );
+		if ( arg.data_ == 0x3C00 )
+			return arg;
+		return half( detail::binary, detail::gamma<half::round_style, false>( arg.data_ ) );
+#endif
+	}
+
+	/// \}
+	/// \anchor rounding
+	/// \name Rounding
+	/// \{
+
+	/// Nearest integer not less than half value.
+	/// **See also:** Documentation for [std::ceil](https://en.cppreference.com/w/cpp/numeric/math/ceil).
+	/// \param arg half to round
+	/// \return nearest integer not less than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half ceil( half arg )
+	{
+		return half( detail::binary, detail::integral<std::round_toward_infinity, true, true>( arg.data_ ) );
+	}
+
+	/// Nearest integer not greater than half value.
+	/// **See also:** Documentation for [std::floor](https://en.cppreference.com/w/cpp/numeric/math/floor).
+	/// \param arg half to round
+	/// \return nearest integer not greater than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half floor( half arg )
+	{
+		return half( detail::binary, detail::integral<std::round_toward_neg_infinity, true, true>( arg.data_ ) );
+	}
+
+	/// Nearest integer not greater in magnitude than half value.
+	/// **See also:** Documentation for [std::trunc](https://en.cppreference.com/w/cpp/numeric/math/trunc).
+	/// \param arg half to round
+	/// \return nearest integer not greater in magnitude than \a arg
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half trunc( half arg )
+	{
+		return half( detail::binary, detail::integral<std::round_toward_zero, true, true>( arg.data_ ) );
+	}
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::round](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half round( half arg )
+	{
+		return half( detail::binary, detail::integral<std::round_to_nearest, false, true>( arg.data_ ) );
+	}
+
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::lround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long`
+	inline long lround( half arg )
+	{
+		return detail::half2int<std::round_to_nearest, false, false, long>( arg.data_ );
+	}
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::rint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_INEXACT if value had to be rounded
+	inline half rint( half arg )
+	{
+		return half( detail::binary, detail::integral<half::round_style, true, true>( arg.data_ ) );
+	}
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::lrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long lrint( half arg )
+	{
+		return detail::half2int<half::round_style, true, true, long>( arg.data_ );
+	}
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::nearbyint](https://en.cppreference.com/w/cpp/numeric/math/nearbyint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID for signaling NaN
+	inline half nearbyint( half arg )
+	{
+		return half( detail::binary, detail::integral<half::round_style, true, false>( arg.data_ ) );
+	}
+#if HALF_ENABLE_CPP11_LONG_LONG
+	/// Nearest integer.
+	/// **See also:** Documentation for [std::llround](https://en.cppreference.com/w/cpp/numeric/math/round).
+	/// \param arg half to round
+	/// \return nearest integer, rounded away from zero in half-way cases
+	/// \exception FE_INVALID if value is not representable as `long long`
+	inline long long llround( half arg )
+	{
+		return detail::half2int<std::round_to_nearest, false, false, long long>( arg.data_ );
+	}
+
+	/// Nearest integer using half's internal rounding mode.
+	/// **See also:** Documentation for [std::llrint](https://en.cppreference.com/w/cpp/numeric/math/rint).
+	/// \param arg half expression to round
+	/// \return nearest integer using default rounding mode
+	/// \exception FE_INVALID if value is not representable as `long long`
+	/// \exception FE_INEXACT if value had to be rounded
+	inline long long llrint( half arg )
+	{
+		return detail::half2int<half::round_style, true, true, long long>( arg.data_ );
+	}
+#endif
+
+	/// \}
+	/// \anchor float
+	/// \name Floating point manipulation
+	/// \{
+
+	/// Decompress floating-point number.
+	/// **See also:** Documentation for [std::frexp](https://en.cppreference.com/w/cpp/numeric/math/frexp).
+	/// \param arg number to decompress
+	/// \param exp address to store exponent at
+	/// \return significant in range [0.5, 1)
+	/// \exception FE_INVALID for signaling NaN
+	inline half frexp( half arg, int *exp )
+	{
+		*exp = 0;
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if ( abs >= 0x7C00 || !abs )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		for ( ; abs < 0x400; abs <<= 1, --*exp )
+			;
+		*exp += ( abs >> 10 ) - 14;
+		return half( detail::binary, ( arg.data_ & 0x8000 ) | 0x3800 | ( abs & 0x3FF ) );
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbln](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbln( half arg, long exp )
+	{
+		unsigned int abs = arg.data_ & 0x7FFF, sign = arg.data_ & 0x8000;
+		if ( abs >= 0x7C00 || !abs )
+			return ( abs > 0x7C00 ) ? half( detail::binary, detail::signal( arg.data_ ) ) : arg;
+		for ( ; abs < 0x400; abs <<= 1, --exp )
+			;
+		exp += abs >> 10;
+		if ( exp > 30 )
+			return half( detail::binary, detail::overflow<half::round_style>( sign ) );
+		else if ( exp < -10 )
+			return half( detail::binary, detail::underflow<half::round_style>( sign ) );
+		else if ( exp > 0 )
+			return half( detail::binary, sign | ( exp << 10 ) | ( abs & 0x3FF ) );
+		unsigned int m = ( abs & 0x3FF ) | 0x400;
+		return half( detail::binary, detail::rounded<half::round_style, false>( sign | ( m >> ( 1 - exp ) ), ( m >> -exp ) & 1, ( m & ( ( 1 << -exp ) - 1 ) ) != 0 ) );
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::scalbn](https://en.cppreference.com/w/cpp/numeric/math/scalbn).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half scalbn( half arg, int exp )
+	{
+		return scalbln( arg, exp );
+	}
+
+	/// Multiply by power of two.
+	/// This function is exact to rounding for all rounding modes.
+	///
+	/// **See also:** Documentation for [std::ldexp](https://en.cppreference.com/w/cpp/numeric/math/ldexp).
+	/// \param arg number to modify
+	/// \param exp power of two to multiply with
+	/// \return \a arg multplied by 2 raised to \a exp
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	inline half ldexp( half arg, int exp )
+	{
+		return scalbln( arg, exp );
+	}
+
+	/// Extract integer and fractional parts.
+	/// **See also:** Documentation for [std::modf](https://en.cppreference.com/w/cpp/numeric/math/modf).
+	/// \param arg number to decompress
+	/// \param iptr address to store integer part at
+	/// \return fractional part
+	/// \exception FE_INVALID for signaling NaN
+	inline half modf( half arg, half *iptr )
+	{
+		unsigned int abs = arg.data_ & 0x7FFF;
+		if ( abs > 0x7C00 )
+		{
+			arg = half( detail::binary, detail::signal( arg.data_ ) );
+			return *iptr = arg, arg;
+		}
+		if ( abs >= 0x6400 )
+			return *iptr = arg, half( detail::binary, arg.data_ & 0x8000 );
+		if ( abs < 0x3C00 )
+			return iptr->data_ = arg.data_ & 0x8000, arg;
+		unsigned int exp = abs >> 10, mask = ( 1 << ( 25 - exp ) ) - 1, m = arg.data_ & mask;
+		iptr->data_ = arg.data_ & ~mask;
+		if ( !m )
+			return half( detail::binary, arg.data_ & 0x8000 );
+		for ( ; m < 0x400; m <<= 1, --exp )
+			;
+		return half( detail::binary, ( arg.data_ & 0x8000 ) | ( exp << 10 ) | ( m & 0x3FF ) );
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::ilogb](https://en.cppreference.com/w/cpp/numeric/math/ilogb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \retval FP_ILOGB0 for zero
+	/// \retval FP_ILOGBNAN for NaN
+	/// \retval INT_MAX for infinity
+	/// \exception FE_INVALID for 0 or infinite values
+	inline int ilogb( half arg )
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if ( !abs || abs >= 0x7C00 )
+		{
+			detail::raise( FE_INVALID );
+			return !abs ? FP_ILOGB0 : ( abs == 0x7C00 ) ? INT_MAX :
+														  FP_ILOGBNAN;
+		}
+		for ( exp = ( abs >> 10 ) - 15; abs < 0x200; abs <<= 1, --exp )
+			;
+		return exp;
+	}
+
+	/// Extract exponent.
+	/// **See also:** Documentation for [std::logb](https://en.cppreference.com/w/cpp/numeric/math/logb).
+	/// \param arg number to query
+	/// \return floating-point exponent
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_DIVBYZERO for 0
+	inline half logb( half arg )
+	{
+		int abs = arg.data_ & 0x7FFF, exp;
+		if ( !abs )
+			return half( detail::binary, detail::pole( 0x8000 ) );
+		if ( abs >= 0x7C00 )
+			return half( detail::binary, ( abs == 0x7C00 ) ? 0x7C00 : detail::signal( arg.data_ ) );
+		for ( exp = ( abs >> 10 ) - 15; abs < 0x200; abs <<= 1, --exp )
+			;
+		unsigned int value = static_cast<unsigned>( exp < 0 ) << 15;
+		if ( exp )
+		{
+			unsigned int m = std::abs( exp ) << 6;
+			for ( exp = 18; m < 0x400; m <<= 1, --exp )
+				;
+			value |= ( exp << 10 ) + m;
+		}
+		return half( detail::binary, value );
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nextafter](https://en.cppreference.com/w/cpp/numeric/math/nextafter).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nextafter( half from, half to )
+	{
+		int fabs = from.data_ & 0x7FFF, tabs = to.data_ & 0x7FFF;
+		if ( fabs > 0x7C00 || tabs > 0x7C00 )
+			return half( detail::binary, detail::signal( from.data_, to.data_ ) );
+		if ( from.data_ == to.data_ || !( fabs | tabs ) )
+			return to;
+		if ( !fabs )
+		{
+			detail::raise( FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT );
+			return half( detail::binary, ( to.data_ & 0x8000 ) + 1 );
+		}
+		unsigned int out = from.data_ + ( ( ( from.data_ >> 15 ) ^ static_cast<unsigned>( ( from.data_ ^ ( 0x8000 | ( 0x8000 - ( from.data_ >> 15 ) ) ) ) < ( to.data_ ^ ( 0x8000 | ( 0x8000 - ( to.data_ >> 15 ) ) ) ) ) ) << 1 ) - 1;
+		detail::raise( FE_OVERFLOW, fabs < 0x7C00 && ( out & 0x7C00 ) == 0x7C00 );
+		detail::raise( FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && ( out & 0x7C00 ) < 0x400 );
+		return half( detail::binary, out );
+	}
+
+	/// Next representable value.
+	/// **See also:** Documentation for [std::nexttoward](https://en.cppreference.com/w/cpp/numeric/math/nexttoward).
+	/// \param from value to compute next representable value for
+	/// \param to direction towards which to compute next value
+	/// \return next representable value after \a from in direction towards \a to
+	/// \exception FE_INVALID for signaling NaN
+	/// \exception FE_OVERFLOW for infinite result from finite argument
+	/// \exception FE_UNDERFLOW for subnormal result
+	inline half nexttoward( half from, long double to )
+	{
+		int fabs = from.data_ & 0x7FFF;
+		if ( fabs > 0x7C00 )
+			return half( detail::binary, detail::signal( from.data_ ) );
+		long double lfrom = static_cast<long double>( from );
+		if ( detail::builtin_isnan( to ) || lfrom == to )
+			return half( static_cast<float>( to ) );
+		if ( !fabs )
+		{
+			detail::raise( FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT );
+			return half( detail::binary, ( static_cast<unsigned>( detail::builtin_signbit( to ) ) << 15 ) + 1 );
+		}
+		unsigned int out = from.data_ + ( ( ( from.data_ >> 15 ) ^ static_cast<unsigned>( lfrom < to ) ) << 1 ) - 1;
+		detail::raise( FE_OVERFLOW, ( out & 0x7FFF ) == 0x7C00 );
+		detail::raise( FE_UNDERFLOW, !HALF_ERRHANDLING_UNDERFLOW_TO_INEXACT && ( out & 0x7FFF ) < 0x400 );
+		return half( detail::binary, out );
+	}
+
+	/// Take sign.
+	/// **See also:** Documentation for [std::copysign](https://en.cppreference.com/w/cpp/numeric/math/copysign).
+	/// \param x value to change sign for
+	/// \param y value to take sign from
+	/// \return value equal to \a x in magnitude and to \a y in sign
+	inline HALF_CONSTEXPR half copysign( half x, half y )
+	{
+		return half( detail::binary, x.data_ ^ ( ( x.data_ ^ y.data_ ) & 0x8000 ) );
+	}
+
+	/// \}
+	/// \anchor classification
+	/// \name Floating point classification
+	/// \{
+
+	/// Classify floating-point value.
+	/// **See also:** Documentation for [std::fpclassify](https://en.cppreference.com/w/cpp/numeric/math/fpclassify).
+	/// \param arg number to classify
+	/// \retval FP_ZERO for positive and negative zero
+	/// \retval FP_SUBNORMAL for subnormal numbers
+	/// \retval FP_INFINITY for positive and negative infinity
+	/// \retval FP_NAN for NaNs
+	/// \retval FP_NORMAL for all other (normal) values
+	inline HALF_CONSTEXPR int fpclassify( half arg )
+	{
+		return !( arg.data_ & 0x7FFF )				? FP_ZERO :
+			   ( ( arg.data_ & 0x7FFF ) < 0x400 )	? FP_SUBNORMAL :
+			   ( ( arg.data_ & 0x7FFF ) < 0x7C00 )	? FP_NORMAL :
+			   ( ( arg.data_ & 0x7FFF ) == 0x7C00 ) ? FP_INFINITE :
+													  FP_NAN;
+	}
+
+	/// Check if finite number.
+	/// **See also:** Documentation for [std::isfinite](https://en.cppreference.com/w/cpp/numeric/math/isfinite).
+	/// \param arg number to check
+	/// \retval true if neither infinity nor NaN
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isfinite( half arg )
+	{
+		return ( arg.data_ & 0x7C00 ) != 0x7C00;
+	}
+
+	/// Check for infinity.
+	/// **See also:** Documentation for [std::isinf](https://en.cppreference.com/w/cpp/numeric/math/isinf).
+	/// \param arg number to check
+	/// \retval true for positive or negative infinity
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isinf( half arg )
+	{
+		return ( arg.data_ & 0x7FFF ) == 0x7C00;
+	}
+
+	/// Check for NaN.
+	/// **See also:** Documentation for [std::isnan](https://en.cppreference.com/w/cpp/numeric/math/isnan).
+	/// \param arg number to check
+	/// \retval true for NaNs
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isnan( half arg )
+	{
+		return ( arg.data_ & 0x7FFF ) > 0x7C00;
+	}
+
+	/// Check if normal number.
+	/// **See also:** Documentation for [std::isnormal](https://en.cppreference.com/w/cpp/numeric/math/isnormal).
+	/// \param arg number to check
+	/// \retval true if normal number
+	/// \retval false if either subnormal, zero, infinity or NaN
+	inline HALF_CONSTEXPR bool isnormal( half arg )
+	{
+		return ( ( arg.data_ & 0x7C00 ) != 0 ) & ( ( arg.data_ & 0x7C00 ) != 0x7C00 );
+	}
+
+	/// Check sign.
+	/// **See also:** Documentation for [std::signbit](https://en.cppreference.com/w/cpp/numeric/math/signbit).
+	/// \param arg number to check
+	/// \retval true for negative number
+	/// \retval false for positive number
+	inline HALF_CONSTEXPR bool signbit( half arg )
+	{
+		return ( arg.data_ & 0x8000 ) != 0;
+	}
+
+	/// \}
+	/// \anchor compfunc
+	/// \name Comparison
+	/// \{
+
+	/// Quiet comparison for greater than.
+	/// **See also:** Documentation for [std::isgreater](https://en.cppreference.com/w/cpp/numeric/math/isgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreater( half x, half y )
+	{
+		return ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) > ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) ) && !isnan( x ) && !isnan( y );
+	}
+
+	/// Quiet comparison for greater equal.
+	/// **See also:** Documentation for [std::isgreaterequal](https://en.cppreference.com/w/cpp/numeric/math/isgreaterequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x greater equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isgreaterequal( half x, half y )
+	{
+		return ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) >= ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) ) && !isnan( x ) && !isnan( y );
+	}
+
+	/// Quiet comparison for less than.
+	/// **See also:** Documentation for [std::isless](https://en.cppreference.com/w/cpp/numeric/math/isless).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less than \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isless( half x, half y )
+	{
+		return ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) < ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) ) && !isnan( x ) && !isnan( y );
+	}
+
+	/// Quiet comparison for less equal.
+	/// **See also:** Documentation for [std::islessequal](https://en.cppreference.com/w/cpp/numeric/math/islessequal).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if \a x less equal \a y
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessequal( half x, half y )
+	{
+		return ( ( x.data_ ^ ( 0x8000 | ( 0x8000 - ( x.data_ >> 15 ) ) ) ) + ( x.data_ >> 15 ) ) <= ( ( y.data_ ^ ( 0x8000 | ( 0x8000 - ( y.data_ >> 15 ) ) ) ) + ( y.data_ >> 15 ) ) && !isnan( x ) && !isnan( y );
+	}
+
+	/// Quiet comarison for less or greater.
+	/// **See also:** Documentation for [std::islessgreater](https://en.cppreference.com/w/cpp/numeric/math/islessgreater).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if either less or greater
+	/// \retval false else
+	inline HALF_CONSTEXPR bool islessgreater( half x, half y )
+	{
+		return x.data_ != y.data_ && ( ( x.data_ | y.data_ ) & 0x7FFF ) && !isnan( x ) && !isnan( y );
+	}
+
+	/// Quiet check if unordered.
+	/// **See also:** Documentation for [std::isunordered](https://en.cppreference.com/w/cpp/numeric/math/isunordered).
+	/// \param x first operand
+	/// \param y second operand
+	/// \retval true if unordered (one or two NaN operands)
+	/// \retval false else
+	inline HALF_CONSTEXPR bool isunordered( half x, half y )
+	{
+		return isnan( x ) || isnan( y );
+	}
+
+	/// \}
+	/// \anchor casting
+	/// \name Casting
+	/// \{
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted
+	/// directly using the default rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template <typename T, typename U>
+	T half_cast( U arg )
+	{
+		return detail::half_caster<T, U>::cast( arg );
+	}
+
+	/// Cast to or from half-precision floating-point number.
+	/// This casts between [half](\ref half_float::half) and any built-in arithmetic type. The values are converted
+	/// directly using the specified rounding mode, without any roundtrip over `float` that a `static_cast` would otherwise do.
+	///
+	/// Using this cast with neither of the two types being a [half](\ref half_float::half) or with any of the two types
+	/// not being a built-in arithmetic type (apart from [half](\ref half_float::half), of course) results in a compiler
+	/// error and casting between [half](\ref half_float::half)s returns the argument unmodified.
+	/// \tparam T destination type (half or built-in arithmetic type)
+	/// \tparam R rounding mode to use.
+	/// \tparam U source type (half or built-in arithmetic type)
+	/// \param arg value to cast
+	/// \return \a arg converted to destination type
+	/// \exception FE_INVALID if \a T is integer type and result is not representable as \a T
+	/// \exception FE_OVERFLOW, ...UNDERFLOW, ...INEXACT according to rounding
+	template <typename T, std::float_round_style R, typename U>
+	T half_cast( U arg )
+	{
+		return detail::half_caster<T, U, R>::cast( arg );
+	}
+	/// \}
+
+	/// \}
+	/// \anchor errors
+	/// \name Error handling
+	/// \{
+
+	/// Clear exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled,
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feclearexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feclearexcept).
+	/// \param excepts OR of exceptions to clear
+	/// \retval 0 all selected flags cleared successfully
+	inline int feclearexcept( int excepts )
+	{
+		detail::errflags() &= ~excepts;
+		return 0;
+	}
+
+	/// Test exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled,
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fetestexcept](https://en.cppreference.com/w/cpp/numeric/fenv/fetestexcept).
+	/// \param excepts OR of exceptions to test
+	/// \return OR of selected exceptions if raised
+	inline int fetestexcept( int excepts )
+	{
+		return detail::errflags() & excepts;
+	}
+
+	/// Raise exception flags.
+	/// This raises the specified floating point exceptions and also invokes any additional automatic exception handling as
+	/// configured with the [HALF_ERRHANDLIG_...](\ref HALF_ERRHANDLING_ERRNO) preprocessor symbols.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled,
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::feraiseexcept](https://en.cppreference.com/w/cpp/numeric/fenv/feraiseexcept).
+	/// \param excepts OR of exceptions to raise
+	/// \retval 0 all selected exceptions raised successfully
+	inline int feraiseexcept( int excepts )
+	{
+		detail::errflags() |= excepts;
+		detail::raise( excepts );
+		return 0;
+	}
+
+	/// Save exception flags.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled,
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fegetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to store flag state at
+	/// \param excepts OR of flags to save
+	/// \retval 0 for success
+	inline int fegetexceptflag( int *flagp, int excepts )
+	{
+		*flagp = detail::errflags() & excepts;
+		return 0;
+	}
+
+	/// Restore exception flags.
+	/// This only copies the specified exception state (including unset flags) without incurring any additional exception handling.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled,
+	/// but in that case manual flag management is the only way to raise flags.
+	///
+	/// **See also:** Documentation for [std::fesetexceptflag](https://en.cppreference.com/w/cpp/numeric/fenv/feexceptflag).
+	/// \param flagp adress to take flag state from
+	/// \param excepts OR of flags to restore
+	/// \retval 0 for success
+	inline int fesetexceptflag( const int *flagp, int excepts )
+	{
+		detail::errflags() = ( detail::errflags() | ( *flagp & excepts ) ) & ( *flagp | ~excepts );
+		return 0;
+	}
+
+	/// Throw C++ exceptions based on set exception flags.
+	/// This function manually throws a corresponding C++ exception if one of the specified flags is set,
+	/// no matter if automatic throwing (via [HALF_ERRHANDLING_THROW_...](\ref HALF_ERRHANDLING_THROW_INVALID)) is enabled or not.
+	/// This function works even if [automatic exception flag handling](\ref HALF_ERRHANDLING_FLAGS) is disabled,
+	/// but in that case manual flag management is the only way to raise flags.
+	/// \param excepts OR of exceptions to test
+	/// \param msg error message to use for exception description
+	/// \throw std::domain_error if `FE_INVALID` or `FE_DIVBYZERO` is selected and set
+	/// \throw std::overflow_error if `FE_OVERFLOW` is selected and set
+	/// \throw std::underflow_error if `FE_UNDERFLOW` is selected and set
+	/// \throw std::range_error if `FE_INEXACT` is selected and set
+	inline void fethrowexcept( int excepts, const char *msg = "" )
+	{
+		excepts &= detail::errflags();
+		if ( excepts & ( FE_INVALID | FE_DIVBYZERO ) )
+			throw std::domain_error( msg );
+		if ( excepts & FE_OVERFLOW )
+			throw std::overflow_error( msg );
+		if ( excepts & FE_UNDERFLOW )
+			throw std::underflow_error( msg );
+		if ( excepts & FE_INEXACT )
+			throw std::range_error( msg );
+	}
+	/// \}
+} // namespace half_float
+
+#undef HALF_UNUSED_NOERR
+#undef HALF_CONSTEXPR
+#undef HALF_CONSTEXPR_CONST
+#undef HALF_CONSTEXPR_NOERR
+#undef HALF_NOEXCEPT
+#undef HALF_NOTHROW
+#undef HALF_THREAD_LOCAL
+#undef HALF_TWOS_COMPLEMENT_INT
+#ifdef HALF_POP_WARNINGS
+#pragma warning( pop )
+#undef HALF_POP_WARNINGS
+#endif
+
+#endif
diff --git a/thirdparty/include/stb_image.h b/thirdparty/include/stb_image.h
new file mode 100644
index 0000000..5e807a0
--- /dev/null
+++ b/thirdparty/include/stb_image.h
@@ -0,0 +1,7987 @@
+/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.28  (2023-01-29) many error fixes, security errors, just tons of stuff
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin              Neil Bickford      Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data);
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#if defined(_MSC_VER) || defined(__SYMBIAN32__)
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR) || !defined(STBI_NO_PNM)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// returns 1 if the sum of two signed ints is valid (between -2^31 and 2^31-1 inclusive), 0 on overflow.
+static int stbi__addints_valid(int a, int b)
+{
+   if ((a >= 0) != (b >= 0)) return 1; // a and b have different signs, so no overflow
+   if (a < 0 && b < 0) return a >= INT_MIN - b; // same as a + b >= INT_MIN; INT_MIN - b cannot overflow since b < 0.
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product of two signed shorts is valid, 0 on overflow.
+static int stbi__mul2shorts_valid(short a, short b)
+{
+   if (b == 0 || b == -1) return 1; // multiplication by 0 is always 0; check for -1 so SHRT_MIN/b doesn't overflow
+   if ((a >= 0) == (b >= 0)) return a <= SHRT_MAX/b; // product is positive, so similar to mul2sizes_valid
+   if (b < 0) return a <= SHRT_MIN / b; // same as a * b >= SHRT_MIN
+   return a >= SHRT_MIN / b;
+}
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+	return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+	if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+	if (0 != _wfopen_s(&f, wFilename, wMode))
+		f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i) {
+      for (j=0; j < count[i]; ++j) {
+         h->size[k++] = (stbi_uc) (i+1);
+         if(k >= 257) return stbi__err("bad size list","Corrupt JPEG");
+      }
+   }
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   if(c < 0 || c >= 256) // symbol id out of bounds!
+       return -1;
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < n) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   if (j->code_bits < 1) return 0; // ran out of bits from stream, return 0s intead of continuing
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta","Corrupt JPEG");
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   if (!stbi__mul2shorts_valid(dc, dequant[0])) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      if (!stbi__addints_valid(j->img_comp[b].dc_pred, diff)) return stbi__err("bad delta", "Corrupt JPEG");
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      if (!stbi__mul2shorts_valid(dc, 1 << j->succ_low)) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            if (s > j->code_bits) return stbi__err("bad huffman code", "Combined length longer than code bits available");
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            if(n > 256) return stbi__err("bad DHT header","Corrupt JPEG"); // Loop over i < n would write past end of values!
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // check that plane subsampling factors are integer ratios; our resamplers can't deal with fractional ratios
+   // and I've never seen a non-corrupted JPEG file actually use them
+   for (i=0; i < s->img_n; ++i) {
+      if (h_max % z->img_comp[i].h != 0) return stbi__err("bad H","Corrupt JPEG");
+      if (v_max % z->img_comp[i].v != 0) return stbi__err("bad V","Corrupt JPEG");
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+static int stbi__skip_jpeg_junk_at_end(stbi__jpeg *j)
+{
+   // some JPEGs have junk at end, skip over it but if we find what looks
+   // like a valid marker, resume there
+   while (!stbi__at_eof(j->s)) {
+      int x = stbi__get8(j->s);
+      while (x == 255) { // might be a marker
+         if (stbi__at_eof(j->s)) return STBI__MARKER_none;
+         x = stbi__get8(j->s);
+         if (x != 0x00 && x != 0xff) {
+            // not a stuffed zero or lead-in to another marker, looks
+            // like an actual marker, return it
+            return x;
+         }
+         // stuffed zero has x=0 now which ends the loop, meaning we go
+         // back to regular scan loop.
+         // repeated 0xff keeps trying to read the next byte of the marker.
+      }
+   }
+   return STBI__MARKER_none;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+         j->marker = stbi__skip_jpeg_junk_at_end(j);
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+         m = stbi__get_marker(j);
+         if (STBI__RESTART(m))
+            m = stbi__get_marker(j);
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+         m = stbi__get_marker(j);
+      } else {
+         if (!stbi__process_marker(j, m)) return 1;
+         m = stbi__get_marker(j);
+      }
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   memset(j, 0, sizeof(stbi__jpeg));
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         if (z >= 286) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, length codes 286 and 287 must not appear in compressed data
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0 || z >= 30) return stbi__err("bad huffman code","Corrupt PNG"); // per DEFLATE, distance codes 30 and 31 must not appear in compressed data
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+            }
+            // even with SCAN_header, have to scan to see if we have a tRNS
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               // non-paletted with tRNS = constant alpha. if header-scanning, we can stop now.
+               if (scan == STBI__SCAN_header) { ++s->img_n; return 1; }
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) {
+               // header scan definitely stops at first IDAT
+               if (pal_img_n)
+                  s->img_n = pal_img_n;
+               return 1;
+            }
+            if (c.length > (1u << 30)) return stbi__err("IDAT size limit", "IDAT section larger than 2^30 bytes");
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+	   return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      // accept some number of extra bytes after the header, but if the offset points either to before
+      // the header ends or implies a large amount of extra data, reject the file as malformed
+      int bytes_read_so_far = s->callback_already_read + (int)(s->img_buffer - s->img_buffer_original);
+      int header_limit = 1024; // max we actually read is below 256 bytes currently.
+      int extra_data_limit = 256*4; // what ordinarily goes here is a palette; 256 entries*4 bytes is its max size.
+      if (bytes_read_so_far <= 0 || bytes_read_so_far > header_limit) {
+         return stbi__errpuc("bad header", "Corrupt BMP");
+      }
+      // we established that bytes_read_so_far is positive and sensible.
+      // the first half of this test rejects offsets that are either too small positives, or
+      // negative, and guarantees that info.offset >= bytes_read_so_far > 0. this in turn
+      // ensures the number computed in the second half of the test can't overflow.
+      if (info.offset < bytes_read_so_far || info.offset - bytes_read_so_far > extra_data_limit) {
+         return stbi__errpuc("bad offset", "Corrupt BMP");
+      } else {
+         stbi__skip(s, info.offset - bytes_read_so_far);
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if ((count == 0) || (count > nleft)) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (!stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8))) {
+      STBI_FREE(out);
+      return stbi__errpuc("bad PNM", "PNM file truncated");
+   }
+
+   if (req_comp && req_comp != s->img_n) {
+      if (ri->bits_per_channel == 16) {
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, s->img_n, req_comp, s->img_x, s->img_y);
+      } else {
+         out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      }
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+      if((value > 214748364) || (value == 214748364 && *c > '7'))
+          return stbi__err("integer parse overflow", "Parsing an integer in the PPM header overflowed a 32-bit int");
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   if(*x == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   if (*y == 0)
+       return stbi__err("invalid width", "PPM image header had zero or overflowing width");
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+	   return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/