diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..c9eee03f
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,11 @@
+---
+Language: Cpp
+BasedOnStyle: Google
+ColumnLimit: 80
+IncludeBlocks: Preserve
+DerivePointerAlignment: false
+PointerAlignment: Left
+---
+Language: Proto
+BasedOnStyle: Google
+...
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 00000000..920555f1
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,2 @@
+# Don't copy build files into a docker image
+*/build*
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..e37fefa1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,47 @@
+**/build
+
+
+# Prerequisites
+*.d
+
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+*.smod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+# IDE stuff
+*.vscode
+
+# Build files
+*build
+
+# Python
+*.pyc
+
+# Reconstructions
+*.ply
diff --git a/Dockerfile.build b/Dockerfile.build
new file mode 100644
index 00000000..93de20df
--- /dev/null
+++ b/Dockerfile.build
@@ -0,0 +1,10 @@
+FROM nvblox_deps
+
+SHELL ["/bin/bash", "-c"]
+
+# Copy over the nvblox (both as a standalone and into the catkin workspace)
+COPY . nvblox
+
+# Build and test the standalone library, then build under catkin
+RUN cd nvblox/nvblox && mkdir build && cd build && \
+    cmake .. && make -j8
diff --git a/Dockerfile.deps b/Dockerfile.deps
new file mode 100644
index 00000000..8f73c372
--- /dev/null
+++ b/Dockerfile.deps
@@ -0,0 +1,20 @@
+FROM nvcr.io/nvidia/cuda:11.2.0-devel-ubuntu20.04
+
+# TZData goes first.
+RUN apt-get update
+ENV TZ Europe/Berlin
+ENV DEBIAN_FRONTEND noninteractive
+RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
+RUN apt-get install -y tzdata
+
+# Install basics.
+RUN apt-get update
+RUN apt-get install -y ssh git jq gnupg apt-utils software-properties-common build-essential cmake
+
+# Install dependencies.
+RUN apt-get install -y libgoogle-glog-dev libgtest-dev curl 
+
+# Build gtest because gtest doesn't do this for you for some reason.
+RUN cd /usr/src/googletest && cmake . && cmake --build . --target install
+
+ENV DEBIAN_FRONTEND teletype
diff --git a/Dockerfile.test b/Dockerfile.test
new file mode 100644
index 00000000..4e51411d
--- /dev/null
+++ b/Dockerfile.test
@@ -0,0 +1,5 @@
+FROM nvblox
+
+# Build and test the standalone library, then build under catkin
+CMD cd nvblox/nvblox/build && \
+    cd tests/ && ctest && cd .. 
diff --git a/Jenkinsfile b/Jenkinsfile
new file mode 100644
index 00000000..3539f66a
--- /dev/null
+++ b/Jenkinsfile
@@ -0,0 +1,65 @@
+pipeline {
+  agent {
+    dockerfile {
+        label 'isaac-gpu'
+        reuseNode true
+        filename 'Dockerfile.deps'
+        args '-u root --gpus all -v /var/run/docker.sock:/var/run/docker.sock:rw'
+    }
+  }
+  triggers {
+    gitlab(triggerOnMergeRequest: true, branchFilterType: 'All')
+  }
+  stages {
+    stage('Compile') {
+      steps {
+        sh '''nvidia-smi'''
+        sh '''mkdir -p nvblox/build'''
+        sh '''cd nvblox/build && cmake .. && make -j8'''
+      }
+    }
+    stage('Test') {
+      steps {
+        sh '''cd nvblox/build/tests && ctest -T test --no-compress-output'''
+      }
+    }
+  }
+  post {
+    always {
+      // Archive the CTest xml output
+      archiveArtifacts (
+        artifacts: 'nvblox/build/tests/Testing/**/*.xml',
+        fingerprint: true
+      )
+
+      // Process the CTest xml output with the xUnit plugin
+      xunit (
+        testTimeMargin: '3000',
+        thresholdMode: 1,
+        thresholds: [
+          skipped(failureThreshold: '0'),
+          failed(failureThreshold: '0')
+        ],
+      tools: [CTest(
+          pattern: 'nvblox/build/Testing/**/*.xml',
+          deleteOutputFiles: true,
+          failIfNotNew: false,
+          skipNoTestFiles: true,
+          stopProcessingIfError: true
+        )]
+      )
+
+      // Clear the source and build dirs before next run
+      cleanWs()
+    }
+    failure {
+      updateGitlabCommitStatus name: 'build', state: 'failed'
+    }
+    success {
+      updateGitlabCommitStatus name: 'build', state: 'success'
+    }
+  }
+  options {
+    gitLabConnection('gitlab-master')
+  }
+}
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..00f4f26c
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,237 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2022 NVIDIA CORPORATION
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+   
+==============================================================================
+voxblox license for some of the source files.
+==============================================================================
+Copyright (c) 2016, ETHZ ASL
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of voxblox nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+
+
diff --git a/README.md b/README.md
new file mode 100644
index 00000000..cc166333
--- /dev/null
+++ b/README.md
@@ -0,0 +1,154 @@
+# nvblox
+Signed Distance Functions (SDFs) on NVIDIA GPUs.
+
+An SDF library which offers
+* Support for storage of various voxel types
+* GPU accelerated agorithms such as:
+  * TSDF construction
+  * ESDF construction
+* ROS2 interface (see [issac_ros_nvblox](https://github.com/NVIDIA-ISAAC-ROS/isaac_ros_nvblox))
+* ~~Python bindings~~ (coming soon)
+
+Do we need another SDF library? That depends on your use case. If your interested in:
+* **Path planning**: We provide GPU accelerated, incremental algorithms for calculating the Euclidian Signed Distance Field (ESDF) which is useful for colision checking, which is critical robotic pathplanning. In contrast, existing GPU-accelerated libraries target reconstruction only and are general not useful in a robotics context.
+* **GPU acceleration**: Our previous works [voxblox](https://github.com/ethz-asl/voxblox) and [voxgraph](https://github.com/ethz-asl/voxgraph) are used for path planning, however are CPU only, which limits the speed of these toolboxes (and therefore the resolution of the maps they can build in real-time).
+
+Here we show slices through a distance function generated from *nvblox* using data from the [3DMatch dataset](https://3dmatch.cs.princeton.edu/).
+![slice](docs/images/nvblox_slice.gif)
+
+# Note from the authors
+This package is under active development. Feel free to make an issue for bugs or feature requests, and we always welcome pull requests!
+
+# ROS2 Interface
+This repo contains the core library which can be linked into users' projects. If you want to use nvblox on a robot out-of-the-box, please see our [ROS2 interface](https://github.com/NVIDIA-ISAAC-ROS/isaac_ros_nvblox), which downloads and builds the core library during installation.
+
+# Native Installation
+If you want to build natively, please follow these instructions. Instructions for docker are [further below](#docker).
+
+## Install dependencies
+We depend on:
+- gtest
+- glog
+- gflags (to run experiments)
+- CUDA 10.2 - 11.5 (others might work but are untested)
+- Eigen (no need to explicitly install, a recent version is built into the library)
+Please run
+```
+sudo apt-get install -y libgoogle-glog-dev libgtest-dev libgflags-dev python3-dev
+cd /usr/src/googletest && sudo cmake . && sudo cmake --build . --target install
+```
+
+## Build and run tests
+```
+cd nvblox/nvblox
+mkdir build
+cd build
+cmake .. && make && ctest
+```
+
+## Run an example
+In this example we fuse data from the [3DMatch dataset](https://3dmatch.cs.princeton.edu/). First let's grab the dataset. Here I'm downloading it to my dataset folder `~/dataset/3dmatch`.
+```
+wget http://vision.princeton.edu/projects/2016/3DMatch/downloads/rgbd-datasets//datasets/3dmatch/sun3d-mit_76_studyroom-76-1studyroom2.zip -P ~/datasets/3dmatch
+unzip ~/datasets/3dmatch//datasets/3dmatch/sun3d-mit_76_studyroom-76-1studyroom2.zip -d ~/datasets/3dmatch
+```
+Navigate to and run the `fuse_3dmatch` binary. From the nvblox base folder run
+```
+cd nvblox/build/experiments
+./fuse_3dmatch ~/datasets/3dmatch//datasets/3dmatch/sun3d-mit_76_studyroom-76-1studyroom2/ --esdf_frame_subsampling 3000 --mesh_output_path mesh.ply
+```
+Once it's done we can view the output mesh using the Open3D viewer.
+```
+pip3 install open3d
+python3 ../../visualization/visualize_mesh.py mesh.ply
+```
+you should see a mesh of a room:
+![slice](docs/images/reconstruction_in_docker_trim.png)
+
+# Docker
+
+We have several dockerfiles which layer on top of one another for the following purposes:
+
+* **Docker.deps**
+* * This installs our dependencies.
+* * This is used in our CI, where the later steps (building and testing) are taken care of by Jenkins (and not docker).
+* **Docker.build**
+* * Layers on top of Docker.deps.
+* * This builds our package.
+* * This is where you get off the layer train if you wanna run stuff (and don't care if it's tested).
+* **Docker.test**
+* * Layers on top of Docker.build.
+* * Runs ours tests.
+* * Useful for checking if things are likely to pass the tests in CI.
+
+We are reliant on nvidia docker. Install the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) following the instructions on that website.
+
+We use the GPU during build, not only at run time. In the default configuration the GPU is only used at at runtime. One must therefore set the default runtime. Add `"default-runtime": "nvidia"` to `/etc/docker/daemon.json` such that it looks like:
+```
+{
+    "runtimes": {
+        "nvidia": {
+            "path": "/usr/bin/nvidia-container-runtime",
+            "runtimeArgs": []
+         } 
+    },
+    "default-runtime": "nvidia" 
+}
+```
+Restart docker
+```
+sudo systemctl restart docker
+```
+Now Let's build Dockerfile.deps docker image. This image install contains our dependacies. 
+```
+docker build -t nvblox_deps -f Dockerfile.deps .
+```
+Now let's build the Dockerfile.build. This image layers on the last, and actually builds the nvblox library.
+```
+docker build -t nvblox -f Dockerfile.build .
+```
+Now let's run the 3DMatch example inside the docker. Note there's some additional complexity in the `docker run` command such that we can forward X11 to the host (we're going to be view a reconstruction in a GUI). Run the container using:
+```
+xhost local:docker
+docker run -it --net=host --env="DISPLAY" -v $HOME/.Xauthority:/root/.Xauthority:rw -v /tmp/.X11-unix:/tmp/.X11-unix:rw nvblox
+```
+Let's download a dataset and run the example (this is largely a repeat of "Run an example" above).
+```
+apt-get update
+apt-get install unzip
+wget http://vision.princeton.edu/projects/2016/3DMatch/downloads/rgbd-datasets/sun3d-mit_76_studyroom-76-1studyroom2.zip -P ~/datasets/3dmatch
+unzip ~/datasets/3dmatch//datasets/3dmatch/sun3d-mit_76_studyroom-76-1studyroom2.zip -d ~/datasets/3dmatch
+cd nvblox/nvblox/build/experiments/
+./fuse_3dmatch ~/datasets/3dmatch//datasets/3dmatch/sun3d-mit_76_studyroom-76-1studyroom2/ --esdf_frame_subsampling 3000 --mesh_output_path mesh.ply
+```
+Now let's visualize. From the same experiments folder run:
+```
+apt-get install python3-pip libgl1-mesa-glx
+pip3 install open3d
+python3 ../../visualization/visualize_mesh.py mesh.ply
+```
+
+# Additional instructions for Jetson Xavier
+These instructions are for a native build on the Jetson Xavier. A Docker based build is coming soon.
+
+The instructions for the native build above work, with one exception:
+
+We build using CMake's modern CUDA integration and therefore require a more modern version of CMAKE than (currently) ships with jetpack. Luckily the Cmake developer team provide a means obtaining recent versions of CMake through apt.
+
+1. Obtain a copy of the signing key:
+```
+wget -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc |
+    sudo apt-key add -
+```
+2. Add the repository to your sources list and update.
+```
+sudo apt-add-repository 'deb https://apt.kitware.com/ubuntu/ bionic main'
+sudo apt-get update
+```
+3. Update!
+```
+sudo apt-get install cmake
+```
+
+# License
+This code is under an [open-source license](LICENSE) (Apache 2.0). :)
\ No newline at end of file
diff --git a/docs/images/nvblox_slice.gif b/docs/images/nvblox_slice.gif
new file mode 100644
index 00000000..a0bf4c1b
Binary files /dev/null and b/docs/images/nvblox_slice.gif differ
diff --git a/docs/images/reconstruction_in_docker_trim.png b/docs/images/reconstruction_in_docker_trim.png
new file mode 100644
index 00000000..8c7bda97
Binary files /dev/null and b/docs/images/reconstruction_in_docker_trim.png differ
diff --git a/nvblox/AMENT_IGNORE b/nvblox/AMENT_IGNORE
new file mode 100644
index 00000000..e69de29b
diff --git a/nvblox/CMakeLists.txt b/nvblox/CMakeLists.txt
new file mode 100644
index 00000000..3babb997
--- /dev/null
+++ b/nvblox/CMakeLists.txt
@@ -0,0 +1,228 @@
+cmake_minimum_required(VERSION 3.16)
+
+# set the project name and version
+project(nvblox VERSION 0.0.1 LANGUAGES CXX CUDA)
+
+# specify the C++ standard
+set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+set(CMAKE_BUILD_TYPE RelWithDebInfo)
+
+# Build options
+option(BUILD_EXPERIMENTS "Build performance experimentation binaries" ON)
+option(BUILD_TESTS "Build the C++ tests of the nvblox library" ON)
+
+# Suppress spammy Eigen CUDA warnings.
+# "expt-relaxed-constexpr" allows sharing constexpr between host and device
+# code.
+# "display_error_number" shows a warning number with all warnings, and the
+# rest is just suppressing specific warnings from Eigen. Note that the numbers
+# keep changing with every CUDA release so this list is a bit arbitrary.
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr  --generate-line-info -lineinfo -Xcudafe --display_error_number -Xcudafe --diag_suppress=2977  -Xcudafe --diag_suppress=3057  -Xcudafe --diag_suppress=3059 ")
+set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --compiler-options -fPIC")
+
+# New warning numbers above CUDA 11.2.
+if (CUDA_VERSION_MAJOR EQUAL 11 AND CUDA_VERSION_MINOR GREATER_EQUAL 2)
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe --diag_suppress=20012 -Xcudafe --diag_suppress=20011 -Xcudafe --diag_suppress=20014")
+endif()
+
+# Download thirdparty deps
+message(STATUS "Downloading 3rdparty dependencies")
+message(STATUS "Downloading Eigen")
+include(thirdparty/eigen/eigen.cmake)
+
+# Build stdgpu as part of this
+message(STATUS "Downloading STDGPU")
+include(thirdparty/stdgpu/stdgpu.cmake)
+
+# Include dem deps
+list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
+find_package(CUDA REQUIRED)
+# In the case of ROS builds, glog will likely be found at a higher level.
+# We want to link against that version in that case.
+if(NOT Glog_FOUND)
+    find_package(Glog REQUIRED)
+endif()
+# TODO(alexmillane): check the ROS builds here.
+find_package(gflags REQUIRED)
+
+# Include dirs
+include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+include_directories(${EIGEN_INCLUDE_DIRS})
+include_directories(${STDGPU_INCLUDE_DIRS})
+include_directories(include)
+
+#############
+# LIBRARIES #
+#############
+
+add_library(nvblox_cuda_check SHARED
+    src/core/cuda/error_check.cu
+)
+
+add_library(nvblox_gpu_hash SHARED
+    src/gpu_hash/cuda/gpu_layer_view.cu
+    src/gpu_hash/cuda/gpu_set.cu
+    src/utils/timing.cpp
+    src/utils/nvtx_ranges.cpp
+)
+add_dependencies(nvblox_gpu_hash ext_eigen stdgpu)
+target_link_libraries(nvblox_gpu_hash PUBLIC
+    stdgpu
+    nvblox_cuda_check
+    ${CUDA_nvToolsExt_LIBRARY}
+)
+
+add_library(nvblox_lib SHARED
+    src/core/bounding_boxes.cpp
+    src/core/camera.cpp
+    src/core/color.cpp
+    src/core/cuda/blox.cu
+    src/core/cuda/image_cuda.cu
+    src/core/cuda/warmup.cu
+    src/core/image.cpp
+    src/core/interpolation_3d.cpp
+    src/core/mapper.cpp
+    src/datasets/image_loader.cpp
+    src/datasets/parse_3dmatch.cpp
+    src/integrators/cuda/frustum.cu
+    src/integrators/cuda/projective_tsdf_integrator.cu
+    src/integrators/cuda/projective_color_integrator.cu
+    src/integrators/cuda/esdf_integrator.cu
+    src/integrators/esdf_integrator.cpp
+    src/integrators/frustum.cpp
+    src/integrators/projective_integrator_base.cpp
+    src/ray_tracing/cuda/sphere_tracer.cu
+    src/io/csv.cpp
+    src/io/mesh_io.cpp
+    src/io/ply_writer.cpp
+    src/mesh/marching_cubes.cu
+    src/mesh/mesh_block.cu
+    src/mesh/mesh_integrator_color.cu
+    src/mesh/mesh_integrator.cpp
+    src/mesh/mesh_integrator.cu
+    src/mesh/mesh.cpp
+    src/primitives/primitives.cpp
+    src/primitives/scene.cpp
+    src/utils/nvtx_ranges.cpp
+    src/utils/timing.cpp
+)
+add_dependencies(nvblox_lib ext_eigen)
+target_link_libraries(nvblox_lib PUBLIC
+    ${GLOG_LIBRARIES}
+    gflags
+    ${CUDA_LIBRARIES}
+    ${CUDA_nvToolsExt_LIBRARY}
+    nvblox_gpu_hash
+    nvblox_cuda_check
+)
+target_link_libraries(nvblox_lib INTERFACE
+    stdgpu
+)
+target_include_directories(nvblox_lib PUBLIC 
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
+    $<INSTALL_INTERFACE:include>
+    $<BUILD_INTERFACE:${EIGEN_INCLUDE_DIRS}>
+    $<INSTALL_INTERFACE:include/eigen3>
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+)    
+set_target_properties(nvblox_lib PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+############
+# BINARIES #
+############
+add_executable(sphere_benchmark src/benchmarks/sphere_benchmark.cpp)
+target_link_libraries(sphere_benchmark
+    nvblox_lib
+)
+set_target_properties(sphere_benchmark PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+#########
+# TESTS #
+#########
+
+if (BUILD_TESTS)
+    add_subdirectory(tests)
+endif()
+
+###############
+# EXPERIMENTS #
+###############
+
+if (BUILD_EXPERIMENTS)
+    add_subdirectory(experiments)
+endif()
+
+#############################
+# INTERFACE LIBRARY FOR ROS #
+#############################
+# TODO: delete
+add_library(nvblox_interface INTERFACE)
+target_link_libraries(nvblox_interface INTERFACE 
+    nvblox_lib 
+    nvblox_gpu_hash 
+    nvblox_cuda_check 
+    ${GLOG_LIBRARIES} 
+    ${CUDA_LIBRARIES} 
+    stdgpu 
+    Eigen3::Eigen
+)
+target_include_directories(nvblox_interface INTERFACE include ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+add_dependencies(nvblox_interface ext_eigen ext_stdgpu)
+
+##########
+# EXPORT #
+##########
+include(GNUInstallDirs)
+
+install( 
+  TARGETS nvblox_lib nvblox_gpu_hash nvblox_cuda_check stdgpu
+  EXPORT nvblox
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}/${PROJECT_NAME}
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}/${PROJECT_NAME}
+  INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
+  PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}
+)
+install(
+    DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/
+    DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+    FILES_MATCHING PATTERN "*.h*"
+)
+
+
+include(CMakePackageConfigHelpers)
+# generate the config file that is includes the exports
+configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/Config.cmake.in
+  "${CMAKE_CURRENT_BINARY_DIR}/nvbloxConfig.cmake"
+  INSTALL_DESTINATION "lib/cmake"
+  NO_SET_AND_CHECK_MACRO
+  NO_CHECK_REQUIRED_COMPONENTS_MACRO
+)
+# generate the version file for the config file
+write_basic_package_version_file(
+  "${CMAKE_CURRENT_BINARY_DIR}/nvbloxConfigVersion.cmake"
+  VERSION "${nvblox_VERSION_MAJOR}.${nvblox_VERSION_MINOR}"
+  COMPATIBILITY AnyNewerVersion
+)
+
+# install the configuration file
+install(FILES
+    ${CMAKE_CURRENT_BINARY_DIR}/nvbloxConfig.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/nvbloxConfigVersion.cmake
+    ${CMAKE_CURRENT_BINARY_DIR}/nvbloxTargets.cmake
+    DESTINATION share/nvblox/cmake)
+
+# generate the export targets for the build tree
+# needs to be after the install(TARGETS ) command
+export(EXPORT nvblox
+    NAMESPACE nvblox::
+    FILE "${CMAKE_CURRENT_BINARY_DIR}/nvbloxTargets.cmake"
+)
+
+install(
+    EXPORT nvblox 
+    FILE nvbloxTargets.cmake
+    NAMESPACE nvblox::
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/nvblox/cmake/Config.cmake.in b/nvblox/cmake/Config.cmake.in
new file mode 100644
index 00000000..38c57549
--- /dev/null
+++ b/nvblox/cmake/Config.cmake.in
@@ -0,0 +1,4 @@
+
+@PACKAGE_INIT@
+
+include ( "${CMAKE_CURRENT_LIST_DIR}/nvbloxTargets.cmake" )
diff --git a/nvblox/cmake/FindGlog.cmake b/nvblox/cmake/FindGlog.cmake
new file mode 100644
index 00000000..1a7b6c09
--- /dev/null
+++ b/nvblox/cmake/FindGlog.cmake
@@ -0,0 +1,379 @@
+# Ceres Solver - A fast non-linear least squares minimizer
+# Copyright 2015 Google Inc. All rights reserved.
+# http://ceres-solver.org/
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice,
+#   this list of conditions and the following disclaimer.
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# * Neither the name of Google Inc. nor the names of its contributors may be
+#   used to endorse or promote products derived from this software without
+#   specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+# POSSIBILITY OF SUCH DAMAGE.
+#
+# Author: alexs.mac@gmail.com (Alex Stewart)
+#
+
+# FindGlog.cmake - Find Google glog logging library.
+#
+# This module defines the following variables:
+#
+# GLOG_FOUND: TRUE iff glog is found.
+# GLOG_INCLUDE_DIRS: Include directories for glog.
+# GLOG_LIBRARIES: Libraries required to link glog.
+# FOUND_INSTALLED_GLOG_CMAKE_CONFIGURATION: True iff the version of glog found
+#                                           was built & installed / exported
+#                                           as a CMake package.
+#
+# The following variables control the behaviour of this module:
+#
+# GLOG_PREFER_EXPORTED_GLOG_CMAKE_CONFIGURATION: TRUE/FALSE, iff TRUE then
+#                           then prefer using an exported CMake configuration
+#                           generated by glog > 0.3.4 over searching for the
+#                           glog components manually.  Otherwise (FALSE)
+#                           ignore any exported glog CMake configurations and
+#                           always perform a manual search for the components.
+#                           Default: TRUE iff user does not define this variable
+#                           before we are called, and does NOT specify either
+#                           GLOG_INCLUDE_DIR_HINTS or GLOG_LIBRARY_DIR_HINTS
+#                           otherwise FALSE.
+# GLOG_INCLUDE_DIR_HINTS: List of additional directories in which to
+#                         search for glog includes, e.g: /timbuktu/include.
+# GLOG_LIBRARY_DIR_HINTS: List of additional directories in which to
+#                         search for glog libraries, e.g: /timbuktu/lib.
+#
+# The following variables are also defined by this module, but in line with
+# CMake recommended FindPackage() module style should NOT be referenced directly
+# by callers (use the plural variables detailed above instead).  These variables
+# do however affect the behaviour of the module via FIND_[PATH/LIBRARY]() which
+# are NOT re-called (i.e. search for library is not repeated) if these variables
+# are set with valid values _in the CMake cache_. This means that if these
+# variables are set directly in the cache, either by the user in the CMake GUI,
+# or by the user passing -DVAR=VALUE directives to CMake when called (which
+# explicitly defines a cache variable), then they will be used verbatim,
+# bypassing the HINTS variables and other hard-coded search locations.
+#
+# GLOG_INCLUDE_DIR: Include directory for glog, not including the
+#                   include directory of any dependencies.
+# GLOG_LIBRARY: glog library, not including the libraries of any
+#               dependencies.
+
+# Reset CALLERS_CMAKE_FIND_LIBRARY_PREFIXES to its value when
+# FindGlog was invoked.
+macro(GLOG_RESET_FIND_LIBRARY_PREFIX)
+  if (MSVC AND CALLERS_CMAKE_FIND_LIBRARY_PREFIXES)
+    set(CMAKE_FIND_LIBRARY_PREFIXES "${CALLERS_CMAKE_FIND_LIBRARY_PREFIXES}")
+  endif()
+endmacro(GLOG_RESET_FIND_LIBRARY_PREFIX)
+
+# Called if we failed to find glog or any of it's required dependencies,
+# unsets all public (designed to be used externally) variables and reports
+# error message at priority depending upon [REQUIRED/QUIET/<NONE>] argument.
+macro(GLOG_REPORT_NOT_FOUND REASON_MSG)
+  unset(GLOG_FOUND)
+  unset(GLOG_INCLUDE_DIRS)
+  unset(GLOG_LIBRARIES)
+  # Make results of search visible in the CMake GUI if glog has not
+  # been found so that user does not have to toggle to advanced view.
+  mark_as_advanced(CLEAR GLOG_INCLUDE_DIR
+                         GLOG_LIBRARY)
+
+  glog_reset_find_library_prefix()
+
+  # Note <package>_FIND_[REQUIRED/QUIETLY] variables defined by FindPackage()
+  # use the camelcase library name, not uppercase.
+  if (Glog_FIND_QUIETLY)
+    message(STATUS "Failed to find glog - " ${REASON_MSG} ${ARGN})
+  elseif (Glog_FIND_REQUIRED)
+    message(FATAL_ERROR "Failed to find glog - " ${REASON_MSG} ${ARGN})
+  else()
+    # Neither QUIETLY nor REQUIRED, use no priority which emits a message
+    # but continues configuration and allows generation.
+    message("-- Failed to find glog - " ${REASON_MSG} ${ARGN})
+  endif ()
+  return()
+endmacro(GLOG_REPORT_NOT_FOUND)
+
+# glog_message([mode] "message text")
+#
+# Wraps the standard cmake 'message' command, but suppresses output
+# if the QUIET flag was passed to the find_package(Glog ...) call.
+function(GLOG_MESSAGE)
+  if (NOT Glog_FIND_QUIETLY)
+    message(${ARGN})
+  endif()
+endfunction()
+
+# Protect against any alternative find_package scripts for this library having
+# been called previously (in a client project) which set GLOG_FOUND, but not
+# the other variables we require / set here which could cause the search logic
+# here to fail.
+unset(GLOG_FOUND)
+
+# -----------------------------------------------------------------
+# By default, if the user has expressed no preference for using an exported
+# glog CMake configuration over performing a search for the installed
+# components, and has not specified any hints for the search locations, then
+# prefer a glog exported configuration if available.
+if (NOT DEFINED GLOG_PREFER_EXPORTED_GLOG_CMAKE_CONFIGURATION
+    AND NOT GLOG_INCLUDE_DIR_HINTS
+    AND NOT GLOG_LIBRARY_DIR_HINTS)
+  glog_message(STATUS "No preference for use of exported glog CMake "
+    "configuration set, and no hints for include/library directories provided. "
+    "Defaulting to preferring an installed/exported glog CMake configuration "
+    "if available.")
+  set(GLOG_PREFER_EXPORTED_GLOG_CMAKE_CONFIGURATION TRUE)
+endif()
+
+# On macOS, add the Homebrew prefix (with appropriate suffixes) to the
+# respective HINTS directories (after any user-specified locations).  This
+# handles Homebrew installations into non-standard locations (not /usr/local).
+# We do not use CMAKE_PREFIX_PATH for this as given the search ordering of
+# find_xxx(), doing so would override any user-specified HINTS locations with
+# the Homebrew version if it exists.
+if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
+  find_program(HOMEBREW_EXECUTABLE brew)
+  mark_as_advanced(FORCE HOMEBREW_EXECUTABLE)
+  if (HOMEBREW_EXECUTABLE)
+    # Detected a Homebrew install, query for its install prefix.
+    execute_process(COMMAND ${HOMEBREW_EXECUTABLE} --prefix
+      OUTPUT_VARIABLE HOMEBREW_INSTALL_PREFIX
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    glog_message(STATUS "Detected Homebrew with install prefix: "
+      "${HOMEBREW_INSTALL_PREFIX}, adding to CMake search paths.")
+    list(APPEND GLOG_INCLUDE_DIR_HINTS "${HOMEBREW_INSTALL_PREFIX}/include")
+    list(APPEND GLOG_LIBRARY_DIR_HINTS "${HOMEBREW_INSTALL_PREFIX}/lib")
+  endif()
+endif()
+
+if (GLOG_PREFER_EXPORTED_GLOG_CMAKE_CONFIGURATION)
+  # Try to find an exported CMake configuration for glog, as generated by
+  # glog versions > 0.3.4
+  #
+  # We search twice, s/t we can invert the ordering of precedence used by
+  # find_package() for exported package build directories, and installed
+  # packages (found via CMAKE_SYSTEM_PREFIX_PATH), listed as items 6) and 7)
+  # respectively in [1].
+  #
+  # By default, exported build directories are (in theory) detected first, and
+  # this is usually the case on Windows.  However, on OS X & Linux, the install
+  # path (/usr/local) is typically present in the PATH environment variable
+  # which is checked in item 4) in [1] (i.e. before both of the above, unless
+  # NO_SYSTEM_ENVIRONMENT_PATH is passed).  As such on those OSs installed
+  # packages are usually detected in preference to exported package build
+  # directories.
+  #
+  # To ensure a more consistent response across all OSs, and as users usually
+  # want to prefer an installed version of a package over a locally built one
+  # where both exist (esp. as the exported build directory might be removed
+  # after installation), we first search with NO_CMAKE_PACKAGE_REGISTRY which
+  # means any build directories exported by the user are ignored, and thus
+  # installed directories are preferred.  If this fails to find the package
+  # we then research again, but without NO_CMAKE_PACKAGE_REGISTRY, so any
+  # exported build directories will now be detected.
+  #
+  # To prevent confusion on Windows, we also pass NO_CMAKE_BUILDS_PATH (which
+  # is item 5) in [1]), to not preferentially use projects that were built
+  # recently with the CMake GUI to ensure that we always prefer an installed
+  # version if available.
+  #
+  # NOTE: We use the NAMES option as glog erroneously uses 'google-glog' as its
+  #       project name when built with CMake, but exports itself as just 'glog'.
+  #       On Linux/OS X this does not break detection as the project name is
+  #       not used as part of the install path for the CMake package files,
+  #       e.g. /usr/local/lib/cmake/glog, where the <glog> suffix is hardcoded
+  #       in glog's CMakeLists.  However, on Windows the project name *is*
+  #       part of the install prefix: C:/Program Files/google-glog/[include,lib].
+  #       However, by default CMake checks:
+  #       C:/Program Files/<FIND_PACKAGE_ARGUMENT_NAME='glog'> which does not
+  #       exist and thus detection fails.  Thus we use the NAMES to force the
+  #       search to use both google-glog & glog.
+  #
+  # [1] http://www.cmake.org/cmake/help/v2.8.11/cmake.html#command:find_package
+  find_package(glog QUIET
+                    NAMES google-glog glog
+                    HINTS ${glog_DIR} ${HOMEBREW_INSTALL_PREFIX}
+                    NO_MODULE
+                    NO_CMAKE_PACKAGE_REGISTRY
+                    NO_CMAKE_BUILDS_PATH)
+  if (glog_FOUND)
+    glog_message(STATUS "Found installed version of glog: ${glog_DIR}")
+  else()
+    # Failed to find an installed version of glog, repeat search allowing
+    # exported build directories.
+    glog_message(STATUS "Failed to find installed glog CMake configuration, "
+      "searching for glog build directories exported with CMake.")
+    # Again pass NO_CMAKE_BUILDS_PATH, as we know that glog is exported and
+    # do not want to treat projects built with the CMake GUI preferentially.
+    find_package(glog QUIET
+                      NAMES google-glog glog
+                      NO_MODULE
+                      NO_CMAKE_BUILDS_PATH)
+    if (glog_FOUND)
+      glog_message(STATUS "Found exported glog build directory: ${glog_DIR}")
+    endif(glog_FOUND)
+  endif(glog_FOUND)
+
+  set(FOUND_INSTALLED_GLOG_CMAKE_CONFIGURATION ${glog_FOUND})
+
+  if (FOUND_INSTALLED_GLOG_CMAKE_CONFIGURATION)
+    glog_message(STATUS "Detected glog version: ${glog_VERSION}")
+    set(GLOG_FOUND ${glog_FOUND})
+    # glog wraps the include directories into the exported glog::glog target.
+    set(GLOG_INCLUDE_DIR "")
+    set(GLOG_LIBRARY glog::glog)
+  else (FOUND_INSTALLED_GLOG_CMAKE_CONFIGURATION)
+    glog_message(STATUS "Failed to find an installed/exported CMake "
+      "configuration for glog, will perform search for installed glog "
+      "components.")
+  endif (FOUND_INSTALLED_GLOG_CMAKE_CONFIGURATION)
+endif(GLOG_PREFER_EXPORTED_GLOG_CMAKE_CONFIGURATION)
+
+if (NOT GLOG_FOUND)
+  # Either failed to find an exported glog CMake configuration, or user
+  # told us not to use one.  Perform a manual search for all glog components.
+
+  # Handle possible presence of lib prefix for libraries on MSVC, see
+  # also GLOG_RESET_FIND_LIBRARY_PREFIX().
+  if (MSVC)
+    # Preserve the caller's original values for CMAKE_FIND_LIBRARY_PREFIXES
+    # s/t we can set it back before returning.
+    set(CALLERS_CMAKE_FIND_LIBRARY_PREFIXES "${CMAKE_FIND_LIBRARY_PREFIXES}")
+    # The empty string in this list is important, it represents the case when
+    # the libraries have no prefix (shared libraries / DLLs).
+    set(CMAKE_FIND_LIBRARY_PREFIXES "lib" "" "${CMAKE_FIND_LIBRARY_PREFIXES}")
+  endif (MSVC)
+
+  # Search user-installed locations first, so that we prefer user installs
+  # to system installs where both exist.
+  list(APPEND GLOG_CHECK_INCLUDE_DIRS
+    /usr/local/include
+    /usr/local/homebrew/include # Mac OS X
+    /opt/local/var/macports/software # Mac OS X.
+    /opt/local/include
+    /usr/include)
+  # Windows (for C:/Program Files prefix).
+  list(APPEND GLOG_CHECK_PATH_SUFFIXES
+    glog/include
+    glog/Include
+    Glog/include
+    Glog/Include
+    google-glog/include # CMake installs with project name prefix.
+    google-glog/Include)
+
+  list(APPEND GLOG_CHECK_LIBRARY_DIRS
+    /usr/local/lib
+    /usr/local/homebrew/lib # Mac OS X.
+    /opt/local/lib
+    /usr/lib)
+  # Windows (for C:/Program Files prefix).
+  list(APPEND GLOG_CHECK_LIBRARY_SUFFIXES
+    glog/lib
+    glog/Lib
+    Glog/lib
+    Glog/Lib
+    google-glog/lib # CMake installs with project name prefix.
+    google-glog/Lib)
+
+  # Search supplied hint directories first if supplied.
+  find_path(GLOG_INCLUDE_DIR
+    NAMES glog/logging.h
+    HINTS ${GLOG_INCLUDE_DIR_HINTS}
+    PATHS ${GLOG_CHECK_INCLUDE_DIRS}
+    PATH_SUFFIXES ${GLOG_CHECK_PATH_SUFFIXES})
+  if (NOT GLOG_INCLUDE_DIR OR
+      NOT EXISTS ${GLOG_INCLUDE_DIR})
+    glog_report_not_found(
+      "Could not find glog include directory, set GLOG_INCLUDE_DIR "
+      "to directory containing glog/logging.h")
+  endif (NOT GLOG_INCLUDE_DIR OR
+    NOT EXISTS ${GLOG_INCLUDE_DIR})
+
+  find_library(GLOG_LIBRARY NAMES glog
+    HINTS ${GLOG_LIBRARY_DIR_HINTS}
+    PATHS ${GLOG_CHECK_LIBRARY_DIRS}
+    PATH_SUFFIXES ${GLOG_CHECK_LIBRARY_SUFFIXES})
+  if (NOT GLOG_LIBRARY OR
+      NOT EXISTS ${GLOG_LIBRARY})
+    glog_report_not_found(
+      "Could not find glog library, set GLOG_LIBRARY "
+      "to full path to libglog.")
+  endif (NOT GLOG_LIBRARY OR
+    NOT EXISTS ${GLOG_LIBRARY})
+
+  # Mark internally as found, then verify. GLOG_REPORT_NOT_FOUND() unsets
+  # if called.
+  set(GLOG_FOUND TRUE)
+
+  # Glog does not seem to provide any record of the version in its
+  # source tree, thus cannot extract version.
+
+  # Catch case when caller has set GLOG_INCLUDE_DIR in the cache / GUI and
+  # thus FIND_[PATH/LIBRARY] are not called, but specified locations are
+  # invalid, otherwise we would report the library as found.
+  if (GLOG_INCLUDE_DIR AND
+      NOT EXISTS ${GLOG_INCLUDE_DIR}/glog/logging.h)
+    glog_report_not_found(
+      "Caller defined GLOG_INCLUDE_DIR:"
+      " ${GLOG_INCLUDE_DIR} does not contain glog/logging.h header.")
+  endif (GLOG_INCLUDE_DIR AND
+    NOT EXISTS ${GLOG_INCLUDE_DIR}/glog/logging.h)
+  # TODO: This regex for glog library is pretty primitive, we use lowercase
+  #       for comparison to handle Windows using CamelCase library names, could
+  #       this check be better?
+  string(TOLOWER "${GLOG_LIBRARY}" LOWERCASE_GLOG_LIBRARY)
+  if (GLOG_LIBRARY AND
+      NOT "${LOWERCASE_GLOG_LIBRARY}" MATCHES ".*glog[^/]*")
+    glog_report_not_found(
+      "Caller defined GLOG_LIBRARY: "
+      "${GLOG_LIBRARY} does not match glog.")
+  endif (GLOG_LIBRARY AND
+    NOT "${LOWERCASE_GLOG_LIBRARY}" MATCHES ".*glog[^/]*")
+
+  glog_reset_find_library_prefix()
+
+endif(NOT GLOG_FOUND)
+
+# Set standard CMake FindPackage variables if found.
+if (GLOG_FOUND)
+  set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR})
+  set(GLOG_LIBRARIES ${GLOG_LIBRARY})
+endif (GLOG_FOUND)
+
+# If we are using an exported CMake glog target, the include directories are
+# wrapped into the target itself, and do not have to be (and are not)
+# separately specified.  In which case, we should not add GLOG_INCLUDE_DIRS
+# to the list of required variables in order that glog be reported as found.
+if (FOUND_INSTALLED_GLOG_CMAKE_CONFIGURATION)
+  set(GLOG_REQUIRED_VARIABLES GLOG_LIBRARIES)
+else()
+  set(GLOG_REQUIRED_VARIABLES GLOG_INCLUDE_DIRS GLOG_LIBRARIES)
+endif()
+
+# Handle REQUIRED / QUIET optional arguments.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Glog DEFAULT_MSG
+  ${GLOG_REQUIRED_VARIABLES})
+
+# Only mark internal variables as advanced if we found glog, otherwise
+# leave them visible in the standard GUI for the user to set manually.
+if (GLOG_FOUND)
+  mark_as_advanced(FORCE GLOG_INCLUDE_DIR
+                         GLOG_LIBRARY
+                         glog_DIR) # Autogenerated by find_package(glog)
+endif (GLOG_FOUND)
diff --git a/nvblox/examples/python/3dmatch_example.py b/nvblox/examples/python/3dmatch_example.py
new file mode 100755
index 00000000..57a74291
--- /dev/null
+++ b/nvblox/examples/python/3dmatch_example.py
@@ -0,0 +1,135 @@
+#!/usr/bin/python3
+
+from genericpath import exists
+import os
+import wget
+from zipfile import ZipFile
+import argparse
+
+import numpy as np
+from matplotlib import pyplot as plt
+import open3d as o3d
+
+from nvblox.datasets.threedmatch.loader import ThreeDMatchLoader
+from nvblox.conversions import open3d_conversions
+import nvblox
+
+from icecream import ic
+
+
+def get_colored_cubes_mesh(values, XYZ, cube_size):
+    # Normalizing/Clipping the values
+    percentile_lim_upper = 90
+    percentile_lim_lower = 10
+    max_val = np.percentile(values, percentile_lim_upper)
+    min_val = np.percentile(values, percentile_lim_lower)
+    values_normalized = (values - min_val) / (max_val - min_val)
+    values_normalized = values_normalized.clip(min=0.0, max=1.0)
+    # Create the values mesh
+    slice_mesh = o3d.geometry.TriangleMesh()
+    for dist, xyz in zip(values_normalized, XYZ):
+        box = o3d.geometry.TriangleMesh.create_box(width=cube_size,
+                                                   height=cube_size,
+                                                   depth=cube_size)
+        color = plt.cm.viridis(dist)
+        box.compute_vertex_normals()
+        box.paint_uniform_color(color[0:3])
+        box.translate(np.array([xyz[0], xyz[1], xyz[2]]))
+        slice_mesh += box
+    return slice_mesh
+
+
+def main(num_frames: int):
+
+    # Check for data (and download it if required)
+    script_dir = os.path.dirname(os.path.realpath(__file__))
+    data_root = os.path.join(script_dir, "data")
+    dataset_name = 'sun3d-mit_76_studyroom-76-1studyroom2'
+    dataset_dir = os.path.join(data_root, dataset_name)
+    if not os.path.isdir(dataset_dir):
+        url = 'http://vision.princeton.edu/projects/2016/3DMatch/downloads/rgbd-datasets/sun3d-mit_76_studyroom-76-1studyroom2.zip'
+        print('Downloading dataset from: ' + url)
+        wget.download(url, out=data_root)
+        print("Extracting dataset")
+        ZipFile(dataset_dir + '.zip').extractall(path=data_root)
+
+    # Paths to the data
+    sequence_root_path = os.path.join(dataset_dir, "seq-01")
+    camera_intrinsics_path = os.path.join(
+        dataset_dir, "camera-intrinsics.txt")
+    loader = ThreeDMatchLoader(sequence_root_path, camera_intrinsics_path)
+
+    # Create a tsdf_layer (our map/reconstruction)
+    block_size = nvblox.utils.block_size(voxel_size=0.05)
+    tsdf_layer = nvblox.TsdfLayer(
+        block_size, memory_type=nvblox.MemoryType.Device)
+    mesh_layer = nvblox.MeshLayer(
+        block_size, memory_type=nvblox.MemoryType.Unified)
+
+    # TSDF Integrator
+    integrator = nvblox.ProjectiveTsdfIntegrator()
+
+    # Integrating all the frames
+    if num_frames is None:
+        num_frames = loader.get_number_of_frames()
+    for frame_idx in range(num_frames):
+        print(f"Integrating frame: {frame_idx} / {num_frames}")
+        (depth_frame, T_L_C_mat, camera) = loader.get_frame_data(frame_idx)
+        if np.isnan(T_L_C_mat).any() or np.isnan(depth_frame).any():
+            continue
+        (depth_frame, T_L_C_mat, camera) = loader.get_frame_data(frame_idx)
+        integrator.integrateFrame(depth_frame, T_L_C_mat, camera, tsdf_layer)
+
+    # Mesh the tsdf_layer and safe the result as ply to a file
+    print("Extracting mesh from TSDF")
+    mesh_integrator = nvblox.mesh.MeshIntegrator()
+    mesh_integrator.integrateMeshFromDistanceField(tsdf_layer, mesh_layer)
+
+    # Generate an ESDF
+    esdf_layer = nvblox.EsdfLayer(
+        tsdf_layer.block_size, memory_type=nvblox.MemoryType.Unified)
+    esdf_integrator = nvblox.EsdfIntegrator()
+    esdf_integrator.integrate_layer(tsdf_layer, esdf_layer)
+
+    # Get the bounding box of the layer
+    aabb = esdf_layer.get_aabb_of_observed_voxels()
+
+    # Get the 3D locations of horizontal slice
+    # Note: 3DMatch is Y-axis up, so a vertical slice is along the y axis
+    resolution_m = 0.1
+    slice_point_ratio = 0.35
+    slice_point_m = slice_point_ratio * aabb.sizes()[1] + aabb.min[1]
+    X, Z = np.mgrid[aabb.min[0]:aabb.max[0]:resolution_m,
+                    aabb.min[2]:aabb.max[2]:resolution_m]
+    X, Y, Z = np.broadcast_arrays(X, slice_point_m, Z)
+    XYZ = np.stack((X.flatten(), Y.flatten(), Z.flatten())).transpose().astype(np.float32)
+
+    #  Interpolate the ESDF at slice locations
+    slice, flags = nvblox.interpolate_on_cpu(XYZ, esdf_layer)
+
+    # Convert the mesh to Open3D and visualize
+    mesh = open3d_conversions.mesh_layer_to_open3d(mesh_layer)
+    mesh.compute_vertex_normals()
+    slice_mesh = get_colored_cubes_mesh(
+        slice[flags == True], XYZ[flags == True], cube_size=3.0 * resolution_m / 4.0)
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+    vis.add_geometry(mesh)
+    vis.add_geometry(open3d_conversions.aabb_to_open3d(aabb))
+    vis.add_geometry(slice_mesh)
+    vis.run()
+    vis.destroy_window()
+
+    # Write the mesh to file
+    mesh_path = os.path.join(script_dir, "output", dataset_name + ".ply")
+    print("Writing the mesh to: " + mesh_path)
+    nvblox.io.output_mesh_layer_to_ply(mesh_layer, mesh_path)
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Run an experiment.")
+    parser.add_argument("--number_of_frames", metavar="number_of_frames", type=int,
+                        help="Number of frames to process. Not including this arg mean process all frames.")
+    args = parser.parse_args()
+    main(args.number_of_frames)
diff --git a/nvblox/examples/python/data/.gitignore b/nvblox/examples/python/data/.gitignore
new file mode 100644
index 00000000..86d0cb27
--- /dev/null
+++ b/nvblox/examples/python/data/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
\ No newline at end of file
diff --git a/nvblox/examples/python/output/.gitignore b/nvblox/examples/python/output/.gitignore
new file mode 100644
index 00000000..86d0cb27
--- /dev/null
+++ b/nvblox/examples/python/output/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
\ No newline at end of file
diff --git a/nvblox/experiments/.gitignore b/nvblox/experiments/.gitignore
new file mode 100644
index 00000000..f32081e2
--- /dev/null
+++ b/nvblox/experiments/.gitignore
@@ -0,0 +1 @@
+*output*
\ No newline at end of file
diff --git a/nvblox/experiments/CMakeLists.txt b/nvblox/experiments/CMakeLists.txt
new file mode 100644
index 00000000..0e9e2f18
--- /dev/null
+++ b/nvblox/experiments/CMakeLists.txt
@@ -0,0 +1,47 @@
+# Common experiments library
+add_library(nvblox_experiments_common SHARED
+    src/common/fuse_3dmatch.cpp
+    src/integrators/cuda/depth_frame_texture.cu
+    src/integrators/cuda/experimental_integrator_input_frames.cu
+    src/integrators/cuda/experimental_projective_tsdf_integrators.cu
+)
+target_include_directories(nvblox_experiments_common PUBLIC
+    include
+)
+target_link_libraries(nvblox_experiments_common nvblox_lib)
+set_target_properties(nvblox_experiments_common PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+# Macro for installing python script SYMBOLICALLY to the buildspace
+macro(makeLink src dest target)
+  add_custom_command(TARGET ${target} PRE_BUILD
+    COMMAND ln -sf ${src} ${dest} DEPENDS ${dest} COMMENT "mklink ${src} -> ${dest}")
+endmacro()
+
+# 3Dmatch executable
+add_executable(fuse_3dmatch src/fuse_3dmatch.cpp)
+target_link_libraries(fuse_3dmatch
+nvblox_experiments_common
+)
+set_target_properties(fuse_3dmatch PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+# Python module for experiments
+set(PYTHON_EXPERIMENTS_MODULE_SRC_DIR "${CMAKE_CURRENT_SOURCE_DIR}/python")
+set(PYTHON_EXPERIMENTS_MODULE_DST_DIR "${CMAKE_CURRENT_BINARY_DIR}/python")
+file(MAKE_DIRECTORY ${PYTHON_EXPERIMENTS_MODULE_DST_DIR})
+file(COPY ${PYTHON_EXPERIMENTS_MODULE_SRC_DIR}/
+     DESTINATION ${PYTHON_EXPERIMENTS_MODULE_DST_DIR})
+
+# Script for comparing branches
+add_subdirectory(experiments/compare_branches)
+
+# Experiments
+# NOTE(alexmillane): Experiments disabled with "EXCLUDE_FROM_ALL" are no longer
+# maintained/building but are kept around for posterity
+add_subdirectory(experiments/texture_vs_global_memory_interpolation EXCLUDE_FROM_ALL)
+add_subdirectory(experiments/unified_vs_device_memory)
+add_subdirectory(experiments/threaded_image_loading)
+add_subdirectory(experiments/realistic_timings)
+add_subdirectory(experiments/vector_copies)
+add_subdirectory(experiments/isolate_tsdf_block_update)
+add_subdirectory(experiments/stream_compaction)
+add_subdirectory(experiments/layer_cake_interface)
diff --git a/nvblox/experiments/experiments/compare_branches/CMakeLists.txt b/nvblox/experiments/experiments/compare_branches/CMakeLists.txt
new file mode 100644
index 00000000..1de54cce
--- /dev/null
+++ b/nvblox/experiments/experiments/compare_branches/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_custom_target( compare_branches )
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/compare_branches.py" "${CMAKE_CURRENT_BINARY_DIR}/" compare_branches)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/plot_comparison.py" "${CMAKE_CURRENT_BINARY_DIR}/" compare_branches)
diff --git a/nvblox/experiments/experiments/compare_branches/__init__.py b/nvblox/experiments/experiments/compare_branches/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nvblox/experiments/experiments/compare_branches/compare_branches.py b/nvblox/experiments/experiments/compare_branches/compare_branches.py
new file mode 100755
index 00000000..eba5de7c
--- /dev/null
+++ b/nvblox/experiments/experiments/compare_branches/compare_branches.py
@@ -0,0 +1,87 @@
+#!/usr/bin/python3
+
+import os
+import argparse
+import subprocess
+from datetime import datetime
+
+import git
+
+import nvblox_experiments.threedmatch as threedmatch
+
+from plot_comparison import plot_timings
+
+
+def generate_timings(dataset_path: str, other_branch_or_hash: str, num_runs: int) -> None:
+
+    # Get this repo
+    # Note(alexmillane): We assume that this script is being executed within the repo under test.
+    this_directory = os.getcwd()
+    repo = git.Repo(this_directory, search_parent_directories=True)
+    git_root_dir = repo.git.rev_parse("--show-toplevel")
+    assert not repo.bare
+
+    # Branches to time
+    current_branch_name = repo.active_branch.name
+    branch_names = [current_branch_name, other_branch_or_hash]
+
+    # Where to place timings
+    datetime_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
+    output_root = os.path.join(
+        this_directory, 'output', datetime_str)
+
+    # Timing each branch
+    build_dir = os.path.join(git_root_dir, 'nvblox/build')
+    for branch_name in branch_names:
+
+        # Checkout
+        print('Checking out: ' + branch_name)
+        try:
+            repo.git.checkout(branch_name)
+        except git.GitCommandError:
+            print("Could not checkout branch: " + branch_name +
+                  ". Maybe you have uncommited changes?.")
+            return
+
+        # Build
+        if not os.path.exists(build_dir):
+            print("Please create a build space at: " + build_dir)
+            return
+        build_command = f"cd {build_dir} && cmake .. && make -j16"
+        subprocess.call(build_command, shell=True)
+
+        # Benchmark
+        branch_str = branch_name.replace('/', '_')
+        output_dir = os.path.join(output_root, branch_str)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+        threedmatch_binary_path = os.path.join(
+            build_dir, 'experiments/fuse_3dmatch')
+        threedmatch.run_multiple(num_runs, threedmatch_binary_path, dataset_path,
+                                 output_dir)
+
+    # Reset to the original branch
+    print('Checking out: ' + current_branch_name)
+    try:
+        repo.git.checkout(current_branch_name)
+    except git.GitCommandError:
+        pass
+
+    # Plot this stuff
+    plot_timings(output_root)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Generate timing results comparing the current and another branch/hash.")
+    parser.add_argument("dataset_base_path", metavar="dataset_base_path", type=str,
+                        help="Path to the 3DMatch dataset root directory.")
+    parser.add_argument("other_branch_or_hash", metavar="other_branch_or_hash", type=str,
+                        help="The branch name or commit hash to compare against.")
+    parser.add_argument("--num_runs", metavar="num_runs", type=int, default=5,
+                        help="The number of experiments over which to average the timings.")
+
+    args = parser.parse_args()
+    generate_timings(args.dataset_base_path,
+                     args.other_branch_or_hash, args.num_runs)
diff --git a/nvblox/experiments/experiments/compare_branches/plot_comparison.py b/nvblox/experiments/experiments/compare_branches/plot_comparison.py
new file mode 100644
index 00000000..02a38731
--- /dev/null
+++ b/nvblox/experiments/experiments/compare_branches/plot_comparison.py
@@ -0,0 +1,119 @@
+#!/usr/bin/python3
+
+import os
+import argparse
+import glob
+import pandas as pd
+from typing import Tuple
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+
+from nvblox_experiments.timing import get_timings_as_dataframe
+
+
+rcParams.update({'figure.autolayout': True})
+
+
+# Get the timings for a single run
+def get_total_times(filepath: str) -> pd.Series:
+    timings = get_timings_as_dataframe(filepath)
+    timings = timings[np.logical_not(timings.index.str.startswith('3dmatch'))]
+    timings = timings[np.logical_not(
+        timings.index.str.startswith('file_loading'))]
+    timings = timings[np.logical_not(
+        timings.index.str.endswith('stbi'))]
+    timings = timings[np.logical_not(timings.index.str.startswith('esdf'))]
+    return timings["total_time"]
+
+
+# Get timings for multiple runs and average them
+def get_branch_mean_total_times(timing_dir: str) -> Tuple[pd.Series, np.ndarray]:
+    series = []
+    results_files = glob.glob(timing_dir + "/run_*.txt")
+    for f in results_files:
+        series.append(get_total_times(f))
+    total_times = pd.concat(series, axis=1)
+    total_times_mean = total_times.mean(axis=1)
+    total_times_max = total_times.max(axis=1)
+    total_times_min = total_times.min(axis=1)
+    total_times_max_min_np = np.vstack((total_times_max, total_times_min))
+    total_times_err = np.abs(total_times_max_min_np -
+                             total_times_mean.to_numpy())
+    return total_times_mean, total_times_err
+
+
+def plot_timings(timing_root_dir: str, save_fig: bool = True) -> None:
+    print("Looking for results in: " + timing_root_dir)
+
+    # Check that the branch results are in the dir
+    result_dirs = [name for name in os.listdir(timing_root_dir)
+                   if os.path.isdir(os.path.join(timing_root_dir, name))]
+    assert len(
+        result_dirs) == 2, "We expect a comparision between exactly two branches"
+
+    # Branch 1 results
+    branch_1_name = result_dirs[0]
+    branch_1_results_folder = os.path.join(timing_root_dir, branch_1_name)
+    branch_1_mean, branch_1_err = get_branch_mean_total_times(
+        branch_1_results_folder)
+
+    # Branch 2 results
+    branch_2_name = result_dirs[1]
+    branch_2_results_folder = os.path.join(timing_root_dir, branch_2_name)
+    branch_2_mean, branch_2_err = get_branch_mean_total_times(
+        branch_2_results_folder)
+
+    # Select only the intersection of the two series.
+    intersection_columns = branch_1_mean.index.intersection(
+        branch_2_mean.index)
+
+    branch_1_inds = [branch_1_mean.index.get_loc(
+        c) for c in intersection_columns]
+    branch_2_inds = [branch_2_mean.index.get_loc(
+        c) for c in intersection_columns]
+
+    branch_1_mean_filtered = branch_1_mean[intersection_columns]
+    branch_2_mean_filtered = branch_2_mean[intersection_columns]
+    branch_1_err_filtered = branch_1_err[:, branch_1_inds]
+    branch_2_err_filtered = branch_2_err[:, branch_2_inds]
+
+    assert len(branch_2_mean_filtered) == len(branch_1_mean_filtered)
+
+    # Plot
+    fig, ax = plt.subplots()
+    bar_width = 0.35
+    x = np.arange(len(branch_1_mean_filtered))
+    x_device = x - bar_width / 2.0
+    x_unified = x + bar_width / 2.0
+    bar1 = ax.bar(x_device, branch_1_mean_filtered, bar_width,
+                  yerr=branch_1_err_filtered, label=branch_1_name)
+    bar2 = ax.bar(x_unified, branch_2_mean_filtered, bar_width,
+                  yerr=branch_2_err_filtered, label=branch_2_name)
+    try:
+        ax.bar_label(bar1, fmt='%.2f')
+        ax.bar_label(bar2, fmt='%.2f')
+    except:
+        pass
+    ax.set_xticks(x)
+    ax.set_xticklabels(intersection_columns, rotation='vertical')
+    ax.legend()
+
+    # Save the plot in the root folder of the timings.
+    if save_fig:
+        image_path = os.path.join(timing_root_dir, 'timings.png')
+        print("Saving figure to disk as: " + image_path)
+        plt.savefig(image_path)
+
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Plot branch timing comparisons.")
+    parser.add_argument("timing_root_dir", metavar="timing_root_dir", type=str,
+                        help="The directory containing the timing results.")
+    args = parser.parse_args()
+    plot_timings(args.timing_root_dir)
diff --git a/nvblox/experiments/experiments/isolate_tsdf_block_update/CMakeLists.txt b/nvblox/experiments/experiments/isolate_tsdf_block_update/CMakeLists.txt
new file mode 100644
index 00000000..cd564351
--- /dev/null
+++ b/nvblox/experiments/experiments/isolate_tsdf_block_update/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(isolate_tsdf_block_update main.cpp)
+target_link_libraries(isolate_tsdf_block_update nvblox_lib)
diff --git a/nvblox/experiments/experiments/isolate_tsdf_block_update/main.cpp b/nvblox/experiments/experiments/isolate_tsdf_block_update/main.cpp
new file mode 100644
index 00000000..4903c3a8
--- /dev/null
+++ b/nvblox/experiments/experiments/isolate_tsdf_block_update/main.cpp
@@ -0,0 +1,118 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <fstream>
+#include <iostream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/types.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/integrators/integrators_common.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/utils/timing.h"
+
+DECLARE_bool(alsologtostderr);
+
+using namespace nvblox;
+
+// Just a class so we can acces integrator internals
+class ProjectiveTsdfIntegratorExperiment : public ProjectiveTsdfIntegrator {
+ public:
+  ProjectiveTsdfIntegratorExperiment() : ProjectiveTsdfIntegrator() {}
+  virtual ~ProjectiveTsdfIntegratorExperiment(){};
+
+  // Expose this publically
+  void updateBlocks(const std::vector<Index3D>& block_indices,
+                    const DepthImage& depth_frame, const Transform& T_L_C,
+                    const Camera& camera, const float truncation_distance_m,
+                    TsdfLayer* layer) {
+    ProjectiveTsdfIntegrator::updateBlocks(block_indices, depth_frame, T_L_C,
+                                           camera, truncation_distance_m,
+                                           layer);
+  }
+};
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  const std::string dataset_base_path = "../../../tests/data/3dmatch";
+  constexpr int kSeqNum = 1;
+
+  constexpr float kVoxelSize = 0.05;
+  TsdfLayer tsdf_layer(kVoxelSize, MemoryType::kDevice);
+
+  ProjectiveTsdfIntegratorExperiment tsdf_integrator;
+
+  const unsigned int frustum_raycast_subsampling_rate = 4;
+  tsdf_integrator.frustum_calculator().raycast_subsampling_factor(
+      frustum_raycast_subsampling_rate);
+
+  // Update identified blocks (many times)
+  constexpr int kNumIntegrations = 1000;
+  for (int i = 0; i < kNumIntegrations; i++) {
+    // Load images
+    auto image_loader_ptr = datasets::threedmatch::createDepthImageLoader(
+        dataset_base_path, kSeqNum);
+
+    DepthImage depth_frame;
+    CHECK(image_loader_ptr->getNextImage(&depth_frame));
+
+    Eigen::Matrix3f camera_intrinsics;
+    CHECK(datasets::threedmatch::parseCameraFromFile(
+        datasets::threedmatch::getPathForCameraIntrinsics(dataset_base_path),
+        &camera_intrinsics));
+    const auto camera = Camera::fromIntrinsicsMatrix(
+        camera_intrinsics, depth_frame.width(), depth_frame.height());
+
+    Transform T_L_C;
+    CHECK(datasets::threedmatch::parsePoseFromFile(
+        datasets::threedmatch::getPathForFramePose(dataset_base_path, kSeqNum,
+                                                   0),
+        &T_L_C));
+
+    // Identify blocks we can (potentially) see (CPU)
+    timing::Timer blocks_in_view_timer("tsdf/integrate/get_blocks_in_view");
+    const std::vector<Index3D> block_indices =
+        tsdf_integrator.getBlocksInViewUsingRaycasting(
+            depth_frame, T_L_C, camera, tsdf_layer.block_size());
+    blocks_in_view_timer.Stop();
+
+    // Allocate blocks (CPU)
+    timing::Timer allocate_blocks_timer("tsdf/integrate/allocate_blocks");
+    allocateBlocksWhereRequired(block_indices, &tsdf_layer);
+    allocate_blocks_timer.Stop();
+
+    timing::Timer update_blocks_timer("tsdf/integrate/update_blocks");
+    const float truncation_distance_m =
+        tsdf_integrator.truncation_distance_vox() * kVoxelSize;
+    tsdf_integrator.updateBlocks(block_indices, depth_frame, T_L_C, camera,
+                                 truncation_distance_m, &tsdf_layer);
+    update_blocks_timer.Stop();
+
+    // Reset the layer such that we do TsdfBlock allocation.
+    // tsdf_layer.clear();
+  }
+
+  std::cout << timing::Timing::Print() << std::endl;
+
+  return 0;
+}
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/layer_cake_interface/CMakeLists.txt b/nvblox/experiments/experiments/layer_cake_interface/CMakeLists.txt
new file mode 100644
index 00000000..b51552ba
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_executable(layer_cake_interface_dynamic 
+  main_dynamic.cpp
+  src/cuda/user_defined_block.cu
+)
+target_include_directories(layer_cake_interface_dynamic PUBLIC
+include
+)
+target_link_libraries(layer_cake_interface_dynamic nvblox_lib)
+
+add_executable(layer_cake_interface_static 
+  main_static.cpp
+)
+target_include_directories(layer_cake_interface_static PUBLIC
+include
+)
+target_link_libraries(layer_cake_interface_static nvblox_lib)
+
+add_executable(layer_cake_interface_type_erasure 
+  main_type_erasure.cpp
+)
+target_include_directories(layer_cake_interface_type_erasure PUBLIC
+include
+)
+target_link_libraries(layer_cake_interface_type_erasure nvblox_lib)
+
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_common.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_common.h
new file mode 100644
index 00000000..bc16d20a
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_common.h
@@ -0,0 +1,32 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace experiments {
+
+// Unique types
+template <class... Types>
+struct unique_types;
+
+// Count occurances
+template <class Tsearch, class... Types>
+struct count_type_occurrence;
+
+}  // namespace experiments
+}  // namespace nvblox
+
+#include "nvblox/experiments/impl/cake_common_impl.h"
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_dynamic.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_dynamic.h
new file mode 100644
index 00000000..d9a941ed
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_dynamic.h
@@ -0,0 +1,61 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <unordered_map>
+
+#include <nvblox/core/layer.h>
+#include <nvblox/core/types.h>
+
+#include "nvblox/experiments/cake_common.h"
+
+namespace nvblox {
+namespace experiments {
+
+class LayerCakeDynamic {
+ public:
+  LayerCakeDynamic(float voxel_size, MemoryType memory_type)
+      : voxel_size_(voxel_size), memory_type_(memory_type) {}
+
+  template <typename LayerType>
+  LayerType* add();
+
+  template <typename LayerType>
+  LayerType* getPtr();
+
+  template <typename LayerType>
+  bool exists() const;
+
+  // Factory accepting a list of LayerTypes
+  template <typename... LayerTypes>
+  static LayerCakeDynamic create(float voxel_size, MemoryType memory_type);
+
+ private:
+  // Params
+  const float voxel_size_;
+  const MemoryType memory_type_;
+
+  // NOTE(alexmillane): Could move to multimap if we want more than one layer
+  //                    with the same type
+  std::unordered_map<std::type_index, std::unique_ptr<BaseLayer>> layers_;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
+
+#include "nvblox/experiments/impl/cake_dynamic_impl.h"
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_static.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_static.h
new file mode 100644
index 00000000..5b2e9105
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_static.h
@@ -0,0 +1,47 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace experiments {
+
+template <class... LayerTypes>
+class LayerCakeStatic {
+ public:
+  LayerCakeStatic(float voxel_size, MemoryType memory_type)
+      : layers_(std::move(
+            LayerTypes(sizeArgumentFromVoxelSize<LayerTypes>(voxel_size),
+                       memory_type))...) {}
+
+  // Access by LayerType.
+  // This requires that the list of types contained in this LayerCakeStatic are
+  // unique with respect to one another.
+  template <typename LayerType>
+  LayerType* getPtr();
+  template <typename LayerType>
+  const LayerType& get() const;
+
+  template <typename LayerType>
+  int count() const;
+
+ private:
+  std::tuple<LayerTypes...> layers_;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
+
+#include "nvblox/experiments/impl/cake_static_impl.h"
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_type_erasure.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_type_erasure.h
new file mode 100644
index 00000000..550eb062
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/cake_type_erasure.h
@@ -0,0 +1,95 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace experiments {
+
+// Note: A good resource for type erasure can be found here:
+// https://github.com/alecjacobson/better-code-runtime-polymorphism/blob/master/library.hpp
+
+template <typename LayerType>
+bool loadLayer(const std::string& filename, LayerType* layer_ptr) {
+  LOG(INFO) << "Loading Layer of type: " << typeid(*layer_ptr).name();
+  return true;
+}
+
+class LayerInterface {
+ public:
+  template <typename LayerType>
+  LayerInterface(LayerType&& layer)
+      : layer_(std::make_unique<LayerModel<LayerType>>(std::move(layer))) {}
+
+  bool load(const std::string& filename) { return layer_->load(filename); }
+
+ private:
+  struct LayerConcept {
+    virtual ~LayerConcept() = default;
+    virtual bool load(const std::string& filename) = 0;
+  };
+
+  template <typename LayerType>
+  struct LayerModel : LayerConcept {
+    LayerModel(LayerType&& layer) : layer_(std::move(layer)) {}
+
+    bool load(const std::string& filename) override {
+      return loadLayer(filename, &layer_);
+    }
+
+    // Where the layer is actually stored
+    LayerType layer_;
+  };
+
+  std::unique_ptr<LayerConcept> layer_;
+};
+
+class LayerCakeTypeErasure {
+ public:
+  LayerCakeTypeErasure(float voxel_size, MemoryType memory_type)
+      : voxel_size_(voxel_size), memory_type_(memory_type){};
+
+  template <typename LayerType>
+  void add() {
+    layers_.push_back(LayerType(
+        sizeArgumentFromVoxelSize<LayerType>(voxel_size_), memory_type_));
+  }
+
+  bool load(const std::string& filename) {
+    bool success = true;
+    for (auto& layer : layers_) {
+      success &= layer.load(filename);
+    }
+    return success;
+  }
+
+ private:
+  // Params
+  const float voxel_size_;
+  const MemoryType memory_type_;
+
+  // Data
+  std::vector<LayerInterface> layers_;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_common_impl.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_common_impl.h
new file mode 100644
index 00000000..9c81fba8
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_common_impl.h
@@ -0,0 +1,63 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace experiments {
+
+// Unique type
+template <class T1, class T2, class... Trest>
+struct unique_types<T1, T2, Trest...> {
+  static constexpr bool value = unique_types<T1, T2>::value &&
+                                unique_types<T1, Trest...>::value &&
+                                unique_types<T2, Trest...>::value;
+};
+
+template <class T1, class T2>
+struct unique_types<T1, T2> {
+  static constexpr bool value = !std::is_same<T1, T2>::value;
+};
+
+template <class T1>
+struct unique_types<T1> {
+  static constexpr bool value = true;
+};
+
+template <>
+struct unique_types<> {
+  static constexpr bool value = true;
+};
+
+// Count occurances
+template <class Tsearch, class T1, class... Trest>
+struct count_type_occurrence<Tsearch, T1, Trest...> {
+  static constexpr int value = count_type_occurrence<Tsearch, T1>::value +
+                               count_type_occurrence<Tsearch, Trest...>::value;
+};
+
+template <class Tsearch, class T1>
+struct count_type_occurrence<Tsearch, T1> {
+  static constexpr int value =
+      static_cast<int>(std::is_same<Tsearch, T1>::value);
+};
+
+template <class Tsearch>
+struct count_type_occurrence<Tsearch> {
+  static constexpr int value = 0;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_dynamic_impl.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_dynamic_impl.h
new file mode 100644
index 00000000..c2be15ae
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_dynamic_impl.h
@@ -0,0 +1,73 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace experiments {
+
+template <typename LayerType>
+LayerType* LayerCakeDynamic::add() {
+  if (layers_.count(typeid(LayerType)) == 0) {
+    // Allocate
+    auto layer_ptr = std::make_unique<LayerType>(
+        sizeArgumentFromVoxelSize<LayerType>(voxel_size_), memory_type_);
+    LayerType* return_ptr = layer_ptr.get();
+    // Store (as BaseLayer ptr)
+    layers_.emplace(std::type_index(typeid(LayerType)), std::move(layer_ptr));
+    LOG(INFO) << "Adding Layer with type: " << typeid(LayerType).name()
+              << " to LayerCake.";
+    return return_ptr;
+  } else {
+    LOG(WARNING) << "Request to add a LayerType that's already in the cake. "
+                    "Currently we only support single layers of each "
+                    "LayerType. So we did nothing.";
+    return nullptr;
+  }
+}
+
+template <typename LayerType>
+LayerType* LayerCakeDynamic::getPtr() {
+  const auto it = layers_.find(std::type_index(typeid(LayerType)));
+  if (it != layers_.end()) {
+    BaseLayer* base_ptr = it->second.get();
+    LayerType* ptr = dynamic_cast<LayerType*>(base_ptr);
+    CHECK_NOTNULL(ptr);
+    return ptr;
+  } else {
+    LOG(WARNING) << "Request for a LayerType which is not in the cake.";
+    return nullptr;
+  }
+}
+
+template <typename LayerType>
+bool LayerCakeDynamic::exists() const {
+  const auto it = layers_.find(std::type_index(typeid(LayerType)));
+  return (it != layers_.end());
+}
+
+template <typename... LayerTypes>
+LayerCakeDynamic LayerCakeDynamic::create(float voxel_size,
+                                          MemoryType memory_type) {
+  static_assert(unique_types<LayerTypes...>::value,
+                "At the moment we only support LayerCakes containing unique "
+                "LayerTypes.");
+  LayerCakeDynamic cake(voxel_size, memory_type);
+  BaseLayer* ignored[] = {cake.add<LayerTypes>()...};
+  return cake;
+}
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_static_impl.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_static_impl.h
new file mode 100644
index 00000000..55ef5aa7
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/cake_static_impl.h
@@ -0,0 +1,52 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/experiments/cake_common.h"
+
+namespace nvblox {
+namespace experiments {
+
+template <class... LayerTypes>
+template <typename LayerType>
+LayerType* LayerCakeStatic<LayerTypes...>::getPtr() {
+  static_assert(count_type_occurrence<LayerType, LayerTypes...>::value > 0,
+                "LayerCake does not contain requested layer");
+  static_assert(count_type_occurrence<LayerType, LayerTypes...>::value < 2,
+                "To get a Layer by type, the LayerCake must only contain "
+                "only a single matching LayerType");
+  return &std::get<LayerType>(layers_);
+}
+
+template <class... LayerTypes>
+template <typename LayerType>
+const LayerType& LayerCakeStatic<LayerTypes...>::get() const {
+  static_assert(count_type_occurrence<LayerType, LayerTypes...>::value > 0,
+                "LayerCake does not contain requested layer");
+  static_assert(count_type_occurrence<LayerType, LayerTypes...>::value < 2,
+                "To get a Layer by type, the LayerCake must only contain "
+                "only a single matching LayerType");
+  return std::get<LayerType>(layers_);
+}
+
+template <class... LayerTypes>
+template <typename LayerType>
+int LayerCakeStatic<LayerTypes...>::count() const {
+  return count_type_occurrence<LayerType, LayerTypes...>::value;
+}
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/load_impl.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/load_impl.h
new file mode 100644
index 00000000..8e071e15
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/impl/load_impl.h
@@ -0,0 +1,60 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace experiments {
+namespace io {
+
+template <typename LayerType>
+bool loadLayer(const std::string& filename, LayerType* layer) {
+  LOG(INFO) << "Loading layer of type: " << typeid(LayerType).name();
+  // TODO: Actually load something.
+  return true;
+}
+
+template <class... LayerTypes>
+struct Loader;
+
+template <class LayerType>
+struct Loader<LayerType> {
+  static bool load(const std::string& filename, LayerCakeDynamic* cake) {
+    if (cake->exists<LayerType>()) {
+      LOG(WARNING) << "Not loading layer. A layer of type: "
+                   << typeid(LayerType).name()
+                   << " already exists in the cake.";
+      return false;
+    }
+    return loadLayer<LayerType>(filename, cake->add<LayerType>());
+  }
+};
+
+template <class LayerType, class... LayerTypesRest>
+struct Loader<LayerType, LayerTypesRest...> {
+  static bool load(const std::string& filename, LayerCakeDynamic* cake) {
+    return Loader<LayerType>::load(filename, cake) &&
+           Loader<LayerTypesRest...>::load(filename, cake);
+  }
+};
+
+template <typename... LayerTypes>
+bool load(const std::string& filename, LayerCakeDynamic* cake) {
+  return Loader<LayerTypes...>::load(filename, cake);
+}
+
+}  // io
+}  // experiments
+}  // nvblox
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/load.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/load.h
new file mode 100644
index 00000000..c127d692
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/load.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <string>
+
+#include "nvblox/experiments/cake_dynamic.h"
+
+namespace nvblox {
+namespace experiments {
+namespace io {
+
+template <typename... LayerTypes>
+bool load(const std::string& filename, LayerCakeDynamic* cake);
+
+template <typename LayerType>
+bool loadLayer(const std::string& filename, LayerType* layer);
+
+}  // namespace io
+}  // namespace experiments
+}  // namespace nvblox
+
+#include "nvblox/experiments/impl/load_impl.h"
diff --git a/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/user_defined_block.h b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/user_defined_block.h
new file mode 100644
index 00000000..402f06e5
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/include/nvblox/experiments/user_defined_block.h
@@ -0,0 +1,49 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/unified_ptr.h"
+
+#include "nvblox/experiments/load.h"
+
+namespace nvblox {
+namespace experiments {
+
+struct UserDefinedBlock {
+  typedef nvblox::unified_ptr<UserDefinedBlock> Ptr;
+  typedef nvblox::unified_ptr<const UserDefinedBlock> ConstPtr;
+
+  static Ptr allocate(MemoryType memory_type) {
+    return make_unified<UserDefinedBlock>(memory_type);
+  }
+
+  float data = 0.0f;
+};
+
+using UserDefinedLayer = BlockLayer<UserDefinedBlock>;
+
+// Custom load function for UserDefinedBlock
+namespace io {
+
+template <>
+bool loadLayer<experiments::UserDefinedLayer>(
+    const std::string& filename, experiments::UserDefinedLayer* layer);
+
+}  // namespace io
+}  // namespace experiments
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/layer_cake_interface/main_dynamic.cpp b/nvblox/experiments/experiments/layer_cake_interface/main_dynamic.cpp
new file mode 100644
index 00000000..61aaf182
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/main_dynamic.cpp
@@ -0,0 +1,95 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <glog/logging.h>
+
+#include <iostream>
+
+#include "nvblox/core/common_names.h"
+
+#include "nvblox/experiments/cake_dynamic.h"
+#include "nvblox/experiments/load.h"
+#include "nvblox/experiments/user_defined_block.h"
+
+using namespace nvblox;
+
+// Globals for convenience
+const float voxel_size = 0.1f;
+const MemoryType memory_type = MemoryType::kDevice;
+
+void addRetrieve() {
+  std::cout << "addRetrieve()" << std::endl;
+
+  experiments::LayerCakeDynamic cake(voxel_size, memory_type);
+
+  cake.add<TsdfLayer>();
+  TsdfLayer* tsdf_layer = cake.getPtr<TsdfLayer>();
+
+  std::cout << "\n\n" << std::endl;
+}
+
+void loadLayers() {
+  std::cout << "loadLayers()." << std::endl;
+
+  experiments::LayerCakeDynamic cake(voxel_size, memory_type);
+
+  std::string filename = "test.cake";
+  experiments::io::load<TsdfLayer, EsdfLayer>(filename, &cake);
+
+  std::cout << "\n\n" << std::endl;
+}
+
+void loadCustomLayer() {
+  std::cout << "loadCustomLayer()." << std::endl;
+
+  const float block_size = 1.0f;
+  experiments::UserDefinedLayer user_defined_layer(block_size, memory_type);
+
+  experiments::LayerCakeDynamic cake(voxel_size, memory_type);
+
+  std::string filename = "test.cake";
+  experiments::io::load<experiments::UserDefinedLayer>(filename, &cake);
+
+  std::cout << "\n\n" << std::endl;
+}
+
+void buildCake() {
+  std::cout << "buildCake()" << std::endl;
+
+  experiments::LayerCakeDynamic cake =
+      experiments::LayerCakeDynamic::create<TsdfLayer, ColorLayer, EsdfLayer,
+                                            MeshLayer>(voxel_size, memory_type);
+
+  // Static assert
+  // NOTE: This creates a static assertion fail due to duplicated LayerTypes.
+  // LayerCakeDynamic cake_error =
+  //     LayerCakeDynamic::create<TsdfLayer, ColorLayer, TsdfLayer>(voxel_size,
+  //                                                                memory_type);
+
+  std::cout << "\n\n" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+
+  addRetrieve();
+  loadLayers();
+  loadCustomLayer();
+  buildCake();
+
+  return 0;
+}
diff --git a/nvblox/experiments/experiments/layer_cake_interface/main_static.cpp b/nvblox/experiments/experiments/layer_cake_interface/main_static.cpp
new file mode 100644
index 00000000..82cdd268
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/main_static.cpp
@@ -0,0 +1,62 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <glog/logging.h>
+
+#include <iostream>
+
+#include "nvblox/core/common_names.h"
+
+#include "nvblox/experiments/cake_static.h"
+
+using namespace nvblox;
+
+// Globals for convenience
+const float voxel_size = 0.1f;
+const MemoryType memory_type = MemoryType::kDevice;
+
+void addRetrieve() {
+  std::cout << "addRetrieve()" << std::endl;
+
+  experiments::LayerCakeStatic<TsdfLayer> cake_1(voxel_size, memory_type);
+
+  TsdfLayer* tsdf_layer = cake_1.getPtr<TsdfLayer>();
+  // EsdfLayer* esdf_layer = cake.getPtr<EsdfLayer>(); // Compile error (missing
+  //                                                      layer)
+
+  experiments::LayerCakeStatic<TsdfLayer, ColorLayer, MeshLayer, EsdfLayer>
+      cake_2(voxel_size, memory_type);
+
+  tsdf_layer = cake_2.getPtr<TsdfLayer>();
+  EsdfLayer* esdf_layer = cake_2.getPtr<EsdfLayer>();
+
+  experiments::LayerCakeStatic<TsdfLayer, TsdfLayer, TsdfLayer, EsdfLayer>
+      cake_3(voxel_size, memory_type);
+  CHECK_EQ(cake_3.count<TsdfLayer>(), 3);
+  // tsdf_layer = cake_3.getPtr<TsdfLayer>(); // Compile error (more than one
+  //                                                            TsdfLayer)
+
+  std::cout << "\n\n" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+
+  addRetrieve();
+
+  return 0;
+}
diff --git a/nvblox/experiments/experiments/layer_cake_interface/main_type_erasure.cpp b/nvblox/experiments/experiments/layer_cake_interface/main_type_erasure.cpp
new file mode 100644
index 00000000..95e630a4
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/main_type_erasure.cpp
@@ -0,0 +1,43 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+
+#include "nvblox/experiments/cake_type_erasure.h"
+
+using namespace nvblox;
+
+void load() {
+  const float voxel_size = 0.1f;
+  const MemoryType memory_type = MemoryType::kDevice;
+  experiments::LayerCakeTypeErasure cake(voxel_size, memory_type);
+
+  // Add layers
+  cake.add<TsdfLayer>();
+  cake.add<ColorLayer>();
+  cake.add<EsdfLayer>();
+  cake.add<MeshLayer>();
+
+  // Populate layer(s) from file
+  const std::string filename = "test.cake";
+  cake.load(filename);
+}
+
+int main() {
+  load();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/layer_cake_interface/src/cuda/user_defined_block.cu b/nvblox/experiments/experiments/layer_cake_interface/src/cuda/user_defined_block.cu
new file mode 100644
index 00000000..e1b57f59
--- /dev/null
+++ b/nvblox/experiments/experiments/layer_cake_interface/src/cuda/user_defined_block.cu
@@ -0,0 +1,34 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/user_defined_block.h"
+
+#include "nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh"
+
+namespace nvblox {
+
+template class GPULayerView<experiments::UserDefinedBlock>;
+
+namespace experiments {
+namespace io {
+template <>
+bool loadLayer<experiments::UserDefinedLayer>(
+    const std::string& filename, experiments::UserDefinedLayer* layer) {
+  LOG(INFO) << "In the user defined load function for: UserDefinedLayer";
+  return true;
+}
+}  // io
+}  // experiments
+}  // nvblox
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/realistic_timings/CMakeLists.txt b/nvblox/experiments/experiments/realistic_timings/CMakeLists.txt
new file mode 100644
index 00000000..6a590ecb
--- /dev/null
+++ b/nvblox/experiments/experiments/realistic_timings/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_custom_target( realistic_timings )
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/gather_timings.py" "${CMAKE_CURRENT_BINARY_DIR}/" realistic_timings)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/plot_timings.py" "${CMAKE_CURRENT_BINARY_DIR}/" realistic_timings)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/compare_platforms.py" "${CMAKE_CURRENT_BINARY_DIR}/" realistic_timings)
diff --git a/nvblox/experiments/experiments/realistic_timings/compare_platforms.py b/nvblox/experiments/experiments/realistic_timings/compare_platforms.py
new file mode 100644
index 00000000..bac01c4f
--- /dev/null
+++ b/nvblox/experiments/experiments/realistic_timings/compare_platforms.py
@@ -0,0 +1,72 @@
+#!/usr/bin/python3
+
+from nvblox.experiments.timing import get_timings_as_dataframe
+import os
+import argparse
+import glob
+import pandas as pd
+from typing import Tuple
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+rcParams.update({'figure.autolayout': True})
+
+
+frame_rate = 30
+
+
+def get_total_times(filepath: str) -> pd.Series:
+    timings = get_timings_as_dataframe(filepath)
+    timings = timings[timings.index.str.startswith('3dmatch')]
+    def mapper(x): return x.replace("3dmatch/", "").replace("_", " ")
+    timings = timings.rename(mapper, axis='index')
+    total_frames = timings["num_calls"].max()
+    total_seconds = total_frames / frame_rate
+    return timings["total_time"].divide(total_seconds)
+
+
+def plot_timings(timing_root_dir: str, save_fig: bool = True) -> None:
+    print("Looking for results in: " + timing_root_dir)
+
+    # Check that the profile results are in the dir
+    result_files = [name for name in os.listdir(timing_root_dir)
+                    if os.path.isfile(os.path.join(timing_root_dir, name))]
+
+    # Load every text file in the results dir.
+    df = pd.DataFrame()
+    for result_name in result_files:
+        results_path = os.path.join(timing_root_dir, result_name)
+        if os.path.splitext(result_name)[1] != ".txt":
+            continue
+        timings = get_total_times(results_path)
+        this_df = timings.to_frame(name=os.path.splitext(result_name)[0])
+        if df.empty:
+            df = this_df
+        else:
+            df = pd.merge(df, this_df, left_index=True, right_index=True)
+
+    # Plot
+    ax = df.plot.bar(rot=0)
+    ax.set_ylabel('Processing Time per Second of Data (s)')
+    for con in ax.containers:
+        ax.bar_label(con, fmt='%.1f', rotation=45)
+    ax.tick_params(axis='x', labelrotation=45)
+    plt.title("Platform Comparison for 3D Match")
+
+    # Save the plot in the root folder of the timings.
+    if save_fig:
+        image_path = os.path.join(timing_root_dir, 'timings.png')
+        print("Saving figure to disk as: " + image_path)
+        plt.savefig(image_path)
+
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Plot 3d match timing comparisons across platforms.")
+    parser.add_argument("timing_root_dir", metavar="timing_root_dir", type=str,
+                        help="The directory containing the timing results.")
+    args = parser.parse_args()
+    plot_timings(args.timing_root_dir)
diff --git a/nvblox/experiments/experiments/realistic_timings/gather_timings.py b/nvblox/experiments/experiments/realistic_timings/gather_timings.py
new file mode 100644
index 00000000..cd08fe51
--- /dev/null
+++ b/nvblox/experiments/experiments/realistic_timings/gather_timings.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python3
+
+import os
+import time
+import argparse
+import subprocess
+from datetime import datetime
+
+import git
+
+import nvblox.experiments.threedmatch as threedmatch
+
+from plot_timings import plot_timings
+
+profiles = {"min": {"tsdf_frame_subsampling": 3, "color_frame_subsampling": 30,
+                    "esdf_frame_subsampling": 10, "mesh_frame_subsampling": 30},
+            "goal": {"tsdf_frame_subsampling": 1, "color_frame_subsampling": 10,
+                     "esdf_frame_subsampling": 3, "mesh_frame_subsampling": 10}}
+resolutions = [0.05, 0.10, 0.20]
+
+
+def gather_timings(dataset_path: str) -> None:
+    # Params
+    num_runs = 1
+
+    # Get the repo base & all needed paths.
+    this_directory = os.getcwd()
+    repo = git.Repo(this_directory, search_parent_directories=True)
+    git_root_dir = repo.git.rev_parse("--show-toplevel")
+    assert not repo.bare
+    build_dir = os.path.join(git_root_dir, 'nvblox/build')
+    threedmatch_binary_path = os.path.join(
+        build_dir, 'experiments/fuse_3dmatch')
+
+    # Where to place timings
+    datetime_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
+    output_root = os.path.join(
+        this_directory, 'output', datetime_str)
+
+    # Time each profile
+    for profile_name, profile_val in profiles.items():
+        folder_name = profile_name
+
+        # Time each resolution within each profile
+        # TODO
+        # Benchmark
+        output_dir = os.path.join(output_root, folder_name)
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir, exist_ok=True)
+
+        threedmatch.run_multiple(
+            num_runs, threedmatch_binary_path, dataset_path, output_dir,
+            warmup_run=False, flags=profile_val)
+
+    # Plot this stuff
+    plot_timings(output_root)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Generate timing results comparing the current and another branch/hash.")
+    parser.add_argument(
+        "dataset_base_path", metavar="dataset_base_path", type=str,
+        help="Path to the 3DMatch dataset root directory.")
+
+    args = parser.parse_args()
+    gather_timings(args.dataset_base_path)
diff --git a/nvblox/experiments/experiments/realistic_timings/plot_timings.py b/nvblox/experiments/experiments/realistic_timings/plot_timings.py
new file mode 100644
index 00000000..f3dcb9dc
--- /dev/null
+++ b/nvblox/experiments/experiments/realistic_timings/plot_timings.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python3
+
+from nvblox.experiments.timing import get_timings_as_dataframe
+import os
+import argparse
+import glob
+import pandas as pd
+from typing import Tuple
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+rcParams.update({'figure.autolayout': True})
+
+# Get the timings for a single run - ONLY the 3dmatch timings.
+
+frame_rate = 30
+
+def get_total_times(filepath: str) -> pd.Series:
+    timings = get_timings_as_dataframe(filepath)
+    timings = timings[timings.index.str.startswith('3dmatch')]
+    total_frames = timings["num_calls"].max()
+    total_seconds = total_frames / frame_rate
+    return timings["total_time"].divide(total_seconds)
+
+# Get timings for multiple runs and average them
+
+
+def get_profile_mean_total_times(timing_dir: str) -> Tuple[pd.Series, np.ndarray]:
+    series = []
+    results_files = glob.glob(timing_dir + "/run_*.txt")
+    for f in results_files:
+        series.append(get_total_times(f))
+    total_times = pd.concat(series, axis=1)
+    total_times_mean = total_times.mean(axis=1)
+    total_times_max = total_times.max(axis=1)
+    total_times_min = total_times.min(axis=1)
+    total_times_max_min_np = np.vstack((total_times_max, total_times_min))
+    total_times_err = np.abs(total_times_max_min_np -
+                             total_times_mean.to_numpy())
+    return total_times_mean, total_times_err
+
+
+def plot_timings(timing_root_dir: str, save_fig: bool = True) -> None:
+    print("Looking for results in: " + timing_root_dir)
+
+    # Check that the profile results are in the dir
+    result_dirs = [name for name in os.listdir(timing_root_dir)
+                   if os.path.isdir(os.path.join(timing_root_dir, name))]
+    assert len(
+        result_dirs) == 2, "We expect a comparision between exactly two profiles (out of laziness)"
+
+    # Profile 1 results
+    profile_1_name = result_dirs[0]
+    profile_1_results_folder = os.path.join(timing_root_dir, profile_1_name)
+    profile_1_mean, profile_1_err = get_profile_mean_total_times(
+        profile_1_results_folder)
+
+    # Profile 2 results
+    profile_2_name = result_dirs[1]
+    profile_2_results_folder = os.path.join(timing_root_dir, profile_2_name)
+    profile_2_mean, profile_2_err = get_profile_mean_total_times(
+        profile_2_results_folder)
+
+    # Select only the intersection of the two series.
+    intersection_columns = profile_1_mean.index.intersection(
+        profile_2_mean.index)
+
+    profile_1_inds = [profile_1_mean.index.get_loc(
+        c) for c in intersection_columns]
+    profile_2_inds = [profile_2_mean.index.get_loc(
+        c) for c in intersection_columns]
+
+    profile_1_mean_filtered = profile_1_mean[intersection_columns]
+    profile_2_mean_filtered = profile_2_mean[intersection_columns]
+    profile_1_err_filtered = profile_1_err[:, profile_1_inds]
+    profile_2_err_filtered = profile_2_err[:, profile_2_inds]
+
+    assert len(profile_2_mean_filtered) == len(profile_1_mean_filtered)
+
+    # Plot
+    fig, ax = plt.subplots()
+    bar_width = 0.35
+    x = np.arange(len(profile_1_mean))
+    x_device = x - bar_width / 2.0
+    x_unified = x + bar_width / 2.0
+    bar1 = ax.bar(x_device, profile_1_mean_filtered, bar_width,
+           yerr=profile_1_err_filtered, label=profile_1_name)
+    bar2 = ax.bar(x_unified, profile_2_mean_filtered, bar_width,
+           yerr=profile_2_err_filtered, label=profile_2_name)
+    ax.set_xticks(x)
+    ax.set_xticklabels(intersection_columns, rotation='vertical')
+    try:
+        ax.bar_label(bar1, fmt='%.2f')
+        ax.bar_label(bar2, fmt='%.2f')
+    except:
+        pass
+    ax.legend()
+    ax.set_ylabel('Processing Time per Second of Data (s)')
+
+    # Save the plot in the root folder of the timings.
+    if save_fig:
+        image_path = os.path.join(timing_root_dir, 'timings.png')
+        print("Saving figure to disk as: " + image_path)
+        plt.savefig(image_path)
+
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Plot profile timing comparisons.")
+    parser.add_argument("timing_root_dir", metavar="timing_root_dir", type=str,
+                        help="The directory containing the timing results.")
+    args = parser.parse_args()
+    plot_timings(args.timing_root_dir)
diff --git a/nvblox/experiments/experiments/stream_compaction/CMakeLists.txt b/nvblox/experiments/experiments/stream_compaction/CMakeLists.txt
new file mode 100644
index 00000000..b31e0eb5
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_executable(stream_compaction
+main.cpp
+lib/stream_compaction.cu
+lib/cached_allocator.cu
+)
+target_include_directories(stream_compaction PUBLIC
+include
+)
+target_link_libraries(stream_compaction nvblox_experiments_common)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/run.py" "${CMAKE_CURRENT_BINARY_DIR}/" stream_compaction)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/plot.py" "${CMAKE_CURRENT_BINARY_DIR}/" stream_compaction)
diff --git a/nvblox/experiments/experiments/stream_compaction/include/nvblox/experiments/cached_allocator.h b/nvblox/experiments/experiments/stream_compaction/include/nvblox/experiments/cached_allocator.h
new file mode 100644
index 00000000..83bedbe4
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/include/nvblox/experiments/cached_allocator.h
@@ -0,0 +1,40 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cstddef>
+
+namespace nvblox {
+namespace experiments {
+
+struct CachedAllocator {
+  typedef char value_type;
+
+  CachedAllocator() {}
+  ~CachedAllocator();
+
+  char* allocate(std::ptrdiff_t num_bytes);
+
+  void deallocate(char* ptr, size_t n);
+  void free();
+
+  std::ptrdiff_t num_bytes_allocated_ = 0;
+  char* memory_ = nullptr;
+  bool in_use_ = false;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/experiments/stream_compaction/include/nvblox/experiments/stream_compaction.h b/nvblox/experiments/experiments/stream_compaction/include/nvblox/experiments/stream_compaction.h
new file mode 100644
index 00000000..ede94c1a
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/include/nvblox/experiments/stream_compaction.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/unified_vector.h"
+
+#include "nvblox/experiments/cached_allocator.h"
+
+namespace nvblox {
+namespace experiments {
+
+class StreamCompactor {
+ public:
+  StreamCompactor() {}
+
+  int streamCompactionOnGPU(const device_vector<int>& data,
+                            const device_vector<bool>& stencil,
+                            device_vector<int>* compact_data_ptr,
+                            bool use_cached_allocator = true);
+
+ private:
+  CachedAllocator cached_allocator_;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/experiments/stream_compaction/lib/cached_allocator.cu b/nvblox/experiments/experiments/stream_compaction/lib/cached_allocator.cu
new file mode 100644
index 00000000..993d30d4
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/lib/cached_allocator.cu
@@ -0,0 +1,64 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/cached_allocator.h"
+
+#include <thrust/system/cuda/vector.h>
+
+#include <glog/logging.h>
+
+namespace nvblox {
+namespace experiments {
+
+CachedAllocator::~CachedAllocator() { free(); }
+
+char* CachedAllocator::allocate(std::ptrdiff_t num_bytes) {
+  CHECK(!in_use_) << "This allocator is only for single consumers.";
+
+  // If there's enough memory,
+  if (num_bytes <= num_bytes_allocated_) {
+    // don't allocate anything and return the block we have.
+  } else {
+    try {
+      // Free old block
+      if (memory_ != nullptr) {
+        thrust::cuda::free(thrust::cuda::pointer<char>(memory_));
+      }
+
+      // allocate memory and convert cuda::pointer to raw pointer
+      memory_ = thrust::cuda::malloc<char>(num_bytes).get();
+      num_bytes_allocated_ = num_bytes;
+    } catch (std::runtime_error& e) {
+      throw;
+    }
+  }
+
+  in_use_ = true;
+  return memory_;
+}
+
+void CachedAllocator::deallocate(char* ptr, size_t n) {
+  in_use_ = false;
+}
+
+void CachedAllocator::free() {
+  thrust::cuda::free(thrust::cuda::pointer<char>(memory_));
+  num_bytes_allocated_ = 0;
+  in_use_ = false;
+  memory_ = nullptr;
+}
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/experiments/stream_compaction/lib/stream_compaction.cu b/nvblox/experiments/experiments/stream_compaction/lib/stream_compaction.cu
new file mode 100644
index 00000000..2b46edf7
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/lib/stream_compaction.cu
@@ -0,0 +1,53 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/stream_compaction.h"
+
+#include <thrust/copy.h>
+
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+namespace experiments {
+
+int StreamCompactor::streamCompactionOnGPU(const device_vector<int>& data,
+                                           const device_vector<bool>& stencil,
+                                           device_vector<int>* compact_data_ptr,
+                                           bool use_cached_allocator) {
+  int* output_end_it;
+  if (!use_cached_allocator) {
+    output_end_it = thrust::copy_if(thrust::device,             // NOLINT
+                                    data.data(),                // NOLINT
+                                    data.data() + data.size(),  // NOLINT
+                                    stencil.data(),             // NOLINT
+                                    compact_data_ptr->data(),   // NOLINT
+                                    thrust::identity<bool>());
+
+  } else {
+    output_end_it =
+        thrust::copy_if(thrust::cuda::par(cached_allocator_),  // NOLINT
+                        data.data(),                           // NOLINT
+                        data.data() + data.size(),             // NOLINT
+                        stencil.data(),                        // NOLINT
+                        compact_data_ptr->data(),              // NOLINT
+                        thrust::identity<bool>());
+  }
+
+  const int num_true = output_end_it - data.data();
+  return num_true;
+}
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/experiments/stream_compaction/main.cpp b/nvblox/experiments/experiments/stream_compaction/main.cpp
new file mode 100644
index 00000000..a10ab728
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/main.cpp
@@ -0,0 +1,106 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <vector>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/unified_vector.h"
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/experiments/stream_compaction.h"
+
+using namespace nvblox;
+
+DEFINE_int32(num_compactions, 100,
+             "How many stream compactions to run and average timings over.");
+DEFINE_int32(num_bytes, 1e7, "How many bytes in the data to compact.");
+DEFINE_string(timing_output_path, "./timings.txt",
+              "File in which to save the timing results.");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  // Params
+  const int num_trials = FLAGS_num_compactions;
+  const int num_bytes = FLAGS_num_bytes;
+
+  // Warmup
+  warmupCuda();
+
+  // Data
+  const int kNumElements = num_bytes / sizeof(int);
+  std::vector<int> data(kNumElements);
+  std::iota(data.begin(), data.end(), 0);
+  device_vector<int> data_device(data);
+  host_vector<int> data_host(kNumElements);
+  device_vector<int> data_compact_device(kNumElements);
+  host_vector<int> data_compact_host(kNumElements);
+
+  // Stencil
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> distribution(0, 1);
+  host_vector<bool> stencil(kNumElements);
+  std::generate(stencil.begin(), stencil.end(),
+                [&]() -> bool { return static_cast<bool>(distribution(gen)); });
+  const device_vector<bool> stencil_device = stencil;
+  host_vector<bool> stencil_host(kNumElements);
+
+  // Experiment #1
+  // - transfer data and stencil device -> host
+  // - stream compaction on host
+  // - transfer compact stream host -> device
+  timing::Timer host_timer("host_compaction");
+  for (int trial_idx = 0; trial_idx < num_trials; trial_idx++) {
+    // Stecil + data, device -> host
+    data_host = data_device;
+    stencil_host = stencil_device;
+    // Stream compaction on host
+    data_compact_host.resize(0);
+    for (int i = 0; i < stencil_host.size(); i++) {
+      if (stencil_host[i]) {
+        data_compact_host.push_back(data_host[i]);
+      }
+    }
+    // Data, host -> device
+    data_compact_device = data_compact_host;
+  }
+  host_timer.Stop();
+
+  // Experiment #2
+  // - perform compaction on the device
+  timing::Timer device_timer("device_compaction");
+  experiments::StreamCompactor stream_compactor;
+  for (int trial_idx = 0; trial_idx < num_trials; trial_idx++) {
+    stream_compactor.streamCompactionOnGPU(data_device, stencil_device,
+                                           &data_compact_device);
+  }
+  device_timer.Stop();
+
+  std::cout << timing::Timing::Print() << std::endl;
+  std::ofstream timing_file(FLAGS_timing_output_path, std::ofstream::out);
+  timing_file << timing::Timing::Print();
+  timing_file.close();
+}
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/stream_compaction/plot.py b/nvblox/experiments/experiments/stream_compaction/plot.py
new file mode 100644
index 00000000..3eeaf624
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/plot.py
@@ -0,0 +1,72 @@
+
+from nvblox.experiments.timing import get_timings_as_dataframe
+import os
+import argparse
+import glob
+import numpy as np
+import pandas as pd
+import re
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+rcParams.update({'figure.autolayout': True})
+
+
+# Get the timings for a single run
+def get_total_times(filepath: str) -> pd.Series:
+    timings = get_timings_as_dataframe(filepath)
+    return timings["total_time"]
+
+
+def get_platform_timings(timings_dir: str, platform_name: str = None) -> pd.DataFrame:
+    results_files = glob.glob(timings_dir + "/timings_*.txt")
+    results_files.sort()
+    df = pd.DataFrame()
+    for f in results_files:
+
+        # Extract datasize
+        data_size = int(re.search('timings_(.+?).txt', f).group(1))
+
+        # Total times -> timg/byte
+        timings = get_total_times(f)
+        timings /= data_size
+
+        # Add to timings for each datasize
+        this_df = timings.to_frame(name=str(data_size) + " bytes")
+        if df.empty:
+            df = this_df
+        else:
+            df = pd.merge(df, this_df, left_index=True, right_index=True)
+    if platform_name is not None:
+        df = df.rename(index={'host_compaction': platform_name + ' host'})
+        df = df.rename(index={'device_compaction': platform_name + ' device'})
+    print(df)
+    return df
+
+
+def plot_timings(timings_dir: str):
+
+    df = pd.DataFrame()
+    platform_timings = []
+    for platform in os.listdir(timings_dir):
+        print("collecting timings for platform: " + platform)
+
+        platform_timings.append(get_platform_timings(
+            os.path.join(timings_dir, platform), platform))
+    df = pd.concat(platform_timings)
+
+    # Plotting
+    df.plot.bar()
+    plt.yscale('log')
+    plt.ylabel('Time per byte (s)')
+    plt.xlabel('Platform/method')
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Plot stream compaction timing comparisons.")
+    parser.add_argument("timings_dir", metavar="timings_dir", type=str,
+                        help="The directory containing the timing results.")
+    args = parser.parse_args()
+    plot_timings(args.timings_dir)
diff --git a/nvblox/experiments/experiments/stream_compaction/run.py b/nvblox/experiments/experiments/stream_compaction/run.py
new file mode 100644
index 00000000..12a219bf
--- /dev/null
+++ b/nvblox/experiments/experiments/stream_compaction/run.py
@@ -0,0 +1,43 @@
+#!/usr/bin/python3
+
+import os
+import git
+import subprocess
+
+
+# Experiment params
+# The list of sizes over which to run the experiments
+data_sizes = [1e3, 1e4, 1e5, 1e6, 1e7]
+
+
+def run() -> None:
+
+    # Create the output path
+    # datetime_str = datetime.now().strftime("%m_%d_%Y_%H_%M_%S")
+    this_directory = os.getcwd()
+    output_root = os.path.join(this_directory, 'output', 'platform')
+    os.makedirs(output_root, exist_ok=True)
+
+    # Binary path
+    repo = git.Repo(this_directory, search_parent_directories=True)
+    git_root_dir = repo.git.rev_parse("--show-toplevel")
+    assert not repo.bare
+    build_dir = os.path.join(git_root_dir, 'nvblox/build')
+    binary_path = os.path.join(
+        build_dir, 'experiments/experiments/stream_compaction/stream_compaction')
+
+    # Running the experiment for each size
+    for data_size in data_sizes:
+
+        output_path = os.path.join(
+            output_root, "timings_" + str(int(data_size)) + ".txt")
+
+        args_string = "--timing_output_path " + output_path
+        args_string += " --num_bytes " + str(int(data_size))
+        run_string = binary_path + ' ' + args_string
+        print("Running experiment as:\n " + run_string)
+        subprocess.call(run_string, shell=True)
+
+
+if __name__ == '__main__':
+    run()
diff --git a/nvblox/experiments/experiments/texture_vs_global_memory_interpolation/CMakeLists.txt b/nvblox/experiments/experiments/texture_vs_global_memory_interpolation/CMakeLists.txt
new file mode 100644
index 00000000..89df593b
--- /dev/null
+++ b/nvblox/experiments/experiments/texture_vs_global_memory_interpolation/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(texture_vs_global_memory_interpolation main.cpp)
+target_link_libraries(texture_vs_global_memory_interpolation nvblox_experiments_common)
diff --git a/nvblox/experiments/experiments/texture_vs_global_memory_interpolation/main.cpp b/nvblox/experiments/experiments/texture_vs_global_memory_interpolation/main.cpp
new file mode 100644
index 00000000..3fda8bb8
--- /dev/null
+++ b/nvblox/experiments/experiments/texture_vs_global_memory_interpolation/main.cpp
@@ -0,0 +1,57 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <glog/logging.h>
+
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/experiments/common/fuse_3dmatch.h"
+#include "nvblox/experiments/integrators/experimental_projective_tsdf_integrators.h"
+
+DECLARE_bool(alsologtostderr);
+
+int main(int argc, char* argv[]) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  auto fuser =
+      nvblox::experiments::Fuse3DMatch::createFromCommandLineArgs(argc, argv);
+
+  // Run texture-based interpolation experiment
+  *fuser.mapper().tsdf_integrator_ptr() = nvblox::experiments::ProjectiveTsdfIntegratorExperimentsTexture>();
+  int res_flag = fuser.run();
+  if (res_flag != 0) {
+    return res_flag;
+  }
+  const std::string texture_based_timing = nvblox::timing::Timing::Print();
+  nvblox::timing::Timing::Reset();
+
+  // Run global-memory-based interpolation experiment
+  fuser.tsdf_integrator_ = std::make_shared<
+      nvblox::experiments::ProjectiveTsdfIntegratorExperimentsGlobal>();
+  res_flag = fuser.run();
+  const std::string global_based_timing = nvblox::timing::Timing::Print();
+
+  // Printing the results
+  std::cout << "\n\n\nTexture-based interpolation timings" << std::endl;
+  std::cout << "-----------" << std::endl;
+  std::cout << texture_based_timing << std::endl;
+  std::cout << "\n\n\nGlobal-based interpolation timings" << std::endl;
+  std::cout << "-----------" << std::endl;
+  std::cout << global_based_timing << std::endl;
+
+  return res_flag;
+}
diff --git a/nvblox/experiments/experiments/threaded_image_loading/CMakeLists.txt b/nvblox/experiments/experiments/threaded_image_loading/CMakeLists.txt
new file mode 100644
index 00000000..8db16e0c
--- /dev/null
+++ b/nvblox/experiments/experiments/threaded_image_loading/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(threaded_image_loading main.cpp)
+target_link_libraries(threaded_image_loading nvblox_experiments_common)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/run.py" "${CMAKE_CURRENT_BINARY_DIR}/" threaded_image_loading)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/plot_comparison.py" "${CMAKE_CURRENT_BINARY_DIR}/" threaded_image_loading)
diff --git a/nvblox/experiments/experiments/threaded_image_loading/__init__.py b/nvblox/experiments/experiments/threaded_image_loading/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nvblox/experiments/experiments/threaded_image_loading/main.cpp b/nvblox/experiments/experiments/threaded_image_loading/main.cpp
new file mode 100644
index 00000000..ecc2e89d
--- /dev/null
+++ b/nvblox/experiments/experiments/threaded_image_loading/main.cpp
@@ -0,0 +1,89 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <iostream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+
+#include "nvblox/experiments/common/fuse_3dmatch.h"
+
+DEFINE_bool(single_thread, false, "Load images using a single thread.");
+DEFINE_bool(multi_thread, false, "Load images using multiple threads");
+DEFINE_int32(num_threads, -1,
+             "Number of threads to load images with in multithreaded mode.");
+DEFINE_int32(num_images, -1, "Number of images to process in this experiment.");
+DEFINE_string(timing_output_path, "",
+              "File in which to save the timing results.");
+
+DECLARE_bool(alsologtostderr);
+
+using namespace nvblox;
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  std::string base_path;
+  if (argc < 2) {
+    LOG(FATAL) << "Must pass path the 3DMatch dataset root directory.";
+    return 0;
+  } else {
+    LOG(INFO) << "Loading data from: " << argv[1];
+    base_path = argv[1];
+  }
+
+  CHECK(FLAGS_single_thread || FLAGS_multi_thread)
+      << "Must select either single or multi-threaded.";
+  if (FLAGS_multi_thread) {
+    CHECK_GT(FLAGS_num_threads, 0)
+        << "Please specify the number of threads to run with.";
+  }
+
+  std::string esdf_output_path = "./3dmatch_esdf_pointcloud.ply";
+  std::string mesh_output_path = "./3dmatch_mesh.ply";
+  experiments::Fuse3DMatch fuser(base_path, FLAGS_timing_output_path,
+                                 mesh_output_path, esdf_output_path);
+  if (FLAGS_num_images > 0) {
+    fuser.num_frames_to_integrate_ = FLAGS_num_images;
+  }
+
+  // Replacing the image loader with the one we are asked to test.
+  if (FLAGS_single_thread) {
+    fuser.depth_image_loader_ = datasets::threedmatch::createDepthImageLoader(
+        base_path, fuser.sequence_num_);
+    fuser.color_image_loader_ = datasets::threedmatch::createColorImageLoader(
+        base_path, fuser.sequence_num_);
+  } else {
+    fuser.depth_image_loader_ =
+        datasets::threedmatch::createMultithreadedDepthImageLoader(
+            base_path, fuser.sequence_num_, FLAGS_num_threads);
+    fuser.color_image_loader_ =
+        datasets::threedmatch::createMultithreadedColorImageLoader(
+            base_path, fuser.sequence_num_, FLAGS_num_threads);
+  }
+
+  warmupCuda();
+  fuser.run();
+
+  return 0;
+}
diff --git a/nvblox/experiments/experiments/threaded_image_loading/plot_comparison.py b/nvblox/experiments/experiments/threaded_image_loading/plot_comparison.py
new file mode 100644
index 00000000..8b632d83
--- /dev/null
+++ b/nvblox/experiments/experiments/threaded_image_loading/plot_comparison.py
@@ -0,0 +1,51 @@
+from nvblox.experiments.timing import get_timings_as_dataframe
+import os
+import argparse
+import glob
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+rcParams.update({'figure.autolayout': True})
+
+
+def plot_timings(timing_root_dir: str) -> None:
+
+    print("Loading result files at: " + timing_root_dir)
+    files = glob.glob(os.path.join(timing_root_dir, '*timings.txt'))
+    files.sort()
+    files.insert(0, files.pop())
+
+    total_times = []
+    labels = []
+    for filepath in files:
+        print("Loading a file at: " + filepath)
+        timing_dataframe = get_timings_as_dataframe(filepath)
+        total_time_series = timing_dataframe['total_time']
+        total_time = total_time_series.at['3dmatch/file_loading']
+        total_times.append(total_time)
+        labels.append(filepath.split('/')[-1].split('.')[0])
+
+    # Plot
+    fig, ax = plt.subplots()
+    x = np.arange(len(total_times))
+    ax.bar(x, total_times)
+    ax.set_ylabel("Load time (s)")
+    ax.set_xlabel("Test")
+    ax.set_xticks(x)
+    ax.set_xticklabels(labels, rotation='vertical')
+
+    fig_path = os.path.join(timing_root_dir, 'timings.png')
+    plt.savefig(fig_path)
+
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Plot branch timing comparisons.")
+    parser.add_argument("timing_root_dir", metavar="timing_root_dir", type=str,
+                        help="The directory containing the timing results.")
+    args = parser.parse_args()
+    plot_timings(args.timing_root_dir)
diff --git a/nvblox/experiments/experiments/threaded_image_loading/run.py b/nvblox/experiments/experiments/threaded_image_loading/run.py
new file mode 100644
index 00000000..96340de0
--- /dev/null
+++ b/nvblox/experiments/experiments/threaded_image_loading/run.py
@@ -0,0 +1,68 @@
+import os
+import subprocess
+import argparse
+from enum import Enum
+
+from plot_comparison import plot_timings
+
+
+class LoadType(Enum):
+    kSingle = 1
+    kMulti = 2
+
+
+def run_single(dataset_base_path: str, timing_output_path: str, load_type: LoadType, num_threads: int = -1, num_images: int = -1):
+    # Launch the 3DMatch reconstruction
+    experiment_executable_name = 'threaded_image_loading'
+    if load_type == LoadType.kSingle:
+        memory_flag = '--single_thread'
+    else:
+        memory_flag = '--multi_thread'
+    args_string = dataset_base_path
+    args_string += ' ' + memory_flag
+    args_string += ' --timing_output_path ' + timing_output_path
+    if load_type == LoadType.kMulti:
+        args_string += ' --num_threads ' + str(num_threads)
+    print(f"num_images: {num_images}")
+    if num_images > 0:
+        args_string += ' --num_images ' + str(num_images)
+    print("args_string: " + args_string)
+    subprocess.call(['./' + experiment_executable_name +
+                    ' ' + args_string], shell=True)
+
+
+def run_experiment(dataset_base_path: str, number_of_images: int):
+
+    print(f"number_of_images: {number_of_images}")
+
+    # Create the timings output directory
+    this_directory = os.getcwd()
+    output_dir = os.path.join(this_directory, 'output')
+    if not os.path.isdir(output_dir):
+        os.mkdir(output_dir)
+
+    output_path = os.path.join(output_dir, "single_threaded_timings.txt")
+    run_single(dataset_base_path, output_path,
+               LoadType.kSingle, num_images=number_of_images)
+
+    thread_nums = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
+    for num_threads in thread_nums:
+        output_path = os.path.join(
+            output_dir, f"multi_threaded_{num_threads:02}_timings.txt")
+        run_single(dataset_base_path, output_path,
+                   LoadType.kMulti, num_threads=num_threads, num_images=number_of_images)
+
+    plot_timings(output_dir)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Run an experiment.")
+    parser.add_argument("dataset_base_path", metavar="dataset_base_path", type=str,
+                        help="Path to the 3DMatch dataset root directory.")
+    parser.add_argument("--number_of_images", metavar="number_of_images", type=int, default=200,
+                        help="Number of images to load.")
+    args = parser.parse_args()
+    if args.dataset_base_path:
+        run_experiment(args.dataset_base_path,
+                       args.number_of_images)
diff --git a/nvblox/experiments/experiments/unified_vs_device_memory/CMakeLists.txt b/nvblox/experiments/experiments/unified_vs_device_memory/CMakeLists.txt
new file mode 100644
index 00000000..95b03945
--- /dev/null
+++ b/nvblox/experiments/experiments/unified_vs_device_memory/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_executable(unified_vs_device_memory main.cpp)
+target_link_libraries(unified_vs_device_memory nvblox_experiments_common)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/run.py" "${CMAKE_CURRENT_BINARY_DIR}/" unified_vs_device_memory)
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/analyze_results.py" "${CMAKE_CURRENT_BINARY_DIR}/" unified_vs_device_memory)
diff --git a/nvblox/experiments/experiments/unified_vs_device_memory/analyze_results.py b/nvblox/experiments/experiments/unified_vs_device_memory/analyze_results.py
new file mode 100644
index 00000000..6bbcee11
--- /dev/null
+++ b/nvblox/experiments/experiments/unified_vs_device_memory/analyze_results.py
@@ -0,0 +1,55 @@
+import glob
+
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+rcParams.update({'figure.autolayout': True})
+
+from nvblox.experiments.timing import get_timings_as_dataframe
+
+
+def get_relavent_timings(filepath: str):
+    timings = get_timings_as_dataframe(filepath)
+    timings = timings.drop(labels="3dmatch/time_per_frame")
+    timings = timings.drop(labels="3dmatch/file_loading")
+    return timings["total_time"]
+
+
+def main():
+    # Load the data
+    input_root = "./output"
+
+    # Averaging the experiment runs
+    series = []
+    device_timing_files = glob.glob(input_root + "/device_*.txt")
+    for f in device_timing_files:
+        series.append(get_relavent_timings(f))
+    device_total_times = pd.concat(series, axis=1)
+    device_total_times_mean = device_total_times.mean(axis=1)
+    device_total_times_std = device_total_times.std(axis=1)
+
+    series = []
+    unified_timing_files = glob.glob(input_root + "/unified_*.txt")
+    for f in unified_timing_files:
+        series.append(get_relavent_timings(f))
+    unified_total_times = pd.concat(series, axis=1)
+    unified_total_times_mean = unified_total_times.mean(axis=1)
+    unified_total_times_std = unified_total_times.std(axis=1)
+
+    # Plot
+    fig, ax = plt.subplots()
+    bar_width = 0.35
+    x = np.arange(len(device_total_times_mean))
+    x_device = x - bar_width / 2.0
+    x_unified = x + bar_width / 2.0
+    ax.bar(x_device, device_total_times_mean, bar_width, yerr=device_total_times_std, label='device')
+    ax.bar(x_unified, unified_total_times_mean, bar_width, yerr=unified_total_times_std, label='unified')
+    ax.set_xticks(x)
+    ax.set_xticklabels(device_total_times_mean.index, rotation='vertical')
+    ax.legend()
+    plt.show()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/nvblox/experiments/experiments/unified_vs_device_memory/main.cpp b/nvblox/experiments/experiments/unified_vs_device_memory/main.cpp
new file mode 100644
index 00000000..e8aed8c5
--- /dev/null
+++ b/nvblox/experiments/experiments/unified_vs_device_memory/main.cpp
@@ -0,0 +1,80 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <iostream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/layer.h"
+
+#include "nvblox/experiments/common/fuse_3dmatch.h"
+
+DEFINE_bool(unified_memory, false, "Run the test using unified memory");
+DEFINE_bool(device_memory, false, "Run the test using device memory");
+DEFINE_string(timing_output_path, "./3dmatch_timings.txt",
+              "File in which to save the timing results.");
+
+DECLARE_bool(alsologtostderr);
+
+using namespace nvblox;
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  if ((!FLAGS_unified_memory && !FLAGS_device_memory) ||
+      (FLAGS_unified_memory && FLAGS_device_memory)) {
+    std::cout << "Must select either --unified_memory or --device_memory\n";
+    return 0;
+  }
+
+  std::string dataset_base_path;
+  if (argc < 2) {
+    // Try out running on the test datasets.
+    dataset_base_path = "../tests/data/3dmatch";
+    std::cout << "Please specify 3DMatch file path.\n";
+  } else {
+    dataset_base_path = argv[1];
+    std::cout << "Loading 3DMatch files from " << dataset_base_path << ".\n";
+  }
+
+  std::string esdf_output_path = "./3dmatch_esdf_pointcloud.ply";
+  std::string mesh_output_path = "./3dmatch_mesh.ply";
+  experiments::Fuse3DMatch fuser(dataset_base_path, FLAGS_timing_output_path,
+                                 mesh_output_path, esdf_output_path);
+
+  // Replacing the TSDF and colors layers with layers stored in the appropriate
+  // memory type
+  if (FLAGS_device_memory) {
+    fuser.mapper().layers() =
+        LayerCake::create<TsdfLayer, ColorLayer, EsdfLayer, MeshLayer>(
+            fuser.voxel_size_m_, MemoryType::kDevice);
+  } else {
+    fuser.mapper().layers() =
+        LayerCake::create<TsdfLayer, ColorLayer, EsdfLayer, MeshLayer>(
+            fuser.voxel_size_m_, MemoryType::kUnified, MemoryType::kUnified,
+            MemoryType::kDevice, MemoryType::kDevice);
+  }
+
+  warmupCuda();
+  fuser.run();
+
+  return 0;
+}
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/unified_vs_device_memory/run.py b/nvblox/experiments/experiments/unified_vs_device_memory/run.py
new file mode 100755
index 00000000..3e928894
--- /dev/null
+++ b/nvblox/experiments/experiments/unified_vs_device_memory/run.py
@@ -0,0 +1,56 @@
+import os
+import subprocess
+import argparse
+from enum import Enum
+
+
+class MemoryType(Enum):
+    kDevice = 1
+    kUnified = 2
+
+
+def run_single(dataset_base_path: str, timing_output_path: str, memory_type: MemoryType):
+    # Launch the 3DMatch reconstruction
+    experiment_executable_name = 'unified_vs_device_memory'
+    if memory_type == MemoryType.kUnified:
+        memory_flag = '--unified_memory'
+    else:
+        memory_flag = '--device_memory'
+    args_string = memory_flag + " --timing_output_path " + \
+        timing_output_path + ' ' + dataset_base_path
+    print("args_string: " + args_string)
+    subprocess.call(['./' + experiment_executable_name +
+                    ' ' + args_string], shell=True)
+
+
+def run_experiment(dataset_base_path: str, number_of_runs: int, warm_up_run: bool):
+
+    # Create the timings output directory
+    if not os.path.isdir("output"):
+        os.mkdir("output")
+
+    # Run a bunch of times
+    run_indices = list(range(number_of_runs))
+    if warm_up_run:
+        run_indices.insert(0, 0)
+    for run_idx in run_indices:
+        print("Run: " + str(run_idx) + ", Memory type: kDevice")
+        timing_output_path = "./output/device_" + str(run_idx) + ".txt"
+        run_single(dataset_base_path, timing_output_path, MemoryType.kDevice)
+        print("Run: " + str(run_idx) + ", Memory type: kUnified")
+        timing_output_path = "./output/unified_" + str(run_idx) + ".txt"
+        run_single(dataset_base_path, timing_output_path, MemoryType.kUnified)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(description="Run an experiment.")
+    parser.add_argument("dataset_base_path", metavar="dataset_base_path", type=str,
+                        help="Path to the 3DMatch dataset root directory.")
+    parser.add_argument("--number_of_runs", metavar="number_of_runs", type=int, default=10,
+                        help="Number of runs to do in this experiment.")
+    args = parser.parse_args()
+    warm_up_run = True
+    if args.dataset_base_path:
+        run_experiment(args.dataset_base_path,
+                       args.number_of_runs, warm_up_run)
diff --git a/nvblox/experiments/experiments/vector_copies/CMakeLists.txt b/nvblox/experiments/experiments/vector_copies/CMakeLists.txt
new file mode 100644
index 00000000..49e5ef46
--- /dev/null
+++ b/nvblox/experiments/experiments/vector_copies/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(vector_copies_cuda vector_copies.cu)
+target_include_directories(vector_copies_cuda PUBLIC "../../../tests/include")
+target_link_libraries(vector_copies_cuda nvblox_experiments_common nvblox_test_utils)
+
+add_executable(vector_copies main.cpp)
+target_link_libraries(vector_copies vector_copies_cuda nvblox_experiments_common)
+
+makeLink("${CMAKE_CURRENT_SOURCE_DIR}/plot_timings.py" "${CMAKE_CURRENT_BINARY_DIR}/" vector_copies)
diff --git a/nvblox/experiments/experiments/vector_copies/main.cpp b/nvblox/experiments/experiments/vector_copies/main.cpp
new file mode 100644
index 00000000..e0f11f7c
--- /dev/null
+++ b/nvblox/experiments/experiments/vector_copies/main.cpp
@@ -0,0 +1,55 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <fstream>
+#include <iostream>
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/utils/timing.h"
+
+#include "vector_copies.h"
+
+DECLARE_bool(alsologtostderr);
+
+using namespace nvblox;
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  warmupCuda();
+  // Do a warmup run and then reset the times.
+  runVectorCopyExperiments(1);
+  nvblox::timing::Timing::Reset();
+
+  runVectorCopyExperiments(1000);
+
+  std::cout << nvblox::timing::Timing::Print();
+
+  // Optionally output to file as well.
+  if (argc >= 2) {
+    std::ofstream outfile(argv[1]);
+    if (outfile.is_open()) {
+      outfile << nvblox::timing::Timing::Print();
+      outfile.close();
+    }
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/vector_copies/plot_timings.py b/nvblox/experiments/experiments/vector_copies/plot_timings.py
new file mode 100644
index 00000000..6f0297b2
--- /dev/null
+++ b/nvblox/experiments/experiments/vector_copies/plot_timings.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python3
+
+from nvblox.experiments.timing import get_timings_as_dataframe
+import os
+import argparse
+import glob
+import pandas as pd
+from typing import Tuple
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib import rcParams
+rcParams.update({'figure.autolayout': True})
+
+
+def get_total_times(filepath: str) -> pd.Series:
+    timings = get_timings_as_dataframe(filepath)
+    return timings["total_time"]
+
+
+def plot_timings(timing_root_dir: str, save_fig: bool = True) -> None:
+    print("Looking for results in: " + timing_root_dir)
+
+    # Check that the profile results are in the dir
+    result_files = [name for name in os.listdir(timing_root_dir)
+                    if os.path.isfile(os.path.join(timing_root_dir, name))]
+
+    # Load every text file in the results dir.
+    df = pd.DataFrame()
+    for result_name in result_files:
+        results_path = os.path.join(timing_root_dir, result_name)
+        if os.path.splitext(result_name)[1] != ".txt":
+            continue
+        timings = get_total_times(results_path)
+        this_df = timings.to_frame(name=os.path.splitext(result_name)[0])
+        if df.empty:
+            df = this_df
+        else:
+            df = pd.merge(df, this_df, left_index=True, right_index=True)
+
+    print(df)
+
+    print(df.loc["exp/initialize"])
+
+    # Organize meaningful columns
+    row_names = ["host to device", "device to device",
+                 "device to host", "kernel"]
+    df.loc[row_names[0]] = df.loc["exp2a/cuda/host_to_device"]
+    df.loc[row_names[1]] = df.loc["exp2b/cuda/device_to_device"]
+    df.loc[row_names[2]] = df.loc["exp2c/cuda/device_to_host"]
+    df.loc[row_names[3]] = df.loc["exp2a/kernel"]
+
+    # Plot
+    ax = df.loc[row_names].plot.bar(rot=0)
+    ax.set_ylabel('Total Time (s)')
+    for con in ax.containers:
+        ax.bar_label(con, fmt='%.2f')
+    plt.title("Vector Copy Times Across Platforms")
+
+    # Save the plot in the root folder of the timings.
+    if save_fig:
+        image_path = os.path.join(timing_root_dir, 'timings.png')
+        print("Saving figure to disk as: " + image_path)
+        plt.savefig(image_path)
+    
+    plt.show()
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser(
+        description="Plot vector copy timing comparisons.")
+    parser.add_argument("timing_root_dir", metavar="timing_root_dir", type=str,
+                        help="The directory containing the timing results.")
+    args = parser.parse_args()
+    plot_timings(args.timing_root_dir)
diff --git a/nvblox/experiments/experiments/vector_copies/vector_copies.cu b/nvblox/experiments/experiments/vector_copies/vector_copies.cu
new file mode 100644
index 00000000..060c48f7
--- /dev/null
+++ b/nvblox/experiments/experiments/vector_copies/vector_copies.cu
@@ -0,0 +1,155 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <thrust/copy.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include "nvblox/core/unified_vector.h"
+#include "nvblox/tests/increment_on_gpu.h"
+#include "nvblox/utils/timing.h"
+
+#include "vector_copies.h"
+
+using namespace nvblox;
+
+void runVectorCopyExperiments(int num_runs) {
+  constexpr size_t kVectorSize = 1000000;
+
+  for (int i = 0; i < num_runs; i++) {
+    // Experiment 1a: copy vector host-to-device with thrust.
+    timing::Timer exp_initialize_timer("exp/initialize");
+    std::vector<int> start_vec(kVectorSize, 5);
+    exp_initialize_timer.Stop();
+
+    {
+      timing::Timer exp1a_timer("exp1a/thrust/host_to_device");
+      thrust::device_vector<int> vec_1a(kVectorSize);
+      thrust::copy(start_vec.begin(), start_vec.end(), vec_1a.begin());
+      exp1a_timer.Stop();
+
+      timing::Timer exp1a_kernel_timer("exp1a/kernel");
+      test_utils::incrementOnGPU(kVectorSize,
+                                 thrust::raw_pointer_cast(vec_1a.data()));
+      exp1a_kernel_timer.Stop();
+
+      // Experiment 1b: copy vector device-to-device with thrust.
+      timing::Timer exp1b_timer("exp1b/thrust/device_to_device");
+      thrust::device_vector<int> vec_1b(kVectorSize);
+      thrust::copy(vec_1a.begin(), vec_1a.end(), vec_1b.begin());
+      exp1b_timer.Stop();
+
+      timing::Timer exp1b_kernel_timer("exp1b/kernel");
+      test_utils::incrementOnGPU(kVectorSize,
+                                 thrust::raw_pointer_cast(vec_1b.data()));
+      exp1b_kernel_timer.Stop();
+
+      // Experiment 1c: copy vector device-to-host with thrust.
+      timing::Timer exp1c_timer("exp1c/thrust/device_to_host");
+      std::vector<int> vec_1c(kVectorSize);
+      thrust::copy(vec_1a.begin(), vec_1a.end(), vec_1c.begin());
+      exp1c_timer.Stop();
+    }
+
+    {
+      // Experiment 2a: copy vector host-to-device with cuda memcpy.
+      timing::Timer exp2a_timer("exp2a/cuda/host_to_device");
+      thrust::device_vector<int> vec_2a(kVectorSize);
+      cudaMemcpy(thrust::raw_pointer_cast(vec_2a.data()), start_vec.data(),
+                 kVectorSize * sizeof(int), cudaMemcpyHostToDevice);
+      exp2a_timer.Stop();
+
+      timing::Timer exp2a_kernel_timer("exp2a/kernel");
+      test_utils::incrementOnGPU(kVectorSize,
+                                 thrust::raw_pointer_cast(vec_2a.data()));
+      exp2a_kernel_timer.Stop();
+
+      // Experiment 2b: copy vector device-to-device with cuda memcpy.
+      timing::Timer exp2b_timer("exp2b/cuda/device_to_device");
+      thrust::device_vector<int> vec_2b(kVectorSize);
+      cudaMemcpy(thrust::raw_pointer_cast(vec_2b.data()),
+                 thrust::raw_pointer_cast(vec_2a.data()),
+                 kVectorSize * sizeof(int), cudaMemcpyDeviceToDevice);
+      exp2b_timer.Stop();
+
+      timing::Timer exp2b_kernel_timer("exp2b/kernel");
+      test_utils::incrementOnGPU(kVectorSize,
+                                 thrust::raw_pointer_cast(vec_2b.data()));
+      exp2b_kernel_timer.Stop();
+
+      // Experiment 2c: copy vector device-to-host with cuda memcpy.
+      timing::Timer exp2c_timer("exp2c/cuda/device_to_host");
+      std::vector<int> vec_2c(kVectorSize);
+      cudaMemcpy(vec_2c.data(), thrust::raw_pointer_cast(vec_2b.data()),
+                 kVectorSize * sizeof(int), cudaMemcpyDeviceToHost);
+      exp2c_timer.Stop();
+    }
+
+    {
+      // Experiment 3: create a unified vector and use it in a kernel.
+      timing::Timer exp3_initialize_timer("exp3/unified/initialize");
+      unified_vector<int> unified_vec(kVectorSize, 5);
+      exp3_initialize_timer.Stop();
+
+      timing::Timer exp3_kernel_timer("exp3/unified/kernel");
+      test_utils::incrementOnGPU(kVectorSize, unified_vec.data());
+      exp3_kernel_timer.Stop();
+
+      timing::Timer exp3_increment_timer("exp3/unified/cpu_increment");
+      for (size_t i = 0; i < kVectorSize; i++) {
+        unified_vec[i]++;
+      }
+      exp3_increment_timer.Stop();
+
+      timing::Timer exp3_host_increment_timer("exp3/host/cpu_increment");
+      for (size_t i = 0; i < kVectorSize; i++) {
+        start_vec[i]++;
+      }
+      exp3_host_increment_timer.Stop();
+    }
+    // Experiment 4: is it faster to create an array on host, copy its value to
+    // GPU, delete it or use unified memory?
+    // 4a: baseline: device copy.
+    {
+      timing::Timer exp4a("exp4a/device_mem");
+      std::vector<int> exp4a_vec(kVectorSize, 1);
+      int* exp4a_dev;
+      cudaMalloc(&exp4a_dev, kVectorSize * sizeof(int));
+      cudaMemcpy(exp4a_dev, exp4a_vec.data(), kVectorSize * sizeof(int),
+                 cudaMemcpyHostToDevice);
+      test_utils::incrementOnGPU(kVectorSize, exp4a_dev);
+      cudaFree(exp4a_dev);
+      exp4a.Stop();
+    }
+
+    // 4b: unified without anything special, using memset to initialize.
+    {
+      timing::Timer exp4b("exp4b/unified_mem");
+      unified_vector<int> unified_vec(kVectorSize);
+      memset(unified_vec.data(), kVectorSize * sizeof(int), 1);
+      test_utils::incrementOnGPU(kVectorSize, unified_vec.data());
+      exp4b.Stop();
+    }
+
+    // 4c: unified vector using the unified vector code (on CPU) to initialize.
+    {
+      timing::Timer exp4c("exp4c/unified_cpu_set");
+      unified_vector<int> unified_vec(kVectorSize, 1);
+      // unified_vec.toGPU();
+      test_utils::incrementOnGPU(kVectorSize, unified_vec.data());
+      exp4c.Stop();
+    }
+  }
+}
\ No newline at end of file
diff --git a/nvblox/experiments/experiments/vector_copies/vector_copies.h b/nvblox/experiments/experiments/vector_copies/vector_copies.h
new file mode 100644
index 00000000..dc425b37
--- /dev/null
+++ b/nvblox/experiments/experiments/vector_copies/vector_copies.h
@@ -0,0 +1,18 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+void runVectorCopyExperiments(int num_runs);
\ No newline at end of file
diff --git a/nvblox/experiments/include/nvblox/experiments/common/fuse_3dmatch.h b/nvblox/experiments/include/nvblox/experiments/common/fuse_3dmatch.h
new file mode 100644
index 00000000..a5b313be
--- /dev/null
+++ b/nvblox/experiments/include/nvblox/experiments/common/fuse_3dmatch.h
@@ -0,0 +1,113 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include <glog/logging.h>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/core/layer_cake.h"
+#include "nvblox/core/mapper.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+#include "nvblox/integrators/esdf_integrator.h"
+#include "nvblox/integrators/projective_color_integrator.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/mesh/mesh_block.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/ray_tracing/sphere_tracer.h"
+
+namespace nvblox {
+namespace experiments {
+
+class Fuse3DMatch {
+ public:
+  Fuse3DMatch() = default;
+  Fuse3DMatch(const std::string& base_path,
+              const std::string& timing_output_path = std::string(),
+              const std::string& mesh_output_path = std::string(),
+              const std::string& esdf_output_path = std::string());
+
+  // Runs an experiment
+  int run();
+
+  // Set the base path pointing to the dataset.
+  void setBasePath(const std::string& base_path);
+  // Set the sequence number within the 3D Match dataset.
+  void setSequenceNumber(int sequence_num);
+
+  // Set various settings.
+  void setVoxelSize(float voxel_size);
+  void setTsdfFrameSubsampling(int subsample);
+  void setColorFrameSubsampling(int subsample);
+  void setMeshFrameSubsampling(int subsample);
+  void setEsdfFrameSubsampling(int subsample);
+
+  // Integrate certain layers.
+  bool integrateFrame(const int frame_number);
+  bool integrateFrames();
+
+  // Initialize the image loaders based on the current base_path and
+  // sequence_num
+  void initializeImageLoaders(bool multithreaded = false);
+
+  // Output a pointcloud ESDF as PLY file.
+  bool outputPointcloudPly();
+  // Output a file with the mesh.
+  bool outputMeshPly();
+  // Output timings to a file
+  bool outputTimingsToFile();
+
+  // Factory: From command line args
+  static Fuse3DMatch createFromCommandLineArgs(int argc, char* argv[]);
+
+  // Get the mapper (useful for experiments where we modify mapper settings)
+  RgbdMapper& mapper() { return mapper_; }
+
+  // Dataset settings.
+  std::string base_path_;
+  int sequence_num_ = 1;
+  int num_frames_to_integrate_ = std::numeric_limits<int>::max();
+  std::unique_ptr<datasets::ImageLoader<DepthImage>> depth_image_loader_;
+  std::unique_ptr<datasets::ImageLoader<ColorImage>> color_image_loader_;
+
+  // Params
+  float voxel_size_m_ = 0.05;
+  int tsdf_frame_subsampling_ = 1;
+  int color_frame_subsampling_ = 1;
+  int mesh_frame_subsampling_ = 1;
+  int esdf_frame_subsampling_ = 1;
+
+  // ESDF slice params
+  float z_min_ = 0.5f;
+  float z_max_ = 1.0f;
+  float z_slice_ = 0.75f;
+
+  // Mapper - Contains map layers and integrators
+  RgbdMapper mapper_;
+
+  // Output paths
+  std::string timing_output_path_;
+  std::string esdf_output_path_;
+  std::string mesh_output_path_;
+};
+
+}  //  namespace experiments
+}  //  namespace nvblox
diff --git a/nvblox/experiments/include/nvblox/experiments/integrators/cuda/depth_frame_texture.cuh b/nvblox/experiments/include/nvblox/experiments/integrators/cuda/depth_frame_texture.cuh
new file mode 100644
index 00000000..97384d58
--- /dev/null
+++ b/nvblox/experiments/include/nvblox/experiments/integrators/cuda/depth_frame_texture.cuh
@@ -0,0 +1,38 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "nvblox/core/image.h"
+
+namespace nvblox {
+namespace experiments {
+
+class DepthImageTexture {
+ public:
+  DepthImageTexture(const DepthImage& depth_frame, cudaStream_t transfer_stream = 0);
+  ~DepthImageTexture();
+
+  cudaTextureObject_t texture_object() const { return depth_texture_; }
+
+ private:
+  cudaArray_t depth_array_;
+  cudaTextureObject_t depth_texture_;
+};
+
+} // namespace experiments
+} // namespace nvblox
diff --git a/nvblox/experiments/include/nvblox/experiments/integrators/cuda/experimental_integrator_input_frames.cuh b/nvblox/experiments/include/nvblox/experiments/integrators/cuda/experimental_integrator_input_frames.cuh
new file mode 100644
index 00000000..edad8d7a
--- /dev/null
+++ b/nvblox/experiments/include/nvblox/experiments/integrators/cuda/experimental_integrator_input_frames.cuh
@@ -0,0 +1,91 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+
+#include "nvblox/experiments/integrators/cuda/depth_frame_texture.cuh"
+
+namespace nvblox {
+namespace experiments {
+
+class IntegratorInputFrameExperimentsBase {
+ public:
+  IntegratorInputFrameExperimentsBase(
+      const std::vector<Index3D>& block_indices,
+      const DepthImage& depth_frame, const Transform& T_L_C,
+      const Camera& camera, const float truncation_distance_m,
+      const float max_weight, TsdfLayer* layer_ptr, cudaStream_t stream);
+  virtual ~IntegratorInputFrameExperimentsBase();
+
+  // Params for this call
+  const float block_size;
+  const float truncation_distance_m;
+  const float max_weight;
+  const int num_blocks;
+
+  // Device-side inputs
+  Index3D* block_indices_device_ptr;
+  Camera* camera_device_ptr;
+  Eigen::Matrix3f* R_C_L_device_ptr;
+  Eigen::Vector3f* t_C_L_device_ptr;
+  VoxelBlock<TsdfVoxel>** block_device_ptrs;
+
+  // Host-side inputs
+  const Eigen::Matrix3f R_C_L;
+  const Eigen::Vector3f t_C_L;
+  std::vector<VoxelBlock<TsdfVoxel>*> block_ptrs;
+
+  // The stream on which host->device transfers are processed
+  cudaStream_t transfer_stream;
+};
+
+class IntegratorInputFrameExperimentsTexture
+    : public IntegratorInputFrameExperimentsBase {
+ public:
+  IntegratorInputFrameExperimentsTexture(
+      const std::vector<Index3D>& block_indices,
+      const DepthImage& depth_frame, const Transform& T_L_C,
+      const Camera& camera, const float truncation_distance_m,
+      const float max_weight, TsdfLayer* layer_ptr, cudaStream_t stream);
+  virtual ~IntegratorInputFrameExperimentsTexture() {};
+
+  // Texture
+  DepthImageTexture depth_texture;
+};
+
+class IntegratorInputFrameExperimentsGlobal
+    : public IntegratorInputFrameExperimentsBase {
+ public:
+  IntegratorInputFrameExperimentsGlobal(
+      const std::vector<Index3D>& block_indices,
+      const DepthImage& depth_frame, const Transform& T_L_C,
+      const Camera& camera, const float truncation_distance_m,
+      const float max_weight, TsdfLayer* layer_ptr, cudaStream_t stream);
+  virtual ~IntegratorInputFrameExperimentsGlobal() {};
+
+  // Depth frame
+  // Stored as a unified ptr to row-major float memory.
+  const float* depth_frame_unified_ptr;
+  const int depth_frame_rows;
+  const int depth_frame_cols;
+};
+
+}  // namespace experiments
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/experiments/include/nvblox/experiments/integrators/experimental_projective_tsdf_integrators.h b/nvblox/experiments/include/nvblox/experiments/integrators/experimental_projective_tsdf_integrators.h
new file mode 100644
index 00000000..d59078be
--- /dev/null
+++ b/nvblox/experiments/include/nvblox/experiments/integrators/experimental_projective_tsdf_integrators.h
@@ -0,0 +1,63 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+
+namespace nvblox {
+namespace experiments {
+
+class ProjectiveTsdfIntegratorExperimentsBase
+    : public ProjectiveTsdfIntegrator {
+ public:
+  ProjectiveTsdfIntegratorExperimentsBase();
+  virtual ~ProjectiveTsdfIntegratorExperimentsBase();
+
+  void finish() const;
+
+ protected:
+};
+
+class ProjectiveTsdfIntegratorExperimentsTexture
+    : public ProjectiveTsdfIntegratorExperimentsBase {
+ public:
+  ProjectiveTsdfIntegratorExperimentsTexture()
+      : ProjectiveTsdfIntegratorExperimentsBase(){};
+  virtual ~ProjectiveTsdfIntegratorExperimentsTexture(){};
+
+ protected:
+  void updateBlocks(const std::vector<Index3D>& block_indices,
+                    const DepthImage& depth_frame, const Transform& T_L_C,
+                    const Camera& camera, const float truncation_distance_m,
+                    TsdfLayer* layer) override;
+};
+
+class ProjectiveTsdfIntegratorExperimentsGlobal
+    : public ProjectiveTsdfIntegratorExperimentsBase {
+ public:
+  ProjectiveTsdfIntegratorExperimentsGlobal()
+      : ProjectiveTsdfIntegratorExperimentsBase(){};
+  virtual ~ProjectiveTsdfIntegratorExperimentsGlobal(){};
+
+ protected:
+  void updateBlocks(const std::vector<Index3D>& block_indices,
+                    const DepthImage& depth_frame, const Transform& T_L_C,
+                    const Camera& camera, const float truncation_distance_m,
+                    TsdfLayer* layer) override;
+};
+
+}  //  namespace experiments
+}  //  namespace nvblox
diff --git a/nvblox/experiments/python/MANIFEST.in b/nvblox/experiments/python/MANIFEST.in
new file mode 100644
index 00000000..853ce576
--- /dev/null
+++ b/nvblox/experiments/python/MANIFEST.in
@@ -0,0 +1,14 @@
+# Main library, including compiled binaries
+graft nvblox_experiments
+
+# Misc
+include README.md
+include requirements.txt
+include setup.py
+
+# Patterns to exclude from any directory
+global-exclude *~
+global-exclude *.pyc
+global-exclude *.pyo
+global-exclude .git
+global-exclude *__pycache__
\ No newline at end of file
diff --git a/nvblox/experiments/python/README.md b/nvblox/experiments/python/README.md
new file mode 100644
index 00000000..e0933a8a
--- /dev/null
+++ b/nvblox/experiments/python/README.md
@@ -0,0 +1,2 @@
+## nvblox experiments python package
+Small python package for helping run and interpret the results of nvblox experiments.
diff --git a/nvblox/experiments/python/nvblox_experiments/__init__.py b/nvblox/experiments/python/nvblox_experiments/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nvblox/experiments/python/nvblox_experiments/threedmatch.py b/nvblox/experiments/python/nvblox_experiments/threedmatch.py
new file mode 100644
index 00000000..acb79b21
--- /dev/null
+++ b/nvblox/experiments/python/nvblox_experiments/threedmatch.py
@@ -0,0 +1,37 @@
+import os
+import subprocess
+from typing import Dict
+
+
+def run_single(executable_path: str, dataset_base_path: str,
+               timing_output_path: str = None, num_frames: int = None,
+               flags: Dict = None):
+    # Launch the 3DMatch reconstruction
+    args_string = dataset_base_path
+    if timing_output_path:
+        args_string += " --timing_output_path " + timing_output_path
+    if num_frames:
+        args_string += " --num_frames " + str(num_frames)
+    if flags:
+        for key, value in flags.items():
+            args_string += " --" + key + " " + str(value)
+
+    run_string = executable_path + ' ' + args_string
+    print("Running 3DMatch as:\n " + run_string)
+    subprocess.call(run_string, shell=True)
+
+
+def run_multiple(num_runs: int, executable_path: str, dataset_base_path: str,
+                 timing_output_dir: str = None, num_frames: int = None,
+                 warmup_run: bool = True, flags: Dict = None):
+    run_indices = list(range(num_runs))
+    if warmup_run:
+        run_indices.insert(0, 0)
+    for run_idx in run_indices:
+        print(f"Run: {run_idx}")
+
+        timing_output_name = f"run_{run_idx}.txt"
+        timing_output_path = os.path.join(timing_output_dir, timing_output_name)
+
+        run_single(executable_path, dataset_base_path,
+                   timing_output_path, num_frames, flags)
diff --git a/nvblox/experiments/python/nvblox_experiments/timing.py b/nvblox/experiments/python/nvblox_experiments/timing.py
new file mode 100644
index 00000000..27555264
--- /dev/null
+++ b/nvblox/experiments/python/nvblox_experiments/timing.py
@@ -0,0 +1,29 @@
+import re
+import pandas as pd
+
+def get_timings_as_dataframe(filepath: str):
+    names = []
+    num_calls = []
+    total_times = []
+    means = []
+    stds = []
+    mins = []
+    maxes = []
+    with open(filepath, 'r') as f:
+        lines = f.readlines()
+        for line in lines[2:]:
+            entries = re.split('\s+|,', line)
+            entries = [entry.strip('()[]') for entry in entries]
+
+            names.append(entries[0])
+            num_calls.append(int(entries[1]))
+            total_times.append(float(entries[2]))
+            means.append(float(entries[3]))
+            stds.append(float(entries[5]))
+            mins.append(float(entries[6]))
+            maxes.append(float(entries[7]))
+
+    d = {"num_calls": num_calls, "total_time": total_times,
+         "mean": means, "std": stds, "min": mins, "max": maxes}
+    timings = pd.DataFrame(d, index=names)
+    return timings
diff --git a/nvblox/experiments/python/requirements.txt b/nvblox/experiments/python/requirements.txt
new file mode 100644
index 00000000..1411a4a0
--- /dev/null
+++ b/nvblox/experiments/python/requirements.txt
@@ -0,0 +1 @@
+pandas
\ No newline at end of file
diff --git a/nvblox/experiments/python/setup.py b/nvblox/experiments/python/setup.py
new file mode 100644
index 00000000..17623ac4
--- /dev/null
+++ b/nvblox/experiments/python/setup.py
@@ -0,0 +1,17 @@
+import setuptools
+
+with open('requirements.txt', 'r') as f:
+    install_requires = [line.strip() for line in f.readlines() if line]
+
+setuptools.setup(
+    name="nvblox_experiments",
+    version="0.0.0",
+    author="Helen Oleynikova + Alexander Millane",
+    author_email="holeynikova@nvidia.com + amilllane@nvidia.com",
+    description="nvblox_experiments: Helper module for nvblox experiments",
+    long_description=open('README.md').read(),
+    long_description_content_type='text/markdown',
+    install_requires=install_requires,
+    include_package_data=True,
+    packages=setuptools.find_packages()
+)
diff --git a/nvblox/experiments/src/common/fuse_3dmatch.cpp b/nvblox/experiments/src/common/fuse_3dmatch.cpp
new file mode 100644
index 00000000..0c61508e
--- /dev/null
+++ b/nvblox/experiments/src/common/fuse_3dmatch.cpp
@@ -0,0 +1,280 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/common/fuse_3dmatch.h"
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/io/mesh_io.h"
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/io/pointcloud_io.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+namespace experiments {
+
+Fuse3DMatch::Fuse3DMatch(const std::string& base_path,
+                         const std::string& timing_output_path,
+                         const std::string& mesh_output_path,
+                         const std::string& esdf_output_path)
+    : base_path_(base_path),
+      timing_output_path_(timing_output_path),
+      mesh_output_path_(mesh_output_path),
+      esdf_output_path_(esdf_output_path),
+      mapper_(voxel_size_m_) {
+  initializeImageLoaders();
+  // Params
+  mapper_.mesh_integrator().min_weight() = 2.0f;
+  mapper_.color_integrator().max_integration_distance_m(5.0f);
+  mapper_.tsdf_integrator().max_integration_distance_m(5.0f);
+  mapper_.tsdf_integrator().frustum_calculator().raycast_subsampling_factor(4);
+  mapper_.esdf_integrator().max_distance_m() = 4.0f;
+  mapper_.esdf_integrator().min_weight() = 2.0f;
+};
+
+int Fuse3DMatch::run() {
+  LOG(INFO) << "Trying to integrate the first frame: ";
+  if (!integrateFrames()) {
+    LOG(FATAL)
+        << "Failed to integrate first frame. Please check the file path.";
+    return 1;
+  }
+
+  if (!mesh_output_path_.empty()) {
+    LOG(INFO) << "Outputting mesh ply file to " << mesh_output_path_;
+    outputMeshPly();
+  }
+
+  if (!esdf_output_path_.empty()) {
+    LOG(INFO) << "Outputting ESDF pointcloud ply file to " << esdf_output_path_;
+    outputPointcloudPly();
+  }
+
+  LOG(INFO) << nvblox::timing::Timing::Print();
+
+  LOG(INFO) << "Writing timings to file.";
+  outputTimingsToFile();
+
+  return 0;
+}
+void Fuse3DMatch::setVoxelSize(float voxel_size) { voxel_size_m_ = voxel_size; }
+void Fuse3DMatch::setTsdfFrameSubsampling(int subsample) {
+  tsdf_frame_subsampling_ = subsample;
+}
+void Fuse3DMatch::setColorFrameSubsampling(int subsample) {
+  color_frame_subsampling_ = subsample;
+}
+void Fuse3DMatch::setMeshFrameSubsampling(int subsample) {
+  mesh_frame_subsampling_ = subsample;
+}
+void Fuse3DMatch::setEsdfFrameSubsampling(int subsample) {
+  esdf_frame_subsampling_ = subsample;
+}
+
+void Fuse3DMatch::setBasePath(const std::string& base_path) {
+  base_path_ = base_path;
+  initializeImageLoaders();
+}
+
+void Fuse3DMatch::setSequenceNumber(int sequence_num) {
+  sequence_num_ = sequence_num;
+  initializeImageLoaders();
+}
+
+void Fuse3DMatch::initializeImageLoaders(bool multithreaded) {
+  // NOTE(alexmillane): On my desktop the performance of threaded image loading
+  // seems to saturate at around 6 threads. On machines with less cores (I have
+  // 12 physical) presumably the saturation point will be less.
+  if (multithreaded) {
+    constexpr unsigned int kMaxLoadingThreads = 6;
+    unsigned int num_loading_threads =
+        std::min(kMaxLoadingThreads, std::thread::hardware_concurrency() / 2);
+    CHECK_GT(num_loading_threads, 0)
+        << "std::thread::hardware_concurrency() returned 0";
+    LOG(INFO) << "Using " << num_loading_threads
+              << " threads for loading images.";
+    depth_image_loader_ =
+        datasets::threedmatch::createMultithreadedDepthImageLoader(
+            base_path_, sequence_num_, num_loading_threads,
+            MemoryType::kDevice);
+    color_image_loader_ =
+        datasets::threedmatch::createMultithreadedColorImageLoader(
+            base_path_, sequence_num_, num_loading_threads,
+            MemoryType::kDevice);
+  } else {
+    depth_image_loader_ = datasets::threedmatch::createDepthImageLoader(
+        base_path_, sequence_num_, MemoryType::kDevice);
+    color_image_loader_ = datasets::threedmatch::createColorImageLoader(
+        base_path_, sequence_num_, MemoryType::kDevice);
+  }
+}
+
+bool Fuse3DMatch::integrateFrame(const int frame_number) {
+  timing::Timer timer_file("3dmatch/file_loading");
+
+  // Get the camera for this frame.
+  timing::Timer timer_file_intrinsics("file_loading/intrinsics");
+  Eigen::Matrix3f camera_intrinsics;
+  if (!datasets::threedmatch::parseCameraFromFile(
+          datasets::threedmatch::getPathForCameraIntrinsics(base_path_),
+          &camera_intrinsics)) {
+    return false;
+  }
+  timer_file_intrinsics.Stop();
+
+  // Load the image into a Depth Frame.
+  CHECK(depth_image_loader_);
+  timing::Timer timer_file_depth("file_loading/depth_image");
+  DepthImage depth_frame;
+  if (!depth_image_loader_->getNextImage(&depth_frame)) {
+    return false;
+  }
+  timer_file_depth.Stop();
+
+  // Load the color image into a ColorImage
+  CHECK(color_image_loader_);
+  timing::Timer timer_file_color("file_loading/color_image");
+  ColorImage color_frame;
+  if (!color_image_loader_->getNextImage(&color_frame)) {
+    return false;
+  }
+  timer_file_color.Stop();
+
+  // Get the transform.
+  timing::Timer timer_file_camera("file_loading/camera");
+  Transform T_L_C;
+  if (!datasets::threedmatch::parsePoseFromFile(
+          datasets::threedmatch::getPathForFramePose(base_path_, sequence_num_,
+                                                     frame_number),
+          &T_L_C)) {
+    return false;
+  }
+
+  // Rotate the world frame since Y is up in the normal 3D match dasets.
+  Eigen::Quaternionf q_L_O =
+      Eigen::Quaternionf::FromTwoVectors(Vector3f(0, 1, 0), Vector3f(0, 0, 1));
+  T_L_C = q_L_O * T_L_C;
+
+  // Create a camera object.
+  int image_width = depth_frame.cols();
+  int image_height = depth_frame.rows();
+  const auto camera = Camera::fromIntrinsicsMatrix(camera_intrinsics,
+                                                   image_width, image_height);
+
+  // Check that the loaded data doesn't contain NaNs or a faulty rotation
+  // matrix. This does occur. If we find one, skip that frame and move to the
+  // next.
+  constexpr float kRotationMatrixDetEpsilon = 1e-4;
+  if (!T_L_C.matrix().allFinite() || !camera_intrinsics.allFinite() ||
+      std::abs(T_L_C.matrix().block<3, 3>(0, 0).determinant() - 1.0f) >
+          kRotationMatrixDetEpsilon) {
+    LOG(WARNING) << "Bad CSV data.";
+    return true;  // Bad data, but keep going.
+  }
+
+  timer_file_camera.Stop();
+  timer_file.Stop();
+
+  timing::Timer per_frame_timer("3dmatch/time_per_frame");
+  if ((frame_number + 1) % tsdf_frame_subsampling_ == 0) {
+    timing::Timer timer_integrate("3dmatch/integrate_tsdf");
+    mapper_.integrateDepth(depth_frame, T_L_C, camera);
+    timer_integrate.Stop();
+  }
+
+  if ((frame_number + 1) % color_frame_subsampling_ == 0) {
+    timing::Timer timer_integrate_color("3dmatch/integrate_color");
+    mapper_.integrateColor(color_frame, T_L_C, camera);
+    timer_integrate_color.Stop();
+  }
+
+  if ((frame_number + 1) % mesh_frame_subsampling_ == 0) {
+    timing::Timer timer_mesh("3dmatch/mesh");
+    mapper_.updateMesh();
+  }
+
+  if ((frame_number + 1) % esdf_frame_subsampling_ == 0) {
+    timing::Timer timer_integrate_esdf("3dmatch/integrate_esdf");
+    mapper_.updateEsdfSlice(z_min_, z_max_, z_slice_);
+    timer_integrate_esdf.Stop();
+  }
+
+  per_frame_timer.Stop();
+
+  return true;
+}
+
+bool Fuse3DMatch::integrateFrames() {
+  int frame_number = 0;
+  while (frame_number < num_frames_to_integrate_ &&
+         integrateFrame(frame_number++)) {
+    timing::mark("Frame " + std::to_string(frame_number - 1), Color::Red());
+    LOG(INFO) << "Integrating frame " << frame_number - 1;
+  }
+  return true;
+}
+
+bool Fuse3DMatch::outputPointcloudPly() {
+  timing::Timer timer_write("3dmatch/esdf/write");
+  return io::outputVoxelLayerToPly(mapper_.esdf_layer(), esdf_output_path_);
+}
+
+bool Fuse3DMatch::outputMeshPly() {
+  timing::Timer timer_write("3dmatch/mesh/write");
+  return io::outputMeshLayerToPly(mapper_.mesh_layer(), mesh_output_path_);
+}
+bool Fuse3DMatch::outputTimingsToFile() {
+  LOG(INFO) << "Writing timing to: " << timing_output_path_;
+  std::ofstream timing_file(timing_output_path_);
+  timing_file << nvblox::timing::Timing::Print();
+  timing_file.close();
+  return true;
+}
+
+Fuse3DMatch Fuse3DMatch::createFromCommandLineArgs(int argc, char* argv[]) {
+  std::string base_path = "";
+  std::string timing_output_path = "./3dmatch_timings.txt";
+  std::string esdf_output_path = "";
+  std::string mesh_output_path = "";
+  if (argc < 2) {
+    // Try out running on the test datasets.
+    base_path = "../tests/data/3dmatch";
+    LOG(WARNING) << "No 3DMatch file path specified; defaulting to the test "
+                    "directory.";
+  } else {
+    base_path = argv[1];
+    LOG(INFO) << "Loading 3DMatch files from " << base_path;
+  }
+  // Optionally overwrite the output paths.
+  if (argc >= 3) {
+    timing_output_path = argv[2];
+  }
+  if (argc >= 3) {
+    esdf_output_path = argv[3];
+  }
+  if (argc >= 4) {
+    mesh_output_path = argv[4];
+  }
+
+  nvblox::experiments::Fuse3DMatch fuser(base_path, timing_output_path,
+                                         mesh_output_path, esdf_output_path);
+
+  return fuser;
+}
+
+}  //  namespace experiments
+}  //  namespace nvblox
diff --git a/nvblox/experiments/src/fuse_3dmatch.cpp b/nvblox/experiments/src/fuse_3dmatch.cpp
new file mode 100644
index 00000000..074b2db6
--- /dev/null
+++ b/nvblox/experiments/src/fuse_3dmatch.cpp
@@ -0,0 +1,81 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include "nvblox/experiments/common/fuse_3dmatch.h"
+
+DECLARE_bool(alsologtostderr);
+
+DEFINE_int32(num_frames, -1,
+             "Number of frames to process. Empty means process all.");
+
+DEFINE_string(timing_output_path, "",
+              "File in which to save the timing results.");
+DEFINE_string(esdf_output_path, "",
+              "File in which to save the ESDF pointcloud.");
+DEFINE_string(mesh_output_path, "", "File in which to save the surface mesh.");
+
+DEFINE_double(voxel_size, 0.0f, "Voxel resolution in meters.");
+DEFINE_int32(tsdf_frame_subsampling, 0,
+             "By what amount to subsample the TSDF frames. A subsample of 3 "
+             "means only every 3rd frame is taken.");
+DEFINE_int32(color_frame_subsampling, 0,
+             "How much to subsample the color integration by.");
+DEFINE_int32(mesh_frame_subsampling, 0,
+             "How much to subsample the meshing by.");
+DEFINE_int32(esdf_frame_subsampling, 0,
+             "How much to subsample the ESDF integration by.");
+
+int main(int argc, char* argv[]) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  nvblox::experiments::Fuse3DMatch fuser =
+      nvblox::experiments::Fuse3DMatch::createFromCommandLineArgs(argc, argv);
+  if (FLAGS_num_frames > 0) {
+    fuser.num_frames_to_integrate_ = FLAGS_num_frames;
+  }
+  if (!FLAGS_timing_output_path.empty()) {
+    fuser.timing_output_path_ = FLAGS_timing_output_path;
+  }
+  if (!FLAGS_esdf_output_path.empty()) {
+    fuser.esdf_output_path_ = FLAGS_esdf_output_path;
+  }
+  if (!FLAGS_mesh_output_path.empty()) {
+    fuser.mesh_output_path_ = FLAGS_mesh_output_path;
+  }
+  if (FLAGS_voxel_size > 0.0f) {
+    fuser.setVoxelSize(static_cast<float>(FLAGS_voxel_size));
+  }
+  if (FLAGS_tsdf_frame_subsampling > 0) {
+    fuser.setTsdfFrameSubsampling(FLAGS_tsdf_frame_subsampling);
+  }
+  if (FLAGS_color_frame_subsampling > 0) {
+    fuser.setColorFrameSubsampling(FLAGS_color_frame_subsampling);
+  }
+  if (FLAGS_mesh_frame_subsampling > 0) {
+    fuser.setMeshFrameSubsampling(FLAGS_mesh_frame_subsampling);
+  }
+  if (FLAGS_esdf_frame_subsampling > 0) {
+    fuser.setEsdfFrameSubsampling(FLAGS_esdf_frame_subsampling);
+  }
+
+  // Make sure the layers are the correct resolution.
+  return fuser.run();
+}
diff --git a/nvblox/experiments/src/integrators/cuda/depth_frame_texture.cu b/nvblox/experiments/src/integrators/cuda/depth_frame_texture.cu
new file mode 100644
index 00000000..7d465c90
--- /dev/null
+++ b/nvblox/experiments/src/integrators/cuda/depth_frame_texture.cu
@@ -0,0 +1,70 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/integrators/cuda/depth_frame_texture.cuh"
+
+#include "nvblox/core/cuda/error_check.cuh"
+
+namespace nvblox {
+namespace experiments {
+
+DepthImageTexture::DepthImageTexture(const DepthImage& depth_frame,
+                                     cudaStream_t transfer_stream) {
+  // Note(alexmillane): Taken from texture memory example
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#texture-memory
+
+  // Allocate CUDA array in device memory
+  // Each channel is a 32bit float in the first (x) dimension
+  const cudaChannelFormatDesc channelDesc =
+      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+  checkCudaErrors(cudaMallocArray(&depth_array_, &channelDesc,
+                                  depth_frame.width(), depth_frame.height()));
+
+  // Set pitch of the source (the width in memory in bytes of the 2D array
+  // pointed to by src, including padding), we dont have any padding
+  const size_t spitch = depth_frame.width() * sizeof(float);
+  // Copy data located at address h_data in host memory to device memory
+  checkCudaErrors(cudaMemcpy2DToArrayAsync(
+      depth_array_, 0, 0, depth_frame.dataConstPtr(), spitch,
+      depth_frame.width() * sizeof(float), depth_frame.height(),
+      cudaMemcpyDefault, transfer_stream));
+
+  // Specify texture
+  struct cudaResourceDesc resource_description;
+  memset(&resource_description, 0, sizeof(resource_description));
+  resource_description.resType = cudaResourceTypeArray;
+  resource_description.res.array.array = depth_array_;
+
+  // Specify texture object parameters
+  struct cudaTextureDesc texture_description;
+  memset(&texture_description, 0, sizeof(texture_description));
+  texture_description.addressMode[0] = cudaAddressModeClamp;
+  texture_description.addressMode[1] = cudaAddressModeClamp;
+  texture_description.filterMode = cudaFilterModeLinear;
+  texture_description.readMode = cudaReadModeElementType;
+  texture_description.normalizedCoords = 0;
+
+  // Create texture object
+  checkCudaErrors(cudaCreateTextureObject(
+      &depth_texture_, &resource_description, &texture_description, NULL));
+}
+
+DepthImageTexture::~DepthImageTexture() {
+  cudaDestroyTextureObject(depth_texture_);
+  cudaFreeArray(depth_array_);
+}
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/src/integrators/cuda/experimental_integrator_input_frames.cu b/nvblox/experiments/src/integrators/cuda/experimental_integrator_input_frames.cu
new file mode 100644
index 00000000..aa29dc45
--- /dev/null
+++ b/nvblox/experiments/src/integrators/cuda/experimental_integrator_input_frames.cu
@@ -0,0 +1,102 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/integrators/cuda/experimental_integrator_input_frames.cuh"
+
+#include "nvblox/integrators/integrators_common.h"
+
+namespace nvblox {
+namespace experiments {
+
+IntegratorInputFrameExperimentsBase::IntegratorInputFrameExperimentsBase(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float _truncation_distance_m, const float _max_weight,
+    TsdfLayer* layer_ptr, cudaStream_t stream)
+    : block_size(layer_ptr->block_size()),
+      truncation_distance_m(_truncation_distance_m),
+      max_weight(_max_weight),
+      num_blocks(block_indices.size()),
+      R_C_L(T_L_C.inverse().rotation()),
+      t_C_L(T_L_C.inverse().translation()),
+      block_ptrs(getBlockPtrsFromIndices(block_indices, layer_ptr)),
+      transfer_stream(stream) {
+  // Allocate GPU memory
+  checkCudaErrors(cudaMalloc(&R_C_L_device_ptr, sizeof(Eigen::Matrix3f)));
+  checkCudaErrors(cudaMalloc(&t_C_L_device_ptr, sizeof(Eigen::Vector3f)));
+  checkCudaErrors(cudaMalloc(&camera_device_ptr, sizeof(Camera)));
+  checkCudaErrors(cudaMalloc(&block_indices_device_ptr,
+                             block_indices.size() * sizeof(Index3D)));
+  checkCudaErrors(
+      cudaMalloc(&block_device_ptrs,
+                 block_indices.size() * sizeof(VoxelBlock<TsdfVoxel>*)));
+
+  // Host -> GPU transfers
+  checkCudaErrors(cudaMemcpyAsync(block_indices_device_ptr,
+                                  block_indices.data(),
+                                  block_indices.size() * sizeof(Index3D),
+                                  cudaMemcpyHostToDevice, transfer_stream));
+  checkCudaErrors(cudaMemcpyAsync(R_C_L_device_ptr, R_C_L.data(),
+                                  sizeof(Eigen::Matrix3f),
+                                  cudaMemcpyHostToDevice, transfer_stream));
+  checkCudaErrors(cudaMemcpyAsync(t_C_L_device_ptr, t_C_L.data(),
+                                  sizeof(Eigen::Vector3f),
+                                  cudaMemcpyHostToDevice, transfer_stream));
+  checkCudaErrors(cudaMemcpyAsync(camera_device_ptr, &camera, sizeof(Camera),
+                                  cudaMemcpyHostToDevice, transfer_stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(block_device_ptrs, block_ptrs.data(),
+                      block_ptrs.size() * sizeof(VoxelBlock<TsdfVoxel>*),
+                      cudaMemcpyHostToDevice, transfer_stream));
+}
+
+IntegratorInputFrameExperimentsBase::~IntegratorInputFrameExperimentsBase() {
+  cudaStreamSynchronize(transfer_stream);
+  checkCudaErrors(cudaFree(block_indices_device_ptr));
+  checkCudaErrors(cudaFree(camera_device_ptr));
+  checkCudaErrors(cudaFree(R_C_L_device_ptr));
+  checkCudaErrors(cudaFree(t_C_L_device_ptr));
+  checkCudaErrors(cudaFree(block_device_ptrs));
+}
+
+IntegratorInputFrameExperimentsTexture::IntegratorInputFrameExperimentsTexture(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, const float max_weight,
+    TsdfLayer* layer_ptr, cudaStream_t stream)
+    : IntegratorInputFrameExperimentsBase(block_indices, depth_frame, T_L_C,
+                                          camera, truncation_distance_m,
+                                          max_weight, layer_ptr, stream),
+      depth_texture(depth_frame, stream){
+          // The base class handles all the other transfers.
+      };
+
+IntegratorInputFrameExperimentsGlobal::IntegratorInputFrameExperimentsGlobal(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, const float max_weight,
+    TsdfLayer* layer_ptr, cudaStream_t stream)
+    : IntegratorInputFrameExperimentsBase(block_indices, depth_frame, T_L_C,
+                                          camera, truncation_distance_m,
+                                          max_weight, layer_ptr, stream),
+      depth_frame_unified_ptr(depth_frame.dataConstPtr()),
+      depth_frame_rows(depth_frame.rows()),
+      depth_frame_cols(depth_frame.cols()) {
+  // Send the depth-frame to GPU
+  depth_frame.toGPU();
+};
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/experiments/src/integrators/cuda/experimental_projective_tsdf_integrators.cu b/nvblox/experiments/src/integrators/cuda/experimental_projective_tsdf_integrators.cu
new file mode 100644
index 00000000..c1b965bd
--- /dev/null
+++ b/nvblox/experiments/src/integrators/cuda/experimental_projective_tsdf_integrators.cu
@@ -0,0 +1,344 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/experiments/integrators/experimental_projective_tsdf_integrators.h"
+
+#include "nvblox/experiments/integrators/cuda/experimental_integrator_input_frames.cuh"
+
+namespace nvblox {
+namespace experiments {
+
+__device__ inline float interpolateDepthTexture(
+    cudaTextureObject_t depth_texture, const Eigen::Vector2f& u_px) {
+  return tex2D<float>(depth_texture, u_px.x() + 0.5, u_px.y() + 0.5);
+}
+
+__device__ inline bool interpolateDepthImage(const float* image, int rows,
+                                             int cols, Eigen::Vector2f u_px,
+                                             float* value_ptr) {
+  // If the projected point does not lie on the image plane, fail. (Here "on the
+  // image plane" means having pixel centers surrounding the query point, ie no
+  // extrapolation).
+  if ((u_px.x() < 0.0f) || (u_px.y() < 0.0f) ||
+      (u_px.x() > static_cast<float>(cols) - 1.0f) ||
+      (u_px.y() > static_cast<float>(rows) - 1.0f)) {
+    return false;
+  }
+  // Interpolation of a grid on with 1 pixel spacing.
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation#On_the_unit_square
+  // Get the pixel coordinates of the pixel on the low side
+  const Index2D u_low_side_px = (u_px).cast<int>();
+  // Get the 4-neighbours values and put them in a matrix
+  // clang-format off
+  const Eigen::Matrix2f value_matrix =
+      (Eigen::Matrix2f() <<
+        image::access(u_low_side_px.y(), u_low_side_px.x(), cols, image),
+        image::access(u_low_side_px.y() + 1, u_low_side_px.x(), cols, image),
+        image::access(u_low_side_px.y(), u_low_side_px.x() + 1, cols, image),
+        image::access(u_low_side_px.y() + 1, u_low_side_px.x() + 1, cols, image))
+        .finished();
+  // clang-format on
+  // Offset of the requested point to the low side center.
+  const Eigen::Vector2f u_offset = (u_px - u_low_side_px.cast<float>());
+  const Eigen::Vector2f x_vec(1.0f - u_offset.x(), u_offset.x());
+  const Eigen::Vector2f y_vec(1.0f - u_offset.y(), u_offset.y());
+  *value_ptr = x_vec.transpose() * value_matrix * y_vec;
+  return true;
+}
+
+__global__ void intergrateBlocksTextureBasedInterpolation(
+    const Index3D* block_indices_device_ptr, const Camera* camera_device_ptr,
+    cudaTextureObject_t depth_texture, const Eigen::Matrix3f* R_C_L_device_ptr,
+    const Eigen::Vector3f* t_C_L_device_ptr, const float block_size,
+    const float truncation_distance_m, const float max_weight,
+    VoxelBlock<TsdfVoxel>** block_device_ptrs) {
+  // Linear index of thread within block
+  const int thread_index_linear =
+      threadIdx.x + blockDim.x * (threadIdx.y + (blockDim.y * threadIdx.z));
+
+  // Get the data which is common between all threads in a block into shared
+  // memory
+  // TODO(alexmillane): We could also get the camera into shared memory. But
+  // maybe let's profile things first and see what is actually affecting the
+  // performance.
+  __shared__ Eigen::Matrix3f R_C_L;
+  if (thread_index_linear < 9) {
+    R_C_L.data()[thread_index_linear] =
+        R_C_L_device_ptr->data()[thread_index_linear];
+  }
+  __shared__ Eigen::Vector3f t_C_L;
+  if (thread_index_linear >= 9 && thread_index_linear < 12) {
+    t_C_L.data()[thread_index_linear - 9] =
+        t_C_L_device_ptr->data()[thread_index_linear - 9];
+  }
+  __syncthreads();
+
+  // The indices of the voxel this thread will work on
+  // blockIdx.x      - The index of the block we're working on (blockIdx.y/z
+  //                   should be zero)
+  // threadIdx.x/y/z - The indices of the voxel within the block (we
+  //                   expect the threadBlockDims == voxelBlockDims)
+  const Index3D block_idx = block_indices_device_ptr[blockIdx.x];
+  const Index3D voxel_idx(threadIdx.z, threadIdx.y, threadIdx.x);
+
+  // Get the Voxel we'll update in this thread
+  // NOTE(alexmillane): Note that we've reverse the voxel indexing order such
+  // that adjacent threads (x-major) access adjacent memory locations in the
+  // block (z-major).
+  TsdfVoxel* voxel_ptr =
+      &(block_device_ptrs[blockIdx.x]
+            ->voxels[threadIdx.z][threadIdx.y][threadIdx.x]);
+
+  // Voxel center point
+  const Vector3f p_voxel_center_L = getCenterPostionFromBlockIndexAndVoxelIndex(
+      block_size, block_idx, voxel_idx);
+  // To camera frame
+  const Vector3f p_voxel_center_C = R_C_L * p_voxel_center_L + t_C_L;
+
+  // Project to image plane
+  Eigen::Vector2f u_px;
+  if (!camera_device_ptr->project(p_voxel_center_C, &u_px)) {
+    return;
+  }
+
+  // If the projected point does not lie on the image plane, fail. (Here "on the
+  // image plane" means having pixel centers surrounding the query point, ie no
+  // extrapolation).
+  if ((u_px.x() < 0.0f) || (u_px.y() < 0.0f) ||
+      (u_px.x() > static_cast<float>(camera_device_ptr->width()) - 1.0f) ||
+      (u_px.y() > static_cast<float>(camera_device_ptr->height()) - 1.0f)) {
+    return;
+  }
+
+  // Get the MEASURED depth of the SURFACE, by interpolating the depth image
+  const float surface_depth_mesured =
+      interpolateDepthTexture(depth_texture, u_px);
+
+  // Get the MEASURED depth of the VOXEL
+  const float voxel_distance_measured =
+      surface_depth_mesured - p_voxel_center_C.z();
+
+  // If we're behind the negative truncation distance, just continue.
+  if (voxel_distance_measured < -truncation_distance_m) {
+    return;
+  }
+
+  // Read CURRENT voxel values (from global GPU memory)
+  const float voxel_distance_current = voxel_ptr->distance;
+  const float voxel_weight_current = voxel_ptr->weight;
+
+  // NOTE(alexmillane): We could try to use CUDA math functions to speed up
+  // below
+  // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE
+
+  // Fuse
+  constexpr float measurement_weight = 1.0f;
+  const float fused_distance = (voxel_distance_measured * measurement_weight +
+                                voxel_distance_current * voxel_weight_current) /
+                               (measurement_weight + voxel_weight_current);
+
+  // Write back to voxel (to global GPU memory)
+  voxel_ptr->distance = fused_distance > 0.0f
+                            ? fmin(truncation_distance_m, fused_distance)
+                            : fmax(-truncation_distance_m, fused_distance);
+  voxel_ptr->weight =
+      fmin(measurement_weight + voxel_weight_current, max_weight);
+}
+
+__global__ void intergrateBlocksGlobalBasedInterpolation(
+    const Index3D* block_indices_device_ptr, const Camera* camera_device_ptr,
+    const float* image, int rows, int cols,
+    const Eigen::Matrix3f* R_C_L_device_ptr,
+    const Eigen::Vector3f* t_C_L_device_ptr, const float block_size,
+    const float truncation_distance_m, const float max_weight,
+    VoxelBlock<TsdfVoxel>** block_device_ptrs) {
+  // Linear index of thread within block
+  const int thread_index_linear =
+      threadIdx.x + blockDim.x * (threadIdx.y + (blockDim.y * threadIdx.z));
+
+  // Get the data which is common between all threads in a block into shared
+  // memory
+  // TODO(alexmillane): We could also get the camera into shared memory. But
+  // maybe let's profile things first and see what is actually affecting the
+  // performance.
+  __shared__ Eigen::Matrix3f R_C_L;
+  if (thread_index_linear < 9) {
+    R_C_L.data()[thread_index_linear] =
+        R_C_L_device_ptr->data()[thread_index_linear];
+  }
+  __shared__ Eigen::Vector3f t_C_L;
+  if (thread_index_linear >= 9 && thread_index_linear < 12) {
+    t_C_L.data()[thread_index_linear - 9] =
+        t_C_L_device_ptr->data()[thread_index_linear - 9];
+  }
+  __syncthreads();
+
+  // The indices of the voxel this thread will work on
+  // blockIdx.x      - The index of the block we're working on (blockIdx.y/z
+  //                   should be zero)
+  // threadIdx.x/y/z - The indices of the voxel within the block (we
+  //                   expect the threadBlockDims == voxelBlockDims)
+  const Index3D block_idx = block_indices_device_ptr[blockIdx.x];
+  const Index3D voxel_idx(threadIdx.z, threadIdx.y, threadIdx.x);
+
+  // Get the Voxel we'll update in this thread
+  // NOTE(alexmillane): Note that we've reverse the voxel indexing order such
+  // that adjacent threads (x-major) access adjacent memory locations in the
+  // block (z-major).
+  TsdfVoxel* voxel_ptr =
+      &(block_device_ptrs[blockIdx.x]
+            ->voxels[threadIdx.z][threadIdx.y][threadIdx.x]);
+
+  // Voxel center point
+  const Vector3f p_voxel_center_L = getCenterPostionFromBlockIndexAndVoxelIndex(
+      block_size, block_idx, voxel_idx);
+  // To camera frame
+  const Vector3f p_voxel_center_C = R_C_L * p_voxel_center_L + t_C_L;
+
+  // Project to image plane
+  Eigen::Vector2f u_px;
+  if (!camera_device_ptr->project(p_voxel_center_C, &u_px)) {
+    return;
+  }
+
+  // Get the MEASURED depth of the SURFACE, by interpolating the depth image
+  float surface_depth_mesured;
+  if (!interpolateDepthImage(image, rows, cols, u_px, &surface_depth_mesured)) {
+    return;
+  }
+
+  // Get the MEASURED depth of the VOXEL
+  const float voxel_distance_measured =
+      surface_depth_mesured - p_voxel_center_C.z();
+
+  // If we're behind the negative truncation distance, just continue.
+  if (voxel_distance_measured < -truncation_distance_m) {
+    return;
+  }
+
+  // Read CURRENT voxel values (from global GPU memory)
+  const float voxel_distance_current = voxel_ptr->distance;
+  const float voxel_weight_current = voxel_ptr->weight;
+
+  // NOTE(alexmillane): We could try to use CUDA math functions to speed up
+  // below
+  // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE
+
+  // Fuse
+  constexpr float measurement_weight = 1.0f;
+  const float fused_distance = (voxel_distance_measured * measurement_weight +
+                                voxel_distance_current * voxel_weight_current) /
+                               (measurement_weight + voxel_weight_current);
+
+  // Write back to voxel (to global GPU memory)
+  voxel_ptr->distance = fused_distance > 0.0f
+                            ? fmin(truncation_distance_m, fused_distance)
+                            : fmax(-truncation_distance_m, fused_distance);
+  voxel_ptr->weight =
+      fmin(measurement_weight + voxel_weight_current, max_weight);
+}
+
+ProjectiveTsdfIntegratorExperimentsBase::
+    ProjectiveTsdfIntegratorExperimentsBase()
+    : ProjectiveTsdfIntegrator() {
+  checkCudaErrors(cudaStreamCreate(&integration_stream_));
+}
+
+ProjectiveTsdfIntegratorExperimentsBase::
+    ~ProjectiveTsdfIntegratorExperimentsBase() {
+  finish();
+  checkCudaErrors(cudaStreamDestroy(integration_stream_));
+}
+
+void ProjectiveTsdfIntegratorExperimentsBase::finish() const {
+  cudaStreamSynchronize(integration_stream_);
+}
+
+void ProjectiveTsdfIntegratorExperimentsTexture::updateBlocks(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, TsdfLayer* layer_ptr) {
+  CHECK_NOTNULL(layer_ptr);
+
+  // Create an integrator frame
+  // Internally this object starts (asynchronous) transfers of it's inputs to
+  // device memory. Kernels called the passed stream can therefore utilize the
+  // input frame's device-side members.
+  const IntegratorInputFrameExperimentsTexture input(
+      block_indices, depth_frame, T_L_C, camera, truncation_distance_m,
+      max_weight_, layer_ptr, integration_stream_);
+
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_blocks = input.num_blocks;
+  // clang-format off
+  intergrateBlocksTextureBasedInterpolation<<<num_blocks, kThreadsPerBlock, 0, integration_stream_>>>(
+      input.block_indices_device_ptr,
+      input.camera_device_ptr,
+      input.depth_texture.texture_object(),
+      input.R_C_L_device_ptr,
+      input.t_C_L_device_ptr,
+      input.block_size, 
+      input.truncation_distance_m,
+      input.max_weight,
+      input.block_device_ptrs);
+  // clang-format on
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Finish processing of the frame before returning control
+  finish();
+}
+
+void ProjectiveTsdfIntegratorExperimentsGlobal::updateBlocks(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, TsdfLayer* layer_ptr) {
+  CHECK_NOTNULL(layer_ptr);
+
+  // Create an integrator frame
+  // Internally this object starts (asynchronous) transfers of it's inputs to
+  // device memory. Kernels called the passed stream can therefore utilize the
+  // input frame's device-side members.
+  const IntegratorInputFrameExperimentsGlobal input(
+      block_indices, depth_frame, T_L_C, camera, truncation_distance_m,
+      max_weight_, layer_ptr, integration_stream_);
+
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_blocks = input.num_blocks;
+  // clang-format off
+  intergrateBlocksGlobalBasedInterpolation<<<num_blocks, kThreadsPerBlock, 0, integration_stream_>>>(
+      input.block_indices_device_ptr,
+      input.camera_device_ptr,
+      input.depth_frame_unified_ptr,
+      input.depth_frame_rows,
+      input.depth_frame_cols,
+      input.R_C_L_device_ptr,
+      input.t_C_L_device_ptr,
+      input.block_size, 
+      input.truncation_distance_m,
+      input.max_weight,
+      input.block_device_ptrs);
+  // clang-format on
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Finish processing of the frame before returning control
+  finish();
+}
+
+}  // namespace experiments
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/accessors.h b/nvblox/include/nvblox/core/accessors.h
new file mode 100644
index 00000000..0c3046a0
--- /dev/null
+++ b/nvblox/include/nvblox/core/accessors.h
@@ -0,0 +1,99 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <functional>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+
+// Get Block
+template <typename BlockType>
+bool getBlockAtPosition(const BlockLayer<BlockType>& layer,
+                        const Eigen::Vector3f& position,
+                        const BlockType** data);
+
+// Get Voxel
+template <typename VoxelType>
+bool getVoxelAtPosition(const BlockLayer<VoxelBlock<VoxelType>>& layer,
+                        const Eigen::Vector3f& position,
+                        const VoxelType** data);
+
+// Accessors for voxelized types
+template <typename VoxelType>
+const VoxelType* getVoxelAtBlockAndVoxelIndex(
+    const BlockLayer<VoxelBlock<VoxelType>>& layer, const Index3D& block_index,
+    const Index3D& voxel_index);
+
+// Accessors for calling a function on all voxels in a layer
+template <typename VoxelType>
+using ConstVoxelCallbackFunction =
+    std::function<void(const Index3D& block_index, const Index3D& voxel_index,
+                       const VoxelType* voxel)>;
+
+template <typename VoxelType>
+using VoxelCallbackFunction = std::function<void(
+    const Index3D& block_index, const Index3D& voxel_index, VoxelType* voxel)>;
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(const BlockLayer<VoxelBlock<VoxelType>>& layer,
+                             ConstVoxelCallbackFunction<VoxelType> callback);
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(BlockLayer<VoxelBlock<VoxelType>>* layer,
+                             VoxelCallbackFunction<VoxelType> callback);
+
+// Accessors for calling a function on all voxels in a block
+template <typename VoxelType>
+using ConstBlockCallbackFunction =
+    std::function<void(const Index3D& voxel_index, const VoxelType* voxel)>;
+
+template <typename VoxelType>
+using BlockCallbackFunction =
+    std::function<void(const Index3D& voxel_index, VoxelType* voxel)>;
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(const VoxelBlock<VoxelType>& block,
+                             ConstBlockCallbackFunction<VoxelType> callback);
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(VoxelBlock<VoxelType>* block,
+                             BlockCallbackFunction<VoxelType> callback);
+
+// Accessors for calling a function on all blocks in a layer
+template <typename BlockType>
+using ConstBlockCallbackFunction =
+    std::function<void(const Index3D& block_index, const BlockType* block)>;
+
+template <typename BlockType>
+using BlockCallbackFunction =
+    std::function<void(const Index3D& block_index, BlockType* block)>;
+
+template <typename BlockType>
+void callFunctionOnAllBlocks(const BlockLayer<BlockType>& layer,
+                             ConstBlockCallbackFunction<BlockType> callback);
+
+template <typename BlockType>
+void callFunctionOnAllBlocks(BlockLayer<BlockType>* layer,
+                             BlockCallbackFunction<BlockType> callback);
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/accessors_impl.h"
diff --git a/nvblox/include/nvblox/core/blox.h b/nvblox/include/nvblox/core/blox.h
new file mode 100644
index 00000000..d3fad0a2
--- /dev/null
+++ b/nvblox/include/nvblox/core/blox.h
@@ -0,0 +1,55 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+
+#include "nvblox/core/unified_ptr.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+
+template <typename VoxelType>
+struct VoxelBlock {
+  typedef unified_ptr<VoxelBlock> Ptr;
+  typedef unified_ptr<const VoxelBlock> ConstPtr;
+
+  static constexpr size_t kVoxelsPerSide = 8;
+  VoxelType voxels[kVoxelsPerSide][kVoxelsPerSide][kVoxelsPerSide];
+
+  static Ptr allocate(MemoryType memory_type);
+  static void initOnGPU(VoxelBlock* block_ptr);
+};
+
+struct FreespaceBlock {
+  typedef unified_ptr<FreespaceBlock> Ptr;
+  typedef unified_ptr<const FreespaceBlock> ConstPtr;
+
+  bool free = true;
+};
+
+// Multires Block
+
+// Reference Block
+
+// Initialization Utility Functions
+template <typename BlockType>
+void setBlockBytesZeroOnGPU(BlockType* block_device_ptr);
+void setColorBlockGrayOnGPU(VoxelBlock<ColorVoxel>* block_device_ptr);
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/blox_impl.h"
diff --git a/nvblox/include/nvblox/core/bounding_boxes.h b/nvblox/include/nvblox/core/bounding_boxes.h
new file mode 100644
index 00000000..6f1e9cd4
--- /dev/null
+++ b/nvblox/include/nvblox/core/bounding_boxes.h
@@ -0,0 +1,46 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+inline std::vector<Index3D> getBlockIndicesTouchedByBoundingBox(
+    const float block_size, const AxisAlignedBoundingBox& aabb);
+
+inline AxisAlignedBoundingBox getAABBOfBlock(const float block_size,
+                                             const Index3D& block_index);
+
+template <typename BlockType>
+AxisAlignedBoundingBox getAABBOfAllocatedBlocks(
+    const BlockLayer<BlockType>& layer);
+
+template <typename BlockType>
+std::vector<Index3D> getAllocatedBlocksWithinAABB(
+    const BlockLayer<BlockType>& layer, const AxisAlignedBoundingBox& aabb);
+
+AxisAlignedBoundingBox getAABBOfObservedVoxels(const EsdfLayer& layer);
+AxisAlignedBoundingBox getAABBOfObservedVoxels(const TsdfLayer& layer,
+                                               const float min_weight = 1e-4);
+AxisAlignedBoundingBox getAABBOfObservedVoxels(const ColorLayer& layer,
+                                               const float min_weight = 1e-4);
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/bounding_boxes_impl.h"
diff --git a/nvblox/include/nvblox/core/camera.h b/nvblox/include/nvblox/core/camera.h
new file mode 100644
index 00000000..39722901
--- /dev/null
+++ b/nvblox/include/nvblox/core/camera.h
@@ -0,0 +1,130 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+class Frustum;
+class BoundingPlane;
+
+/// Class that describes the parameters and FoV of a camera.
+class Camera {
+ public:
+  __host__ __device__ inline Camera(float fu, float fv, float cu, float cv,
+                                    int width, int height);
+  __host__ __device__ inline Camera(float fu, float fv, int width, int height);
+
+  __host__ __device__ inline bool project(const Vector3f& p_C,
+                                          Vector2f* u_C) const;
+
+  /// Get the axis aligned bounding box of the view in the LAYER coordinate
+  /// frame.
+  __host__ AxisAlignedBoundingBox getViewAABB(const Transform& T_L_C,
+                                              const float min_depth,
+                                              const float max_depth) const;
+
+  __host__ Frustum getViewFrustum(const Transform& T_L_C, const float min_depth,
+                                  const float max_depth) const;
+
+  /// Gets the view corners in the CAMERA coordinate frame.
+  __host__ Eigen::Matrix<float, 8, 3> getViewCorners(
+      const float min_depth, const float max_depth) const;
+
+  // Returns an unnormalized ray direction in the camera frame corresponding to
+  // the passed pixel.
+  // Two functions, one for (floating point) image-plane coordinates, another
+  // function for pixel indices.
+  __host__ __device__ inline Vector3f rayFromImagePlaneCoordinates(
+      const Vector2f& u_C) const;
+  __host__ __device__ inline Vector3f rayFromPixelIndices(
+      const Index2D& u_C) const;
+
+  // Accessors
+  __host__ __device__ inline float fu() const { return fu_; }
+  __host__ __device__ inline float fv() const { return fv_; }
+  __host__ __device__ inline float cu() const { return cu_; }
+  __host__ __device__ inline float cv() const { return cv_; }
+  __host__ __device__ inline int width() const { return width_; }
+  __host__ __device__ inline int height() const { return height_; }
+  __host__ __device__ inline int cols() const { return width_; }
+  __host__ __device__ inline int rows() const { return height_; }
+
+  // Factories
+  inline static Camera fromIntrinsicsMatrix(const Eigen::Matrix3f& mat,
+                                            int width, int height);
+
+ private:
+  const float fu_;
+  const float fv_;
+  const float cu_;
+  const float cv_;
+
+  const int width_;
+  const int height_;
+};
+
+/// A bounding plane which has one "inside" direction and the other direction is
+/// "outside." Quick tests for which side of the plane you are on.
+class BoundingPlane {
+ public:
+  BoundingPlane() : normal_(Vector3f::Identity()), distance_(0.0f) {}
+
+  void setFromPoints(const Vector3f& p1, const Vector3f& p2,
+                     const Vector3f& p3);
+  void setFromDistanceNormal(const Vector3f& normal, float distance);
+
+  /// Is the point on the correct side of the bounding plane?
+  bool isPointInside(const Vector3f& point) const;
+
+  const Vector3f& normal() const { return normal_; }
+  float distance() const { return distance_; }
+
+ private:
+  Vector3f normal_;
+  float distance_;
+};
+
+/// Class that allows checking for whether objects are within the field of view
+/// of a camera or not.
+class Frustum {
+ public:
+  // Frustum must be initialized with a camera and min and max depth and pose.
+  Frustum(const Camera& camera, const Transform& T_L_C, float min_depth,
+          float max_depth);
+
+  AxisAlignedBoundingBox getAABB() const { return aabb_; }
+
+  bool isPointInView(const Vector3f& point) const;
+  bool isAABBInView(const AxisAlignedBoundingBox& aabb) const;
+
+ private:
+  // Helper functions to do the actual computations.
+  void computeBoundingPlanes(const Eigen::Matrix<float, 8, 3>& corners_C,
+                             const Transform& T_L_C);
+
+  /// Bounding planes containing around the frustum. Expressed in the layer
+  /// coordinate frame.
+  std::array<BoundingPlane, 6> bounding_planes_L_;
+
+  /// Cached AABB of the
+  AxisAlignedBoundingBox aabb_;
+};
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/camera_impl.h"
diff --git a/nvblox/include/nvblox/core/color.h b/nvblox/include/nvblox/core/color.h
new file mode 100644
index 00000000..b5d84d7d
--- /dev/null
+++ b/nvblox/include/nvblox/core/color.h
@@ -0,0 +1,55 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include <cmath>
+
+namespace nvblox {
+
+struct Color {
+  __host__ __device__ Color() : r(0), g(0), b(0) {}
+  __host__ __device__ Color(uint8_t _r, uint8_t _g, uint8_t _b)
+      : r(_r), g(_g), b(_b) {}
+
+  uint8_t r;
+  uint8_t g;
+  uint8_t b;
+
+  bool operator==(const Color& other) const {
+    return (r == other.r) && (g == other.g) && (b == other.b);
+  }
+
+  // Static functions for working with colors
+  static Color blendTwoColors(const Color& first_color, float first_weight,
+                              const Color& second_color, float second_weight);
+
+  // Now a bunch of static colors to use! :)
+  static const Color White() { return Color(255, 255, 255); }
+  static const Color Black() { return Color(0, 0, 0); }
+  static const Color Gray() { return Color(127, 127, 127); }
+  static const Color Red() { return Color(255, 0, 0); }
+  static const Color Green() { return Color(0, 255, 0); }
+  static const Color Blue() { return Color(0, 0, 255); }
+  static const Color Yellow() { return Color(255, 255, 0); }
+  static const Color Orange() { return Color(255, 127, 0); }
+  static const Color Purple() { return Color(127, 0, 255); }
+  static const Color Teal() { return Color(0, 255, 255); }
+  static const Color Pink() { return Color(255, 0, 127); }
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/common_names.h b/nvblox/include/nvblox/core/common_names.h
new file mode 100644
index 00000000..646d85ff
--- /dev/null
+++ b/nvblox/include/nvblox/core/common_names.h
@@ -0,0 +1,34 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/mesh/mesh_block.h"
+
+namespace nvblox {
+
+using TsdfBlock = VoxelBlock<TsdfVoxel>;
+using TsdfLayer = VoxelBlockLayer<TsdfVoxel>;
+using EsdfBlock = VoxelBlock<EsdfVoxel>;
+using EsdfLayer = VoxelBlockLayer<EsdfVoxel>;
+using ColorBlock = VoxelBlock<ColorVoxel>;
+using ColorLayer = VoxelBlockLayer<ColorVoxel>;
+using MeshLayer = BlockLayer<MeshBlock>;
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/cuda/error_check.cuh b/nvblox/include/nvblox/core/cuda/error_check.cuh
new file mode 100644
index 00000000..208f7c15
--- /dev/null
+++ b/nvblox/include/nvblox/core/cuda/error_check.cuh
@@ -0,0 +1,25 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+void check_cuda(cudaError_t result, char const* const func,
+                const char* const file, int const line);
+
+#define checkCudaErrors(val) check_cuda((val), #val, __FILE__, __LINE__)
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/cuda/image_cuda.h b/nvblox/include/nvblox/core/cuda/image_cuda.h
new file mode 100644
index 00000000..d43cea03
--- /dev/null
+++ b/nvblox/include/nvblox/core/cuda/image_cuda.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <utility>
+
+namespace nvblox {
+namespace cuda {
+
+template <typename T>
+void copy(const int rows, const int cols, const T* from, T* to);
+
+template <typename T>
+void toGPU(const T* data, const int rows, const int cols);
+
+float max(const int rows, const int cols, const float* image);
+float min(const int rows, const int cols, const float* image);
+std::pair<float, float> minmax(const int rows, const int cols,
+                               const float* image);
+
+}  // namespace cuda
+}  // namespace nvblox
+
+#include "nvblox/core/cuda/impl/image_cuda_impl.cuh"
diff --git a/nvblox/include/nvblox/core/cuda/impl/image_cuda_impl.cuh b/nvblox/include/nvblox/core/cuda/impl/image_cuda_impl.cuh
new file mode 100644
index 00000000..673e6f5b
--- /dev/null
+++ b/nvblox/include/nvblox/core/cuda/impl/image_cuda_impl.cuh
@@ -0,0 +1,37 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/cuda/error_check.cuh"
+
+namespace nvblox {
+namespace cuda {
+
+template<typename T>
+void copy(const int rows, const int cols, const T* from, T* to) {
+  checkCudaErrors(
+      cudaMemcpy(to, from, rows * cols * sizeof(T), cudaMemcpyDefault));
+}
+
+template<typename T>
+void toGPU(const T* image, const int rows, const int cols) {
+  int device;
+  checkCudaErrors(cudaGetDevice(&device));
+  checkCudaErrors(cudaMemPrefetchAsync(image, rows * cols * sizeof(T), device));
+}
+
+} // namespace cuda
+} // namespace nvblox
diff --git a/nvblox/include/nvblox/core/cuda/warmup.h b/nvblox/include/nvblox/core/cuda/warmup.h
new file mode 100644
index 00000000..34e50acb
--- /dev/null
+++ b/nvblox/include/nvblox/core/cuda/warmup.h
@@ -0,0 +1,23 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+// Little function that warms up the CUDA drivers.
+void warmupCuda();
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/core/hash.h b/nvblox/include/nvblox/core/hash.h
new file mode 100644
index 00000000..00228141
--- /dev/null
+++ b/nvblox/include/nvblox/core/hash.h
@@ -0,0 +1,68 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <functional>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+/**
+ * Performs deco hashing on block indexes. Based on recommendations of
+ * "Investigating the impact of Suboptimal Hashing Functions" by L. Buckley et
+ * al.
+ */
+struct Index3DHash {
+  /// number was arbitrarily chosen with no good justification
+  static constexpr size_t sl = 17191;
+  static constexpr size_t sl2 = sl * sl;
+
+  __host__ __device__ std::size_t operator()(const Index3D& index) const {
+    return static_cast<unsigned int>(index.x() + index.y() * sl +
+                                     index.z() * sl2);
+  }
+};
+
+template <typename T>
+struct VectorCompare {
+  __host__ __device__ inline bool operator()(const T& p_1, const T& p_2) const {
+    if (p_1.x() != p_2.x()) {
+      return p_1.x() < p_2.x();
+    }
+    if (p_1.y() != p_2.y()) {
+      return p_1.y() < p_2.y();
+    }
+    return p_1.z() < p_2.z();
+  };
+};
+
+template <typename ValueType>
+struct Index3DHashMapType {
+  typedef std::unordered_map<
+      Index3D, ValueType, Index3DHash, std::equal_to<Index3D>,
+      Eigen::aligned_allocator<std::pair<const Index3D, ValueType>>>
+      type;
+};
+
+typedef std::unordered_set<Index3D, Index3DHash, std::equal_to<Index3D>,
+                           Eigen::aligned_allocator<Index3D>>
+    Index3DSet;
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/image.h b/nvblox/include/nvblox/core/image.h
new file mode 100644
index 00000000..09597167
--- /dev/null
+++ b/nvblox/include/nvblox/core/image.h
@@ -0,0 +1,152 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "nvblox/core/color.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_ptr.h"
+
+namespace nvblox {
+
+// Row-major access to images (on host and device)
+namespace image {
+
+template <typename ElementType>
+__host__ __device__ inline ElementType access(int row_idx, int col_idx,
+                                              int cols,
+                                              const ElementType* data) {
+  return data[row_idx * cols + col_idx];
+}
+template <typename ElementType>
+__host__ __device__ inline ElementType& access(int row_idx, int col_idx,
+                                               int cols, ElementType* data) {
+  return data[row_idx * cols + col_idx];
+}
+template <typename ElementType>
+__host__ __device__ inline ElementType access(int linear_idx,
+                                              const ElementType* data) {
+  return data[linear_idx];
+}
+template <typename ElementType>
+__host__ __device__ inline ElementType& access(int linear_idx,
+                                               ElementType* data) {
+  return data[linear_idx];
+}
+
+}  // namespace image
+
+// Row-major image.
+// - Note that in a row-major image, rows follow one another in linear memory,
+// which means the column index varied between subsequent elements.
+// - Images use corner based indexing such that the pixel with index (0,0) is
+// centered at (0.5px,0.5px) and spans (0.0px,0.0px) to (1.0px,1.0px).
+// - Points on the image plane are defined as: u_px = (u_px.x(), u_px.y()) =
+// (col_idx, row_idx) in pixels.
+
+constexpr MemoryType kDefaultImageMemoryType = MemoryType::kDevice;
+
+template <typename _ElementType>
+class Image {
+ public:
+  // "Save" the element type so it's queryable as Image::ElementType.
+  typedef _ElementType ElementType;
+
+  Image(int rows, int cols, MemoryType memory_type = kDefaultImageMemoryType)
+      : rows_(rows),
+        cols_(cols),
+        memory_type_(memory_type),
+        data_(make_unified<ElementType[]>(static_cast<size_t>(rows * cols),
+                                          memory_type_)) {}
+  Image(MemoryType memory_type = kDefaultImageMemoryType)
+      : rows_(0), cols_(0), memory_type_(memory_type) {}
+  Image(Image&& other);
+  Image& operator=(Image&& other);
+
+  // Deep copy constructor (second can be used to transition memory type)
+  Image(const Image& other);
+  Image(const Image& other, MemoryType memory_type);
+  Image& operator=(const Image& other);
+
+  // Prefetch the data to the gpu (only makes sense for kUnified images)
+  void toGPU() const;
+
+  // Attributes
+  inline int cols() const { return cols_; }
+  inline int rows() const { return rows_; }
+  inline int numel() const { return cols_ * rows_; }
+  inline int width() const { return cols_; }
+  inline int height() const { return rows_; }
+  inline MemoryType memory_type() const { return memory_type_; }
+
+  // Access
+  // NOTE(alexmillane): The guard-rails are off here. If you declare a kDevice
+  // Image and try to access its data, you will get undefined behaviour.
+  inline ElementType operator()(const int row_idx, const int col_idx) const {
+    return image::access(row_idx, col_idx, cols_, data_.get());
+  }
+  inline ElementType& operator()(const int row_idx, const int col_idx) {
+    return image::access(row_idx, col_idx, cols_, data_.get());
+  }
+  inline ElementType operator()(const int linear_idx) const {
+    return image::access(linear_idx, data_.get());
+  }
+  inline ElementType& operator()(const int linear_idx) {
+    return image::access(linear_idx, data_.get());
+  }
+
+  // Raw pointer access
+  inline ElementType* dataPtr() { return data_.get(); }
+  inline const ElementType* dataConstPtr() const { return data_.get(); }
+
+  // Reset the contents of the image. Reallocate if the image got larger.
+  void populateFromBuffer(int rows, int cols, const ElementType* buffer,
+                          MemoryType memory_type = kDefaultImageMemoryType);
+
+  // Factories
+  static inline Image fromBuffer(
+      int rows, int cols, const ElementType* buffer,
+      MemoryType memory_type = kDefaultImageMemoryType);
+
+ protected:
+  int rows_;
+  int cols_;
+  MemoryType memory_type_;
+  unified_ptr<ElementType[]> data_;
+};
+
+using DepthImage = Image<float>;
+using ColorImage = Image<Color>;
+
+// Image Reductions
+namespace image {
+
+float max(const DepthImage& image);
+float min(const DepthImage& image);
+std::pair<float, float> minmax(const DepthImage& image);
+float maxGPU(const DepthImage& image);
+float minGPU(const DepthImage& image);
+std::pair<float, float> minmaxGPU(const DepthImage& image);
+float maxCPU(const DepthImage& image);
+float minCPU(const DepthImage& image);
+std::pair<float, float> minmaxCPU(const DepthImage& image);
+
+}  // namespace image
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/image_impl.h"
diff --git a/nvblox/include/nvblox/core/impl/accessors_impl.h b/nvblox/include/nvblox/core/impl/accessors_impl.h
new file mode 100644
index 00000000..79f852c3
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/accessors_impl.h
@@ -0,0 +1,201 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+
+template <typename BlockType>
+bool getBlockAtPosition(const BlockLayer<BlockType>& layer,
+                        const Eigen::Vector3f& position,
+                        const BlockType** data) {
+  typename BlockType::ConstPtr block_ptr = layer.getBlockAtPosition(position);
+  if (block_ptr != nullptr) {
+    *data = block_ptr.get();
+    return true;
+  }
+  return false;
+}
+
+template <typename VoxelType>
+bool getVoxelAtPosition(const BlockLayer<VoxelBlock<VoxelType>>& layer,
+                        const Eigen::Vector3f& position,
+                        const VoxelType** data) {
+  Index3D block_idx;
+  Index3D voxel_idx;
+  getBlockAndVoxelIndexFromPositionInLayer(layer.block_size(), position,
+                                           &block_idx, &voxel_idx);
+  const typename VoxelBlock<VoxelType>::ConstPtr block_ptr =
+      layer.getBlockAtIndex(block_idx);
+  if (block_ptr) {
+    *data = &block_ptr->voxels[voxel_idx.x()][voxel_idx.y()][voxel_idx.z()];
+    return true;
+  }
+  return false;
+}
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(const BlockLayer<VoxelBlock<VoxelType>>& layer,
+                             ConstVoxelCallbackFunction<VoxelType> callback) {
+  std::vector<Index3D> block_indices = layer.getAllBlockIndices();
+
+  constexpr int kVoxelsPerSide = VoxelBlock<VoxelType>::kVoxelsPerSide;
+
+  bool clone = (layer.memory_type() == MemoryType::kDevice);
+
+  // Iterate over all the blocks:
+  for (const Index3D& block_index : block_indices) {
+    Index3D voxel_index;
+    typename VoxelBlock<VoxelType>::ConstPtr block;
+    if (clone) {
+      block = layer.getBlockAtIndex(block_index).clone(MemoryType::kHost);
+    } else {
+      block = layer.getBlockAtIndex(block_index);
+    }
+    if (!block) {
+      continue;
+    }
+    // Iterate over all the voxels:
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          // Get the voxel and call the callback on it:
+          const VoxelType* voxel =
+              &block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+          callback(block_index, voxel_index, voxel);
+        }
+      }
+    }
+  }
+}
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(BlockLayer<VoxelBlock<VoxelType>>* layer,
+                             VoxelCallbackFunction<VoxelType> callback) {
+  std::vector<Index3D> block_indices = layer->getAllBlockIndices();
+
+  constexpr int kVoxelsPerSide = VoxelBlock<VoxelType>::kVoxelsPerSide;
+
+  // Iterate over all the blocks:
+  for (const Index3D& block_index : block_indices) {
+    Index3D voxel_index;
+    typename VoxelBlock<VoxelType>::Ptr block =
+        layer->getBlockAtIndex(block_index);
+    if (!block) {
+      continue;
+    }
+    // Iterate over all the voxels:
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          // Get the voxel and call the callback on it:
+          VoxelType* voxel =
+              &block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+          callback(block_index, voxel_index, voxel);
+        }
+      }
+    }
+  }
+}
+
+template <typename VoxelType>
+const VoxelType* getVoxelAtBlockAndVoxelIndex(
+    const BlockLayer<VoxelBlock<VoxelType>>& layer, const Index3D& block_index,
+    const Index3D& voxel_index) {
+  typename VoxelBlock<VoxelType>::ConstPtr block =
+      layer.getBlockAtIndex(block_index);
+  if (!block) {
+    return nullptr;
+  }
+  // TODO: add DCHECKS
+  return &block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+}
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(const VoxelBlock<VoxelType>& block,
+                             ConstBlockCallbackFunction<VoxelType> callback) {
+  constexpr int kVoxelsPerSide = VoxelBlock<VoxelType>::kVoxelsPerSide;
+  // Iterate over all the voxels:
+  Index3D voxel_index;
+  for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+       voxel_index.x()++) {
+    for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+         voxel_index.y()++) {
+      for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+           voxel_index.z()++) {
+        // Get the voxel and call the callback on it:
+        const VoxelType* voxel =
+            &block.voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+        callback(voxel_index, voxel);
+      }
+    }
+  }
+}
+
+template <typename VoxelType>
+void callFunctionOnAllVoxels(VoxelBlock<VoxelType>* block,
+                             BlockCallbackFunction<VoxelType> callback) {
+  constexpr int kVoxelsPerSide = VoxelBlock<VoxelType>::kVoxelsPerSide;
+  // Iterate over all the voxels:
+  Index3D voxel_index;
+  for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+       voxel_index.x()++) {
+    for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+         voxel_index.y()++) {
+      for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+           voxel_index.z()++) {
+        // Get the voxel and call the callback on it:
+        VoxelType* voxel =
+            &block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+        callback(voxel_index, voxel);
+      }
+    }
+  }
+}
+
+template <typename BlockType>
+void callFunctionOnAllBlocks(const BlockLayer<BlockType>& layer,
+                             ConstBlockCallbackFunction<BlockType> callback) {
+  const std::vector<Index3D> block_indices = layer.getAllBlockIndices();
+  for (const Index3D& block_index : block_indices) {
+    typename BlockType::ConstPtr block = layer.getBlockAtIndex(block_index);
+    CHECK(block);
+    callback(block_index, block.get());
+  }
+}
+
+template <typename BlockType>
+void callFunctionOnAllBlocks(BlockLayer<BlockType>* layer,
+                             BlockCallbackFunction<BlockType> callback) {
+  std::vector<Index3D> block_indices = layer->getAllBlockIndices();
+  for (const Index3D& block_index : block_indices) {
+    typename BlockType::Ptr block = layer->getBlockAtIndex(block_index);
+    CHECK(block);
+    callback(block_index, block.get());
+  }
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/blox_impl.h b/nvblox/include/nvblox/core/impl/blox_impl.h
new file mode 100644
index 00000000..6bc4532d
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/blox_impl.h
@@ -0,0 +1,50 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+template <typename VoxelType>
+typename VoxelBlock<VoxelType>::Ptr VoxelBlock<VoxelType>::allocate(
+    MemoryType memory_type) {
+  Ptr voxel_block_ptr = make_unified<VoxelBlock>(memory_type);
+  if (memory_type == MemoryType::kDevice) {
+    initOnGPU(voxel_block_ptr.get());
+  }
+  return voxel_block_ptr;
+}
+
+template <typename VoxelType>
+void VoxelBlock<VoxelType>::initOnGPU(VoxelBlock<VoxelType>* block_ptr) {
+  setBlockBytesZeroOnGPU(block_ptr);
+}
+
+// Initialization specialization for ColorVoxel which is initialized to gray
+// with zero weight
+template <>
+inline void VoxelBlock<ColorVoxel>::initOnGPU(
+    VoxelBlock<ColorVoxel>* block_ptr) {
+  setColorBlockGrayOnGPU(block_ptr);
+}
+
+template <typename BlockType>
+void setBlockBytesZeroOnGPU(BlockType* block_device_ptr) {
+  // TODO(alexmillane): This is a cuda call in a public header... Is this
+  // causing issues?
+  cudaMemset(block_device_ptr, 0, sizeof(BlockType));
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/bounding_boxes_impl.h b/nvblox/include/nvblox/core/impl/bounding_boxes_impl.h
new file mode 100644
index 00000000..de5e7998
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/bounding_boxes_impl.h
@@ -0,0 +1,78 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/accessors.h"
+
+namespace nvblox {
+
+inline std::vector<Index3D> getBlockIndicesTouchedByBoundingBox(
+    const float block_size, const AxisAlignedBoundingBox& aabb_L) {
+  // Get bounds (in voxels)
+  const Index3D aabb_min_vox_L =
+      getBlockIndexFromPositionInLayer(block_size, aabb_L.min());
+  const Index3D aabb_max_vox_L =
+      getBlockIndexFromPositionInLayer(block_size, aabb_L.max());
+  const Index3D num_blocks = aabb_max_vox_L - aabb_min_vox_L;
+  // Fill up the vector
+  std::vector<Index3D> block_indices;
+  block_indices.reserve(num_blocks.x() * num_blocks.y() * num_blocks.z());
+  Index3D current_idx = aabb_min_vox_L;
+  for (int x_ind = 0; x_ind <= num_blocks.x(); x_ind++) {
+    for (int y_ind = 0; y_ind <= num_blocks.y(); y_ind++) {
+      for (int z_ind = 0; z_ind <= num_blocks.z(); z_ind++) {
+        block_indices.push_back(current_idx);
+        current_idx.z()++;
+      }
+      current_idx.z() -= (num_blocks.z() + 1);  // reset z
+      current_idx.y()++;
+    }
+    current_idx.y() -= (num_blocks.y() + 1);  // reset y
+    current_idx.x()++;
+  }
+  return block_indices;
+}
+
+inline AxisAlignedBoundingBox getAABBOfBlock(const float block_size,
+                                             const Index3D& block_index) {
+  return AxisAlignedBoundingBox(
+      block_index.cast<float>() * block_size,
+      (block_index.cast<float>() + Vector3f(1.0f, 1.0f, 1.0f)) * block_size);
+}
+
+template <typename BlockType>
+AxisAlignedBoundingBox getAABBOfAllocatedBlocks(
+    const BlockLayer<BlockType>& layer) {
+  AxisAlignedBoundingBox aabb;
+  for (const Index3D& idx : layer.getAllBlockIndices()) {
+    aabb = aabb.merged(getAABBOfBlock(layer.block_size(), idx));
+  }
+  return aabb;
+}
+
+template <typename BlockType>
+std::vector<Index3D> getAllocatedBlocksWithinAABB(
+    const BlockLayer<BlockType>& layer, const AxisAlignedBoundingBox& aabb) {
+  std::vector<Index3D> allocated_blocks;
+  for (const Index3D& idx : layer.getAllBlockIndices()) {
+    if (aabb.intersects(getAABBOfBlock(layer.block_size(), idx))) {
+      allocated_blocks.push_back(idx);
+    }
+  }
+  return allocated_blocks;
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/camera_impl.h b/nvblox/include/nvblox/core/impl/camera_impl.h
new file mode 100644
index 00000000..496d226e
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/camera_impl.h
@@ -0,0 +1,72 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <glog/logging.h>
+
+namespace nvblox {
+
+Camera::Camera(float fu, float fv, float cu, float cv, int width, int height)
+    : fu_(fu), fv_(fv), cu_(cu), cv_(cv), width_(width), height_(height) {}
+Camera::Camera(float fu, float fv, int width, int height)
+    : Camera(fu, fv, width / 2.0, height / 2.0, width, height) {}
+
+bool Camera::project(const Eigen::Vector3f& p_C, Eigen::Vector2f* u_C) const {
+  // Point is behind the camera.
+  if (p_C.z() <= 0.0f) {
+    return false;
+  }
+  const float inv_z = 1.0f / p_C.z();
+  *u_C = inv_z * Eigen::Vector2f(p_C.x(), p_C.y());
+
+  u_C->x() = u_C->x() * fu_ + cu_;
+  u_C->y() = u_C->y() * fv_ + cv_;
+
+  if (u_C->x() > width_ || u_C->y() > height_ || u_C->x() < 0 || u_C->y() < 0) {
+    return false;
+  }
+  return true;
+}
+
+Vector3f Camera::rayFromImagePlaneCoordinates(const Vector2f& u_C) const {
+  // NOTE(alexmillane): We allow u_C values up to the outer edges of pixels,
+  // hence the GE and LE checks.
+  DCHECK_GE(u_C[0], 0.0f);
+  DCHECK_LE(u_C[0], width_);
+  DCHECK_GE(u_C[1], 0.0f);
+  DCHECK_LE(u_C[1], height_);
+  return Vector3f((u_C[0] - cu_) / fu_,  // NOLINT
+                  (u_C[1] - cv_) / fv_,  // NOLINT
+                  1.0f);
+}
+
+Vector3f Camera::rayFromPixelIndices(const Index2D& u_C) const {
+  // NOTE(alexmillane): The +0.5 here takes us from image plane indices, which
+  // are equal to the coordinates of the lower pixel corner, to the pixel
+  // center.
+  return rayFromImagePlaneCoordinates(u_C.cast<float>() + Vector2f(0.5, 0.5));
+}
+
+Camera Camera::fromIntrinsicsMatrix(const Eigen::Matrix3f& mat, int width,
+                                    int height) {
+  const float fu = mat(0, 0);
+  const float fv = mat(1, 1);
+  const float cu = mat(0, 2);
+  const float cv = mat(1, 2);
+  return Camera(fu, fv, cu, cv, width, height);
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/image_impl.h b/nvblox/include/nvblox/core/impl/image_impl.h
new file mode 100644
index 00000000..a3acb44d
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/image_impl.h
@@ -0,0 +1,112 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <algorithm>
+
+#include "nvblox/core/cuda/image_cuda.h"
+#include "nvblox/core/interpolation_2d.h"
+#include "nvblox/io/csv.h"
+
+namespace nvblox {
+
+template <typename ElementType>
+Image<ElementType>::Image(const Image<ElementType>& other)
+    : Image(other, other.memory_type_){};
+
+template <typename ElementType>
+Image<ElementType>::Image(const Image<ElementType>& other,
+                          MemoryType memory_type)
+    : rows_(other.rows()),
+      cols_(other.cols()),
+      memory_type_(memory_type),
+      data_(make_unified<ElementType[]>(static_cast<size_t>(rows_ * cols_),
+                                        memory_type)) {
+  LOG(WARNING) << "Deep copy of Image.";
+  cuda::copy(rows_, cols_, other.data_.get(), data_.get());
+}
+
+template <typename ElementType>
+Image<ElementType>::Image(Image<ElementType>&& other)
+    : rows_(other.rows()),
+      cols_(other.cols()),
+      memory_type_(other.memory_type()),
+      data_(std::move(other.data_)) {}
+
+template <typename ElementType>
+Image<ElementType>& Image<ElementType>::operator=(Image<ElementType>&& other) {
+  rows_ = other.rows_;
+  cols_ = other.cols_;
+  memory_type_ = other.memory_type_;
+  data_ = std::move(other.data_);
+  return *this;
+}
+
+template <typename ElementType>
+Image<ElementType>& Image<ElementType>::operator=(
+    const Image<ElementType>& other) {
+  LOG(WARNING) << "Deep copy of Image.";
+  rows_ = other.rows_;
+  cols_ = other.cols_;
+  memory_type_ = other.memory_type_;
+  data_ = make_unified<ElementType[]>(static_cast<size_t>(rows_ * cols_),
+                                      memory_type_);
+  cuda::copy(rows_, cols_, other.data_.get(), data_.get());
+  return *this;
+}
+
+template <typename ElementType>
+void Image<ElementType>::toGPU() const {
+  CHECK(memory_type_ == MemoryType::kUnified)
+      << "Called kUnified function on kDevice image.";
+  cuda::toGPU(data_.get(), rows_, cols_);
+}
+
+template <typename ImageType>
+ImageType fromBufferTemplate(int rows, int cols,
+                             const typename ImageType::ElementType* buffer,
+                             MemoryType memory_type) {
+  ImageType image(rows, cols, memory_type);
+  cuda::copy<typename ImageType::ElementType>(rows, cols, buffer,
+                                              image.dataPtr());
+  return image;
+}
+
+template <typename ElementType>
+Image<ElementType> Image<ElementType>::fromBuffer(int rows, int cols,
+                                                  const ElementType* buffer,
+                                                  MemoryType memory_type) {
+  return fromBufferTemplate<Image<ElementType>>(rows, cols, buffer,
+                                                memory_type);
+}
+
+template <typename ElementType>
+void Image<ElementType>::populateFromBuffer(int rows, int cols,
+                                            const ElementType* buffer,
+                                            MemoryType memory_type) {
+  if (!data_ || numel() < rows * cols || memory_type != memory_type_) {
+    // We need to reallocate.
+    data_ = make_unified<ElementType[]>(static_cast<size_t>(rows * cols),
+                                        memory_type);
+  }
+  rows_ = rows;
+  cols_ = cols;
+  memory_type_ = memory_type;
+  cudaMemcpy(data_.get(), buffer, rows * cols * sizeof(ElementType),
+             cudaMemcpyDefault);
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/indexing_impl.h b/nvblox/include/nvblox/core/impl/indexing_impl.h
new file mode 100644
index 00000000..3c50414b
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/indexing_impl.h
@@ -0,0 +1,94 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <glog/logging.h>
+
+#include "nvblox/core/blox.h"
+
+namespace nvblox {
+
+// TODO: this is badly done now for non-Eigen types.
+/// Input position: metric (world) units with respect to the layer origin.
+/// Output index: voxel index relative to the block origin.
+Index3D getVoxelIndexFromPositionInLayer(const float block_size,
+                                         const Vector3f& position) {
+  constexpr float kVoxelsPerSideInv = 1.0f / VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = block_size * kVoxelsPerSideInv;
+
+  Index3D global_voxel_index =
+      (position / voxel_size).array().floor().cast<int>();
+  Index3D voxel_index = global_voxel_index.unaryExpr([&](int x) {
+    return static_cast<int>(x % VoxelBlock<bool>::kVoxelsPerSide);
+  });
+
+  // Double-check that we get reasonable indices out.
+  DCHECK((voxel_index.array() >= 0).all() &&
+         (voxel_index.array() < VoxelBlock<bool>::kVoxelsPerSide).all());
+
+  return voxel_index;
+}
+
+Index3D getBlockIndexFromPositionInLayer(const float block_size,
+                                         const Vector3f& position) {
+  Eigen::Vector3i index = (position / block_size).array().floor().cast<int>();
+  return Index3D(index.x(), index.y(), index.z());
+}
+
+void getBlockAndVoxelIndexFromPositionInLayer(const float block_size,
+                                              const Vector3f& position,
+                                              Index3D* block_idx,
+                                              Index3D* voxel_idx) {
+  constexpr int kVoxelsPerSideMinusOne = VoxelBlock<bool>::kVoxelsPerSide - 1;
+  constexpr float kVoxelsPerSideInv = 1.0f / VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size_inv = 1.0 / (block_size * kVoxelsPerSideInv);
+  *block_idx = (position / block_size).array().floor().cast<int>();
+  *voxel_idx =
+      ((position - block_size * block_idx->cast<float>()) * voxel_size_inv)
+          .array()
+          .cast<int>()
+          .min(kVoxelsPerSideMinusOne);
+}
+
+Vector3f getPositionFromBlockIndexAndVoxelIndex(const float block_size,
+                                                const Index3D& block_index,
+                                                const Index3D& voxel_index) {
+  constexpr float kVoxelsPerSideInv = 1.0f / VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = block_size * kVoxelsPerSideInv;
+
+  return Vector3f(block_size * block_index.cast<float>() +
+                  voxel_size * voxel_index.cast<float>());
+}
+
+Vector3f getPositionFromBlockIndex(const float block_size,
+                                   const Index3D& block_index) {
+  // This is pretty trivial, huh.
+  return Vector3f(block_size * block_index.cast<float>());
+}
+
+Vector3f getCenterPostionFromBlockIndexAndVoxelIndex(
+    const float block_size, const Index3D& block_index,
+    const Index3D& voxel_index) {
+  constexpr float kHalfVoxelsPerSideInv =
+      0.5f / VoxelBlock<bool>::kVoxelsPerSide;
+  const float half_voxel_size = block_size * kHalfVoxelsPerSideInv;
+
+  return getPositionFromBlockIndexAndVoxelIndex(block_size, block_index,
+                                                voxel_index) +
+         Vector3f(half_voxel_size, half_voxel_size, half_voxel_size);
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/interpolation_2d_impl.h b/nvblox/include/nvblox/core/impl/interpolation_2d_impl.h
new file mode 100644
index 00000000..f426dfba
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/interpolation_2d_impl.h
@@ -0,0 +1,131 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace interpolation {
+namespace internal {
+
+__host__ __device__ inline float interpolatePixels(const Vector2f& xy_px,
+                                                   const float& f00,
+                                                   const float& f01,
+                                                   const float& f10,
+                                                   const float& f11) {
+  // Interpolation of a grid on with 1 pixel spacing.
+  // https://en.wikipedia.org/wiki/Bilinear_interpolation#On_the_unit_square
+  const Eigen::Matrix2f value_matrix =
+      (Eigen::Matrix2f() << f00, f01, f10, f11).finished();
+  const Eigen::Vector2f x_vec(1.0f - xy_px.x(), xy_px.x());
+  const Eigen::Vector2f y_vec(1.0f - xy_px.y(), xy_px.y());
+  return x_vec.transpose() * value_matrix * y_vec;
+}
+
+__host__ __device__ inline Color interpolatePixels(const Vector2f& xy_px,
+                                                   const Color& c00,
+                                                   const Color& c01,
+                                                   const Color& c10,
+                                                   const Color& c11) {
+  // We define interpolating two colors as independantly interpolating their
+  // channels.
+  return Color(
+      static_cast<uint8_t>(
+          std::round(interpolatePixels(xy_px, c00.r, c01.r, c10.r, c11.r))),
+      static_cast<uint8_t>(
+          std::round(interpolatePixels(xy_px, c00.g, c01.g, c10.g, c11.g))),
+      static_cast<uint8_t>(
+          std::round(interpolatePixels(xy_px, c00.b, c01.b, c10.b, c11.b))));
+}
+
+}  //  namespace internal
+
+template <typename ElementType>
+bool interpolate2D(const Image<ElementType>& frame, const Vector2f& u_px,
+                   ElementType* value_interpolated_ptr,
+                   const InterpolationType type) {
+  if (type == InterpolationType::kNearestNeighbor) {
+    return interpolate2DClosest(frame, u_px, value_interpolated_ptr);
+  }
+  if (type == InterpolationType::kLinear) {
+    return interpolate2DLinear(frame, u_px, value_interpolated_ptr);
+  } else {
+    CHECK(false) << "Requested interpolation method is not implemented.";
+  }
+  return 0.0;
+}
+
+template <typename ElementType>
+bool interpolate2DClosest(const Image<ElementType>& frame, const Vector2f& u_px,
+                          ElementType* value_interpolated_ptr) {
+  return interpolate2DClosest(frame.dataConstPtr(), u_px, frame.rows(),
+                              frame.cols(), value_interpolated_ptr);
+}
+
+template <typename ElementType>
+bool interpolate2DClosest(const ElementType* frame, const Vector2f& u_px,
+                          const int rows, const int cols,
+                          ElementType* value_interpolated_ptr) {
+  // Closest pixel
+  const Index2D u_M_rounded = u_px.array().round().cast<int>();
+  // Check bounds:
+  if (u_M_rounded.x() < 0 || u_M_rounded.y() < 0 || u_M_rounded.x() >= cols ||
+      u_M_rounded.y() >= rows) {
+    return false;
+  }
+  // "Interpolate"
+  *value_interpolated_ptr =
+      image::access(u_M_rounded.y(), u_M_rounded.x(), cols, frame);
+  return true;
+}
+
+template <typename ElementType>
+bool interpolate2DLinear(const Image<ElementType>& frame, const Vector2f& u_px,
+                         ElementType* value_interpolated_ptr) {
+  return interpolate2DLinear(frame.dataConstPtr(), u_px, frame.rows(),
+                             frame.cols(), value_interpolated_ptr);
+}
+
+template <typename ElementType>
+bool interpolate2DLinear(const ElementType* frame, const Vector2f& u_px,
+                         const int rows, const int cols,
+                         ElementType* value_interpolated_ptr) {
+  // Subtraction of Vector2f(0.5, 0.5) takes our coordinates from
+  // corner-referenced to center-referenced.
+  const Vector2f u_center_referenced_px = u_px - Vector2f(0.5, 0.5);
+  // Get the pixel index of the pixel on the low side (which is also the image
+  // plane location of the pixel center).
+  const Index2D u_low_side_px = u_center_referenced_px.cast<int>();
+  // If we're gonna access out of bounds, fail.
+  if ((u_low_side_px.array() < 0).any() ||
+      ((u_low_side_px.x() + 1) > (cols - 1)) ||
+      ((u_low_side_px.y() + 1) > (rows - 1))) {
+    return false;
+  }
+  // Offset of the requested point to the low side center.
+  const Eigen::Vector2f u_offset =
+      (u_center_referenced_px - u_low_side_px.cast<float>());
+  // clang-format off
+  *value_interpolated_ptr = internal::interpolatePixels(
+      u_offset,
+      image::access(u_low_side_px.y(), u_low_side_px.x(), cols, frame),
+      image::access(u_low_side_px.y() + 1, u_low_side_px.x(), cols, frame),
+      image::access(u_low_side_px.y(), u_low_side_px.x() + 1, cols, frame),
+      image::access(u_low_side_px.y() + 1, u_low_side_px.x() + 1, cols, frame));
+  // clang-format on
+  return true;
+}
+
+}  // namespace interpolation
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/interpolation_3d_impl.h b/nvblox/include/nvblox/core/impl/interpolation_3d_impl.h
new file mode 100644
index 00000000..71ef90af
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/interpolation_3d_impl.h
@@ -0,0 +1,190 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+namespace nvblox {
+namespace interpolation {
+
+template <typename VoxelType>
+void interpolateOnCPU(const std::vector<Vector3f>& points_L,
+                      const VoxelBlockLayer<VoxelType>& layer,
+                      std::vector<float>* distances_ptr,
+                      std::vector<bool>* success_flags_ptr) {
+  CHECK_NOTNULL(distances_ptr);
+  CHECK_NOTNULL(success_flags_ptr);
+  CHECK(layer.memory_type() == MemoryType::kUnified)
+      << "For CPU-based interpolation, the layer must be CPU accessible (ie "
+         "MemoryType::kUnified).";
+  distances_ptr->reserve(points_L.size());
+  success_flags_ptr->reserve(points_L.size());
+  for (const Vector3f& p_L : points_L) {
+    float distance;
+    success_flags_ptr->push_back(interpolateOnCPU(p_L, layer, &distance));
+    distances_ptr->push_back(distance);
+  }
+}
+
+namespace internal {
+
+// Interpolate a layer's voxel's members using a member-access function
+template <typename VoxelType>
+using GetMemberFunctionType = std::function<float(const VoxelType& voxel)>;
+template <typename VoxelType>
+using VoxelValidFunctionType = std::function<bool(const VoxelType& voxel)>;
+
+template <typename VoxelType>
+bool interpolateMemberOnCPU(const Vector3f& p_L,
+                            const VoxelBlockLayer<VoxelType>& layer,
+                            GetMemberFunctionType<VoxelType> get_member,
+                            VoxelValidFunctionType<VoxelType> is_valid,
+                            float* result_ptr);
+
+template <typename VoxelType>
+bool getSurroundingVoxels3D(const Eigen::Vector3f p_L,
+                            const VoxelBlockLayer<VoxelType>& layer,
+                            VoxelValidFunctionType<VoxelType> is_valid,
+                            std::array<VoxelType, 8>* voxels_ptr,
+                            Vector3f* p_offset_in_voxels_L_ptr = nullptr) {
+  CHECK_NOTNULL(voxels_ptr);
+  // Get the low-side voxel: ie the voxel whose midpoint lies on the low side of
+  // the query point
+  // NOTE(alexmillane): We implement this by finding the block
+  // and voxel coordinates of a point half a voxel-width lower than the query
+  // point.
+  const float voxel_size = layer.voxel_size();
+  const float half_voxel_size = voxel_size * 0.5f;
+  Index3D low_block_idx;
+  Index3D low_voxel_idx;
+  getBlockAndVoxelIndexFromPositionInLayer(layer.block_size(),
+                                           p_L.array() - half_voxel_size,
+                                           &low_block_idx, &low_voxel_idx);
+
+  // Get offset between bottom-left-corner voxel and the point
+  if (p_offset_in_voxels_L_ptr != nullptr) {
+    const Vector3f p_corner_L =
+        getPositionFromBlockIndexAndVoxelIndex(layer.block_size(),
+                                               low_block_idx, low_voxel_idx) +
+        half_voxel_size * Vector3f::Ones();
+    *p_offset_in_voxels_L_ptr = (p_L - p_corner_L) / voxel_size;
+  }
+
+  // Get the 8 voxels surrounding this point
+  int linear_idx = 0;
+  for (int x = 0; x < 2; x++) {
+    for (int y = 0; y < 2; y++) {
+      for (int z = 0; z < 2; z++) {
+        Index3D voxel_idx = low_voxel_idx + Index3D(x, y, z);
+        Index3D block_idx = low_block_idx;
+
+        // Move into a neighbouring block(s) if required
+        if ((voxel_idx.array() == VoxelBlock<bool>::kVoxelsPerSide).any()) {
+          // Increment block index in dimensions which have hit the limits
+          const Eigen::Array<bool, 3, 1> limits_hit_flags =
+              (voxel_idx.array() == VoxelBlock<bool>::kVoxelsPerSide);
+          block_idx += limits_hit_flags.matrix().cast<int>();
+          // Reset voxel index to zero in dimensions in which the block idx was
+          // incremented
+          voxel_idx = limits_hit_flags.select(0, voxel_idx);
+        }
+
+        // Get the voxel
+        const typename VoxelBlock<VoxelType>::ConstPtr block_ptr =
+            layer.getBlockAtIndex(block_idx);
+        if (block_ptr == nullptr) {
+          return false;
+        }
+        const VoxelType& voxel =
+            block_ptr->voxels[voxel_idx.x()][voxel_idx.y()][voxel_idx.z()];
+
+        // Check voxel valid
+        if (!is_valid(voxel)) {
+          return false;
+        }
+        // Write
+        (*voxels_ptr)[linear_idx] = voxel;
+        ++linear_idx;
+      }
+    }
+  }
+  return true;
+}
+
+Eigen::Matrix<float, 8, 1> getQVector3D(const Vector3f& p_offset_in_voxels_L) {
+  // Units are in voxels
+  CHECK((p_offset_in_voxels_L.array() >= 0.0f).all());
+  CHECK((p_offset_in_voxels_L.array() <= 1.0f).all());
+
+  constexpr float kEps = 1e-4;
+  constexpr float kOnePlusEps = 1.0f + kEps;
+  CHECK((p_offset_in_voxels_L.array() <= kOnePlusEps).all());
+  // FROM PAPER (http://spie.org/samples/PM159.pdf)
+  // clang-format off
+  Eigen::Matrix<float, 8, 1> q_vector;
+  q_vector <<
+      1,
+      p_offset_in_voxels_L[0],
+      p_offset_in_voxels_L[1],
+      p_offset_in_voxels_L[2],
+      p_offset_in_voxels_L[0] * p_offset_in_voxels_L[1],
+      p_offset_in_voxels_L[1] * p_offset_in_voxels_L[2],
+      p_offset_in_voxels_L[2] * p_offset_in_voxels_L[0],
+      p_offset_in_voxels_L[0] * p_offset_in_voxels_L[1] * p_offset_in_voxels_L[2];
+  // clang-format on
+  return q_vector;
+}
+
+template <typename VoxelType>
+bool interpolateMemberOnCPU(const Vector3f& p_L,
+                            const VoxelBlockLayer<VoxelType>& layer,
+                            GetMemberFunctionType<VoxelType> get_member,
+                            VoxelValidFunctionType<VoxelType> is_valid,
+                            float* result_ptr) {
+  std::array<VoxelType, 8> voxels;
+
+  Eigen::Vector3f p_offset_L;
+  if (!internal::getSurroundingVoxels3D(p_L, layer, is_valid, &voxels,
+                                        &p_offset_L)) {
+    return false;
+  }
+
+  // Express interpolation as matrix mulitplication from paper:
+  // (http://spie.org/samples/PM159.pdf)
+  Eigen::Matrix<float, 8, 1> member_vector;
+  for (int i = 0; i < 8; i++) {
+    member_vector[i] = get_member(voxels[i]);
+  }
+  const Eigen::Matrix<float, 1, 8> q_vector =
+      internal::getQVector3D(p_offset_L);
+  // clang-format off
+  static const Eigen::Matrix<float,8,8> interpolation_table =
+      (Eigen::Matrix<float,8,8>() <<
+        1,  0,  0,  0,  0,  0,  0,  0,
+       -1,  0,  0,  0,  1,  0,  0,  0,
+       -1,  0,  1,  0,  0,  0,  0,  0,
+       -1,  1,  0,  0,  0,  0,  0,  0,
+        1,  0, -1,  0, -1,  0,  1,  0,
+        1, -1, -1,  1,  0,  0,  0,  0,
+        1, -1,  0,  0, -1,  1,  0,  0,
+       -1,  1,  1, -1,  1, -1, -1,  1
+       )
+          .finished();
+  // clang-format on
+  *result_ptr = q_vector * (interpolation_table * member_vector);
+  return true;
+}
+
+}  // namespace internal
+}  // namespace interpolation
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/layer_cake_impl.h b/nvblox/include/nvblox/core/impl/layer_cake_impl.h
new file mode 100644
index 00000000..34e57801
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/layer_cake_impl.h
@@ -0,0 +1,108 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/variadic_template_tools.h"
+
+namespace nvblox {
+
+template <typename LayerType>
+LayerType* LayerCake::add(MemoryType memory_type) {
+  if (layers_.count(typeid(LayerType)) == 0) {
+    // Allocate
+    auto layer_ptr = std::make_unique<LayerType>(
+        sizeArgumentFromVoxelSize<LayerType>(voxel_size_), memory_type);
+    LayerType* return_ptr = layer_ptr.get();
+    // Store (as BaseLayer ptr)
+    layers_.emplace(std::type_index(typeid(LayerType)), std::move(layer_ptr));
+    LOG(INFO) << "Adding Layer with type: " << typeid(LayerType).name()
+              << " , and memory_type: " << toString(memory_type)
+              << " to LayerCake.";
+    return return_ptr;
+  } else {
+    LOG(WARNING) << "Request to add a LayerType that's already in the cake. "
+                    "Currently we only support single layers of each "
+                    "LayerType. So we did nothing.";
+    return nullptr;
+  }
+}
+
+template <typename LayerType>
+const LayerType* LayerCake::getConstPtr() const {
+  auto it = layers_.find(std::type_index(typeid(LayerType)));
+  if (it != layers_.end()) {
+    const BaseLayer* base_ptr = it->second.get();
+    const LayerType* ptr = dynamic_cast<const LayerType*>(base_ptr);
+    CHECK_NOTNULL(ptr);
+    return ptr;
+  } else {
+    LOG(WARNING) << "Request for a LayerType: " << typeid(LayerType).name()
+                 << " which is not in the cake.";
+    return nullptr;
+  }
+}
+
+template <typename LayerType>
+LayerType* LayerCake::getPtr() {
+  auto it = layers_.find(std::type_index(typeid(LayerType)));
+  if (it != layers_.end()) {
+    BaseLayer* base_ptr = it->second.get();
+    LayerType* ptr = dynamic_cast<LayerType*>(base_ptr);
+    CHECK_NOTNULL(ptr);
+    return ptr;
+  } else {
+    LOG(WARNING) << "Request for a LayerType: " << typeid(LayerType).name()
+                 << " which is not in the cake.";
+    return nullptr;
+  }
+}
+
+template <typename LayerType>
+const LayerType& LayerCake::get() const {
+  const LayerType* ptr = getConstPtr<LayerType>();
+  CHECK_NOTNULL(ptr);
+  return *ptr;
+}
+
+template <typename LayerType>
+bool LayerCake::exists() const {
+  const auto it = layers_.find(std::type_index(typeid(LayerType)));
+  return (it != layers_.end());
+}
+
+template <typename... LayerTypes>
+LayerCake LayerCake::create(float voxel_size, MemoryType memory_type) {
+  static_assert(unique_types<LayerTypes...>::value,
+                "At the moment we only support LayerCakes containing unique "
+                "LayerTypes.");
+  LayerCake cake(voxel_size);
+  BaseLayer* ignored[] = {cake.add<LayerTypes>(memory_type)...};
+  return cake;
+}
+
+template <typename... LayerTypes, typename... MemoryTypes>
+LayerCake LayerCake::create(float voxel_size,
+                             MemoryTypes... memory_types) {
+  static_assert(unique_types<LayerTypes...>::value,
+                "At the moment we only support LayerCakes containing unique "
+                "LayerTypes.");
+  LayerCake cake(voxel_size);
+  BaseLayer* ignored[] = {cake.add<LayerTypes>(memory_types)...};
+  return cake;
+}
+
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/layer_impl.h b/nvblox/include/nvblox/core/impl/layer_impl.h
new file mode 100644
index 00000000..8be099ef
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/layer_impl.h
@@ -0,0 +1,185 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <glog/logging.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+// Block accessors by index.
+template <typename BlockType>
+typename BlockType::Ptr BlockLayer<BlockType>::getBlockAtIndex(
+    const Index3D& index) {
+  // Look up the block in the hash?
+  // And return it.
+  auto it = blocks_.find(index);
+  if (it != blocks_.end()) {
+    return it->second;
+  } else {
+    return typename BlockType::Ptr();
+  }
+}
+
+template <typename BlockType>
+typename BlockType::ConstPtr BlockLayer<BlockType>::getBlockAtIndex(
+    const Index3D& index) const {
+  const auto it = blocks_.find(index);
+  if (it != blocks_.end()) {
+    return (it->second);
+  } else {
+    return typename BlockType::ConstPtr();
+  }
+}
+
+template <typename BlockType>
+typename BlockType::Ptr BlockLayer<BlockType>::allocateBlockAtIndex(
+    const Index3D& index) {
+  auto it = blocks_.find(index);
+  if (it != blocks_.end()) {
+    return it->second;
+  } else {
+    // Invalidate the GPU hash
+    gpu_layer_view_up_to_date_ = false;
+    // Blocks define their own method for allocation.
+    auto insert_status =
+        blocks_.emplace(index, BlockType::allocate(memory_type_));
+    return insert_status.first->second;
+  }
+}
+
+// Block accessors by position.
+template <typename BlockType>
+typename BlockType::Ptr BlockLayer<BlockType>::getBlockAtPosition(
+    const Eigen::Vector3f& position) {
+  return getBlockAtIndex(
+      getBlockIndexFromPositionInLayer(block_size_, position));
+}
+
+template <typename BlockType>
+typename BlockType::ConstPtr BlockLayer<BlockType>::getBlockAtPosition(
+    const Eigen::Vector3f& position) const {
+  return getBlockAtIndex(
+      getBlockIndexFromPositionInLayer(block_size_, position));
+}
+
+template <typename BlockType>
+typename BlockType::Ptr BlockLayer<BlockType>::allocateBlockAtPosition(
+    const Eigen::Vector3f& position) {
+  return allocateBlockAtIndex(
+      getBlockIndexFromPositionInLayer(block_size_, position));
+}
+
+template <typename BlockType>
+std::vector<Index3D> BlockLayer<BlockType>::getAllBlockIndices() const {
+  std::vector<Index3D> indices;
+  indices.reserve(blocks_.size());
+
+  for (const auto& kv : blocks_) {
+    indices.push_back(kv.first);
+  }
+
+  return indices;
+}
+
+template <typename BlockType>
+bool BlockLayer<BlockType>::isBlockAllocated(const Index3D& index) const {
+  const auto it = blocks_.find(index);
+  return (it != blocks_.end());
+}
+
+template <typename BlockType>
+typename BlockLayer<BlockType>::GPULayerViewType
+BlockLayer<BlockType>::getGpuLayerView() const {
+  if (!gpu_layer_view_) {
+    gpu_layer_view_ = std::make_unique<GPULayerViewType>();
+  }
+  if (!gpu_layer_view_up_to_date_) {
+    (*gpu_layer_view_).reset(const_cast<BlockLayer<BlockType>*>(this));
+    gpu_layer_view_up_to_date_ = true;
+  }
+  return *gpu_layer_view_;
+}
+
+// VoxelBlockLayer
+
+template <typename VoxelType>
+void VoxelBlockLayer<VoxelType>::getVoxels(
+    const std::vector<Vector3f>& positions_L,
+    std::vector<VoxelType>* voxels_ptr,
+    std::vector<bool>* success_flags_ptr) const {
+  CHECK_NOTNULL(voxels_ptr);
+  CHECK_NOTNULL(success_flags_ptr);
+
+  cudaStream_t transfer_stream;
+  checkCudaErrors(cudaStreamCreate(&transfer_stream));
+
+  voxels_ptr->resize(positions_L.size());
+  success_flags_ptr->resize(positions_L.size());
+
+  for (int i = 0; i < positions_L.size(); i++) {
+    const Vector3f& p_L = positions_L[i];
+    // Get the block address
+    Index3D block_idx;
+    Index3D voxel_idx;
+    getBlockAndVoxelIndexFromPositionInLayer(this->block_size_, p_L, &block_idx,
+                                             &voxel_idx);
+    const typename VoxelBlock<VoxelType>::ConstPtr block_ptr =
+        this->getBlockAtIndex(block_idx);
+    if (!block_ptr) {
+      (*success_flags_ptr)[i] = false;
+      continue;
+    }
+    (*success_flags_ptr)[i] = true;
+    // Get the voxel address
+    const auto block_raw_ptr = block_ptr.get();
+    const VoxelType* voxel_ptr =
+        &block_raw_ptr->voxels[voxel_idx.x()][voxel_idx.y()][voxel_idx.z()];
+    // Copy the Voxel to the CPU
+    cudaMemcpyAsync(&(*voxels_ptr)[i], voxel_ptr, sizeof(VoxelType),
+                    cudaMemcpyDefault, transfer_stream);
+  }
+  cudaStreamSynchronize(transfer_stream);
+  checkCudaErrors(cudaStreamDestroy(transfer_stream));
+}
+
+namespace internal {
+
+template <typename is_voxel_layer>
+inline float sizeArgumentFromVoxelSize(float voxel_size);
+
+template <>
+inline float sizeArgumentFromVoxelSize<std::true_type>(float voxel_size) {
+  return voxel_size;
+}
+
+template <>
+inline float sizeArgumentFromVoxelSize<std::false_type>(float voxel_size) {
+  return voxel_size * VoxelBlock<bool>::kVoxelsPerSide;
+}
+
+}  // namespace internal
+
+template <typename LayerType>
+constexpr float sizeArgumentFromVoxelSize(float voxel_size) {
+  return internal::sizeArgumentFromVoxelSize<
+      typename traits::is_voxel_layer<LayerType>::type>(voxel_size);
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/unified_ptr_impl.h b/nvblox/include/nvblox/core/impl/unified_ptr_impl.h
new file mode 100644
index 00000000..ce842038
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/unified_ptr_impl.h
@@ -0,0 +1,353 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+
+#include "nvblox/core/cuda/error_check.cuh"
+
+namespace nvblox {
+
+// Single object
+template <typename T, typename... Args>
+typename _Unified_if<T>::_Single_object make_unified(Args&&... args) {
+  return make_unified<T>(unified_ptr<T>::kDefaultMemoryType,
+                         std::forward<Args>(args)...);
+}
+
+template <typename T, typename... Args>
+typename _Unified_if<T>::_Single_object make_unified(MemoryType memory_type,
+                                                     Args&&... args) {
+  T* cuda_ptr = nullptr;
+  if (memory_type == MemoryType::kDevice) {
+    // No constructor (or destructor, hence the check)
+    CHECK(std::is_trivially_destructible<T>::value);
+    checkCudaErrors(cudaMalloc(&cuda_ptr, sizeof(T)));
+    return unified_ptr<T>(cuda_ptr, memory_type);
+  } else if (memory_type == MemoryType::kUnified) {
+    // Constructor called
+    checkCudaErrors(
+        cudaMallocManaged(&cuda_ptr, sizeof(T), cudaMemAttachGlobal));
+    return unified_ptr<T>(new (cuda_ptr) T(args...), memory_type);
+  } else {
+    // Constructor called
+    checkCudaErrors(cudaMallocHost(&cuda_ptr, sizeof(T)));
+    return unified_ptr<T>(new (cuda_ptr) T(args...), memory_type);
+  }
+}
+
+// Array
+template <typename T>
+typename _Unified_if<T>::_Unknown_bound make_unified(std::size_t size) {
+  return make_unified<T>(size, unified_ptr<T>::kDefaultMemoryType);
+}
+
+template <typename T>
+typename _Unified_if<T>::_Unknown_bound make_unified(std::size_t size,
+                                                     MemoryType memory_type) {
+  typedef typename std::remove_extent<T>::type TNonArray;
+  TNonArray* cuda_ptr = nullptr;
+  if (memory_type == MemoryType::kDevice) {
+    // No constructor
+    checkCudaErrors(cudaMalloc(&cuda_ptr, sizeof(TNonArray) * size));
+    return unified_ptr<T>(cuda_ptr, memory_type, size);
+  } else if (memory_type == MemoryType::kUnified) {
+    // Default constructor
+    checkCudaErrors(cudaMallocManaged(&cuda_ptr, sizeof(TNonArray) * size,
+                                      cudaMemAttachGlobal));
+    return unified_ptr<T>(new (cuda_ptr) TNonArray[size], memory_type, size);
+  } else {
+    // Default constructor
+    checkCudaErrors(cudaMallocHost(&cuda_ptr, sizeof(TNonArray) * size));
+    return unified_ptr<T>(new (cuda_ptr) TNonArray[size], memory_type, size);
+  }
+}
+
+// Default constructor.
+template <typename T>
+unified_ptr<T>::unified_ptr()
+    : memory_type_(kDefaultMemoryType), ptr_(nullptr), ref_counter_(nullptr) {}
+
+// Useful constructor.
+template <typename T>
+unified_ptr<T>::unified_ptr(T_noextent* ptr, MemoryType memory_type,
+                            size_t size)
+    : memory_type_(memory_type),
+      size_(size),
+      ptr_(ptr),
+      ref_counter_(new std::atomic<int>(1)) {}
+
+// Copy constructor.
+template <typename T>
+unified_ptr<T>::unified_ptr(const unified_ptr<T>& other)
+    : memory_type_(other.memory_type_),
+      size_(other.size_),
+      ptr_(other.ptr_),
+      ref_counter_(other.ref_counter_) {
+  if (ptr_ != nullptr) {
+    (*ref_counter_)++;
+  }
+}
+
+// Move constructor.
+template <typename T>
+unified_ptr<T>::unified_ptr(unified_ptr<T>&& other)
+    : memory_type_(other.memory_type_),
+      size_(other.size_),
+      ptr_(other.ptr_),
+      ref_counter_(other.ref_counter_) {
+  if (ptr_ != nullptr) {
+    (*ref_counter_)++;
+  }
+}
+
+template <typename T>
+struct Deleter {
+  static void destroy(T* ptr, MemoryType memory_type) {
+    if (memory_type == MemoryType::kUnified) {
+      ptr->~T();
+      checkCudaErrors(
+          cudaFree(const_cast<void*>(reinterpret_cast<void const*>(ptr))));
+    } else if (memory_type == MemoryType::kDevice) {
+      checkCudaErrors(
+          cudaFree(const_cast<void*>(reinterpret_cast<void const*>(ptr))));
+    } else {
+      ptr->~T();
+      checkCudaErrors(
+          cudaFreeHost(const_cast<void*>(reinterpret_cast<void const*>(ptr))));
+    }
+  }
+};
+
+template <typename T>
+struct Deleter<T[]> {
+  static void destroy(T* ptr, MemoryType memory_type) {
+    // Do nothing at run-time
+    static_assert(
+        std::is_trivially_destructible<T>::value,
+        "Objects stored in unified_ptr<T[]> must be trivially destructible.");
+    if (memory_type == MemoryType::kDevice ||
+        memory_type == MemoryType::kUnified) {
+      checkCudaErrors(
+          cudaFree(const_cast<void*>(reinterpret_cast<void const*>(ptr))));
+    } else {
+      checkCudaErrors(
+          cudaFreeHost(const_cast<void*>(reinterpret_cast<void const*>(ptr))));
+    }
+  }
+};
+
+// Destructor.
+template <typename T>
+unified_ptr<T>::~unified_ptr() {
+  if (ptr_ != nullptr) {
+    (*ref_counter_)--;
+    if (*ref_counter_ <= 0) {
+      Deleter<T>::destroy(ptr_, memory_type_);
+      delete ref_counter_;
+      ptr_ = nullptr;
+      ref_counter_ = nullptr;
+    }
+  }
+}
+
+// Operators
+template <typename T>
+unified_ptr<T>& unified_ptr<T>::operator=(const unified_ptr<T>& other) {
+  reset();
+  ptr_ = other.ptr_;
+  ref_counter_ = other.ref_counter_;
+  memory_type_ = other.memory_type_;
+  if (ptr_ != nullptr) {
+    (*ref_counter_)++;
+  }
+  return *this;
+}
+
+template <typename T>
+unified_ptr<T>& unified_ptr<T>::operator=(unified_ptr<T>&& other) {
+  reset();
+  ptr_ = other.ptr_;
+  ref_counter_ = other.ref_counter_;
+  memory_type_ = other.memory_type_;
+  if (ptr_ != nullptr) {
+    (*ref_counter_)++;
+  }
+  return *this;
+}
+
+template <typename T>
+template <typename T2, std::enable_if_t<!std::is_const<T2>::value, bool>>
+unified_ptr<T>::operator unified_ptr<const T2>() const {
+  unified_ptr<const T2> const_ptr;
+  if (ptr_ != nullptr) {
+    const_ptr.ptr_ = ptr_;
+    const_ptr.ref_counter_ = ref_counter_;
+    const_ptr.memory_type_ = memory_type_;
+    (*const_ptr.ref_counter_)++;
+  }
+  return const_ptr;
+}
+
+// Operator bool
+template <typename T>
+unified_ptr<T>::operator bool() const {
+  return ptr_ != nullptr;
+}
+
+// Operator comparison
+template <typename T>
+bool unified_ptr<T>::operator==(const unified_ptr<T>& other) const {
+  return ptr_ == other.ptr_;
+}
+
+template <typename T>
+bool unified_ptr<T>::operator!=(const unified_ptr<T>& other) const {
+  return ptr_ != other.ptr_;
+}
+
+template <typename T>
+typename unified_ptr<T>::T_noextent* unified_ptr<T>::operator->() {
+  CHECK(memory_type_ != MemoryType::kDevice);
+  return ptr_;
+}
+
+template <typename T>
+T& unified_ptr<T>::operator*() {
+  CHECK(memory_type_ != MemoryType::kDevice);
+  return *ptr_;
+}
+
+template <typename T>
+const typename unified_ptr<T>::T_noextent* unified_ptr<T>::operator->() const {
+  CHECK(memory_type_ != MemoryType::kDevice);
+  return ptr_;
+}
+
+template <typename T>
+const T& unified_ptr<T>::operator*() const {
+  CHECK(memory_type_ != MemoryType::kDevice);
+  return *ptr_;
+}
+
+// Getters
+template <typename T>
+typename unified_ptr<T>::T_noextent* unified_ptr<T>::get() {
+  return ptr_;
+}
+
+template <typename T>
+const typename unified_ptr<T>::T_noextent* unified_ptr<T>::get() const {
+  return ptr_;
+}
+
+// Reset
+template <typename T>
+void unified_ptr<T>::reset() {
+  if (ptr_ != nullptr) {
+    (*ref_counter_)--;
+    if (*ref_counter_ <= 0) {
+      Deleter<T>::destroy(ptr_, memory_type_);
+      delete ref_counter_;
+    }
+    ptr_ = nullptr;
+    ref_counter_ = nullptr;
+  }
+}
+
+// Cloner
+template <typename T>
+struct Cloner {
+  typedef typename std::remove_cv<T>::type T_nonconst;
+  static unified_ptr<T_nonconst> clone(const unified_ptr<T>& original,
+                                       MemoryType memory_type, size_t) {
+    auto other = make_unified<T_nonconst>(memory_type);
+    cudaMemcpy(other.get(), original.get(), sizeof(T), cudaMemcpyDefault);
+    return other;
+  }
+};
+
+template <typename T>
+struct Cloner<T[]> {
+  typedef typename std::remove_extent<T>::type T_noextent;
+  typedef typename std::remove_cv<T>::type T_nonconst;
+  static unified_ptr<T_nonconst[]> clone(const unified_ptr<T[]>& original,
+                                         MemoryType memory_type, size_t size) {
+    auto other = make_unified<T_nonconst[]>(size, memory_type);
+    cudaMemcpy(other.get(), original.get(), sizeof(T_noextent) * size,
+               cudaMemcpyDefault);
+    return other;
+  }
+};
+
+template <typename T>
+unified_ptr<typename std::remove_cv<T>::type> unified_ptr<T>::clone() const {
+  return Cloner<T>::clone(*this, memory_type_, size_);
+}
+
+template <typename T>
+unified_ptr<typename std::remove_cv<T>::type> unified_ptr<T>::clone(
+    MemoryType memory_type) const {
+  return Cloner<T>::clone(*this, memory_type, size_);
+}
+
+// Hint to move to GPU or CPU.
+template <typename T>
+void unified_ptr<T>::toGPU() {
+  CHECK(memory_type_ == MemoryType::kUnified);
+  if (ptr_ == nullptr) {
+    return;
+  }
+  int device;
+  cudaGetDevice(&device);
+  checkCudaErrors(cudaMemPrefetchAsync(ptr_, sizeof(T), device));
+}
+
+template <typename T>
+void unified_ptr<T>::toGPU(cudaStream_t cuda_stream) {
+  CHECK(memory_type_ == MemoryType::kUnified);
+  if (ptr_ == nullptr) {
+    return;
+  }
+  int device;
+  cudaGetDevice(&device);
+  checkCudaErrors(cudaMemPrefetchAsync(ptr_, sizeof(T), device, cuda_stream));
+}
+
+template <typename T>
+void unified_ptr<T>::toCPU() {
+  CHECK(memory_type_ == MemoryType::kUnified);
+  if (ptr_ == nullptr) {
+    return;
+  }
+  checkCudaErrors(cudaMemPrefetchAsync(ptr_, sizeof(T), cudaCpuDeviceId));
+}
+
+template <typename T>
+void unified_ptr<T>::preferGPU() {
+  CHECK(memory_type_ == MemoryType::kUnified);
+  int device;
+  cudaGetDevice(&device);
+  checkCudaErrors(cudaMemAdvise(ptr_, sizeof(T),
+                                cudaMemAdviseSetPreferredLocation, device));
+}
+
+template <typename T>
+void unified_ptr<T>::setZero() {
+  checkCudaErrors(cudaMemset(ptr_, 0, sizeof(T)));
+}
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/unified_vector_impl.h b/nvblox/include/nvblox/core/impl/unified_vector_impl.h
new file mode 100644
index 00000000..bc203415
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/unified_vector_impl.h
@@ -0,0 +1,298 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <algorithm>
+#include <cstring>
+#include <type_traits>
+
+#include "nvblox/core/cuda/error_check.cuh"
+
+namespace nvblox {
+
+// Constructors and destructors.
+template <typename T>
+unified_vector<T>::unified_vector(MemoryType memory_type)
+    : memory_type_(memory_type),
+      buffer_(nullptr),
+      buffer_size_(0),
+      buffer_capacity_(0) {
+  static_assert(std::is_default_constructible<T>::value,
+                "Need to have a default constructor to use unified_vector.");
+  static_assert(std::is_trivially_destructible<T>::value,
+                "Need to have a trivial destructor to use unified_vector.");
+}
+
+template <typename T>
+unified_vector<T>::unified_vector(size_t size, MemoryType memory_type)
+    : unified_vector(memory_type) {
+  resize(size);
+}
+
+template <typename T>
+unified_vector<T>::unified_vector(size_t size, const T& initial,
+                                  MemoryType memory_type)
+    : unified_vector(memory_type) {
+  CHECK_NE(static_cast<int>(memory_type_),
+           static_cast<int>(MemoryType::kDevice))
+      << "Can't set initial values for device memory. Use copy instead.";
+  resize(size);
+  for (size_t i = 0; i < size; i++) {
+    buffer_[i] = initial;
+  }
+}
+
+// Copy constructor.
+template <typename T>
+unified_vector<T>::unified_vector(const unified_vector<T>& other,
+                                  MemoryType memory_type)
+    : unified_vector(memory_type) {
+  resize(other.size());
+
+  // Do a cuda memcpy on this, which will hopefully figure out the 2 data types
+  // correctly.
+  checkCudaErrors(cudaMemcpy(buffer_, other.data(), sizeof(T) * other.size(),
+                             cudaMemcpyDefault));
+}
+
+template <typename T>
+unified_vector<T>::unified_vector(const std::vector<T>& other,
+                                  MemoryType memory_type)
+    : unified_vector(memory_type) {
+  resize(other.size());
+
+  checkCudaErrors(cudaMemcpy(buffer_, other.data(), sizeof(T) * other.size(),
+                             cudaMemcpyDefault));
+}
+
+// Move constructor.
+template <typename T>
+unified_vector<T>::unified_vector(unified_vector<T>&& other)
+    : memory_type_(other.memory_type_),
+      buffer_(other.buffer_),
+      buffer_size_(other.buffer_size_),
+      buffer_capacity_(other.buffer_capacity_) {
+  other.buffer_ = nullptr;
+  other.buffer_size_ = 0;
+  other.buffer_capacity_ = 0;
+}
+
+template <typename T>
+unified_vector<T>::~unified_vector() {
+  clear();
+}
+
+// Operators
+template <typename T>
+T& unified_vector<T>::operator[](size_t index) {
+  return buffer_[index];
+}
+
+template <typename T>
+const T& unified_vector<T>::operator[](size_t index) const {
+  return buffer_[index];
+}
+
+// Copy assignment
+template <typename T>
+unified_vector<T>& unified_vector<T>::operator=(
+    const unified_vector<T>& other) {
+  resize(other.size());
+  checkCudaErrors(cudaMemcpy(buffer_, other.data(), sizeof(T) * other.size(),
+                             cudaMemcpyDefault));
+  return *this;
+}
+
+// Move assignment
+template <typename T>
+unified_vector<T>& unified_vector<T>::operator=(unified_vector<T>&& other) {
+  clear();
+  buffer_ = other.buffer_;
+  buffer_size_ = other.buffer_size_;
+  buffer_capacity_ = other.buffer_capacity_;
+  other.buffer_ = nullptr;
+  other.buffer_size_ = 0;
+  other.buffer_capacity_ = 0;
+  return *this;
+}
+
+template <typename T>
+unified_vector<T>& unified_vector<T>::operator=(const std::vector<T>& other) {
+  resize(other.size());
+  checkCudaErrors(cudaMemcpy(buffer_, other.data(), sizeof(T) * other.size(),
+                             cudaMemcpyDefault));
+  return *this;
+}
+
+template <typename T>
+std::vector<T> unified_vector<T>::toVector() const {
+  std::vector<T> vect(buffer_size_);
+  checkCudaErrors(cudaMemcpy(vect.data(), buffer_, sizeof(T) * buffer_size_,
+                             cudaMemcpyDefault));
+  return vect;
+}
+
+// Get those raw pointers.
+template <typename T>
+T* unified_vector<T>::data() {
+  return buffer_;
+}
+
+template <typename T>
+const T* unified_vector<T>::data() const {
+  return buffer_;
+}
+
+// Hint to move the memory to the GPU.
+template <typename T>
+void unified_vector<T>::toGPU() {
+  if (buffer_ == nullptr || buffer_capacity_ == 0) {
+    return;
+  }
+  int device = 0;
+  checkCudaErrors(cudaGetDevice(&device));
+  checkCudaErrors(
+      cudaMemPrefetchAsync(buffer_, buffer_capacity_ * sizeof(T), device));
+}
+
+template <typename T>
+void unified_vector<T>::toCPU() {
+  if (buffer_ == nullptr || buffer_capacity_ == 0) {
+    return;
+  }
+  checkCudaErrors(cudaMemPrefetchAsync(buffer_, buffer_capacity_ * sizeof(T),
+                                       cudaCpuDeviceId));
+}
+
+// Accessors
+template <typename T>
+size_t unified_vector<T>::capacity() const {
+  return buffer_capacity_;
+}
+
+template <typename T>
+size_t unified_vector<T>::size() const {
+  return buffer_size_;
+}
+
+template <typename T>
+bool unified_vector<T>::empty() const {
+  return buffer_size_ == 0;
+}
+
+// Changing the size.
+template <typename T>
+void unified_vector<T>::reserve(size_t capacity) {
+  if (buffer_capacity_ < capacity) {
+    // Create a new buffer.
+    T* new_buffer = nullptr;
+
+    if (memory_type_ == MemoryType::kUnified) {
+      checkCudaErrors(cudaMallocManaged(&new_buffer, sizeof(T) * capacity,
+                                        cudaMemAttachGlobal));
+    } else if (memory_type_ == MemoryType::kDevice) {
+      checkCudaErrors(cudaMalloc(&new_buffer, sizeof(T) * capacity));
+    } else {
+      checkCudaErrors(cudaMallocHost(&new_buffer, sizeof(T) * capacity));
+    }
+
+    if (buffer_ != nullptr) {
+      // Copy the old values to the new buffer.
+      checkCudaErrors(cudaMemcpy(new_buffer, buffer_, sizeof(T) * buffer_size_,
+                                 cudaMemcpyDefault));
+
+      // Delete the old buffer.
+      if (memory_type_ == MemoryType::kHost) {
+        checkCudaErrors(cudaFreeHost(reinterpret_cast<void*>(buffer_)));
+      } else {
+        checkCudaErrors(cudaFree(reinterpret_cast<void*>(buffer_)));
+      }
+    }
+    buffer_ = new_buffer;
+    buffer_capacity_ = capacity;
+  }
+}
+
+template <typename T>
+void unified_vector<T>::resize(size_t size) {
+  // Absolute no-op.
+  if (buffer_size_ == size) {
+    return;
+  } else {
+    reserve(size);
+    buffer_size_ = size;
+  }
+}
+
+template <typename T>
+void unified_vector<T>::clear() {
+  if (buffer_ != nullptr) {
+    if (memory_type_ == MemoryType::kHost) {
+      checkCudaErrors(cudaFreeHost(reinterpret_cast<void*>(buffer_)));
+    } else {
+      checkCudaErrors(cudaFree(reinterpret_cast<void*>(buffer_)));
+    }
+  }
+  buffer_ = nullptr;
+  buffer_size_ = 0;
+  buffer_capacity_ = 0;
+}
+
+template <typename T>
+void unified_vector<T>::push_back(const T& value) {
+  CHECK_NE(static_cast<int>(memory_type_),
+           static_cast<int>(MemoryType::kDevice))
+      << "Can't use push_back on device memory.";
+
+  if (buffer_capacity_ < buffer_size_ + 1) {
+    // Do the vector thing and reserve double.
+    reserve(std::max(buffer_size_ * 2, buffer_size_ + 1));
+  }
+  // Add the new value to the end.
+  buffer_[buffer_size_] = value;
+  buffer_size_++;
+}
+
+template <typename T>
+typename unified_vector<T>::iterator unified_vector<T>::begin() {
+  return iterator(&buffer_[0]);
+}
+
+template <typename T>
+typename unified_vector<T>::iterator unified_vector<T>::end() {
+  return iterator(&buffer_[buffer_size_]);
+}
+
+template <typename T>
+typename unified_vector<T>::const_iterator unified_vector<T>::cbegin() const {
+  return const_iterator(&buffer_[0]);
+}
+
+template <typename T>
+typename unified_vector<T>::const_iterator unified_vector<T>::cend() const {
+  return const_iterator(&buffer_[buffer_size_]);
+}
+
+template <typename T>
+void unified_vector<T>::setZero() {
+  // It is safe to use cudaMemset since the memory is ALWAYS allocated with
+  // cudaMalloc.
+  checkCudaErrors(cudaMemset(buffer_, 0, buffer_size_ * sizeof(T)));
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/impl/variadic_template_tools_impl.h b/nvblox/include/nvblox/core/impl/variadic_template_tools_impl.h
new file mode 100644
index 00000000..b8d2322b
--- /dev/null
+++ b/nvblox/include/nvblox/core/impl/variadic_template_tools_impl.h
@@ -0,0 +1,43 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+// Unique type
+template <class T1, class T2, class... Trest>
+struct unique_types<T1, T2, Trest...> {
+  static constexpr bool value = unique_types<T1, T2>::value &&
+                                unique_types<T1, Trest...>::value &&
+                                unique_types<T2, Trest...>::value;
+};
+
+template <class T1, class T2>
+struct unique_types<T1, T2> {
+  static constexpr bool value = !std::is_same<T1, T2>::value;
+};
+
+template <class T1>
+struct unique_types<T1> {
+  static constexpr bool value = true;
+};
+
+template <>
+struct unique_types<> {
+  static constexpr bool value = true;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/indexing.h b/nvblox/include/nvblox/core/indexing.h
new file mode 100644
index 00000000..2a8d8834
--- /dev/null
+++ b/nvblox/include/nvblox/core/indexing.h
@@ -0,0 +1,50 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+// Assuming a fixed-size voxel block, get the voxel index of a voxel at that
+// position within a block.
+__host__ __device__ inline Index3D getVoxelIndexFromPositionInLayer(
+    const float block_size, const Vector3f& position);
+
+__host__ __device__ inline Index3D getBlockIndexFromPositionInLayer(
+    const float block_size, const Vector3f& position);
+
+__host__ __device__ inline void getBlockAndVoxelIndexFromPositionInLayer(
+    const float block_size, const Vector3f& position, Index3D* block_idx,
+    Index3D* voxel_idx);
+
+/// Gets the position of the minimum corner (i.e., the smallest towards negative
+/// infinity of each axis).
+__host__ __device__ inline Vector3f getPositionFromBlockIndexAndVoxelIndex(
+    const float block_size, const Index3D& block_index,
+    const Index3D& voxel_index);
+
+__host__ __device__ inline Vector3f getPositionFromBlockIndex(
+    const float block_size, const Index3D& block_index);
+
+/// Gets the CENTER of the voxel.
+__host__ __device__ inline Vector3f getCenterPostionFromBlockIndexAndVoxelIndex(
+    const float block_size, const Index3D& block_index,
+    const Index3D& voxel_index);
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/indexing_impl.h"
diff --git a/nvblox/include/nvblox/core/interpolation_2d.h b/nvblox/include/nvblox/core/interpolation_2d.h
new file mode 100644
index 00000000..3fdfdb17
--- /dev/null
+++ b/nvblox/include/nvblox/core/interpolation_2d.h
@@ -0,0 +1,52 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace interpolation {
+
+// CPU interfaces
+template <typename ElementType>
+__host__ inline bool interpolate2DClosest(const Image<ElementType>& frame,
+                                          const Vector2f& u_px,
+                                          ElementType* value_interpolated_ptr);
+template <typename ElementType>
+__host__ inline bool interpolate2DLinear(const Image<ElementType>& frame,
+                                         const Vector2f& u_px,
+                                         ElementType* value_interpolated_ptr);
+template <typename ElementType>
+__host__ bool interpolate2D(
+    const Image<ElementType>& frame, const Vector2f& u_px,
+    ElementType* value_interpolated_ptr,
+    const InterpolationType type = InterpolationType::kLinear);
+
+// GPU interfaces
+template <typename ElementType>
+__host__ __device__ inline bool interpolate2DClosest(
+    const ElementType* frame, const Vector2f& u_px, const int rows,
+    const int cols, ElementType* value_interpolated_ptr);
+template <typename ElementType>
+__host__ __device__ inline bool interpolate2DLinear(
+    const ElementType* frame, const Vector2f& u_px, const int rows,
+    const int cols, ElementType* value_interpolated_ptr);
+
+}  // namespace interpolation
+}  // namespace nvblox
+
+#include "nvblox/core/impl/interpolation_2d_impl.h"
diff --git a/nvblox/include/nvblox/core/interpolation_3d.h b/nvblox/include/nvblox/core/interpolation_3d.h
new file mode 100644
index 00000000..05e491e7
--- /dev/null
+++ b/nvblox/include/nvblox/core/interpolation_3d.h
@@ -0,0 +1,43 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+namespace interpolation {
+
+// Single points
+bool interpolateOnCPU(const Vector3f& p_L, const TsdfLayer& layer,
+                      float* distance);
+bool interpolateOnCPU(const Vector3f& p_L, const EsdfLayer& layer,
+                      float* distance);
+
+// Vectors of points
+template <typename VoxelType>
+void interpolateOnCPU(const std::vector<Vector3f>& points_L,
+                      const VoxelBlockLayer<VoxelType>& layer,
+                      std::vector<float>* distances_ptr,
+                      std::vector<bool>* success_flags_ptr);
+
+}  // namespace interpolation
+}  // namespace nvblox
+
+#include "nvblox/core/impl/interpolation_3d_impl.h"
diff --git a/nvblox/include/nvblox/core/iterator.h b/nvblox/include/nvblox/core/iterator.h
new file mode 100644
index 00000000..83af41c2
--- /dev/null
+++ b/nvblox/include/nvblox/core/iterator.h
@@ -0,0 +1,97 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+#include <cstddef>
+#include <iterator>
+
+namespace nvblox {
+
+template <typename T>
+struct RawIterator {
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using value_type = T;
+  using pointer = T*;
+  using reference = T&;
+
+  __host__ __device__ RawIterator(pointer ptr) : ptr_(ptr) {}
+
+  __host__ __device__ reference operator*() const { return *ptr_; }
+  __host__ __device__ pointer operator->() { return ptr_; }
+  __host__ __device__ RawIterator& operator++() {
+    ptr_++;
+    return *this;
+  }
+  __host__ __device__ RawIterator& operator--() {
+    ptr_--;
+    return *this;
+  }
+  __host__ __device__ RawIterator& operator++(int) {
+    RawIterator tmp = *this;
+    ++(*this);
+    return tmp;
+  }
+  __host__ __device__ RawIterator& operator+=(const difference_type& movement) {
+    ptr_ += movement;
+    return *this;
+  }
+
+  // Suspiciously need this for CUB.
+  __host__ __device__ RawIterator& operator=(const T& value) {
+    *ptr_ = value;
+    return *this;
+  }
+
+  __host__ __device__ RawIterator
+  operator+(const difference_type& movement) const {
+    pointer old_ptr = ptr_;
+    old_ptr += movement;
+    return RawIterator(old_ptr);
+  }
+  __host__ __device__ RawIterator
+  operator-(const difference_type& movement) const {
+    pointer old_ptr = ptr_;
+    old_ptr -= movement;
+    return RawIterator(old_ptr);
+  }
+
+  __host__ __device__ RawIterator
+  operator[](const difference_type& movement) const {
+    pointer old_ptr = ptr_;
+    old_ptr += movement;
+    return RawIterator(old_ptr);
+  }
+
+  // These two are needed for thrust:
+  __host__ __device__ operator T*() const { return ptr_; }
+  __host__ __device__ operator T() const { return *ptr_; }
+
+  __host__ __device__ friend bool operator==(const RawIterator& a,
+                                             const RawIterator& b) {
+    return a.ptr_ == b.ptr_;
+  };
+  __host__ __device__ friend bool operator!=(const RawIterator& a,
+                                             const RawIterator& b) {
+    return a.ptr_ != b.ptr_;
+  };
+
+ private:
+  pointer ptr_;
+};
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/core/layer.h b/nvblox/include/nvblox/core/layer.h
new file mode 100644
index 00000000..04d81460
--- /dev/null
+++ b/nvblox/include/nvblox/core/layer.h
@@ -0,0 +1,192 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/hash.h"
+#include "nvblox/core/traits.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_ptr.h"
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+
+namespace nvblox {
+
+class BaseLayer {
+ public:
+  typedef std::shared_ptr<BaseLayer> Ptr;
+  typedef std::shared_ptr<const BaseLayer> ConstPtr;
+
+  virtual ~BaseLayer() = default;
+
+  // Just an inteface class
+};
+
+template <typename _BlockType>
+class BlockLayer : public BaseLayer {
+ public:
+  typedef std::shared_ptr<BlockLayer> Ptr;
+  typedef std::shared_ptr<const BlockLayer> ConstPtr;
+
+  // Check that custom block types implement allocate
+  static_assert(traits::has_allocate<_BlockType>::value,
+                "BlockType must implement an allocate() function.");
+
+  // Allows inspection of the contained BlockType through LayerType::BlockType
+  typedef _BlockType BlockType;
+  typedef BlockLayer<BlockType> LayerType;
+  typedef GPULayerView<BlockType> GPULayerViewType;
+
+  typedef typename Index3DHashMapType<typename BlockType::Ptr>::type BlockHash;
+
+  BlockLayer() = delete;
+  BlockLayer(float block_size, MemoryType memory_type)
+      : block_size_(block_size),
+        memory_type_(memory_type),
+        gpu_layer_view_up_to_date_(false) {}
+  virtual ~BlockLayer() {}
+
+  // (Deep) Copy disabled
+  // NOTE(alexmillane): We could write these if needed in the future
+  BlockLayer(const BlockLayer& other) = delete;
+  BlockLayer& operator=(const BlockLayer& other) = delete;
+
+  // Move operations
+  BlockLayer(BlockLayer&& other) = default;
+  BlockLayer& operator=(BlockLayer&& other) = default;
+
+  // Block accessors by index.
+  typename BlockType::Ptr getBlockAtIndex(const Index3D& index);
+  typename BlockType::ConstPtr getBlockAtIndex(const Index3D& index) const;
+  typename BlockType::Ptr allocateBlockAtIndex(const Index3D& index);
+
+  // Block accessors by position.
+  typename BlockType::Ptr getBlockAtPosition(const Vector3f& position);
+  typename BlockType::ConstPtr getBlockAtPosition(
+      const Vector3f& position) const;
+  typename BlockType::Ptr allocateBlockAtPosition(const Vector3f& position);
+
+  // Get all blocks indices.
+  std::vector<Index3D> getAllBlockIndices() const;
+
+  // Check if allocated
+  bool isBlockAllocated(const Index3D& index) const;
+
+  __host__ __device__ float block_size() const { return block_size_; }
+  int numAllocatedBlocks() const { return blocks_.size(); }
+
+  // Clear the layer of all data
+  void clear() { blocks_.clear(); }
+
+  MemoryType memory_type() const { return memory_type_; }
+
+  // GPU Hash
+  // Note(alexmillane): The hash returned here is invalidated by calls to
+  // allocateBlock
+  GPULayerViewType getGpuLayerView() const;
+
+ protected:
+  const float block_size_;
+  const MemoryType memory_type_;
+
+  // CPU Hash (Index3D -> BlockType::Ptr)
+  BlockHash blocks_;
+
+  /// GPU Hash
+  /// NOTE(alexmillane):
+  /// - This has is subservient to the CPU version. The layer has to copy the
+  ///   hash to GPU when it is requested.
+  /// - Cached such that if no blocks are allocated between requests, the
+  ///   GPULayerView is not recopied.
+  /// - Lazily allocated (space allocated on the GPU first request)
+  /// - The "mutable" here is to enable caching in const member functions.
+  mutable bool gpu_layer_view_up_to_date_;
+  mutable std::unique_ptr<GPULayerViewType> gpu_layer_view_;
+};
+
+template <typename VoxelType>
+class VoxelBlockLayer : public BlockLayer<VoxelBlock<VoxelType>> {
+ public:
+  typedef std::shared_ptr<VoxelBlockLayer> Ptr;
+  typedef std::shared_ptr<const VoxelBlockLayer> ConstPtr;
+
+  using Base = BlockLayer<VoxelBlock<VoxelType>>;
+
+  using VoxelBlockType = VoxelBlock<VoxelType>;
+
+  /// Constructor
+  /// @param voxel_size The size of each voxel
+  /// @param memory_type In which type of memory the blocks in this layer should
+  ///                    be stored.
+  VoxelBlockLayer(float voxel_size, MemoryType memory_type)
+      : BlockLayer<VoxelBlockType>(VoxelBlockType::kVoxelsPerSide * voxel_size,
+                                   memory_type),
+        voxel_size_(voxel_size) {}
+  VoxelBlockLayer() = delete;
+  virtual ~VoxelBlockLayer() {}
+
+  // (Deep) Copy disabled
+  // NOTE(alexmillane): We could write these if needed in the future
+  VoxelBlockLayer(const VoxelBlockLayer& other) = delete;
+  VoxelBlockLayer& operator=(const VoxelBlockLayer& other) = delete;
+
+  // Move operations
+  VoxelBlockLayer(VoxelBlockLayer&& other) = default;
+  VoxelBlockLayer& operator=(VoxelBlockLayer&& other) = default;
+
+  /// Gets voxels by copy from a list of positions.
+  /// The positions are given with respect to the layer frame (L). The function
+  /// returns the closest voxels to the passed points.
+  /// If memory_type_ == kDevice, the function retrieves voxel data from the GPU
+  /// and transfers it to the CPU. Modifications to the returned voxel data do
+  /// not affect the layer (they're copies).
+  /// Note that this function performs a Cudamemcpy per voxel. So it will likely
+  /// be relatively slow.
+  /// @param positions_L query positions in layer frame
+  /// @param voxels_ptr a pointer to a vector of voxels where we'll store the
+  ///                   output
+  /// @param success_flags_ptr a pointer to a vector of flags indicating if we
+  ///                          were able to retrive each voxel.
+  void getVoxels(const std::vector<Vector3f>& positions_L,
+                 std::vector<VoxelType>* voxels_ptr,
+                 std::vector<bool>* success_flags_ptr) const;
+
+  /// Returns the size of the voxels in this layer.
+  float voxel_size() const { return voxel_size_; }
+
+ private:
+  const float voxel_size_;
+};
+
+namespace traits {
+
+template <typename LayerType>
+struct is_voxel_layer : public std::false_type {};
+
+template <typename VoxelType>
+struct is_voxel_layer<VoxelBlockLayer<VoxelType>> : public std::true_type {};
+
+}  // namespace traits
+
+// Returns voxel size or block size based on the layer type (at compile time)
+template <typename LayerType>
+constexpr float sizeArgumentFromVoxelSize(float voxel_size);
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/layer_impl.h"
diff --git a/nvblox/include/nvblox/core/layer_cake.h b/nvblox/include/nvblox/core/layer_cake.h
new file mode 100644
index 00000000..9b04c236
--- /dev/null
+++ b/nvblox/include/nvblox/core/layer_cake.h
@@ -0,0 +1,78 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <unordered_map>
+
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+class LayerCake {
+ public:
+  LayerCake() = default;
+  LayerCake(float voxel_size) : voxel_size_(voxel_size) {}
+
+  // Deep Copy (disallowed for now)
+  LayerCake(const LayerCake& other) = delete;
+  LayerCake& operator=(const LayerCake& other) = delete;
+
+  // Move
+  LayerCake(LayerCake&& other) = default;
+  LayerCake& operator=(LayerCake&& other) = default;
+
+  template <typename LayerType>
+  LayerType* add(MemoryType memory_type);
+
+  // Retrieve layers (as pointers)
+  template <typename LayerType>
+  LayerType* getPtr();
+  template <typename LayerType>
+  const LayerType* getConstPtr() const;
+
+  // Retrieve layers (as reference) (will fail if the layer doesn't exist)
+  template <typename LayerType>
+  const LayerType& get() const;
+
+  template <typename LayerType>
+  bool exists() const;
+
+  bool empty() const { return layers_.empty(); }
+
+  void clear() { layers_.clear(); }
+
+  // Factory accepting a list of LayerTypes (and MemoryTypes)
+  template <typename... LayerTypes>
+  static LayerCake create(float voxel_size, MemoryType memory_type);
+  template <typename... LayerTypes, typename... MemoryTypes>
+  static LayerCake create(float voxel_size, MemoryTypes... memory_type);
+
+ private:
+  // Params
+  float voxel_size_;
+
+  // Stored layers
+  // Note(alexmillane): Currently we restrict the cake to storing a single layer
+  //                    of each type.
+  std::unordered_map<std::type_index, std::unique_ptr<BaseLayer>> layers_;
+};
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/layer_cake_impl.h"
diff --git a/nvblox/include/nvblox/core/mapper.h b/nvblox/include/nvblox/core/mapper.h
new file mode 100644
index 00000000..c828c9ff
--- /dev/null
+++ b/nvblox/include/nvblox/core/mapper.h
@@ -0,0 +1,120 @@
+#pragma once
+
+#include <unordered_set>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/hash.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/layer_cake.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/integrators/esdf_integrator.h"
+#include "nvblox/integrators/projective_color_integrator.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/mesh/mesh_integrator.h"
+
+namespace nvblox {
+
+// The mapper classes wraps layers and integrators together.
+// In the base class we only specify that a mapper should contain map layers and
+// leave it up to sub-classes to add functionality.
+class MapperBase {
+ public:
+  MapperBase() = default;
+
+  // Move
+  MapperBase(MapperBase&& other) = default;
+  MapperBase& operator=(MapperBase&& other) = default;
+
+ protected:
+  // Map layers
+  LayerCake layers_;
+};
+
+// The RgbdMapper class is what we consider the default (but extensible) mapping
+// behaviour in nvblox.
+// Contains:
+// - TsdfLayer, ColorLayer, EsdfLayer, MeshLayer
+// - Functions for integrating depth and color frames
+// - Function for generating Meshes, ESDF, and ESDF-slices
+class RgbdMapper : public MapperBase {
+ public:
+  RgbdMapper() = delete;
+  RgbdMapper(float voxel_size_m, MemoryType memory_type = MemoryType::kDevice);
+  virtual ~RgbdMapper() {}
+
+  // Move
+  RgbdMapper(RgbdMapper&& other) = default;
+  RgbdMapper& operator=(RgbdMapper&& other) = default;
+
+  void integrateDepth(const DepthImage& depth_frame, const Transform& T_L_C,
+                      const Camera& camera);
+  void integrateColor(const ColorImage& color_frame, const Transform& T_L_C,
+                      const Camera& camera);
+
+  /// Updates the mesh blocks which require an update
+  /// @return The indices of the blocks that were updated in this call.
+  std::vector<Index3D> updateMesh();
+
+  /// Updates the Esdf blocks which require update
+  /// @return The indices of the blocks that were updated in this call.
+  std::vector<Index3D> updateEsdf();
+  std::vector<Index3D> updateEsdfSlice(float slice_input_z_min,
+                                       float slice_input_z_max,
+                                       float slice_output_z);
+
+  // Const access to layers
+  const LayerCake& layers() const { return layers_; }
+  const TsdfLayer& tsdf_layer() const { return layers_.get<TsdfLayer>(); }
+  const ColorLayer& color_layer() const { return layers_.get<ColorLayer>(); }
+  const EsdfLayer& esdf_layer() const { return layers_.get<EsdfLayer>(); }
+  const MeshLayer& mesh_layer() const { return layers_.get<MeshLayer>(); }
+
+  // Non-const access to layers
+  LayerCake& layers() { return layers_; }
+  TsdfLayer& tsdf_layer() { return *layers_.getPtr<TsdfLayer>(); }
+  ColorLayer& color_layer() { return *layers_.getPtr<ColorLayer>(); }
+  EsdfLayer& esdf_layer() { return *layers_.getPtr<EsdfLayer>(); }
+  MeshLayer& mesh_layer() { return *layers_.getPtr<MeshLayer>(); }
+
+  // Const access to integrators
+  const ProjectiveTsdfIntegrator& tsdf_integrator() const {
+    return tsdf_integrator_;
+  }
+  const ProjectiveColorIntegrator& color_integrator() const {
+    return color_integrator_;
+  }
+  const MeshIntegrator& mesh_integrator() const { return mesh_integrator_; }
+  const EsdfIntegrator& esdf_integrator() const { return esdf_integrator_; }
+
+  // Non-const access to integrators
+  ProjectiveTsdfIntegrator& tsdf_integrator() { return tsdf_integrator_; }
+  ProjectiveColorIntegrator& color_integrator() { return color_integrator_; }
+  MeshIntegrator& mesh_integrator() { return mesh_integrator_; }
+  EsdfIntegrator& esdf_integrator() { return esdf_integrator_; }
+
+ protected:
+  // Params
+  float voxel_size_m_;
+
+  // Behaviour params
+  enum class EsdfMode { k3D, k2D, kUnset };
+  EsdfMode esdf_mode_ = EsdfMode::kUnset;
+
+  // Map layers defined in base)
+  // For RgbdMapper class: TSDF, Color, ESDF, Mesh
+  // LayerCake layers_;
+
+  // Integrators
+  ProjectiveTsdfIntegrator tsdf_integrator_;
+  ProjectiveColorIntegrator color_integrator_;
+  MeshIntegrator mesh_integrator_;
+  EsdfIntegrator esdf_integrator_;
+
+  // These queue keep track of the blocks which need to be updated on the next
+  // calls to updateMeshLayer() and updateEsdfLayer() respectively.
+  Index3DSet mesh_blocks_to_update_;
+  Index3DSet esdf_blocks_to_update_;
+};
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/core/traits.h b/nvblox/include/nvblox/core/traits.h
new file mode 100644
index 00000000..17bb32f7
--- /dev/null
+++ b/nvblox/include/nvblox/core/traits.h
@@ -0,0 +1,45 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace traits {
+
+// Trait which is true if the BlockType defines an allocate function
+// NOTE(alexmillane): This type trait is not totally complete as it doesn't
+// check the return type of allocate function... it will do for now. The intent
+// is just to give people implementing new blocks a useful message when they
+// forget to implement this function.
+template <typename T>
+struct make_void {
+  typedef void type;
+};
+template <typename T>
+using void_t = typename make_void<T>::type;
+
+template <typename, typename = void>
+struct has_allocate : std::false_type {};
+
+template <typename BlockType>
+struct has_allocate<
+    BlockType,
+    void_t<decltype(std::declval<BlockType&>().allocate(MemoryType::kUnified))>>
+    : std::true_type {};
+
+}  // namespace traits
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/core/types.h b/nvblox/include/nvblox/core/types.h
new file mode 100644
index 00000000..e9609da8
--- /dev/null
+++ b/nvblox/include/nvblox/core/types.h
@@ -0,0 +1,59 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <vector>
+
+namespace nvblox {
+
+// Whether the storage or processing is happening on CPU, GPU, or any future
+// amazing hardware- accelerated platform.
+enum class DeviceType { kCPU, kGPU };
+
+// How GPU data is stored, either in Device-only or unified (both) memory.
+// NOTE(alexmillane): tag: c++17, switch to constexpr when we move to c++17.
+enum class MemoryType { kDevice, kUnified, kHost };
+inline std::string toString(MemoryType memory_type) {
+  switch (memory_type) {
+    case MemoryType::kDevice: return "kDevice"; break;
+    case MemoryType::kUnified: return "kUnified"; break;
+    default: return "kHost"; break;
+  }
+}
+
+typedef Eigen::Vector3i Index3D;
+typedef Eigen::Vector2i Index2D;
+
+typedef Eigen::Vector3f Vector3f;
+typedef Eigen::Vector2f Vector2f;
+
+typedef Eigen::AlignedBox3f AxisAlignedBoundingBox;
+
+typedef Eigen::Isometry3f Transform;
+
+// Aligned Eigen containers
+template <typename Type>
+using AlignedVector = std::vector<Type, Eigen::aligned_allocator<Type>>;
+
+enum class InterpolationType { kNearestNeighbor, kLinear, kMax, kMin };
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/core/unified_ptr.h b/nvblox/include/nvblox/core/unified_ptr.h
new file mode 100644
index 00000000..c8adeae8
--- /dev/null
+++ b/nvblox/include/nvblox/core/unified_ptr.h
@@ -0,0 +1,188 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <cuda_runtime.h>
+#include <glog/logging.h>
+#include <atomic>
+#include <type_traits>
+
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+// shared_ptr for device, unified memory, and pinned host memory
+// Things to be aware of
+// - Single objects
+//   - Constructor and Destructor are not called when memory_type==kDevice
+//     (these are CPU functions). Therefore unified_ptr generates an error if
+//     used in device mode with non-trivially destructible types.
+//   - Both Constructor and Destructor are called when memory_type==kUnified ||
+//     kHost.
+// - Arrays
+//   - Default Constructor called when storing arrays in kUnified || kHost.
+//   - No Constructor called in kDevice mode.
+//   - No destructor called for ANY memory setting so we make a static check
+//     that objects are trivially destructable.
+template <typename T>
+class unified_ptr {
+ public:
+  typedef typename std::remove_extent<T>::type T_noextent;
+  typedef typename std::remove_cv<T>::type T_nonconst;
+
+  static constexpr MemoryType kDefaultMemoryType = MemoryType::kUnified;
+
+  unified_ptr();
+  explicit unified_ptr(T_noextent* ptr, MemoryType memory_type,
+                       size_t size = 1);
+  unified_ptr(const unified_ptr<T>& other);
+  unified_ptr(unified_ptr<T>&& other);
+  ~unified_ptr();
+
+  // Operator =
+  unified_ptr<T>& operator=(const unified_ptr<T>& other);
+  unified_ptr<T>& operator=(unified_ptr<T>&& other);
+
+  // Operator bool
+  operator bool() const;
+  // Operator comparison
+  bool operator==(const unified_ptr<T>& other) const;
+  bool operator!=(const unified_ptr<T>& other) const;
+
+  // Operator dereference
+  T_noextent* operator->();
+  const T_noextent* operator->() const;
+  T& operator*();
+  const T& operator*() const;
+
+  // Operator array access
+  // Only enabled if the underlying type is an array
+  template <typename T1 = T,
+            std::enable_if_t<std::is_array<T1>::value, bool> = true>
+  T_noextent& operator[](size_t i) {
+    CHECK(memory_type_ != MemoryType::kDevice);
+    return ptr_[i];
+  }
+  template <typename T1 = T,
+            std::enable_if_t<std::is_array<T1>::value, bool> = true>
+  const T_noextent& operator[](size_t i) const {
+    CHECK(memory_type_ != MemoryType::kDevice);
+    return ptr_[i];
+  }
+
+  // Operator convert
+  // Only enabled if the base type is NOT const, otherwise adds a second
+  // trivial converter.
+  template <typename T2,
+            std::enable_if_t<!std::is_const<T2>::value, bool> = true>
+  operator unified_ptr<const T2>() const;
+
+  // Get the raw pointer.
+  T_noextent* get();
+  const T_noextent* get() const;
+
+  // Reset the pointer to point to nothing.
+  void reset();
+
+  // Copy the underlying object (potentially to another memory location)
+  // NOTE: This is implemented as a memcpy at the pointed to location.
+  unified_ptr<T_nonconst> clone() const;
+  unified_ptr<T_nonconst> clone(MemoryType memory_type) const;
+
+  // Hint to move the memory to the GPU or CPU.
+  void toGPU();
+  void toGPU(cudaStream_t cuda_stream);
+  void toCPU();
+  // Advise CUDA to prefer the GPU for this memory. Should reduce paging fails.
+  void preferGPU();
+
+  MemoryType memory_type() const { return memory_type_; }
+
+  // Helper function to memset all the memory to 0.
+  void setZero();
+
+  friend class unified_ptr<T_nonconst>;
+  friend class unified_ptr<const T>;
+
+ private:
+  MemoryType memory_type_;
+  size_t size_;
+
+  T_noextent* ptr_;
+
+  mutable std::atomic<int>* ref_counter_;
+};
+
+// Comparison with nullptr
+template <class T>
+bool operator==(const unified_ptr<T>& lhs, std::nullptr_t) noexcept {
+  return !lhs;
+}
+template <class T>
+bool operator==(std::nullptr_t, const unified_ptr<T>& rhs) noexcept {
+  return !rhs;
+}
+template <class T>
+bool operator!=(const unified_ptr<T>& lhs, std::nullptr_t) noexcept {
+  return lhs;
+}
+template <class T>
+bool operator!=(std::nullptr_t, const unified_ptr<T>& rhs) noexcept {
+  return rhs;
+}
+
+// These are used to specialize make_unique on the return type, which is need to
+// generate class templates for non-array and array types. See gcc here:
+// https://github.com/gcc-mirror/gcc/blob/master/libstdc%2B%2B-v3/include/bits/unique_ptr.h#L943
+template <class T>
+struct _Unified_if {
+  typedef unified_ptr<T> _Single_object;
+};
+
+template <class T>
+struct _Unified_if<T[]> {
+  typedef unified_ptr<T[]> _Unknown_bound;
+};
+
+template <class T, size_t N>
+struct _Unified_if<T[N]> {
+  typedef void _Known_bound;
+};
+
+// Single object, default storage location
+template <typename T, typename... Args>
+typename _Unified_if<T>::_Single_object make_unified(Args&&... args);
+
+// Specialization, Single object, specified storage location
+template <typename T, typename... Args>
+typename _Unified_if<T>::_Single_object make_unified(MemoryType memory_type,
+                                                     Args&&... args);
+
+// Specialization for arrays, default storage location
+template <typename T>
+typename _Unified_if<T>::_Unknown_bound make_unified(std::size_t size);
+
+// Specialization for arrays, specified storage location
+template <typename T>
+typename _Unified_if<T>::_Unknown_bound make_unified(std::size_t size,
+                                                     MemoryType memory_type);
+
+template <typename T, typename... Args>
+typename _Unified_if<T>::_Known_bound make_unified(Args&&... args) = delete;
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/unified_ptr_impl.h"
diff --git a/nvblox/include/nvblox/core/unified_vector.h b/nvblox/include/nvblox/core/unified_vector.h
new file mode 100644
index 00000000..4cab8e24
--- /dev/null
+++ b/nvblox/include/nvblox/core/unified_vector.h
@@ -0,0 +1,177 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <glog/logging.h>
+#include <type_traits>
+#include <vector>
+
+#include "nvblox/core/iterator.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+/// Unified-memory CUDA vector that should only be used on trivial types
+/// as the constructors and destructors *are NOT called*.
+template <typename T>
+class unified_vector {
+ public:
+  typedef RawIterator<T> iterator;
+  typedef RawIterator<const T> const_iterator;
+
+  static constexpr MemoryType kDefaultMemoryType = MemoryType::kUnified;
+
+  // Static asserts on the type.
+  static_assert(
+      std::is_default_constructible<T>::value,
+      "Objects stored in unified vector should be default constructible.");
+
+  unified_vector(MemoryType memory_type = kDefaultMemoryType);
+  unified_vector(size_t size, MemoryType memory_type = kDefaultMemoryType);
+  unified_vector(size_t size, const T& initial,
+                 MemoryType memory_type = kDefaultMemoryType);
+  // Copy constructor.
+  unified_vector(const unified_vector<T>& other,
+                 MemoryType memory_type = kDefaultMemoryType);
+  unified_vector(const std::vector<T>& other,
+                 MemoryType memory_type = kDefaultMemoryType);
+
+  // Move constructor.
+  unified_vector(unified_vector<T>&& other);
+
+  // Destructor
+  virtual ~unified_vector();
+
+  // Operators.
+  T& operator[](size_t index);
+  const T& operator[](size_t index) const;
+  unified_vector<T>& operator=(const unified_vector<T>& other);
+  unified_vector<T>& operator=(unified_vector<T>&& other);
+  unified_vector<T>& operator=(const std::vector<T>& other);
+
+  // Convert to an std::vector. Creates a copy.
+  std::vector<T> toVector() const;
+
+  // Get raw pointers. This is also for GPU pointers.
+  T* data();
+  const T* data() const;
+
+  // Hint to move the memory to the GPU or CPU.
+  void toGPU();
+  void toCPU();
+
+  // Access information.
+  size_t capacity() const;
+  size_t size() const;
+  bool empty() const;
+
+  // Changing the size.
+  void reserve(size_t capacity);
+  void resize(size_t size);
+  void clear();
+
+  // Adding elements.
+  void push_back(const T& value);
+
+  // Iterator access.
+  iterator begin();
+  iterator end();
+  const_iterator begin() const { return cbegin(); }
+  const_iterator end() const { return cend(); }
+  const_iterator cbegin() const;
+  const_iterator cend() const;
+
+  // Get the memory type.
+  MemoryType memory_type() const { return memory_type_; }
+
+  // Set the entire *memory* of the vector to zero.
+  void setZero();
+
+ private:
+  MemoryType memory_type_;
+
+  T* buffer_;
+  size_t buffer_size_;
+  size_t buffer_capacity_;
+};
+
+template <typename T>
+class device_vector : public unified_vector<T> {
+ public:
+  device_vector() : unified_vector<T>(MemoryType::kDevice) {}
+  device_vector(MemoryType memory_type) = delete;
+  device_vector(size_t size) : unified_vector<T>(size, MemoryType::kDevice) {}
+  device_vector(size_t size, const T& initial)
+      : unified_vector<T>(size, initial, MemoryType::kDevice) {}
+
+  // Conversions from base
+  device_vector(const unified_vector<T>& other)
+      : unified_vector<T>(other, MemoryType::kDevice){};
+  device_vector(unified_vector<T>&& other)
+      : unified_vector<T>(std::move(other), MemoryType::kDevice){};
+  device_vector& operator=(const unified_vector<T>& other) {
+    unified_vector<T>::operator=(other);
+    return *this;
+  }
+  device_vector& operator=(unified_vector<T>&& other) {
+    unified_vector<T>::operator=(std::move(other));
+    return *this;
+  }
+
+  // From std::vector
+  device_vector(const std::vector<T>& other)
+      : unified_vector<T>(other, MemoryType::kDevice) {}
+  device_vector<T>& operator=(const std::vector<T>& other) {
+    unified_vector<T>::operator=(other);
+    return *this;
+  }
+};
+
+template <typename T>
+class host_vector : public unified_vector<T> {
+ public:
+  host_vector() : unified_vector<T>(MemoryType::kHost) {}
+  host_vector(MemoryType memory_type) = delete;
+  host_vector(size_t size) : unified_vector<T>(size, MemoryType::kHost) {}
+  host_vector(size_t size, const T& initial)
+      : unified_vector<T>(size, initial, MemoryType::kHost) {}
+
+  // Conversions from base
+  host_vector(const unified_vector<T>& other)
+      : unified_vector<T>(other, MemoryType::kHost){};
+  host_vector(unified_vector<T>&& other)
+      : unified_vector<T>(std::move(other), MemoryType::kHost){};
+  host_vector& operator=(const unified_vector<T>& other) {
+    unified_vector<T>::operator=(other);
+    return *this;
+  }
+  host_vector& operator=(unified_vector<T>&& other) {
+    unified_vector<T>::operator=(std::move(other));
+    return *this;
+  }
+
+  // From std::vector
+  host_vector(const std::vector<T>& other)
+      : unified_vector<T>(other, MemoryType::kHost) {}
+  host_vector<T>& operator=(const std::vector<T>& other) {
+    unified_vector<T>::operator=(other);
+    return *this;
+  }
+};
+
+}  // namespace nvblox
+
+#include "nvblox/core/impl/unified_vector_impl.h"
\ No newline at end of file
diff --git a/nvblox/include/nvblox/core/variadic_template_tools.h b/nvblox/include/nvblox/core/variadic_template_tools.h
new file mode 100644
index 00000000..8fe5d870
--- /dev/null
+++ b/nvblox/include/nvblox/core/variadic_template_tools.h
@@ -0,0 +1,26 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+// Unique types
+template <class... Types>
+struct unique_types;
+
+}  // nvblox
+
+#include "nvblox/core/impl/variadic_template_tools_impl.h"
diff --git a/nvblox/include/nvblox/core/voxels.h b/nvblox/include/nvblox/core/voxels.h
new file mode 100644
index 00000000..9a98bf1e
--- /dev/null
+++ b/nvblox/include/nvblox/core/voxels.h
@@ -0,0 +1,56 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <Eigen/Core>
+
+#include "nvblox/core/color.h"
+
+namespace nvblox {
+
+struct TsdfVoxel {
+  // Signed projective distance of the voxel from a surface.
+  float distance = 0.0f;
+  // How many observations/how confident we are in this observation.
+  float weight = 0.0f;
+};
+
+struct EsdfVoxel {
+  // TODO(helen): optimize the memory layout here.
+  // Cached squared distance towards the parent.
+  float squared_distance_vox = 0.0f;
+  // Direction towards the parent, *in units of voxels*.
+  Eigen::Vector3i parent_direction = Eigen::Vector3i::Zero();
+  // Whether this voxel is inside the surface or not.
+  bool is_inside = false;
+  // Whether this voxel has been observed.
+  bool observed = false;
+  // Whether this voxel is a "site": i.e., near the zero-crossing and is
+  // eligible to be considered a parent.
+  bool is_site = false;
+};
+
+struct ColorVoxel {
+  Color color = Color::Gray();
+  // How many observations/how confident we are in this observation.
+  float weight = 0.0f;
+};
+
+struct FreespaceVoxel {
+  bool free = true;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/datasets/external/stb_image.h b/nvblox/include/nvblox/datasets/external/stb_image.h
new file mode 100644
index 00000000..a1f18190
--- /dev/null
+++ b/nvblox/include/nvblox/datasets/external/stb_image.h
@@ -0,0 +1,7890 @@
+/* stb_image - v2.27 - public domain image loader - http://nothings.org/stb
+                                  no warranty implied; use at your own risk
+
+   Do this:
+      #define STB_IMAGE_IMPLEMENTATION
+   before you include this file in *one* C or C++ file to create the implementation.
+
+   // i.e. it should look like this:
+   #include ...
+   #include ...
+   #include ...
+   #define STB_IMAGE_IMPLEMENTATION
+   #include "stb_image.h"
+
+   You can #define STBI_ASSERT(x) before the #include to avoid using assert.h.
+   And #define STBI_MALLOC, STBI_REALLOC, and STBI_FREE to avoid using malloc,realloc,free
+
+
+   QUICK NOTES:
+      Primarily of interest to game developers and other people who can
+          avoid problematic images and only need the trivial interface
+
+      JPEG baseline & progressive (12 bpc/arithmetic not supported, same as stock IJG lib)
+      PNG 1/2/4/8/16-bit-per-channel
+
+      TGA (not sure what subset, if a subset)
+      BMP non-1bpp, non-RLE
+      PSD (composited view only, no extra channels, 8/16 bit-per-channel)
+
+      GIF (*comp always reports as 4-channel)
+      HDR (radiance rgbE format)
+      PIC (Softimage PIC)
+      PNM (PPM and PGM binary only)
+
+      Animated GIF still needs a proper API, but here's one way to do it:
+          http://gist.github.com/urraka/685d9a6340b26b830d49
+
+      - decode from memory or through FILE (define STBI_NO_STDIO to remove code)
+      - decode from arbitrary I/O callbacks
+      - SIMD acceleration on x86/x64 (SSE2) and ARM (NEON)
+
+   Full documentation under "DOCUMENTATION" below.
+
+
+LICENSE
+
+  See end of file for license information.
+
+RECENT REVISION HISTORY:
+
+      2.27  (2021-07-11) document stbi_info better, 16-bit PNM support, bug fixes
+      2.26  (2020-07-13) many minor fixes
+      2.25  (2020-02-02) fix warnings
+      2.24  (2020-02-02) fix warnings; thread-local failure_reason and flip_vertically
+      2.23  (2019-08-11) fix clang static analysis warning
+      2.22  (2019-03-04) gif fixes, fix warnings
+      2.21  (2019-02-25) fix typo in comment
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) bugfix, 1-bit BMP, 16-bitness query, fix warnings
+      2.16  (2017-07-23) all functions have 16-bit variants; optimizations; bugfixes
+      2.15  (2017-03-18) fix png-1,2,4; all Imagenet JPGs; no runtime SSE detection on GCC
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-12-04) experimental 16-bit API, only for PNG so far; fixes
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) 16-bit PNGS; enable SSE2 in non-gcc x64
+                         RGB-format JPEG; remove white matting in PSD;
+                         allocate large structures on the stack;
+                         correct channel count for PNG & BMP
+      2.10  (2016-01-22) avoid warning introduced in 2.09
+      2.09  (2016-01-16) 16-bit TGA; comments in PNM files; STBI_REALLOC_SIZED
+
+   See end of file for full revision history.
+
+
+ ============================    Contributors    =========================
+
+ Image formats                          Extensions, features
+    Sean Barrett (jpeg, png, bmp)          Jetro Lauha (stbi_info)
+    Nicolas Schulz (hdr, psd)              Martin "SpartanJ" Golini (stbi_info)
+    Jonathan Dummer (tga)                  James "moose2000" Brown (iPhone PNG)
+    Jean-Marc Lienher (gif)                Ben "Disch" Wenger (io callbacks)
+    Tom Seddon (pic)                       Omar Cornut (1/2/4-bit PNG)
+    Thatcher Ulrich (psd)                  Nicolas Guillemot (vertical flip)
+    Ken Miller (pgm, ppm)                  Richard Mitton (16-bit PSD)
+    github:urraka (animated gif)           Junggon Kim (PNM comments)
+    Christopher Forseth (animated gif)     Daniel Gibson (16-bit TGA)
+                                           socks-the-fox (16-bit PNG)
+                                           Jeremy Sawicki (handle all ImageNet JPGs)
+ Optimizations & bugfixes                  Mikhail Morozov (1-bit BMP)
+    Fabian "ryg" Giesen                    Anael Seghezzi (is-16-bit query)
+    Arseny Kapoulkine                      Simon Breuss (16-bit PNM)
+    John-Mark Allen
+    Carmelo J Fdez-Aguera
+
+ Bug & warning fixes
+    Marc LeBlanc            David Woo          Guillaume George     Martins Mozeiko
+    Christpher Lloyd        Jerry Jansson      Joseph Thomson       Blazej Dariusz Roszkowski
+    Phil Jordan                                Dave Moore           Roy Eltham
+    Hayaki Saito            Nathan Reed        Won Chun
+    Luke Graham             Johan Duparc       Nick Verigakis       the Horde3D community
+    Thomas Ruf              Ronny Chevalier                         github:rlyeh
+    Janez Zemva             John Bartholomew   Michal Cichon        github:romigrou
+    Jonathan Blow           Ken Hamada         Tero Hanninen        github:svdijk
+    Eugene Golushkov        Laurent Gomila     Cort Stratton        github:snagar
+    Aruelien Pocheville     Sergio Gonzalez    Thibault Reuille     github:Zelex
+    Cass Everitt            Ryamond Barbiero                        github:grim210
+    Paul Du Bois            Engin Manap        Aldo Culquicondor    github:sammyhw
+    Philipp Wiesemann       Dale Weiler        Oriol Ferrer Mesia   github:phprus
+    Josh Tobin                                 Matthew Gregan       github:poppolopoppo
+    Julian Raschke          Gregory Mullen     Christian Floisand   github:darealshinji
+    Baldur Karlsson         Kevin Schmidt      JR Smith             github:Michaelangel007
+                            Brad Weinberger    Matvey Cherevko      github:mosra
+    Luca Sas                Alexander Veselov  Zack Middleton       [reserved]
+    Ryan C. Gordon          [reserved]                              [reserved]
+                     DO NOT ADD YOUR NAME HERE
+
+                     Jacko Dirks
+
+  To add your name to the credits, pick a random blank space in the middle and fill it.
+  80% of merge conflicts on stb PRs are due to people adding their name at the end
+  of the credits.
+*/
+
+#ifndef STBI_INCLUDE_STB_IMAGE_H
+#define STBI_INCLUDE_STB_IMAGE_H
+
+// DOCUMENTATION
+//
+// Limitations:
+//    - no 12-bit-per-channel JPEG
+//    - no JPEGs with arithmetic coding
+//    - GIF always returns *comp=4
+//
+// Basic usage (see HDR discussion below for HDR usage):
+//    int x,y,n;
+//    unsigned char *data = stbi_load(filename, &x, &y, &n, 0);
+//    // ... process data if not NULL ...
+//    // ... x = width, y = height, n = # 8-bit components per pixel ...
+//    // ... replace '0' with '1'..'4' to force that many components per pixel
+//    // ... but 'n' will always be the number that it would have been if you said 0
+//    stbi_image_free(data)
+//
+// Standard parameters:
+//    int *x                 -- outputs image width in pixels
+//    int *y                 -- outputs image height in pixels
+//    int *channels_in_file  -- outputs # of image components in image file
+//    int desired_channels   -- if non-zero, # of image components requested in result
+//
+// The return value from an image loader is an 'unsigned char *' which points
+// to the pixel data, or NULL on an allocation failure or if the image is
+// corrupt or invalid. The pixel data consists of *y scanlines of *x pixels,
+// with each pixel consisting of N interleaved 8-bit components; the first
+// pixel pointed to is top-left-most in the image. There is no padding between
+// image scanlines or between pixels, regardless of format. The number of
+// components N is 'desired_channels' if desired_channels is non-zero, or
+// *channels_in_file otherwise. If desired_channels is non-zero,
+// *channels_in_file has the number of components that _would_ have been
+// output otherwise. E.g. if you set desired_channels to 4, you will always
+// get RGBA output, but you can check *channels_in_file to see if it's trivially
+// opaque because e.g. there were only 3 channels in the source image.
+//
+// An output image with N components has the following components interleaved
+// in this order in each pixel:
+//
+//     N=#comp     components
+//       1           grey
+//       2           grey, alpha
+//       3           red, green, blue
+//       4           red, green, blue, alpha
+//
+// If image loading fails for any reason, the return value will be NULL,
+// and *x, *y, *channels_in_file will be unchanged. The function
+// stbi_failure_reason() can be queried for an extremely brief, end-user
+// unfriendly explanation of why the load failed. Define STBI_NO_FAILURE_STRINGS
+// to avoid compiling these strings at all, and STBI_FAILURE_USERMSG to get slightly
+// more user-friendly ones.
+//
+// Paletted PNG, BMP, GIF, and PIC images are automatically depalettized.
+//
+// To query the width, height and component count of an image without having to
+// decode the full file, you can use the stbi_info family of functions:
+//
+//   int x,y,n,ok;
+//   ok = stbi_info(filename, &x, &y, &n);
+//   // returns ok=1 and sets x, y, n if image is a supported format,
+//   // 0 otherwise.
+//
+// Note that stb_image pervasively uses ints in its public API for sizes,
+// including sizes of memory buffers. This is now part of the API and thus
+// hard to change without causing breakage. As a result, the various image
+// loaders all have certain limits on image size; these differ somewhat
+// by format but generally boil down to either just under 2GB or just under
+// 1GB. When the decoded image would be larger than this, stb_image decoding
+// will fail.
+//
+// Additionally, stb_image will reject image files that have any of their
+// dimensions set to a larger value than the configurable STBI_MAX_DIMENSIONS,
+// which defaults to 2**24 = 16777216 pixels. Due to the above memory limit,
+// the only way to have an image with such dimensions load correctly
+// is for it to have a rather extreme aspect ratio. Either way, the
+// assumption here is that such larger images are likely to be malformed
+// or malicious. If you do need to load an image with individual dimensions
+// larger than that, and it still fits in the overall size limit, you can
+// #define STBI_MAX_DIMENSIONS on your own to be something larger.
+//
+// ===========================================================================
+//
+// UNICODE:
+//
+//   If compiling for Windows and you wish to use Unicode filenames, compile
+//   with
+//       #define STBI_WINDOWS_UTF8
+//   and pass utf8-encoded filenames. Call stbi_convert_wchar_to_utf8 to convert
+//   Windows wchar_t filenames to utf8.
+//
+// ===========================================================================
+//
+// Philosophy
+//
+// stb libraries are designed with the following priorities:
+//
+//    1. easy to use
+//    2. easy to maintain
+//    3. good performance
+//
+// Sometimes I let "good performance" creep up in priority over "easy to maintain",
+// and for best performance I may provide less-easy-to-use APIs that give higher
+// performance, in addition to the easy-to-use ones. Nevertheless, it's important
+// to keep in mind that from the standpoint of you, a client of this library,
+// all you care about is #1 and #3, and stb libraries DO NOT emphasize #3 above all.
+//
+// Some secondary priorities arise directly from the first two, some of which
+// provide more explicit reasons why performance can't be emphasized.
+//
+//    - Portable ("ease of use")
+//    - Small source code footprint ("easy to maintain")
+//    - No dependencies ("ease of use")
+//
+// ===========================================================================
+//
+// I/O callbacks
+//
+// I/O callbacks allow you to read from arbitrary sources, like packaged
+// files or some other source. Data read from callbacks are processed
+// through a small internal buffer (currently 128 bytes) to try to reduce
+// overhead.
+//
+// The three functions you must define are "read" (reads some bytes of data),
+// "skip" (skips some bytes of data), "eof" (reports if the stream is at the end).
+//
+// ===========================================================================
+//
+// SIMD support
+//
+// The JPEG decoder will try to automatically use SIMD kernels on x86 when
+// supported by the compiler. For ARM Neon support, you must explicitly
+// request it.
+//
+// (The old do-it-yourself SIMD API is no longer supported in the current
+// code.)
+//
+// On x86, SSE2 will automatically be used when available based on a run-time
+// test; if not, the generic C versions are used as a fall-back. On ARM targets,
+// the typical path is to have separate builds for NEON and non-NEON devices
+// (at least this is true for iOS and Android). Therefore, the NEON support is
+// toggled by a build flag: define STBI_NEON to get NEON loops.
+//
+// If for some reason you do not want to use any of SIMD code, or if
+// you have issues compiling it, you can disable it entirely by
+// defining STBI_NO_SIMD.
+//
+// ===========================================================================
+//
+// HDR image support   (disable by defining STBI_NO_HDR)
+//
+// stb_image supports loading HDR images in general, and currently the Radiance
+// .HDR file format specifically. You can still load any file through the existing
+// interface; if you attempt to load an HDR file, it will be automatically remapped
+// to LDR, assuming gamma 2.2 and an arbitrary scale factor defaulting to 1;
+// both of these constants can be reconfigured through this interface:
+//
+//     stbi_hdr_to_ldr_gamma(2.2f);
+//     stbi_hdr_to_ldr_scale(1.0f);
+//
+// (note, do not use _inverse_ constants; stbi_image will invert them
+// appropriately).
+//
+// Additionally, there is a new, parallel interface for loading files as
+// (linear) floats to preserve the full dynamic range:
+//
+//    float *data = stbi_loadf(filename, &x, &y, &n, 0);
+//
+// If you load LDR images through this interface, those images will
+// be promoted to floating point values, run through the inverse of
+// constants corresponding to the above:
+//
+//     stbi_ldr_to_hdr_scale(1.0f);
+//     stbi_ldr_to_hdr_gamma(2.2f);
+//
+// Finally, given a filename (or an open file or memory block--see header
+// file for details) containing image data, you can query for the "most
+// appropriate" interface to use (that is, whether the image is HDR or
+// not), using:
+//
+//     stbi_is_hdr(char *filename);
+//
+// ===========================================================================
+//
+// iPhone PNG support:
+//
+// We optionally support converting iPhone-formatted PNGs (which store
+// premultiplied BGRA) back to RGB, even though they're internally encoded
+// differently. To enable this conversion, call
+// stbi_convert_iphone_png_to_rgb(1).
+//
+// Call stbi_set_unpremultiply_on_load(1) as well to force a divide per
+// pixel to remove any premultiplied alpha *only* if the image file explicitly
+// says there's premultiplied data (currently only happens in iPhone images,
+// and only if iPhone convert-to-rgb processing is on).
+//
+// ===========================================================================
+//
+// ADDITIONAL CONFIGURATION
+//
+//  - You can suppress implementation of any of the decoders to reduce
+//    your code footprint by #defining one or more of the following
+//    symbols before creating the implementation.
+//
+//        STBI_NO_JPEG
+//        STBI_NO_PNG
+//        STBI_NO_BMP
+//        STBI_NO_PSD
+//        STBI_NO_TGA
+//        STBI_NO_GIF
+//        STBI_NO_HDR
+//        STBI_NO_PIC
+//        STBI_NO_PNM   (.ppm and .pgm)
+//
+//  - You can request *only* certain decoders and suppress all other ones
+//    (this will be more forward-compatible, as addition of new decoders
+//    doesn't require you to disable them explicitly):
+//
+//        STBI_ONLY_JPEG
+//        STBI_ONLY_PNG
+//        STBI_ONLY_BMP
+//        STBI_ONLY_PSD
+//        STBI_ONLY_TGA
+//        STBI_ONLY_GIF
+//        STBI_ONLY_HDR
+//        STBI_ONLY_PIC
+//        STBI_ONLY_PNM   (.ppm and .pgm)
+//
+//   - If you use STBI_NO_PNG (or _ONLY_ without PNG), and you still
+//     want the zlib decoder to be available, #define STBI_SUPPORT_ZLIB
+//
+//  - If you define STBI_MAX_DIMENSIONS, stb_image will reject images greater
+//    than that size (in either width or height) without further processing.
+//    This is to let programs in the wild set an upper bound to prevent
+//    denial-of-service attacks on untrusted data, as one could generate a
+//    valid image of gigantic dimensions and force stb_image to allocate a
+//    huge block of memory and spend disproportionate time decoding it. By
+//    default this is set to (1 << 24), which is 16777216, but that's still
+//    very big.
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif // STBI_NO_STDIO
+
+#define STBI_VERSION 1
+
+enum
+{
+   STBI_default = 0, // only used for desired_channels
+
+   STBI_grey       = 1,
+   STBI_grey_alpha = 2,
+   STBI_rgb        = 3,
+   STBI_rgb_alpha  = 4
+};
+
+#include <stdlib.h>
+typedef unsigned char stbi_uc;
+typedef unsigned short stbi_us;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef STBIDEF
+#ifdef STB_IMAGE_STATIC
+#define STBIDEF static
+#else
+#define STBIDEF extern
+#endif
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// PRIMARY API - works on images of any type
+//
+
+//
+// load image by filename, open file, or memory buffer
+//
+
+typedef struct
+{
+   int      (*read)  (void *user,char *data,int size);   // fill 'data' with 'size' bytes.  return number of bytes actually read
+   void     (*skip)  (void *user,int n);                 // skip the next 'n' bytes, or 'unget' the last -n bytes if negative
+   int      (*eof)   (void *user);                       // returns nonzero if we are at end of file/data
+} stbi_io_callbacks;
+
+////////////////////////////////////
+//
+// 8-bits-per-channel interface
+//
+
+STBIDEF stbi_uc *stbi_load_from_memory   (stbi_uc           const *buffer, int len   , int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk  , void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_uc *stbi_load            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_uc *stbi_load_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+// for stbi_load_from_file, file pointer is left pointing immediately after image
+#endif
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+#endif
+
+#ifdef STBI_WINDOWS_UTF8
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input);
+#endif
+
+////////////////////////////////////
+//
+// 16-bits-per-channel interface
+//
+
+STBIDEF stbi_us *stbi_load_16_from_memory   (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels);
+
+#ifndef STBI_NO_STDIO
+STBIDEF stbi_us *stbi_load_16          (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+STBIDEF stbi_us *stbi_load_from_file_16(FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+#endif
+
+////////////////////////////////////
+//
+// float-per-channel interface
+//
+#ifndef STBI_NO_LINEAR
+   STBIDEF float *stbi_loadf_from_memory     (stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_callbacks  (stbi_io_callbacks const *clbk, void *user, int *x, int *y,  int *channels_in_file, int desired_channels);
+
+   #ifndef STBI_NO_STDIO
+   STBIDEF float *stbi_loadf            (char const *filename, int *x, int *y, int *channels_in_file, int desired_channels);
+   STBIDEF float *stbi_loadf_from_file  (FILE *f, int *x, int *y, int *channels_in_file, int desired_channels);
+   #endif
+#endif
+
+#ifndef STBI_NO_HDR
+   STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma);
+   STBIDEF void   stbi_hdr_to_ldr_scale(float scale);
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_LINEAR
+   STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma);
+   STBIDEF void   stbi_ldr_to_hdr_scale(float scale);
+#endif // STBI_NO_LINEAR
+
+// stbi_is_hdr is always defined, but always returns false if STBI_NO_HDR
+STBIDEF int    stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+STBIDEF int    stbi_is_hdr_from_memory(stbi_uc const *buffer, int len);
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename);
+STBIDEF int      stbi_is_hdr_from_file(FILE *f);
+#endif // STBI_NO_STDIO
+
+
+// get a VERY brief reason for failure
+// on most compilers (and ALL modern mainstream compilers) this is threadsafe
+STBIDEF const char *stbi_failure_reason  (void);
+
+// free the loaded image -- this is just free()
+STBIDEF void     stbi_image_free      (void *retval_from_stbi_load);
+
+// get image dimensions & components without fully decoding
+STBIDEF int      stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len);
+STBIDEF int      stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *clbk, void *user);
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_info               (char const *filename,     int *x, int *y, int *comp);
+STBIDEF int      stbi_info_from_file     (FILE *f,                  int *x, int *y, int *comp);
+STBIDEF int      stbi_is_16_bit          (char const *filename);
+STBIDEF int      stbi_is_16_bit_from_file(FILE *f);
+#endif
+
+
+
+// for image formats that explicitly notate that they have premultiplied alpha,
+// we just return the colors as stored in the file. set this flag to force
+// unpremultiplication. results are undefined if the unpremultiply overflow.
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply);
+
+// indicate whether we should process iphone images back to canonical format,
+// or just pass them through "as-is"
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert);
+
+// flip the image vertically, so the first pixel in the output array is the bottom left
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip);
+
+// as above, but only applies to images loaded on the thread that calls the function
+// this function is only available if your compiler supports thread-local variables;
+// calling it will fail to link if your compiler doesn't
+STBIDEF void stbi_set_unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply);
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert);
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip);
+
+// ZLIB client - used by PNG, available for other purposes
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen);
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header);
+STBIDEF char *stbi_zlib_decode_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(const char *buffer, int len, int *outlen);
+STBIDEF int   stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+//
+//
+////   end header file   /////////////////////////////////////////////////////
+#endif // STBI_INCLUDE_STB_IMAGE_H
+
+#ifdef STB_IMAGE_IMPLEMENTATION
+
+#if defined(STBI_ONLY_JPEG) || defined(STBI_ONLY_PNG) || defined(STBI_ONLY_BMP) \
+  || defined(STBI_ONLY_TGA) || defined(STBI_ONLY_GIF) || defined(STBI_ONLY_PSD) \
+  || defined(STBI_ONLY_HDR) || defined(STBI_ONLY_PIC) || defined(STBI_ONLY_PNM) \
+  || defined(STBI_ONLY_ZLIB)
+   #ifndef STBI_ONLY_JPEG
+   #define STBI_NO_JPEG
+   #endif
+   #ifndef STBI_ONLY_PNG
+   #define STBI_NO_PNG
+   #endif
+   #ifndef STBI_ONLY_BMP
+   #define STBI_NO_BMP
+   #endif
+   #ifndef STBI_ONLY_PSD
+   #define STBI_NO_PSD
+   #endif
+   #ifndef STBI_ONLY_TGA
+   #define STBI_NO_TGA
+   #endif
+   #ifndef STBI_ONLY_GIF
+   #define STBI_NO_GIF
+   #endif
+   #ifndef STBI_ONLY_HDR
+   #define STBI_NO_HDR
+   #endif
+   #ifndef STBI_ONLY_PIC
+   #define STBI_NO_PIC
+   #endif
+   #ifndef STBI_ONLY_PNM
+   #define STBI_NO_PNM
+   #endif
+#endif
+
+#if defined(STBI_NO_PNG) && !defined(STBI_SUPPORT_ZLIB) && !defined(STBI_NO_ZLIB)
+#define STBI_NO_ZLIB
+#endif
+
+
+#include <stdarg.h>
+#include <stddef.h> // ptrdiff_t on osx
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+#include <math.h>  // ldexp, pow
+#endif
+
+#ifndef STBI_NO_STDIO
+#include <stdio.h>
+#endif
+
+#ifndef STBI_ASSERT
+#include <assert.h>
+#define STBI_ASSERT(x) assert(x)
+#endif
+
+#ifdef __cplusplus
+#define STBI_EXTERN extern "C"
+#else
+#define STBI_EXTERN extern
+#endif
+
+
+#ifndef _MSC_VER
+   #ifdef __cplusplus
+   #define stbi_inline inline
+   #else
+   #define stbi_inline
+   #endif
+#else
+   #define stbi_inline __forceinline
+#endif
+
+#ifndef STBI_NO_THREAD_LOCALS
+   #if defined(__cplusplus) &&  __cplusplus >= 201103L
+      #define STBI_THREAD_LOCAL       thread_local
+   #elif defined(__GNUC__) && __GNUC__ < 5
+      #define STBI_THREAD_LOCAL       __thread
+   #elif defined(_MSC_VER)
+      #define STBI_THREAD_LOCAL       __declspec(thread)
+   #elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+      #define STBI_THREAD_LOCAL       _Thread_local
+   #endif
+
+   #ifndef STBI_THREAD_LOCAL
+      #if defined(__GNUC__)
+        #define STBI_THREAD_LOCAL       __thread
+      #endif
+   #endif
+#endif
+
+#ifdef _MSC_VER
+typedef unsigned short stbi__uint16;
+typedef   signed short stbi__int16;
+typedef unsigned int   stbi__uint32;
+typedef   signed int   stbi__int32;
+#else
+#include <stdint.h>
+typedef uint16_t stbi__uint16;
+typedef int16_t  stbi__int16;
+typedef uint32_t stbi__uint32;
+typedef int32_t  stbi__int32;
+#endif
+
+// should produce compiler error if size is wrong
+typedef unsigned char validate_uint32[sizeof(stbi__uint32)==4 ? 1 : -1];
+
+#ifdef _MSC_VER
+#define STBI_NOTUSED(v)  (void)(v)
+#else
+#define STBI_NOTUSED(v)  (void)sizeof(v)
+#endif
+
+#ifdef _MSC_VER
+#define STBI_HAS_LROTL
+#endif
+
+#ifdef STBI_HAS_LROTL
+   #define stbi_lrot(x,y)  _lrotl(x,y)
+#else
+   #define stbi_lrot(x,y)  (((x) << (y)) | ((x) >> (-(y) & 31)))
+#endif
+
+#if defined(STBI_MALLOC) && defined(STBI_FREE) && (defined(STBI_REALLOC) || defined(STBI_REALLOC_SIZED))
+// ok
+#elif !defined(STBI_MALLOC) && !defined(STBI_FREE) && !defined(STBI_REALLOC) && !defined(STBI_REALLOC_SIZED)
+// ok
+#else
+#error "Must define all or none of STBI_MALLOC, STBI_FREE, and STBI_REALLOC (or STBI_REALLOC_SIZED)."
+#endif
+
+#ifndef STBI_MALLOC
+#define STBI_MALLOC(sz)           malloc(sz)
+#define STBI_REALLOC(p,newsz)     realloc(p,newsz)
+#define STBI_FREE(p)              free(p)
+#endif
+
+#ifndef STBI_REALLOC_SIZED
+#define STBI_REALLOC_SIZED(p,oldsz,newsz) STBI_REALLOC(p,newsz)
+#endif
+
+// x86/x64 detection
+#if defined(__x86_64__) || defined(_M_X64)
+#define STBI__X64_TARGET
+#elif defined(__i386) || defined(_M_IX86)
+#define STBI__X86_TARGET
+#endif
+
+#if defined(__GNUC__) && defined(STBI__X86_TARGET) && !defined(__SSE2__) && !defined(STBI_NO_SIMD)
+// gcc doesn't support sse2 intrinsics unless you compile with -msse2,
+// which in turn means it gets to use SSE2 everywhere. This is unfortunate,
+// but previous attempts to provide the SSE2 functions with runtime
+// detection caused numerous issues. The way architecture extensions are
+// exposed in GCC/Clang is, sadly, not really suited for one-file libs.
+// New behavior: if compiled with -msse2, we use SSE2 without any
+// detection; if not, we don't use it at all.
+#define STBI_NO_SIMD
+#endif
+
+#if defined(__MINGW32__) && defined(STBI__X86_TARGET) && !defined(STBI_MINGW_ENABLE_SSE2) && !defined(STBI_NO_SIMD)
+// Note that __MINGW32__ doesn't actually mean 32-bit, so we have to avoid STBI__X64_TARGET
+//
+// 32-bit MinGW wants ESP to be 16-byte aligned, but this is not in the
+// Windows ABI and VC++ as well as Windows DLLs don't maintain that invariant.
+// As a result, enabling SSE2 on 32-bit MinGW is dangerous when not
+// simultaneously enabling "-mstackrealign".
+//
+// See https://github.com/nothings/stb/issues/81 for more information.
+//
+// So default to no SSE2 on 32-bit MinGW. If you've read this far and added
+// -mstackrealign to your build settings, feel free to #define STBI_MINGW_ENABLE_SSE2.
+#define STBI_NO_SIMD
+#endif
+
+#if !defined(STBI_NO_SIMD) && (defined(STBI__X86_TARGET) || defined(STBI__X64_TARGET))
+#define STBI_SSE2
+#include <emmintrin.h>
+
+#ifdef _MSC_VER
+
+#if _MSC_VER >= 1400  // not VC6
+#include <intrin.h> // __cpuid
+static int stbi__cpuid3(void)
+{
+   int info[4];
+   __cpuid(info,1);
+   return info[3];
+}
+#else
+static int stbi__cpuid3(void)
+{
+   int res;
+   __asm {
+      mov  eax,1
+      cpuid
+      mov  res,edx
+   }
+   return res;
+}
+#endif
+
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   int info3 = stbi__cpuid3();
+   return ((info3 >> 26) & 1) != 0;
+}
+#endif
+
+#else // assume GCC-style if not VC++
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+
+#if !defined(STBI_NO_JPEG) && defined(STBI_SSE2)
+static int stbi__sse2_available(void)
+{
+   // If we're even attempting to compile this on GCC/Clang, that means
+   // -msse2 is on, which means the compiler is allowed to use SSE2
+   // instructions at will, and so are we.
+   return 1;
+}
+#endif
+
+#endif
+#endif
+
+// ARM NEON
+#if defined(STBI_NO_SIMD) && defined(STBI_NEON)
+#undef STBI_NEON
+#endif
+
+#ifdef STBI_NEON
+#include <arm_neon.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
+#ifndef STBI_SIMD_ALIGN
+#define STBI_SIMD_ALIGN(type, name) type name
+#endif
+
+#ifndef STBI_MAX_DIMENSIONS
+#define STBI_MAX_DIMENSIONS (1 << 24)
+#endif
+
+///////////////////////////////////////////////
+//
+//  stbi__context struct and start_xxx functions
+
+// stbi__context structure is our basic context used by all images, so it
+// contains all the IO context, plus some basic image information
+typedef struct
+{
+   stbi__uint32 img_x, img_y;
+   int img_n, img_out_n;
+
+   stbi_io_callbacks io;
+   void *io_user_data;
+
+   int read_from_callbacks;
+   int buflen;
+   stbi_uc buffer_start[128];
+   int callback_already_read;
+
+   stbi_uc *img_buffer, *img_buffer_end;
+   stbi_uc *img_buffer_original, *img_buffer_original_end;
+} stbi__context;
+
+
+static void stbi__refill_buffer(stbi__context *s);
+
+// initialize a memory-decode context
+static void stbi__start_mem(stbi__context *s, stbi_uc const *buffer, int len)
+{
+   s->io.read = NULL;
+   s->read_from_callbacks = 0;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = (stbi_uc *) buffer;
+   s->img_buffer_end = s->img_buffer_original_end = (stbi_uc *) buffer+len;
+}
+
+// initialize a callback-based context
+static void stbi__start_callbacks(stbi__context *s, stbi_io_callbacks *c, void *user)
+{
+   s->io = *c;
+   s->io_user_data = user;
+   s->buflen = sizeof(s->buffer_start);
+   s->read_from_callbacks = 1;
+   s->callback_already_read = 0;
+   s->img_buffer = s->img_buffer_original = s->buffer_start;
+   stbi__refill_buffer(s);
+   s->img_buffer_original_end = s->img_buffer_end;
+}
+
+#ifndef STBI_NO_STDIO
+
+static int stbi__stdio_read(void *user, char *data, int size)
+{
+   return (int) fread(data,1,size,(FILE*) user);
+}
+
+static void stbi__stdio_skip(void *user, int n)
+{
+   int ch;
+   fseek((FILE*) user, n, SEEK_CUR);
+   ch = fgetc((FILE*) user);  /* have to read a byte to reset feof()'s flag */
+   if (ch != EOF) {
+      ungetc(ch, (FILE *) user);  /* push byte back onto stream if valid. */
+   }
+}
+
+static int stbi__stdio_eof(void *user)
+{
+   return feof((FILE*) user) || ferror((FILE *) user);
+}
+
+static stbi_io_callbacks stbi__stdio_callbacks =
+{
+   stbi__stdio_read,
+   stbi__stdio_skip,
+   stbi__stdio_eof,
+};
+
+static void stbi__start_file(stbi__context *s, FILE *f)
+{
+   stbi__start_callbacks(s, &stbi__stdio_callbacks, (void *) f);
+}
+
+//static void stop_file(stbi__context *s) { }
+
+#endif // !STBI_NO_STDIO
+
+static void stbi__rewind(stbi__context *s)
+{
+   // conceptually rewind SHOULD rewind to the beginning of the stream,
+   // but we just rewind to the beginning of the initial buffer, because
+   // we only use it after doing 'test', which only ever looks at at most 92 bytes
+   s->img_buffer = s->img_buffer_original;
+   s->img_buffer_end = s->img_buffer_original_end;
+}
+
+enum
+{
+   STBI_ORDER_RGB,
+   STBI_ORDER_BGR
+};
+
+typedef struct
+{
+   int bits_per_channel;
+   int num_channels;
+   int channel_order;
+} stbi__result_info;
+
+#ifndef STBI_NO_JPEG
+static int      stbi__jpeg_test(stbi__context *s);
+static void    *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNG
+static int      stbi__png_test(stbi__context *s);
+static void    *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__png_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__png_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_BMP
+static int      stbi__bmp_test(stbi__context *s);
+static void    *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_TGA
+static int      stbi__tga_test(stbi__context *s);
+static void    *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__tga_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PSD
+static int      stbi__psd_test(stbi__context *s);
+static void    *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc);
+static int      stbi__psd_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__psd_is16(stbi__context *s);
+#endif
+
+#ifndef STBI_NO_HDR
+static int      stbi__hdr_test(stbi__context *s);
+static float   *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PIC
+static int      stbi__pic_test(stbi__context *s);
+static void    *stbi__pic_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pic_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_GIF
+static int      stbi__gif_test(stbi__context *s);
+static void    *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static void    *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp);
+static int      stbi__gif_info(stbi__context *s, int *x, int *y, int *comp);
+#endif
+
+#ifndef STBI_NO_PNM
+static int      stbi__pnm_test(stbi__context *s);
+static void    *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri);
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp);
+static int      stbi__pnm_is16(stbi__context *s);
+#endif
+
+static
+#ifdef STBI_THREAD_LOCAL
+STBI_THREAD_LOCAL
+#endif
+const char *stbi__g_failure_reason;
+
+STBIDEF const char *stbi_failure_reason(void)
+{
+   return stbi__g_failure_reason;
+}
+
+#ifndef STBI_NO_FAILURE_STRINGS
+static int stbi__err(const char *str)
+{
+   stbi__g_failure_reason = str;
+   return 0;
+}
+#endif
+
+static void *stbi__malloc(size_t size)
+{
+    return STBI_MALLOC(size);
+}
+
+// stb_image uses ints pervasively, including for offset calculations.
+// therefore the largest decoded image size we can support with the
+// current code, even on 64-bit targets, is INT_MAX. this is not a
+// significant limitation for the intended use case.
+//
+// we do, however, need to make sure our size calculations don't
+// overflow. hence a few helper functions for size calculations that
+// multiply integers together, making sure that they're non-negative
+// and no overflow occurs.
+
+// return 1 if the sum is valid, 0 on overflow.
+// negative terms are considered invalid.
+static int stbi__addsizes_valid(int a, int b)
+{
+   if (b < 0) return 0;
+   // now 0 <= b <= INT_MAX, hence also
+   // 0 <= INT_MAX - b <= INTMAX.
+   // And "a + b <= INT_MAX" (which might overflow) is the
+   // same as a <= INT_MAX - b (no overflow)
+   return a <= INT_MAX - b;
+}
+
+// returns 1 if the product is valid, 0 on overflow.
+// negative factors are considered invalid.
+static int stbi__mul2sizes_valid(int a, int b)
+{
+   if (a < 0 || b < 0) return 0;
+   if (b == 0) return 1; // mul-by-0 is always safe
+   // portable way to check for no overflows in a*b
+   return a <= INT_MAX/b;
+}
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// returns 1 if "a*b + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad2sizes_valid(int a, int b, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__addsizes_valid(a*b, add);
+}
+#endif
+
+// returns 1 if "a*b*c + add" has no negative terms/factors and doesn't overflow
+static int stbi__mad3sizes_valid(int a, int b, int c, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__addsizes_valid(a*b*c, add);
+}
+
+// returns 1 if "a*b*c*d + add" has no negative terms/factors and doesn't overflow
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static int stbi__mad4sizes_valid(int a, int b, int c, int d, int add)
+{
+   return stbi__mul2sizes_valid(a, b) && stbi__mul2sizes_valid(a*b, c) &&
+      stbi__mul2sizes_valid(a*b*c, d) && stbi__addsizes_valid(a*b*c*d, add);
+}
+#endif
+
+#if !defined(STBI_NO_JPEG) || !defined(STBI_NO_PNG) || !defined(STBI_NO_TGA) || !defined(STBI_NO_HDR)
+// mallocs with size overflow checking
+static void *stbi__malloc_mad2(int a, int b, int add)
+{
+   if (!stbi__mad2sizes_valid(a, b, add)) return NULL;
+   return stbi__malloc(a*b + add);
+}
+#endif
+
+static void *stbi__malloc_mad3(int a, int b, int c, int add)
+{
+   if (!stbi__mad3sizes_valid(a, b, c, add)) return NULL;
+   return stbi__malloc(a*b*c + add);
+}
+
+#if !defined(STBI_NO_LINEAR) || !defined(STBI_NO_HDR)
+static void *stbi__malloc_mad4(int a, int b, int c, int d, int add)
+{
+   if (!stbi__mad4sizes_valid(a, b, c, d, add)) return NULL;
+   return stbi__malloc(a*b*c*d + add);
+}
+#endif
+
+// stbi__err - error
+// stbi__errpf - error returning pointer to float
+// stbi__errpuc - error returning pointer to unsigned char
+
+#ifdef STBI_NO_FAILURE_STRINGS
+   #define stbi__err(x,y)  0
+#elif defined(STBI_FAILURE_USERMSG)
+   #define stbi__err(x,y)  stbi__err(y)
+#else
+   #define stbi__err(x,y)  stbi__err(x)
+#endif
+
+#define stbi__errpf(x,y)   ((float *)(size_t) (stbi__err(x,y)?NULL:NULL))
+#define stbi__errpuc(x,y)  ((unsigned char *)(size_t) (stbi__err(x,y)?NULL:NULL))
+
+STBIDEF void stbi_image_free(void *retval_from_stbi_load)
+{
+   STBI_FREE(retval_from_stbi_load);
+}
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp);
+#endif
+
+#ifndef STBI_NO_HDR
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp);
+#endif
+
+static int stbi__vertically_flip_on_load_global = 0;
+
+STBIDEF void stbi_set_flip_vertically_on_load(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_global = flag_true_if_should_flip;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__vertically_flip_on_load  stbi__vertically_flip_on_load_global
+#else
+static STBI_THREAD_LOCAL int stbi__vertically_flip_on_load_local, stbi__vertically_flip_on_load_set;
+
+STBIDEF void stbi_set_flip_vertically_on_load_thread(int flag_true_if_should_flip)
+{
+   stbi__vertically_flip_on_load_local = flag_true_if_should_flip;
+   stbi__vertically_flip_on_load_set = 1;
+}
+
+#define stbi__vertically_flip_on_load  (stbi__vertically_flip_on_load_set       \
+                                         ? stbi__vertically_flip_on_load_local  \
+                                         : stbi__vertically_flip_on_load_global)
+#endif // STBI_THREAD_LOCAL
+
+static void *stbi__load_main(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   memset(ri, 0, sizeof(*ri)); // make sure it's initialized if we add new fields
+   ri->bits_per_channel = 8; // default is 8 so most paths don't have to be changed
+   ri->channel_order = STBI_ORDER_RGB; // all current input & output are this, but this is here so we can add BGR order
+   ri->num_channels = 0;
+
+   // test the formats with a very explicit header first (at least a FOURCC
+   // or distinctive magic number first)
+   #ifndef STBI_NO_PNG
+   if (stbi__png_test(s))  return stbi__png_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_test(s))  return stbi__bmp_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_test(s))  return stbi__gif_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_test(s))  return stbi__psd_load(s,x,y,comp,req_comp, ri, bpc);
+   #else
+   STBI_NOTUSED(bpc);
+   #endif
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_test(s))  return stbi__pic_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   // then the formats that can end up attempting to load with just 1 or 2
+   // bytes matching expectations; these are prone to false positives, so
+   // try them later
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_test(s)) return stbi__jpeg_load(s,x,y,comp,req_comp, ri);
+   #endif
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_test(s))  return stbi__pnm_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      float *hdr = stbi__hdr_load(s, x,y,comp,req_comp, ri);
+      return stbi__hdr_to_ldr(hdr, *x, *y, req_comp ? req_comp : *comp);
+   }
+   #endif
+
+   #ifndef STBI_NO_TGA
+   // test tga last because it's a crappy test!
+   if (stbi__tga_test(s))
+      return stbi__tga_load(s,x,y,comp,req_comp, ri);
+   #endif
+
+   return stbi__errpuc("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static stbi_uc *stbi__convert_16_to_8(stbi__uint16 *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi_uc *reduced;
+
+   reduced = (stbi_uc *) stbi__malloc(img_len);
+   if (reduced == NULL) return stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      reduced[i] = (stbi_uc)((orig[i] >> 8) & 0xFF); // top half of each byte is sufficient approx of 16->8 bit scaling
+
+   STBI_FREE(orig);
+   return reduced;
+}
+
+static stbi__uint16 *stbi__convert_8_to_16(stbi_uc *orig, int w, int h, int channels)
+{
+   int i;
+   int img_len = w * h * channels;
+   stbi__uint16 *enlarged;
+
+   enlarged = (stbi__uint16 *) stbi__malloc(img_len*2);
+   if (enlarged == NULL) return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+
+   for (i = 0; i < img_len; ++i)
+      enlarged[i] = (stbi__uint16)((orig[i] << 8) + orig[i]); // replicate to high and low byte, maps 0->0, 255->0xffff
+
+   STBI_FREE(orig);
+   return enlarged;
+}
+
+static void stbi__vertical_flip(void *image, int w, int h, int bytes_per_pixel)
+{
+   int row;
+   size_t bytes_per_row = (size_t)w * bytes_per_pixel;
+   stbi_uc temp[2048];
+   stbi_uc *bytes = (stbi_uc *)image;
+
+   for (row = 0; row < (h>>1); row++) {
+      stbi_uc *row0 = bytes + row*bytes_per_row;
+      stbi_uc *row1 = bytes + (h - row - 1)*bytes_per_row;
+      // swap row0 with row1
+      size_t bytes_left = bytes_per_row;
+      while (bytes_left) {
+         size_t bytes_copy = (bytes_left < sizeof(temp)) ? bytes_left : sizeof(temp);
+         memcpy(temp, row0, bytes_copy);
+         memcpy(row0, row1, bytes_copy);
+         memcpy(row1, temp, bytes_copy);
+         row0 += bytes_copy;
+         row1 += bytes_copy;
+         bytes_left -= bytes_copy;
+      }
+   }
+}
+
+#ifndef STBI_NO_GIF
+static void stbi__vertical_flip_slices(void *image, int w, int h, int z, int bytes_per_pixel)
+{
+   int slice;
+   int slice_size = w * h * bytes_per_pixel;
+
+   stbi_uc *bytes = (stbi_uc *)image;
+   for (slice = 0; slice < z; ++slice) {
+      stbi__vertical_flip(bytes, w, h, bytes_per_pixel);
+      bytes += slice_size;
+   }
+}
+#endif
+
+static unsigned char *stbi__load_and_postprocess_8bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 8);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 8) {
+      result = stbi__convert_16_to_8((stbi__uint16 *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 8;
+   }
+
+   // @TODO: move stbi__convert_format to here
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi_uc));
+   }
+
+   return (unsigned char *) result;
+}
+
+static stbi__uint16 *stbi__load_and_postprocess_16bit(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__result_info ri;
+   void *result = stbi__load_main(s, x, y, comp, req_comp, &ri, 16);
+
+   if (result == NULL)
+      return NULL;
+
+   // it is the responsibility of the loaders to make sure we get either 8 or 16 bit.
+   STBI_ASSERT(ri.bits_per_channel == 8 || ri.bits_per_channel == 16);
+
+   if (ri.bits_per_channel != 16) {
+      result = stbi__convert_8_to_16((stbi_uc *) result, *x, *y, req_comp == 0 ? *comp : req_comp);
+      ri.bits_per_channel = 16;
+   }
+
+   // @TODO: move stbi__convert_format16 to here
+   // @TODO: special case RGB-to-Y (and RGBA-to-YA) for 8-bit-to-16-bit case to keep more precision
+
+   if (stbi__vertically_flip_on_load) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(stbi__uint16));
+   }
+
+   return (stbi__uint16 *) result;
+}
+
+#if !defined(STBI_NO_HDR) && !defined(STBI_NO_LINEAR)
+static void stbi__float_postprocess(float *result, int *x, int *y, int *comp, int req_comp)
+{
+   if (stbi__vertically_flip_on_load && result != NULL) {
+      int channels = req_comp ? req_comp : *comp;
+      stbi__vertical_flip(result, *x, *y, channels * sizeof(float));
+   }
+}
+#endif
+
+#ifndef STBI_NO_STDIO
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBI_EXTERN __declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
+STBI_EXTERN __declspec(dllimport) int __stdcall WideCharToMultiByte(unsigned int cp, unsigned long flags, const wchar_t *widestr, int cchwide, char *str, int cbmb, const char *defchar, int *used_default);
+#endif
+
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+STBIDEF int stbi_convert_wchar_to_utf8(char *buffer, size_t bufferlen, const wchar_t* input)
+{
+  return WideCharToMultiByte(65001 /* UTF8 */, 0, input, -1, buffer, (int) bufferlen, NULL, NULL);
+}
+#endif
+
+static FILE *stbi__fopen(char const *filename, char const *mode)
+{
+   FILE *f;
+#if defined(_WIN32) && defined(STBI_WINDOWS_UTF8)
+   wchar_t wMode[64];
+   wchar_t wFilename[1024];
+  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, filename, -1, wFilename, sizeof(wFilename)/sizeof(*wFilename)))
+      return 0;
+
+  if (0 == MultiByteToWideChar(65001 /* UTF8 */, 0, mode, -1, wMode, sizeof(wMode)/sizeof(*wMode)))
+      return 0;
+
+#if defined(_MSC_VER) && _MSC_VER >= 1400
+  if (0 != _wfopen_s(&f, wFilename, wMode))
+    f = 0;
+#else
+   f = _wfopen(wFilename, wMode);
+#endif
+
+#elif defined(_MSC_VER) && _MSC_VER >= 1400
+   if (0 != fopen_s(&f, filename, mode))
+      f=0;
+#else
+   f = fopen(filename, mode);
+#endif
+   return f;
+}
+
+
+STBIDEF stbi_uc *stbi_load(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   unsigned char *result;
+   if (!f) return stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF stbi_uc *stbi_load_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi__uint16 *stbi_load_from_file_16(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__uint16 *result;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   result = stbi__load_and_postprocess_16bit(&s,x,y,comp,req_comp);
+   if (result) {
+      // need to 'unget' all the characters in the IO buffer
+      fseek(f, - (int) (s.img_buffer_end - s.img_buffer), SEEK_CUR);
+   }
+   return result;
+}
+
+STBIDEF stbi_us *stbi_load_16(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   stbi__uint16 *result;
+   if (!f) return (stbi_us *) stbi__errpuc("can't fopen", "Unable to open file");
+   result = stbi_load_from_file_16(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+
+#endif //!STBI_NO_STDIO
+
+STBIDEF stbi_us *stbi_load_16_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_us *stbi_load_16_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *channels_in_file, int desired_channels)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *)clbk, user);
+   return stbi__load_and_postprocess_16bit(&s,x,y,channels_in_file,desired_channels);
+}
+
+STBIDEF stbi_uc *stbi_load_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+STBIDEF stbi_uc *stbi_load_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__load_and_postprocess_8bit(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_GIF
+STBIDEF stbi_uc *stbi_load_gif_from_memory(stbi_uc const *buffer, int len, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   unsigned char *result;
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+
+   result = (unsigned char*) stbi__load_gif_main(&s, delays, x, y, z, comp, req_comp);
+   if (stbi__vertically_flip_on_load) {
+      stbi__vertical_flip_slices( result, *x, *y, *z, *comp );
+   }
+
+   return result;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float *stbi__loadf_main(stbi__context *s, int *x, int *y, int *comp, int req_comp)
+{
+   unsigned char *data;
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_test(s)) {
+      stbi__result_info ri;
+      float *hdr_data = stbi__hdr_load(s,x,y,comp,req_comp, &ri);
+      if (hdr_data)
+         stbi__float_postprocess(hdr_data,x,y,comp,req_comp);
+      return hdr_data;
+   }
+   #endif
+   data = stbi__load_and_postprocess_8bit(s, x, y, comp, req_comp);
+   if (data)
+      return stbi__ldr_to_hdr(data, *x, *y, req_comp ? req_comp : *comp);
+   return stbi__errpf("unknown image type", "Image not of any known type, or corrupt");
+}
+
+STBIDEF float *stbi_loadf_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+STBIDEF float *stbi_loadf_from_callbacks(stbi_io_callbacks const *clbk, void *user, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF float *stbi_loadf(char const *filename, int *x, int *y, int *comp, int req_comp)
+{
+   float *result;
+   FILE *f = stbi__fopen(filename, "rb");
+   if (!f) return stbi__errpf("can't fopen", "Unable to open file");
+   result = stbi_loadf_from_file(f,x,y,comp,req_comp);
+   fclose(f);
+   return result;
+}
+
+STBIDEF float *stbi_loadf_from_file(FILE *f, int *x, int *y, int *comp, int req_comp)
+{
+   stbi__context s;
+   stbi__start_file(&s,f);
+   return stbi__loadf_main(&s,x,y,comp,req_comp);
+}
+#endif // !STBI_NO_STDIO
+
+#endif // !STBI_NO_LINEAR
+
+// these is-hdr-or-not is defined independent of whether STBI_NO_LINEAR is
+// defined, for API simplicity; if STBI_NO_LINEAR is defined, it always
+// reports false!
+
+STBIDEF int stbi_is_hdr_from_memory(stbi_uc const *buffer, int len)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(buffer);
+   STBI_NOTUSED(len);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int      stbi_is_hdr          (char const *filename)
+{
+   FILE *f = stbi__fopen(filename, "rb");
+   int result=0;
+   if (f) {
+      result = stbi_is_hdr_from_file(f);
+      fclose(f);
+   }
+   return result;
+}
+
+STBIDEF int stbi_is_hdr_from_file(FILE *f)
+{
+   #ifndef STBI_NO_HDR
+   long pos = ftell(f);
+   int res;
+   stbi__context s;
+   stbi__start_file(&s,f);
+   res = stbi__hdr_test(&s);
+   fseek(f, pos, SEEK_SET);
+   return res;
+   #else
+   STBI_NOTUSED(f);
+   return 0;
+   #endif
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int      stbi_is_hdr_from_callbacks(stbi_io_callbacks const *clbk, void *user)
+{
+   #ifndef STBI_NO_HDR
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) clbk, user);
+   return stbi__hdr_test(&s);
+   #else
+   STBI_NOTUSED(clbk);
+   STBI_NOTUSED(user);
+   return 0;
+   #endif
+}
+
+#ifndef STBI_NO_LINEAR
+static float stbi__l2h_gamma=2.2f, stbi__l2h_scale=1.0f;
+
+STBIDEF void   stbi_ldr_to_hdr_gamma(float gamma) { stbi__l2h_gamma = gamma; }
+STBIDEF void   stbi_ldr_to_hdr_scale(float scale) { stbi__l2h_scale = scale; }
+#endif
+
+static float stbi__h2l_gamma_i=1.0f/2.2f, stbi__h2l_scale_i=1.0f;
+
+STBIDEF void   stbi_hdr_to_ldr_gamma(float gamma) { stbi__h2l_gamma_i = 1/gamma; }
+STBIDEF void   stbi_hdr_to_ldr_scale(float scale) { stbi__h2l_scale_i = 1/scale; }
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Common code used by all image loaders
+//
+
+enum
+{
+   STBI__SCAN_load=0,
+   STBI__SCAN_type,
+   STBI__SCAN_header
+};
+
+static void stbi__refill_buffer(stbi__context *s)
+{
+   int n = (s->io.read)(s->io_user_data,(char*)s->buffer_start,s->buflen);
+   s->callback_already_read += (int) (s->img_buffer - s->img_buffer_original);
+   if (n == 0) {
+      // at end of file, treat same as if from memory, but need to handle case
+      // where s->img_buffer isn't pointing to safe memory, e.g. 0-byte file
+      s->read_from_callbacks = 0;
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start+1;
+      *s->img_buffer = 0;
+   } else {
+      s->img_buffer = s->buffer_start;
+      s->img_buffer_end = s->buffer_start + n;
+   }
+}
+
+stbi_inline static stbi_uc stbi__get8(stbi__context *s)
+{
+   if (s->img_buffer < s->img_buffer_end)
+      return *s->img_buffer++;
+   if (s->read_from_callbacks) {
+      stbi__refill_buffer(s);
+      return *s->img_buffer++;
+   }
+   return 0;
+}
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_HDR) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+stbi_inline static int stbi__at_eof(stbi__context *s)
+{
+   if (s->io.read) {
+      if (!(s->io.eof)(s->io_user_data)) return 0;
+      // if feof() is true, check if buffer = end
+      // special case: we've only got the special 0 character at the end
+      if (s->read_from_callbacks == 0) return 1;
+   }
+
+   return s->img_buffer >= s->img_buffer_end;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC)
+// nothing
+#else
+static void stbi__skip(stbi__context *s, int n)
+{
+   if (n == 0) return;  // already there!
+   if (n < 0) {
+      s->img_buffer = s->img_buffer_end;
+      return;
+   }
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         s->img_buffer = s->img_buffer_end;
+         (s->io.skip)(s->io_user_data, n - blen);
+         return;
+      }
+   }
+   s->img_buffer += n;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_TGA) && defined(STBI_NO_HDR) && defined(STBI_NO_PNM)
+// nothing
+#else
+static int stbi__getn(stbi__context *s, stbi_uc *buffer, int n)
+{
+   if (s->io.read) {
+      int blen = (int) (s->img_buffer_end - s->img_buffer);
+      if (blen < n) {
+         int res, count;
+
+         memcpy(buffer, s->img_buffer, blen);
+
+         count = (s->io.read)(s->io_user_data, (char*) buffer + blen, n - blen);
+         res = (count == (n-blen));
+         s->img_buffer = s->img_buffer_end;
+         return res;
+      }
+   }
+
+   if (s->img_buffer+n <= s->img_buffer_end) {
+      memcpy(buffer, s->img_buffer, n);
+      s->img_buffer += n;
+      return 1;
+   } else
+      return 0;
+}
+#endif
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static int stbi__get16be(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return (z << 8) + stbi__get8(s);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD) && defined(STBI_NO_PIC)
+// nothing
+#else
+static stbi__uint32 stbi__get32be(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16be(s);
+   return (z << 16) + stbi__get16be(s);
+}
+#endif
+
+#if defined(STBI_NO_BMP) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF)
+// nothing
+#else
+static int stbi__get16le(stbi__context *s)
+{
+   int z = stbi__get8(s);
+   return z + (stbi__get8(s) << 8);
+}
+#endif
+
+#ifndef STBI_NO_BMP
+static stbi__uint32 stbi__get32le(stbi__context *s)
+{
+   stbi__uint32 z = stbi__get16le(s);
+   z += (stbi__uint32)stbi__get16le(s) << 16;
+   return z;
+}
+#endif
+
+#define STBI__BYTECAST(x)  ((stbi_uc) ((x) & 255))  // truncate int to byte without warnings
+
+#if defined(STBI_NO_JPEG) && defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+//////////////////////////////////////////////////////////////////////////////
+//
+//  generic converter from built-in img_n to req_comp
+//    individual types do this automatically as much as possible (e.g. jpeg
+//    does all cases internally since it needs to colorspace convert anyway,
+//    and it never has alpha, so very few cases ). png can automatically
+//    interleave an alpha=255 channel, but falls back to this for other cases
+//
+//  assume data buffer is malloced, so malloc a new one and free that one
+//  only failure mode is malloc failing
+
+static stbi_uc stbi__compute_y(int r, int g, int b)
+{
+   return (stbi_uc) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_BMP) && defined(STBI_NO_PSD) && defined(STBI_NO_TGA) && defined(STBI_NO_GIF) && defined(STBI_NO_PIC) && defined(STBI_NO_PNM)
+// nothing
+#else
+static unsigned char *stbi__convert_format(unsigned char *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   unsigned char *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (unsigned char *) stbi__malloc_mad3(req_comp, x, y, 0);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      unsigned char *src  = data + j * x * img_n   ;
+      unsigned char *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=255;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=255;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                  } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                  } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                  } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=255;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = 255;    } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                    } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 stbi__compute_y_16(int r, int g, int b)
+{
+   return (stbi__uint16) (((r*77) + (g*150) +  (29*b)) >> 8);
+}
+#endif
+
+#if defined(STBI_NO_PNG) && defined(STBI_NO_PSD)
+// nothing
+#else
+static stbi__uint16 *stbi__convert_format16(stbi__uint16 *data, int img_n, int req_comp, unsigned int x, unsigned int y)
+{
+   int i,j;
+   stbi__uint16 *good;
+
+   if (req_comp == img_n) return data;
+   STBI_ASSERT(req_comp >= 1 && req_comp <= 4);
+
+   good = (stbi__uint16 *) stbi__malloc(req_comp * x * y * 2);
+   if (good == NULL) {
+      STBI_FREE(data);
+      return (stbi__uint16 *) stbi__errpuc("outofmem", "Out of memory");
+   }
+
+   for (j=0; j < (int) y; ++j) {
+      stbi__uint16 *src  = data + j * x * img_n   ;
+      stbi__uint16 *dest = good + j * x * req_comp;
+
+      #define STBI__COMBO(a,b)  ((a)*8+(b))
+      #define STBI__CASE(a,b)   case STBI__COMBO(a,b): for(i=x-1; i >= 0; --i, src += a, dest += b)
+      // convert source image with img_n components to one with req_comp components;
+      // avoid switch per pixel, so use switch per scanline and massive macros
+      switch (STBI__COMBO(img_n, req_comp)) {
+         STBI__CASE(1,2) { dest[0]=src[0]; dest[1]=0xffff;                                     } break;
+         STBI__CASE(1,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(1,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=0xffff;                     } break;
+         STBI__CASE(2,1) { dest[0]=src[0];                                                     } break;
+         STBI__CASE(2,3) { dest[0]=dest[1]=dest[2]=src[0];                                     } break;
+         STBI__CASE(2,4) { dest[0]=dest[1]=dest[2]=src[0]; dest[3]=src[1];                     } break;
+         STBI__CASE(3,4) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];dest[3]=0xffff;        } break;
+         STBI__CASE(3,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(3,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = 0xffff; } break;
+         STBI__CASE(4,1) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]);                   } break;
+         STBI__CASE(4,2) { dest[0]=stbi__compute_y_16(src[0],src[1],src[2]); dest[1] = src[3]; } break;
+         STBI__CASE(4,3) { dest[0]=src[0];dest[1]=src[1];dest[2]=src[2];                       } break;
+         default: STBI_ASSERT(0); STBI_FREE(data); STBI_FREE(good); return (stbi__uint16*) stbi__errpuc("unsupported", "Unsupported format conversion");
+      }
+      #undef STBI__CASE
+   }
+
+   STBI_FREE(data);
+   return good;
+}
+#endif
+
+#ifndef STBI_NO_LINEAR
+static float   *stbi__ldr_to_hdr(stbi_uc *data, int x, int y, int comp)
+{
+   int i,k,n;
+   float *output;
+   if (!data) return NULL;
+   output = (float *) stbi__malloc_mad4(x, y, comp, sizeof(float), 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpf("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         output[i*comp + k] = (float) (pow(data[i*comp+k]/255.0f, stbi__l2h_gamma) * stbi__l2h_scale);
+      }
+   }
+   if (n < comp) {
+      for (i=0; i < x*y; ++i) {
+         output[i*comp + n] = data[i*comp + n]/255.0f;
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+#ifndef STBI_NO_HDR
+#define stbi__float2int(x)   ((int) (x))
+static stbi_uc *stbi__hdr_to_ldr(float   *data, int x, int y, int comp)
+{
+   int i,k,n;
+   stbi_uc *output;
+   if (!data) return NULL;
+   output = (stbi_uc *) stbi__malloc_mad3(x, y, comp, 0);
+   if (output == NULL) { STBI_FREE(data); return stbi__errpuc("outofmem", "Out of memory"); }
+   // compute number of non-alpha components
+   if (comp & 1) n = comp; else n = comp-1;
+   for (i=0; i < x*y; ++i) {
+      for (k=0; k < n; ++k) {
+         float z = (float) pow(data[i*comp+k]*stbi__h2l_scale_i, stbi__h2l_gamma_i) * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+      if (k < comp) {
+         float z = data[i*comp+k] * 255 + 0.5f;
+         if (z < 0) z = 0;
+         if (z > 255) z = 255;
+         output[i*comp + k] = (stbi_uc) stbi__float2int(z);
+      }
+   }
+   STBI_FREE(data);
+   return output;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+//  "baseline" JPEG/JFIF decoder
+//
+//    simple implementation
+//      - doesn't support delayed output of y-dimension
+//      - simple interface (only one output format: 8-bit interleaved RGB)
+//      - doesn't try to recover corrupt jpegs
+//      - doesn't allow partial loading, loading multiple at once
+//      - still fast on x86 (copying globals into locals doesn't help x86)
+//      - allocates lots of intermediate memory (full size of all components)
+//        - non-interleaved case requires this anyway
+//        - allows good upsampling (see next)
+//    high-quality
+//      - upsampled channels are bilinearly interpolated, even across blocks
+//      - quality integer IDCT derived from IJG's 'slow'
+//    performance
+//      - fast huffman; reasonable integer IDCT
+//      - some SIMD kernels for common paths on targets with SSE2/NEON
+//      - uses a lot of intermediate memory, could cache poorly
+
+#ifndef STBI_NO_JPEG
+
+// huffman decoding acceleration
+#define FAST_BITS   9  // larger handles more cases; smaller stomps less cache
+
+typedef struct
+{
+   stbi_uc  fast[1 << FAST_BITS];
+   // weirdly, repacking this into AoS is a 10% speed loss, instead of a win
+   stbi__uint16 code[256];
+   stbi_uc  values[256];
+   stbi_uc  size[257];
+   unsigned int maxcode[18];
+   int    delta[17];   // old 'firstsymbol' - old 'firstcode'
+} stbi__huffman;
+
+typedef struct
+{
+   stbi__context *s;
+   stbi__huffman huff_dc[4];
+   stbi__huffman huff_ac[4];
+   stbi__uint16 dequant[4][64];
+   stbi__int16 fast_ac[4][1 << FAST_BITS];
+
+// sizes for components, interleaved MCUs
+   int img_h_max, img_v_max;
+   int img_mcu_x, img_mcu_y;
+   int img_mcu_w, img_mcu_h;
+
+// definition of jpeg image component
+   struct
+   {
+      int id;
+      int h,v;
+      int tq;
+      int hd,ha;
+      int dc_pred;
+
+      int x,y,w2,h2;
+      stbi_uc *data;
+      void *raw_data, *raw_coeff;
+      stbi_uc *linebuf;
+      short   *coeff;   // progressive only
+      int      coeff_w, coeff_h; // number of 8x8 coefficient blocks
+   } img_comp[4];
+
+   stbi__uint32   code_buffer; // jpeg entropy-coded buffer
+   int            code_bits;   // number of valid bits
+   unsigned char  marker;      // marker seen while filling entropy buffer
+   int            nomore;      // flag if we saw a marker so must stop
+
+   int            progressive;
+   int            spec_start;
+   int            spec_end;
+   int            succ_high;
+   int            succ_low;
+   int            eob_run;
+   int            jfif;
+   int            app14_color_transform; // Adobe APP14 tag
+   int            rgb;
+
+   int scan_n, order[4];
+   int restart_interval, todo;
+
+// kernels
+   void (*idct_block_kernel)(stbi_uc *out, int out_stride, short data[64]);
+   void (*YCbCr_to_RGB_kernel)(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step);
+   stbi_uc *(*resample_row_hv_2_kernel)(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs);
+} stbi__jpeg;
+
+static int stbi__build_huffman(stbi__huffman *h, int *count)
+{
+   int i,j,k=0;
+   unsigned int code;
+   // build size list for each symbol (from JPEG spec)
+   for (i=0; i < 16; ++i)
+      for (j=0; j < count[i]; ++j)
+         h->size[k++] = (stbi_uc) (i+1);
+   h->size[k] = 0;
+
+   // compute actual symbols (from jpeg spec)
+   code = 0;
+   k = 0;
+   for(j=1; j <= 16; ++j) {
+      // compute delta to add to code to compute symbol id
+      h->delta[j] = k - code;
+      if (h->size[k] == j) {
+         while (h->size[k] == j)
+            h->code[k++] = (stbi__uint16) (code++);
+         if (code-1 >= (1u << j)) return stbi__err("bad code lengths","Corrupt JPEG");
+      }
+      // compute largest code + 1 for this size, preshifted as needed later
+      h->maxcode[j] = code << (16-j);
+      code <<= 1;
+   }
+   h->maxcode[j] = 0xffffffff;
+
+   // build non-spec acceleration table; 255 is flag for not-accelerated
+   memset(h->fast, 255, 1 << FAST_BITS);
+   for (i=0; i < k; ++i) {
+      int s = h->size[i];
+      if (s <= FAST_BITS) {
+         int c = h->code[i] << (FAST_BITS-s);
+         int m = 1 << (FAST_BITS-s);
+         for (j=0; j < m; ++j) {
+            h->fast[c+j] = (stbi_uc) i;
+         }
+      }
+   }
+   return 1;
+}
+
+// build a table that decodes both magnitude and value of small ACs in
+// one go.
+static void stbi__build_fast_ac(stbi__int16 *fast_ac, stbi__huffman *h)
+{
+   int i;
+   for (i=0; i < (1 << FAST_BITS); ++i) {
+      stbi_uc fast = h->fast[i];
+      fast_ac[i] = 0;
+      if (fast < 255) {
+         int rs = h->values[fast];
+         int run = (rs >> 4) & 15;
+         int magbits = rs & 15;
+         int len = h->size[fast];
+
+         if (magbits && len + magbits <= FAST_BITS) {
+            // magnitude code followed by receive_extend code
+            int k = ((i << len) & ((1 << FAST_BITS) - 1)) >> (FAST_BITS - magbits);
+            int m = 1 << (magbits - 1);
+            if (k < m) k += (~0U << magbits) + 1;
+            // if the result is small enough, we can fit it in fast_ac table
+            if (k >= -128 && k <= 127)
+               fast_ac[i] = (stbi__int16) ((k * 256) + (run * 16) + (len + magbits));
+         }
+      }
+   }
+}
+
+static void stbi__grow_buffer_unsafe(stbi__jpeg *j)
+{
+   do {
+      unsigned int b = j->nomore ? 0 : stbi__get8(j->s);
+      if (b == 0xff) {
+         int c = stbi__get8(j->s);
+         while (c == 0xff) c = stbi__get8(j->s); // consume fill bytes
+         if (c != 0) {
+            j->marker = (unsigned char) c;
+            j->nomore = 1;
+            return;
+         }
+      }
+      j->code_buffer |= b << (24 - j->code_bits);
+      j->code_bits += 8;
+   } while (j->code_bits <= 24);
+}
+
+// (1 << n) - 1
+static const stbi__uint32 stbi__bmask[17]={0,1,3,7,15,31,63,127,255,511,1023,2047,4095,8191,16383,32767,65535};
+
+// decode a jpeg huffman value from the bitstream
+stbi_inline static int stbi__jpeg_huff_decode(stbi__jpeg *j, stbi__huffman *h)
+{
+   unsigned int temp;
+   int c,k;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   // look at the top FAST_BITS and determine what symbol ID it is,
+   // if the code is <= FAST_BITS
+   c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+   k = h->fast[c];
+   if (k < 255) {
+      int s = h->size[k];
+      if (s > j->code_bits)
+         return -1;
+      j->code_buffer <<= s;
+      j->code_bits -= s;
+      return h->values[k];
+   }
+
+   // naive test is to shift the code_buffer down so k bits are
+   // valid, then test against maxcode. To speed this up, we've
+   // preshifted maxcode left so that it has (16-k) 0s at the
+   // end; in other words, regardless of the number of bits, it
+   // wants to be compared against something shifted to have 16;
+   // that way we don't need to shift inside the loop.
+   temp = j->code_buffer >> 16;
+   for (k=FAST_BITS+1 ; ; ++k)
+      if (temp < h->maxcode[k])
+         break;
+   if (k == 17) {
+      // error! code not found
+      j->code_bits -= 16;
+      return -1;
+   }
+
+   if (k > j->code_bits)
+      return -1;
+
+   // convert the huffman code to the symbol id
+   c = ((j->code_buffer >> (32 - k)) & stbi__bmask[k]) + h->delta[k];
+   STBI_ASSERT((((j->code_buffer) >> (32 - h->size[c])) & stbi__bmask[h->size[c]]) == h->code[c]);
+
+   // convert the id to a symbol
+   j->code_bits -= k;
+   j->code_buffer <<= k;
+   return h->values[c];
+}
+
+// bias[n] = (-1<<n) + 1
+static const int stbi__jbias[16] = {0,-1,-3,-7,-15,-31,-63,-127,-255,-511,-1023,-2047,-4095,-8191,-16383,-32767};
+
+// combined JPEG 'receive' and JPEG 'extend', since baseline
+// always extends everything it receives.
+stbi_inline static int stbi__extend_receive(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   int sgn;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+
+   sgn = j->code_buffer >> 31; // sign bit always in MSB; 0 if MSB clear (positive), 1 if MSB set (negative)
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k + (stbi__jbias[n] & (sgn - 1));
+}
+
+// get some unsigned bits
+stbi_inline static int stbi__jpeg_get_bits(stbi__jpeg *j, int n)
+{
+   unsigned int k;
+   if (j->code_bits < n) stbi__grow_buffer_unsafe(j);
+   k = stbi_lrot(j->code_buffer, n);
+   j->code_buffer = k & ~stbi__bmask[n];
+   k &= stbi__bmask[n];
+   j->code_bits -= n;
+   return k;
+}
+
+stbi_inline static int stbi__jpeg_get_bit(stbi__jpeg *j)
+{
+   unsigned int k;
+   if (j->code_bits < 1) stbi__grow_buffer_unsafe(j);
+   k = j->code_buffer;
+   j->code_buffer <<= 1;
+   --j->code_bits;
+   return k & 0x80000000;
+}
+
+// given a value that's at position X in the zigzag stream,
+// where does it appear in the 8x8 matrix coded as row-major?
+static const stbi_uc stbi__jpeg_dezigzag[64+15] =
+{
+    0,  1,  8, 16,  9,  2,  3, 10,
+   17, 24, 32, 25, 18, 11,  4,  5,
+   12, 19, 26, 33, 40, 48, 41, 34,
+   27, 20, 13,  6,  7, 14, 21, 28,
+   35, 42, 49, 56, 57, 50, 43, 36,
+   29, 22, 15, 23, 30, 37, 44, 51,
+   58, 59, 52, 45, 38, 31, 39, 46,
+   53, 60, 61, 54, 47, 55, 62, 63,
+   // let corrupt input sample past end
+   63, 63, 63, 63, 63, 63, 63, 63,
+   63, 63, 63, 63, 63, 63, 63
+};
+
+// decode one 64-entry block--
+static int stbi__jpeg_decode_block(stbi__jpeg *j, short data[64], stbi__huffman *hdc, stbi__huffman *hac, stbi__int16 *fac, int b, stbi__uint16 *dequant)
+{
+   int diff,dc,k;
+   int t;
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+   t = stbi__jpeg_huff_decode(j, hdc);
+   if (t < 0 || t > 15) return stbi__err("bad huffman code","Corrupt JPEG");
+
+   // 0 all the ac values now so we can do it 32-bits at a time
+   memset(data,0,64*sizeof(data[0]));
+
+   diff = t ? stbi__extend_receive(j, t) : 0;
+   dc = j->img_comp[b].dc_pred + diff;
+   j->img_comp[b].dc_pred = dc;
+   data[0] = (short) (dc * dequant[0]);
+
+   // decode AC components, see JPEG spec
+   k = 1;
+   do {
+      unsigned int zig;
+      int c,r,s;
+      if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+      c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+      r = fac[c];
+      if (r) { // fast-AC path
+         k += (r >> 4) & 15; // run
+         s = r & 15; // combined length
+         j->code_buffer <<= s;
+         j->code_bits -= s;
+         // decode into unzigzag'd location
+         zig = stbi__jpeg_dezigzag[k++];
+         data[zig] = (short) ((r >> 8) * dequant[zig]);
+      } else {
+         int rs = stbi__jpeg_huff_decode(j, hac);
+         if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+         s = rs & 15;
+         r = rs >> 4;
+         if (s == 0) {
+            if (rs != 0xf0) break; // end block
+            k += 16;
+         } else {
+            k += r;
+            // decode into unzigzag'd location
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) (stbi__extend_receive(j,s) * dequant[zig]);
+         }
+      }
+   } while (k < 64);
+   return 1;
+}
+
+static int stbi__jpeg_decode_block_prog_dc(stbi__jpeg *j, short data[64], stbi__huffman *hdc, int b)
+{
+   int diff,dc;
+   int t;
+   if (j->spec_end != 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+
+   if (j->succ_high == 0) {
+      // first scan for DC coefficient, must be first
+      memset(data,0,64*sizeof(data[0])); // 0 all the ac values now
+      t = stbi__jpeg_huff_decode(j, hdc);
+      if (t < 0 || t > 15) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+      diff = t ? stbi__extend_receive(j, t) : 0;
+
+      dc = j->img_comp[b].dc_pred + diff;
+      j->img_comp[b].dc_pred = dc;
+      data[0] = (short) (dc * (1 << j->succ_low));
+   } else {
+      // refinement scan for DC coefficient
+      if (stbi__jpeg_get_bit(j))
+         data[0] += (short) (1 << j->succ_low);
+   }
+   return 1;
+}
+
+// @OPTIMIZE: store non-zigzagged during the decode passes,
+// and only de-zigzag when dequantizing
+static int stbi__jpeg_decode_block_prog_ac(stbi__jpeg *j, short data[64], stbi__huffman *hac, stbi__int16 *fac)
+{
+   int k;
+   if (j->spec_start == 0) return stbi__err("can't merge dc and ac", "Corrupt JPEG");
+
+   if (j->succ_high == 0) {
+      int shift = j->succ_low;
+
+      if (j->eob_run) {
+         --j->eob_run;
+         return 1;
+      }
+
+      k = j->spec_start;
+      do {
+         unsigned int zig;
+         int c,r,s;
+         if (j->code_bits < 16) stbi__grow_buffer_unsafe(j);
+         c = (j->code_buffer >> (32 - FAST_BITS)) & ((1 << FAST_BITS)-1);
+         r = fac[c];
+         if (r) { // fast-AC path
+            k += (r >> 4) & 15; // run
+            s = r & 15; // combined length
+            j->code_buffer <<= s;
+            j->code_bits -= s;
+            zig = stbi__jpeg_dezigzag[k++];
+            data[zig] = (short) ((r >> 8) * (1 << shift));
+         } else {
+            int rs = stbi__jpeg_huff_decode(j, hac);
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r);
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  --j->eob_run;
+                  break;
+               }
+               k += 16;
+            } else {
+               k += r;
+               zig = stbi__jpeg_dezigzag[k++];
+               data[zig] = (short) (stbi__extend_receive(j,s) * (1 << shift));
+            }
+         }
+      } while (k <= j->spec_end);
+   } else {
+      // refinement scan for these AC coefficients
+
+      short bit = (short) (1 << j->succ_low);
+
+      if (j->eob_run) {
+         --j->eob_run;
+         for (k = j->spec_start; k <= j->spec_end; ++k) {
+            short *p = &data[stbi__jpeg_dezigzag[k]];
+            if (*p != 0)
+               if (stbi__jpeg_get_bit(j))
+                  if ((*p & bit)==0) {
+                     if (*p > 0)
+                        *p += bit;
+                     else
+                        *p -= bit;
+                  }
+         }
+      } else {
+         k = j->spec_start;
+         do {
+            int r,s;
+            int rs = stbi__jpeg_huff_decode(j, hac); // @OPTIMIZE see if we can use the fast path here, advance-by-r is so slow, eh
+            if (rs < 0) return stbi__err("bad huffman code","Corrupt JPEG");
+            s = rs & 15;
+            r = rs >> 4;
+            if (s == 0) {
+               if (r < 15) {
+                  j->eob_run = (1 << r) - 1;
+                  if (r)
+                     j->eob_run += stbi__jpeg_get_bits(j, r);
+                  r = 64; // force end of block
+               } else {
+                  // r=15 s=0 should write 16 0s, so we just do
+                  // a run of 15 0s and then write s (which is 0),
+                  // so we don't have to do anything special here
+               }
+            } else {
+               if (s != 1) return stbi__err("bad huffman code", "Corrupt JPEG");
+               // sign bit
+               if (stbi__jpeg_get_bit(j))
+                  s = bit;
+               else
+                  s = -bit;
+            }
+
+            // advance by r
+            while (k <= j->spec_end) {
+               short *p = &data[stbi__jpeg_dezigzag[k++]];
+               if (*p != 0) {
+                  if (stbi__jpeg_get_bit(j))
+                     if ((*p & bit)==0) {
+                        if (*p > 0)
+                           *p += bit;
+                        else
+                           *p -= bit;
+                     }
+               } else {
+                  if (r == 0) {
+                     *p = (short) s;
+                     break;
+                  }
+                  --r;
+               }
+            }
+         } while (k <= j->spec_end);
+      }
+   }
+   return 1;
+}
+
+// take a -128..127 value and stbi__clamp it and convert to 0..255
+stbi_inline static stbi_uc stbi__clamp(int x)
+{
+   // trick to use a single test to catch both cases
+   if ((unsigned int) x > 255) {
+      if (x < 0) return 0;
+      if (x > 255) return 255;
+   }
+   return (stbi_uc) x;
+}
+
+#define stbi__f2f(x)  ((int) (((x) * 4096 + 0.5)))
+#define stbi__fsh(x)  ((x) * 4096)
+
+// derived from jidctint -- DCT_ISLOW
+#define STBI__IDCT_1D(s0,s1,s2,s3,s4,s5,s6,s7) \
+   int t0,t1,t2,t3,p1,p2,p3,p4,p5,x0,x1,x2,x3; \
+   p2 = s2;                                    \
+   p3 = s6;                                    \
+   p1 = (p2+p3) * stbi__f2f(0.5411961f);       \
+   t2 = p1 + p3*stbi__f2f(-1.847759065f);      \
+   t3 = p1 + p2*stbi__f2f( 0.765366865f);      \
+   p2 = s0;                                    \
+   p3 = s4;                                    \
+   t0 = stbi__fsh(p2+p3);                      \
+   t1 = stbi__fsh(p2-p3);                      \
+   x0 = t0+t3;                                 \
+   x3 = t0-t3;                                 \
+   x1 = t1+t2;                                 \
+   x2 = t1-t2;                                 \
+   t0 = s7;                                    \
+   t1 = s5;                                    \
+   t2 = s3;                                    \
+   t3 = s1;                                    \
+   p3 = t0+t2;                                 \
+   p4 = t1+t3;                                 \
+   p1 = t0+t3;                                 \
+   p2 = t1+t2;                                 \
+   p5 = (p3+p4)*stbi__f2f( 1.175875602f);      \
+   t0 = t0*stbi__f2f( 0.298631336f);           \
+   t1 = t1*stbi__f2f( 2.053119869f);           \
+   t2 = t2*stbi__f2f( 3.072711026f);           \
+   t3 = t3*stbi__f2f( 1.501321110f);           \
+   p1 = p5 + p1*stbi__f2f(-0.899976223f);      \
+   p2 = p5 + p2*stbi__f2f(-2.562915447f);      \
+   p3 = p3*stbi__f2f(-1.961570560f);           \
+   p4 = p4*stbi__f2f(-0.390180644f);           \
+   t3 += p1+p4;                                \
+   t2 += p2+p3;                                \
+   t1 += p2+p4;                                \
+   t0 += p1+p3;
+
+static void stbi__idct_block(stbi_uc *out, int out_stride, short data[64])
+{
+   int i,val[64],*v=val;
+   stbi_uc *o;
+   short *d = data;
+
+   // columns
+   for (i=0; i < 8; ++i,++d, ++v) {
+      // if all zeroes, shortcut -- this avoids dequantizing 0s and IDCTing
+      if (d[ 8]==0 && d[16]==0 && d[24]==0 && d[32]==0
+           && d[40]==0 && d[48]==0 && d[56]==0) {
+         //    no shortcut                 0     seconds
+         //    (1|2|3|4|5|6|7)==0          0     seconds
+         //    all separate               -0.047 seconds
+         //    1 && 2|3 && 4|5 && 6|7:    -0.047 seconds
+         int dcterm = d[0]*4;
+         v[0] = v[8] = v[16] = v[24] = v[32] = v[40] = v[48] = v[56] = dcterm;
+      } else {
+         STBI__IDCT_1D(d[ 0],d[ 8],d[16],d[24],d[32],d[40],d[48],d[56])
+         // constants scaled things up by 1<<12; let's bring them back
+         // down, but keep 2 extra bits of precision
+         x0 += 512; x1 += 512; x2 += 512; x3 += 512;
+         v[ 0] = (x0+t3) >> 10;
+         v[56] = (x0-t3) >> 10;
+         v[ 8] = (x1+t2) >> 10;
+         v[48] = (x1-t2) >> 10;
+         v[16] = (x2+t1) >> 10;
+         v[40] = (x2-t1) >> 10;
+         v[24] = (x3+t0) >> 10;
+         v[32] = (x3-t0) >> 10;
+      }
+   }
+
+   for (i=0, v=val, o=out; i < 8; ++i,v+=8,o+=out_stride) {
+      // no fast case since the first 1D IDCT spread components out
+      STBI__IDCT_1D(v[0],v[1],v[2],v[3],v[4],v[5],v[6],v[7])
+      // constants scaled things up by 1<<12, plus we had 1<<2 from first
+      // loop, plus horizontal and vertical each scale by sqrt(8) so together
+      // we've got an extra 1<<3, so 1<<17 total we need to remove.
+      // so we want to round that, which means adding 0.5 * 1<<17,
+      // aka 65536. Also, we'll end up with -128 to 127 that we want
+      // to encode as 0..255 by adding 128, so we'll add that before the shift
+      x0 += 65536 + (128<<17);
+      x1 += 65536 + (128<<17);
+      x2 += 65536 + (128<<17);
+      x3 += 65536 + (128<<17);
+      // tried computing the shifts into temps, or'ing the temps to see
+      // if any were out of range, but that was slower
+      o[0] = stbi__clamp((x0+t3) >> 17);
+      o[7] = stbi__clamp((x0-t3) >> 17);
+      o[1] = stbi__clamp((x1+t2) >> 17);
+      o[6] = stbi__clamp((x1-t2) >> 17);
+      o[2] = stbi__clamp((x2+t1) >> 17);
+      o[5] = stbi__clamp((x2-t1) >> 17);
+      o[3] = stbi__clamp((x3+t0) >> 17);
+      o[4] = stbi__clamp((x3-t0) >> 17);
+   }
+}
+
+#ifdef STBI_SSE2
+// sse2 integer IDCT. not the fastest possible implementation but it
+// produces bit-identical results to the generic C version so it's
+// fully "transparent".
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   // This is constructed to match our regular (generic) integer IDCT exactly.
+   __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+   __m128i tmp;
+
+   // dot product constant: even elems=x, odd elems=y
+   #define dct_const(x,y)  _mm_setr_epi16((x),(y),(x),(y),(x),(y),(x),(y))
+
+   // out(0) = c0[even]*x + c0[odd]*y   (c0, x, y 16-bit, out 32-bit)
+   // out(1) = c1[even]*x + c1[odd]*y
+   #define dct_rot(out0,out1, x,y,c0,c1) \
+      __m128i c0##lo = _mm_unpacklo_epi16((x),(y)); \
+      __m128i c0##hi = _mm_unpackhi_epi16((x),(y)); \
+      __m128i out0##_l = _mm_madd_epi16(c0##lo, c0); \
+      __m128i out0##_h = _mm_madd_epi16(c0##hi, c0); \
+      __m128i out1##_l = _mm_madd_epi16(c0##lo, c1); \
+      __m128i out1##_h = _mm_madd_epi16(c0##hi, c1)
+
+   // out = in << 12  (in 16-bit, out 32-bit)
+   #define dct_widen(out, in) \
+      __m128i out##_l = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), (in)), 4); \
+      __m128i out##_h = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), (in)), 4)
+
+   // wide add
+   #define dct_wadd(out, a, b) \
+      __m128i out##_l = _mm_add_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_add_epi32(a##_h, b##_h)
+
+   // wide sub
+   #define dct_wsub(out, a, b) \
+      __m128i out##_l = _mm_sub_epi32(a##_l, b##_l); \
+      __m128i out##_h = _mm_sub_epi32(a##_h, b##_h)
+
+   // butterfly a/b, add bias, then shift by "s" and pack
+   #define dct_bfly32o(out0, out1, a,b,bias,s) \
+      { \
+         __m128i abiased_l = _mm_add_epi32(a##_l, bias); \
+         __m128i abiased_h = _mm_add_epi32(a##_h, bias); \
+         dct_wadd(sum, abiased, b); \
+         dct_wsub(dif, abiased, b); \
+         out0 = _mm_packs_epi32(_mm_srai_epi32(sum_l, s), _mm_srai_epi32(sum_h, s)); \
+         out1 = _mm_packs_epi32(_mm_srai_epi32(dif_l, s), _mm_srai_epi32(dif_h, s)); \
+      }
+
+   // 8-bit interleave step (for transposes)
+   #define dct_interleave8(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi8(a, b); \
+      b = _mm_unpackhi_epi8(tmp, b)
+
+   // 16-bit interleave step (for transposes)
+   #define dct_interleave16(a, b) \
+      tmp = a; \
+      a = _mm_unpacklo_epi16(a, b); \
+      b = _mm_unpackhi_epi16(tmp, b)
+
+   #define dct_pass(bias,shift) \
+      { \
+         /* even part */ \
+         dct_rot(t2e,t3e, row2,row6, rot0_0,rot0_1); \
+         __m128i sum04 = _mm_add_epi16(row0, row4); \
+         __m128i dif04 = _mm_sub_epi16(row0, row4); \
+         dct_widen(t0e, sum04); \
+         dct_widen(t1e, dif04); \
+         dct_wadd(x0, t0e, t3e); \
+         dct_wsub(x3, t0e, t3e); \
+         dct_wadd(x1, t1e, t2e); \
+         dct_wsub(x2, t1e, t2e); \
+         /* odd part */ \
+         dct_rot(y0o,y2o, row7,row3, rot2_0,rot2_1); \
+         dct_rot(y1o,y3o, row5,row1, rot3_0,rot3_1); \
+         __m128i sum17 = _mm_add_epi16(row1, row7); \
+         __m128i sum35 = _mm_add_epi16(row3, row5); \
+         dct_rot(y4o,y5o, sum17,sum35, rot1_0,rot1_1); \
+         dct_wadd(x4, y0o, y4o); \
+         dct_wadd(x5, y1o, y5o); \
+         dct_wadd(x6, y2o, y5o); \
+         dct_wadd(x7, y3o, y4o); \
+         dct_bfly32o(row0,row7, x0,x7,bias,shift); \
+         dct_bfly32o(row1,row6, x1,x6,bias,shift); \
+         dct_bfly32o(row2,row5, x2,x5,bias,shift); \
+         dct_bfly32o(row3,row4, x3,x4,bias,shift); \
+      }
+
+   __m128i rot0_0 = dct_const(stbi__f2f(0.5411961f), stbi__f2f(0.5411961f) + stbi__f2f(-1.847759065f));
+   __m128i rot0_1 = dct_const(stbi__f2f(0.5411961f) + stbi__f2f( 0.765366865f), stbi__f2f(0.5411961f));
+   __m128i rot1_0 = dct_const(stbi__f2f(1.175875602f) + stbi__f2f(-0.899976223f), stbi__f2f(1.175875602f));
+   __m128i rot1_1 = dct_const(stbi__f2f(1.175875602f), stbi__f2f(1.175875602f) + stbi__f2f(-2.562915447f));
+   __m128i rot2_0 = dct_const(stbi__f2f(-1.961570560f) + stbi__f2f( 0.298631336f), stbi__f2f(-1.961570560f));
+   __m128i rot2_1 = dct_const(stbi__f2f(-1.961570560f), stbi__f2f(-1.961570560f) + stbi__f2f( 3.072711026f));
+   __m128i rot3_0 = dct_const(stbi__f2f(-0.390180644f) + stbi__f2f( 2.053119869f), stbi__f2f(-0.390180644f));
+   __m128i rot3_1 = dct_const(stbi__f2f(-0.390180644f), stbi__f2f(-0.390180644f) + stbi__f2f( 1.501321110f));
+
+   // rounding biases in column/row passes, see stbi__idct_block for explanation.
+   __m128i bias_0 = _mm_set1_epi32(512);
+   __m128i bias_1 = _mm_set1_epi32(65536 + (128<<17));
+
+   // load
+   row0 = _mm_load_si128((const __m128i *) (data + 0*8));
+   row1 = _mm_load_si128((const __m128i *) (data + 1*8));
+   row2 = _mm_load_si128((const __m128i *) (data + 2*8));
+   row3 = _mm_load_si128((const __m128i *) (data + 3*8));
+   row4 = _mm_load_si128((const __m128i *) (data + 4*8));
+   row5 = _mm_load_si128((const __m128i *) (data + 5*8));
+   row6 = _mm_load_si128((const __m128i *) (data + 6*8));
+   row7 = _mm_load_si128((const __m128i *) (data + 7*8));
+
+   // column pass
+   dct_pass(bias_0, 10);
+
+   {
+      // 16bit 8x8 transpose pass 1
+      dct_interleave16(row0, row4);
+      dct_interleave16(row1, row5);
+      dct_interleave16(row2, row6);
+      dct_interleave16(row3, row7);
+
+      // transpose pass 2
+      dct_interleave16(row0, row2);
+      dct_interleave16(row1, row3);
+      dct_interleave16(row4, row6);
+      dct_interleave16(row5, row7);
+
+      // transpose pass 3
+      dct_interleave16(row0, row1);
+      dct_interleave16(row2, row3);
+      dct_interleave16(row4, row5);
+      dct_interleave16(row6, row7);
+   }
+
+   // row pass
+   dct_pass(bias_1, 17);
+
+   {
+      // pack
+      __m128i p0 = _mm_packus_epi16(row0, row1); // a0a1a2a3...a7b0b1b2b3...b7
+      __m128i p1 = _mm_packus_epi16(row2, row3);
+      __m128i p2 = _mm_packus_epi16(row4, row5);
+      __m128i p3 = _mm_packus_epi16(row6, row7);
+
+      // 8bit 8x8 transpose pass 1
+      dct_interleave8(p0, p2); // a0e0a1e1...
+      dct_interleave8(p1, p3); // c0g0c1g1...
+
+      // transpose pass 2
+      dct_interleave8(p0, p1); // a0c0e0g0...
+      dct_interleave8(p2, p3); // b0d0f0h0...
+
+      // transpose pass 3
+      dct_interleave8(p0, p2); // a0b0c0d0...
+      dct_interleave8(p1, p3); // a4b4c4d4...
+
+      // store
+      _mm_storel_epi64((__m128i *) out, p0); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p0, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p2); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p2, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p1); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p1, 0x4e)); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, p3); out += out_stride;
+      _mm_storel_epi64((__m128i *) out, _mm_shuffle_epi32(p3, 0x4e));
+   }
+
+#undef dct_const
+#undef dct_rot
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_interleave8
+#undef dct_interleave16
+#undef dct_pass
+}
+
+#endif // STBI_SSE2
+
+#ifdef STBI_NEON
+
+// NEON integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   int16x8_t row0, row1, row2, row3, row4, row5, row6, row7;
+
+   int16x4_t rot0_0 = vdup_n_s16(stbi__f2f(0.5411961f));
+   int16x4_t rot0_1 = vdup_n_s16(stbi__f2f(-1.847759065f));
+   int16x4_t rot0_2 = vdup_n_s16(stbi__f2f( 0.765366865f));
+   int16x4_t rot1_0 = vdup_n_s16(stbi__f2f( 1.175875602f));
+   int16x4_t rot1_1 = vdup_n_s16(stbi__f2f(-0.899976223f));
+   int16x4_t rot1_2 = vdup_n_s16(stbi__f2f(-2.562915447f));
+   int16x4_t rot2_0 = vdup_n_s16(stbi__f2f(-1.961570560f));
+   int16x4_t rot2_1 = vdup_n_s16(stbi__f2f(-0.390180644f));
+   int16x4_t rot3_0 = vdup_n_s16(stbi__f2f( 0.298631336f));
+   int16x4_t rot3_1 = vdup_n_s16(stbi__f2f( 2.053119869f));
+   int16x4_t rot3_2 = vdup_n_s16(stbi__f2f( 3.072711026f));
+   int16x4_t rot3_3 = vdup_n_s16(stbi__f2f( 1.501321110f));
+
+#define dct_long_mul(out, inq, coeff) \
+   int32x4_t out##_l = vmull_s16(vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmull_s16(vget_high_s16(inq), coeff)
+
+#define dct_long_mac(out, acc, inq, coeff) \
+   int32x4_t out##_l = vmlal_s16(acc##_l, vget_low_s16(inq), coeff); \
+   int32x4_t out##_h = vmlal_s16(acc##_h, vget_high_s16(inq), coeff)
+
+#define dct_widen(out, inq) \
+   int32x4_t out##_l = vshll_n_s16(vget_low_s16(inq), 12); \
+   int32x4_t out##_h = vshll_n_s16(vget_high_s16(inq), 12)
+
+// wide add
+#define dct_wadd(out, a, b) \
+   int32x4_t out##_l = vaddq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vaddq_s32(a##_h, b##_h)
+
+// wide sub
+#define dct_wsub(out, a, b) \
+   int32x4_t out##_l = vsubq_s32(a##_l, b##_l); \
+   int32x4_t out##_h = vsubq_s32(a##_h, b##_h)
+
+// butterfly a/b, then shift using "shiftop" by "s" and pack
+#define dct_bfly32o(out0,out1, a,b,shiftop,s) \
+   { \
+      dct_wadd(sum, a, b); \
+      dct_wsub(dif, a, b); \
+      out0 = vcombine_s16(shiftop(sum_l, s), shiftop(sum_h, s)); \
+      out1 = vcombine_s16(shiftop(dif_l, s), shiftop(dif_h, s)); \
+   }
+
+#define dct_pass(shiftop, shift) \
+   { \
+      /* even part */ \
+      int16x8_t sum26 = vaddq_s16(row2, row6); \
+      dct_long_mul(p1e, sum26, rot0_0); \
+      dct_long_mac(t2e, p1e, row6, rot0_1); \
+      dct_long_mac(t3e, p1e, row2, rot0_2); \
+      int16x8_t sum04 = vaddq_s16(row0, row4); \
+      int16x8_t dif04 = vsubq_s16(row0, row4); \
+      dct_widen(t0e, sum04); \
+      dct_widen(t1e, dif04); \
+      dct_wadd(x0, t0e, t3e); \
+      dct_wsub(x3, t0e, t3e); \
+      dct_wadd(x1, t1e, t2e); \
+      dct_wsub(x2, t1e, t2e); \
+      /* odd part */ \
+      int16x8_t sum15 = vaddq_s16(row1, row5); \
+      int16x8_t sum17 = vaddq_s16(row1, row7); \
+      int16x8_t sum35 = vaddq_s16(row3, row5); \
+      int16x8_t sum37 = vaddq_s16(row3, row7); \
+      int16x8_t sumodd = vaddq_s16(sum17, sum35); \
+      dct_long_mul(p5o, sumodd, rot1_0); \
+      dct_long_mac(p1o, p5o, sum17, rot1_1); \
+      dct_long_mac(p2o, p5o, sum35, rot1_2); \
+      dct_long_mul(p3o, sum37, rot2_0); \
+      dct_long_mul(p4o, sum15, rot2_1); \
+      dct_wadd(sump13o, p1o, p3o); \
+      dct_wadd(sump24o, p2o, p4o); \
+      dct_wadd(sump23o, p2o, p3o); \
+      dct_wadd(sump14o, p1o, p4o); \
+      dct_long_mac(x4, sump13o, row7, rot3_0); \
+      dct_long_mac(x5, sump24o, row5, rot3_1); \
+      dct_long_mac(x6, sump23o, row3, rot3_2); \
+      dct_long_mac(x7, sump14o, row1, rot3_3); \
+      dct_bfly32o(row0,row7, x0,x7,shiftop,shift); \
+      dct_bfly32o(row1,row6, x1,x6,shiftop,shift); \
+      dct_bfly32o(row2,row5, x2,x5,shiftop,shift); \
+      dct_bfly32o(row3,row4, x3,x4,shiftop,shift); \
+   }
+
+   // load
+   row0 = vld1q_s16(data + 0*8);
+   row1 = vld1q_s16(data + 1*8);
+   row2 = vld1q_s16(data + 2*8);
+   row3 = vld1q_s16(data + 3*8);
+   row4 = vld1q_s16(data + 4*8);
+   row5 = vld1q_s16(data + 5*8);
+   row6 = vld1q_s16(data + 6*8);
+   row7 = vld1q_s16(data + 7*8);
+
+   // add DC bias
+   row0 = vaddq_s16(row0, vsetq_lane_s16(1024, vdupq_n_s16(0), 0));
+
+   // column pass
+   dct_pass(vrshrn_n_s32, 10);
+
+   // 16bit 8x8 transpose
+   {
+// these three map to a single VTRN.16, VTRN.32, and VSWP, respectively.
+// whether compilers actually get this is another story, sadly.
+#define dct_trn16(x, y) { int16x8x2_t t = vtrnq_s16(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn32(x, y) { int32x4x2_t t = vtrnq_s32(vreinterpretq_s32_s16(x), vreinterpretq_s32_s16(y)); x = vreinterpretq_s16_s32(t.val[0]); y = vreinterpretq_s16_s32(t.val[1]); }
+#define dct_trn64(x, y) { int16x8_t x0 = x; int16x8_t y0 = y; x = vcombine_s16(vget_low_s16(x0), vget_low_s16(y0)); y = vcombine_s16(vget_high_s16(x0), vget_high_s16(y0)); }
+
+      // pass 1
+      dct_trn16(row0, row1); // a0b0a2b2a4b4a6b6
+      dct_trn16(row2, row3);
+      dct_trn16(row4, row5);
+      dct_trn16(row6, row7);
+
+      // pass 2
+      dct_trn32(row0, row2); // a0b0c0d0a4b4c4d4
+      dct_trn32(row1, row3);
+      dct_trn32(row4, row6);
+      dct_trn32(row5, row7);
+
+      // pass 3
+      dct_trn64(row0, row4); // a0b0c0d0e0f0g0h0
+      dct_trn64(row1, row5);
+      dct_trn64(row2, row6);
+      dct_trn64(row3, row7);
+
+#undef dct_trn16
+#undef dct_trn32
+#undef dct_trn64
+   }
+
+   // row pass
+   // vrshrn_n_s32 only supports shifts up to 16, we need
+   // 17. so do a non-rounding shift of 16 first then follow
+   // up with a rounding shift by 1.
+   dct_pass(vshrn_n_s32, 16);
+
+   {
+      // pack and round
+      uint8x8_t p0 = vqrshrun_n_s16(row0, 1);
+      uint8x8_t p1 = vqrshrun_n_s16(row1, 1);
+      uint8x8_t p2 = vqrshrun_n_s16(row2, 1);
+      uint8x8_t p3 = vqrshrun_n_s16(row3, 1);
+      uint8x8_t p4 = vqrshrun_n_s16(row4, 1);
+      uint8x8_t p5 = vqrshrun_n_s16(row5, 1);
+      uint8x8_t p6 = vqrshrun_n_s16(row6, 1);
+      uint8x8_t p7 = vqrshrun_n_s16(row7, 1);
+
+      // again, these can translate into one instruction, but often don't.
+#define dct_trn8_8(x, y) { uint8x8x2_t t = vtrn_u8(x, y); x = t.val[0]; y = t.val[1]; }
+#define dct_trn8_16(x, y) { uint16x4x2_t t = vtrn_u16(vreinterpret_u16_u8(x), vreinterpret_u16_u8(y)); x = vreinterpret_u8_u16(t.val[0]); y = vreinterpret_u8_u16(t.val[1]); }
+#define dct_trn8_32(x, y) { uint32x2x2_t t = vtrn_u32(vreinterpret_u32_u8(x), vreinterpret_u32_u8(y)); x = vreinterpret_u8_u32(t.val[0]); y = vreinterpret_u8_u32(t.val[1]); }
+
+      // sadly can't use interleaved stores here since we only write
+      // 8 bytes to each scan line!
+
+      // 8x8 8-bit transpose pass 1
+      dct_trn8_8(p0, p1);
+      dct_trn8_8(p2, p3);
+      dct_trn8_8(p4, p5);
+      dct_trn8_8(p6, p7);
+
+      // pass 2
+      dct_trn8_16(p0, p2);
+      dct_trn8_16(p1, p3);
+      dct_trn8_16(p4, p6);
+      dct_trn8_16(p5, p7);
+
+      // pass 3
+      dct_trn8_32(p0, p4);
+      dct_trn8_32(p1, p5);
+      dct_trn8_32(p2, p6);
+      dct_trn8_32(p3, p7);
+
+      // store
+      vst1_u8(out, p0); out += out_stride;
+      vst1_u8(out, p1); out += out_stride;
+      vst1_u8(out, p2); out += out_stride;
+      vst1_u8(out, p3); out += out_stride;
+      vst1_u8(out, p4); out += out_stride;
+      vst1_u8(out, p5); out += out_stride;
+      vst1_u8(out, p6); out += out_stride;
+      vst1_u8(out, p7);
+
+#undef dct_trn8_8
+#undef dct_trn8_16
+#undef dct_trn8_32
+   }
+
+#undef dct_long_mul
+#undef dct_long_mac
+#undef dct_widen
+#undef dct_wadd
+#undef dct_wsub
+#undef dct_bfly32o
+#undef dct_pass
+}
+
+#endif // STBI_NEON
+
+#define STBI__MARKER_none  0xff
+// if there's a pending marker from the entropy stream, return that
+// otherwise, fetch from the stream and get a marker. if there's no
+// marker, return 0xff, which is never a valid marker value
+static stbi_uc stbi__get_marker(stbi__jpeg *j)
+{
+   stbi_uc x;
+   if (j->marker != STBI__MARKER_none) { x = j->marker; j->marker = STBI__MARKER_none; return x; }
+   x = stbi__get8(j->s);
+   if (x != 0xff) return STBI__MARKER_none;
+   while (x == 0xff)
+      x = stbi__get8(j->s); // consume repeated 0xff fill bytes
+   return x;
+}
+
+// in each scan, we'll have scan_n components, and the order
+// of the components is specified by order[]
+#define STBI__RESTART(x)     ((x) >= 0xd0 && (x) <= 0xd7)
+
+// after a restart interval, stbi__jpeg_reset the entropy decoder and
+// the dc prediction
+static void stbi__jpeg_reset(stbi__jpeg *j)
+{
+   j->code_bits = 0;
+   j->code_buffer = 0;
+   j->nomore = 0;
+   j->img_comp[0].dc_pred = j->img_comp[1].dc_pred = j->img_comp[2].dc_pred = j->img_comp[3].dc_pred = 0;
+   j->marker = STBI__MARKER_none;
+   j->todo = j->restart_interval ? j->restart_interval : 0x7fffffff;
+   j->eob_run = 0;
+   // no more than 1<<31 MCUs if no restart_interal? that's plenty safe,
+   // since we don't even allow 1<<30 pixels
+}
+
+static int stbi__parse_entropy_coded_data(stbi__jpeg *z)
+{
+   stbi__jpeg_reset(z);
+   if (!z->progressive) {
+      if (z->scan_n == 1) {
+         int i,j;
+         STBI_SIMD_ALIGN(short, data[64]);
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               int ha = z->img_comp[n].ha;
+               if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  // if it's NOT a restart, then just bail, so we get corrupt data
+                  // rather than no data
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         STBI_SIMD_ALIGN(short, data[64]);
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x)*8;
+                        int y2 = (j*z->img_comp[n].v + y)*8;
+                        int ha = z->img_comp[n].ha;
+                        if (!stbi__jpeg_decode_block(z, data, z->huff_dc+z->img_comp[n].hd, z->huff_ac+ha, z->fast_ac[ha], n, z->dequant[z->img_comp[n].tq])) return 0;
+                        z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*y2+x2, z->img_comp[n].w2, data);
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   } else {
+      if (z->scan_n == 1) {
+         int i,j;
+         int n = z->order[0];
+         // non-interleaved data, we just need to process one block at a time,
+         // in trivial scanline order
+         // number of blocks to do just depends on how many actual "pixels" this
+         // component has, independent of interleaved MCU blocking and such
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               if (z->spec_start == 0) {
+                  if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                     return 0;
+               } else {
+                  int ha = z->img_comp[n].ha;
+                  if (!stbi__jpeg_decode_block_prog_ac(z, data, &z->huff_ac[ha], z->fast_ac[ha]))
+                     return 0;
+               }
+               // every data block is an MCU, so countdown the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      } else { // interleaved
+         int i,j,k,x,y;
+         for (j=0; j < z->img_mcu_y; ++j) {
+            for (i=0; i < z->img_mcu_x; ++i) {
+               // scan an interleaved mcu... process scan_n components in order
+               for (k=0; k < z->scan_n; ++k) {
+                  int n = z->order[k];
+                  // scan out an mcu's worth of this component; that's just determined
+                  // by the basic H and V specified for the component
+                  for (y=0; y < z->img_comp[n].v; ++y) {
+                     for (x=0; x < z->img_comp[n].h; ++x) {
+                        int x2 = (i*z->img_comp[n].h + x);
+                        int y2 = (j*z->img_comp[n].v + y);
+                        short *data = z->img_comp[n].coeff + 64 * (x2 + y2 * z->img_comp[n].coeff_w);
+                        if (!stbi__jpeg_decode_block_prog_dc(z, data, &z->huff_dc[z->img_comp[n].hd], n))
+                           return 0;
+                     }
+                  }
+               }
+               // after all interleaved components, that's an interleaved MCU,
+               // so now count down the restart interval
+               if (--z->todo <= 0) {
+                  if (z->code_bits < 24) stbi__grow_buffer_unsafe(z);
+                  if (!STBI__RESTART(z->marker)) return 1;
+                  stbi__jpeg_reset(z);
+               }
+            }
+         }
+         return 1;
+      }
+   }
+}
+
+static void stbi__jpeg_dequantize(short *data, stbi__uint16 *dequant)
+{
+   int i;
+   for (i=0; i < 64; ++i)
+      data[i] *= dequant[i];
+}
+
+static void stbi__jpeg_finish(stbi__jpeg *z)
+{
+   if (z->progressive) {
+      // dequantize and idct the data
+      int i,j,n;
+      for (n=0; n < z->s->img_n; ++n) {
+         int w = (z->img_comp[n].x+7) >> 3;
+         int h = (z->img_comp[n].y+7) >> 3;
+         for (j=0; j < h; ++j) {
+            for (i=0; i < w; ++i) {
+               short *data = z->img_comp[n].coeff + 64 * (i + j * z->img_comp[n].coeff_w);
+               stbi__jpeg_dequantize(data, z->dequant[z->img_comp[n].tq]);
+               z->idct_block_kernel(z->img_comp[n].data+z->img_comp[n].w2*j*8+i*8, z->img_comp[n].w2, data);
+            }
+         }
+      }
+   }
+}
+
+static int stbi__process_marker(stbi__jpeg *z, int m)
+{
+   int L;
+   switch (m) {
+      case STBI__MARKER_none: // no marker found
+         return stbi__err("expected marker","Corrupt JPEG");
+
+      case 0xDD: // DRI - specify restart interval
+         if (stbi__get16be(z->s) != 4) return stbi__err("bad DRI len","Corrupt JPEG");
+         z->restart_interval = stbi__get16be(z->s);
+         return 1;
+
+      case 0xDB: // DQT - define quantization table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            int q = stbi__get8(z->s);
+            int p = q >> 4, sixteen = (p != 0);
+            int t = q & 15,i;
+            if (p != 0 && p != 1) return stbi__err("bad DQT type","Corrupt JPEG");
+            if (t > 3) return stbi__err("bad DQT table","Corrupt JPEG");
+
+            for (i=0; i < 64; ++i)
+               z->dequant[t][stbi__jpeg_dezigzag[i]] = (stbi__uint16)(sixteen ? stbi__get16be(z->s) : stbi__get8(z->s));
+            L -= (sixteen ? 129 : 65);
+         }
+         return L==0;
+
+      case 0xC4: // DHT - define huffman table
+         L = stbi__get16be(z->s)-2;
+         while (L > 0) {
+            stbi_uc *v;
+            int sizes[16],i,n=0;
+            int q = stbi__get8(z->s);
+            int tc = q >> 4;
+            int th = q & 15;
+            if (tc > 1 || th > 3) return stbi__err("bad DHT header","Corrupt JPEG");
+            for (i=0; i < 16; ++i) {
+               sizes[i] = stbi__get8(z->s);
+               n += sizes[i];
+            }
+            L -= 17;
+            if (tc == 0) {
+               if (!stbi__build_huffman(z->huff_dc+th, sizes)) return 0;
+               v = z->huff_dc[th].values;
+            } else {
+               if (!stbi__build_huffman(z->huff_ac+th, sizes)) return 0;
+               v = z->huff_ac[th].values;
+            }
+            for (i=0; i < n; ++i)
+               v[i] = stbi__get8(z->s);
+            if (tc != 0)
+               stbi__build_fast_ac(z->fast_ac[th], z->huff_ac + th);
+            L -= n;
+         }
+         return L==0;
+   }
+
+   // check for comment block or APP blocks
+   if ((m >= 0xE0 && m <= 0xEF) || m == 0xFE) {
+      L = stbi__get16be(z->s);
+      if (L < 2) {
+         if (m == 0xFE)
+            return stbi__err("bad COM len","Corrupt JPEG");
+         else
+            return stbi__err("bad APP len","Corrupt JPEG");
+      }
+      L -= 2;
+
+      if (m == 0xE0 && L >= 5) { // JFIF APP0 segment
+         static const unsigned char tag[5] = {'J','F','I','F','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 5; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 5;
+         if (ok)
+            z->jfif = 1;
+      } else if (m == 0xEE && L >= 12) { // Adobe APP14 segment
+         static const unsigned char tag[6] = {'A','d','o','b','e','\0'};
+         int ok = 1;
+         int i;
+         for (i=0; i < 6; ++i)
+            if (stbi__get8(z->s) != tag[i])
+               ok = 0;
+         L -= 6;
+         if (ok) {
+            stbi__get8(z->s); // version
+            stbi__get16be(z->s); // flags0
+            stbi__get16be(z->s); // flags1
+            z->app14_color_transform = stbi__get8(z->s); // color transform
+            L -= 6;
+         }
+      }
+
+      stbi__skip(z->s, L);
+      return 1;
+   }
+
+   return stbi__err("unknown marker","Corrupt JPEG");
+}
+
+// after we see SOS
+static int stbi__process_scan_header(stbi__jpeg *z)
+{
+   int i;
+   int Ls = stbi__get16be(z->s);
+   z->scan_n = stbi__get8(z->s);
+   if (z->scan_n < 1 || z->scan_n > 4 || z->scan_n > (int) z->s->img_n) return stbi__err("bad SOS component count","Corrupt JPEG");
+   if (Ls != 6+2*z->scan_n) return stbi__err("bad SOS len","Corrupt JPEG");
+   for (i=0; i < z->scan_n; ++i) {
+      int id = stbi__get8(z->s), which;
+      int q = stbi__get8(z->s);
+      for (which = 0; which < z->s->img_n; ++which)
+         if (z->img_comp[which].id == id)
+            break;
+      if (which == z->s->img_n) return 0; // no match
+      z->img_comp[which].hd = q >> 4;   if (z->img_comp[which].hd > 3) return stbi__err("bad DC huff","Corrupt JPEG");
+      z->img_comp[which].ha = q & 15;   if (z->img_comp[which].ha > 3) return stbi__err("bad AC huff","Corrupt JPEG");
+      z->order[i] = which;
+   }
+
+   {
+      int aa;
+      z->spec_start = stbi__get8(z->s);
+      z->spec_end   = stbi__get8(z->s); // should be 63, but might be 0
+      aa = stbi__get8(z->s);
+      z->succ_high = (aa >> 4);
+      z->succ_low  = (aa & 15);
+      if (z->progressive) {
+         if (z->spec_start > 63 || z->spec_end > 63  || z->spec_start > z->spec_end || z->succ_high > 13 || z->succ_low > 13)
+            return stbi__err("bad SOS", "Corrupt JPEG");
+      } else {
+         if (z->spec_start != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         if (z->succ_high != 0 || z->succ_low != 0) return stbi__err("bad SOS","Corrupt JPEG");
+         z->spec_end = 63;
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__free_jpeg_components(stbi__jpeg *z, int ncomp, int why)
+{
+   int i;
+   for (i=0; i < ncomp; ++i) {
+      if (z->img_comp[i].raw_data) {
+         STBI_FREE(z->img_comp[i].raw_data);
+         z->img_comp[i].raw_data = NULL;
+         z->img_comp[i].data = NULL;
+      }
+      if (z->img_comp[i].raw_coeff) {
+         STBI_FREE(z->img_comp[i].raw_coeff);
+         z->img_comp[i].raw_coeff = 0;
+         z->img_comp[i].coeff = 0;
+      }
+      if (z->img_comp[i].linebuf) {
+         STBI_FREE(z->img_comp[i].linebuf);
+         z->img_comp[i].linebuf = NULL;
+      }
+   }
+   return why;
+}
+
+static int stbi__process_frame_header(stbi__jpeg *z, int scan)
+{
+   stbi__context *s = z->s;
+   int Lf,p,i,q, h_max=1,v_max=1,c;
+   Lf = stbi__get16be(s);         if (Lf < 11) return stbi__err("bad SOF len","Corrupt JPEG"); // JPEG
+   p  = stbi__get8(s);            if (p != 8) return stbi__err("only 8-bit","JPEG format not supported: 8-bit only"); // JPEG baseline
+   s->img_y = stbi__get16be(s);   if (s->img_y == 0) return stbi__err("no header height", "JPEG format not supported: delayed height"); // Legal, but we don't handle it--but neither does IJG
+   s->img_x = stbi__get16be(s);   if (s->img_x == 0) return stbi__err("0 width","Corrupt JPEG"); // JPEG requires
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   c = stbi__get8(s);
+   if (c != 3 && c != 1 && c != 4) return stbi__err("bad component count","Corrupt JPEG");
+   s->img_n = c;
+   for (i=0; i < c; ++i) {
+      z->img_comp[i].data = NULL;
+      z->img_comp[i].linebuf = NULL;
+   }
+
+   if (Lf != 8+3*s->img_n) return stbi__err("bad SOF len","Corrupt JPEG");
+
+   z->rgb = 0;
+   for (i=0; i < s->img_n; ++i) {
+      static const unsigned char rgb[3] = { 'R', 'G', 'B' };
+      z->img_comp[i].id = stbi__get8(s);
+      if (s->img_n == 3 && z->img_comp[i].id == rgb[i])
+         ++z->rgb;
+      q = stbi__get8(s);
+      z->img_comp[i].h = (q >> 4);  if (!z->img_comp[i].h || z->img_comp[i].h > 4) return stbi__err("bad H","Corrupt JPEG");
+      z->img_comp[i].v = q & 15;    if (!z->img_comp[i].v || z->img_comp[i].v > 4) return stbi__err("bad V","Corrupt JPEG");
+      z->img_comp[i].tq = stbi__get8(s);  if (z->img_comp[i].tq > 3) return stbi__err("bad TQ","Corrupt JPEG");
+   }
+
+   if (scan != STBI__SCAN_load) return 1;
+
+   if (!stbi__mad3sizes_valid(s->img_x, s->img_y, s->img_n, 0)) return stbi__err("too large", "Image too large to decode");
+
+   for (i=0; i < s->img_n; ++i) {
+      if (z->img_comp[i].h > h_max) h_max = z->img_comp[i].h;
+      if (z->img_comp[i].v > v_max) v_max = z->img_comp[i].v;
+   }
+
+   // compute interleaved mcu info
+   z->img_h_max = h_max;
+   z->img_v_max = v_max;
+   z->img_mcu_w = h_max * 8;
+   z->img_mcu_h = v_max * 8;
+   // these sizes can't be more than 17 bits
+   z->img_mcu_x = (s->img_x + z->img_mcu_w-1) / z->img_mcu_w;
+   z->img_mcu_y = (s->img_y + z->img_mcu_h-1) / z->img_mcu_h;
+
+   for (i=0; i < s->img_n; ++i) {
+      // number of effective pixels (e.g. for non-interleaved MCU)
+      z->img_comp[i].x = (s->img_x * z->img_comp[i].h + h_max-1) / h_max;
+      z->img_comp[i].y = (s->img_y * z->img_comp[i].v + v_max-1) / v_max;
+      // to simplify generation, we'll allocate enough memory to decode
+      // the bogus oversized data from using interleaved MCUs and their
+      // big blocks (e.g. a 16x16 iMCU on an image of width 33); we won't
+      // discard the extra data until colorspace conversion
+      //
+      // img_mcu_x, img_mcu_y: <=17 bits; comp[i].h and .v are <=4 (checked earlier)
+      // so these muls can't overflow with 32-bit ints (which we require)
+      z->img_comp[i].w2 = z->img_mcu_x * z->img_comp[i].h * 8;
+      z->img_comp[i].h2 = z->img_mcu_y * z->img_comp[i].v * 8;
+      z->img_comp[i].coeff = 0;
+      z->img_comp[i].raw_coeff = 0;
+      z->img_comp[i].linebuf = NULL;
+      z->img_comp[i].raw_data = stbi__malloc_mad2(z->img_comp[i].w2, z->img_comp[i].h2, 15);
+      if (z->img_comp[i].raw_data == NULL)
+         return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+      // align blocks for idct using mmx/sse
+      z->img_comp[i].data = (stbi_uc*) (((size_t) z->img_comp[i].raw_data + 15) & ~15);
+      if (z->progressive) {
+         // w2, h2 are multiples of 8 (see above)
+         z->img_comp[i].coeff_w = z->img_comp[i].w2 / 8;
+         z->img_comp[i].coeff_h = z->img_comp[i].h2 / 8;
+         z->img_comp[i].raw_coeff = stbi__malloc_mad3(z->img_comp[i].w2, z->img_comp[i].h2, sizeof(short), 15);
+         if (z->img_comp[i].raw_coeff == NULL)
+            return stbi__free_jpeg_components(z, i+1, stbi__err("outofmem", "Out of memory"));
+         z->img_comp[i].coeff = (short*) (((size_t) z->img_comp[i].raw_coeff + 15) & ~15);
+      }
+   }
+
+   return 1;
+}
+
+// use comparisons since in some cases we handle more than one case (e.g. SOF)
+#define stbi__DNL(x)         ((x) == 0xdc)
+#define stbi__SOI(x)         ((x) == 0xd8)
+#define stbi__EOI(x)         ((x) == 0xd9)
+#define stbi__SOF(x)         ((x) == 0xc0 || (x) == 0xc1 || (x) == 0xc2)
+#define stbi__SOS(x)         ((x) == 0xda)
+
+#define stbi__SOF_progressive(x)   ((x) == 0xc2)
+
+static int stbi__decode_jpeg_header(stbi__jpeg *z, int scan)
+{
+   int m;
+   z->jfif = 0;
+   z->app14_color_transform = -1; // valid values are 0,1,2
+   z->marker = STBI__MARKER_none; // initialize cached marker to empty
+   m = stbi__get_marker(z);
+   if (!stbi__SOI(m)) return stbi__err("no SOI","Corrupt JPEG");
+   if (scan == STBI__SCAN_type) return 1;
+   m = stbi__get_marker(z);
+   while (!stbi__SOF(m)) {
+      if (!stbi__process_marker(z,m)) return 0;
+      m = stbi__get_marker(z);
+      while (m == STBI__MARKER_none) {
+         // some files have extra padding after their blocks, so ok, we'll scan
+         if (stbi__at_eof(z->s)) return stbi__err("no SOF", "Corrupt JPEG");
+         m = stbi__get_marker(z);
+      }
+   }
+   z->progressive = stbi__SOF_progressive(m);
+   if (!stbi__process_frame_header(z, scan)) return 0;
+   return 1;
+}
+
+// decode image to YCbCr format
+static int stbi__decode_jpeg_image(stbi__jpeg *j)
+{
+   int m;
+   for (m = 0; m < 4; m++) {
+      j->img_comp[m].raw_data = NULL;
+      j->img_comp[m].raw_coeff = NULL;
+   }
+   j->restart_interval = 0;
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_load)) return 0;
+   m = stbi__get_marker(j);
+   while (!stbi__EOI(m)) {
+      if (stbi__SOS(m)) {
+         if (!stbi__process_scan_header(j)) return 0;
+         if (!stbi__parse_entropy_coded_data(j)) return 0;
+         if (j->marker == STBI__MARKER_none ) {
+            // handle 0s at the end of image data from IP Kamera 9060
+            while (!stbi__at_eof(j->s)) {
+               int x = stbi__get8(j->s);
+               if (x == 255) {
+                  j->marker = stbi__get8(j->s);
+                  break;
+               }
+            }
+            // if we reach eof without hitting a marker, stbi__get_marker() below will fail and we'll eventually return 0
+         }
+      } else if (stbi__DNL(m)) {
+         int Ld = stbi__get16be(j->s);
+         stbi__uint32 NL = stbi__get16be(j->s);
+         if (Ld != 4) return stbi__err("bad DNL len", "Corrupt JPEG");
+         if (NL != j->s->img_y) return stbi__err("bad DNL height", "Corrupt JPEG");
+      } else {
+         if (!stbi__process_marker(j, m)) return 0;
+      }
+      m = stbi__get_marker(j);
+   }
+   if (j->progressive)
+      stbi__jpeg_finish(j);
+   return 1;
+}
+
+// static jfif-centered resampling (across block boundaries)
+
+typedef stbi_uc *(*resample_row_func)(stbi_uc *out, stbi_uc *in0, stbi_uc *in1,
+                                    int w, int hs);
+
+#define stbi__div4(x) ((stbi_uc) ((x) >> 2))
+
+static stbi_uc *resample_row_1(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   STBI_NOTUSED(out);
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(w);
+   STBI_NOTUSED(hs);
+   return in_near;
+}
+
+static stbi_uc* stbi__resample_row_v_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples vertically for every one in input
+   int i;
+   STBI_NOTUSED(hs);
+   for (i=0; i < w; ++i)
+      out[i] = stbi__div4(3*in_near[i] + in_far[i] + 2);
+   return out;
+}
+
+static stbi_uc*  stbi__resample_row_h_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate two samples horizontally for every one in input
+   int i;
+   stbi_uc *input = in_near;
+
+   if (w == 1) {
+      // if only one sample, can't do any interpolation
+      out[0] = out[1] = input[0];
+      return out;
+   }
+
+   out[0] = input[0];
+   out[1] = stbi__div4(input[0]*3 + input[1] + 2);
+   for (i=1; i < w-1; ++i) {
+      int n = 3*input[i]+2;
+      out[i*2+0] = stbi__div4(n+input[i-1]);
+      out[i*2+1] = stbi__div4(n+input[i+1]);
+   }
+   out[i*2+0] = stbi__div4(input[w-2]*3 + input[w-1] + 2);
+   out[i*2+1] = input[w-1];
+
+   STBI_NOTUSED(in_far);
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#define stbi__div16(x) ((stbi_uc) ((x) >> 4))
+
+static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i,t0,t1;
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   out[0] = stbi__div4(t1+2);
+   for (i=1; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // need to generate 2x2 samples for every one in input
+   int i=0,t0,t1;
+
+   if (w == 1) {
+      out[0] = out[1] = stbi__div4(3*in_near[0] + in_far[0] + 2);
+      return out;
+   }
+
+   t1 = 3*in_near[0] + in_far[0];
+   // process groups of 8 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   for (; i < ((w-1) & ~7); i += 8) {
+#if defined(STBI_SSE2)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      __m128i zero  = _mm_setzero_si128();
+      __m128i farb  = _mm_loadl_epi64((__m128i *) (in_far + i));
+      __m128i nearb = _mm_loadl_epi64((__m128i *) (in_near + i));
+      __m128i farw  = _mm_unpacklo_epi8(farb, zero);
+      __m128i nearw = _mm_unpacklo_epi8(nearb, zero);
+      __m128i diff  = _mm_sub_epi16(farw, nearw);
+      __m128i nears = _mm_slli_epi16(nearw, 2);
+      __m128i curr  = _mm_add_epi16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      __m128i prv0 = _mm_slli_si128(curr, 2);
+      __m128i nxt0 = _mm_srli_si128(curr, 2);
+      __m128i prev = _mm_insert_epi16(prv0, t1, 0);
+      __m128i next = _mm_insert_epi16(nxt0, 3*in_near[i+8] + in_far[i+8], 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      __m128i bias  = _mm_set1_epi16(8);
+      __m128i curs = _mm_slli_epi16(curr, 2);
+      __m128i prvd = _mm_sub_epi16(prev, curr);
+      __m128i nxtd = _mm_sub_epi16(next, curr);
+      __m128i curb = _mm_add_epi16(curs, bias);
+      __m128i even = _mm_add_epi16(prvd, curb);
+      __m128i odd  = _mm_add_epi16(nxtd, curb);
+
+      // interleave even and odd pixels, then undo scaling.
+      __m128i int0 = _mm_unpacklo_epi16(even, odd);
+      __m128i int1 = _mm_unpackhi_epi16(even, odd);
+      __m128i de0  = _mm_srli_epi16(int0, 4);
+      __m128i de1  = _mm_srli_epi16(int1, 4);
+
+      // pack and write output
+      __m128i outv = _mm_packus_epi16(de0, de1);
+      _mm_storeu_si128((__m128i *) (out + i*2), outv);
+#elif defined(STBI_NEON)
+      // load and perform the vertical filtering pass
+      // this uses 3*x + y = 4*x + (y - x)
+      uint8x8_t farb  = vld1_u8(in_far + i);
+      uint8x8_t nearb = vld1_u8(in_near + i);
+      int16x8_t diff  = vreinterpretq_s16_u16(vsubl_u8(farb, nearb));
+      int16x8_t nears = vreinterpretq_s16_u16(vshll_n_u8(nearb, 2));
+      int16x8_t curr  = vaddq_s16(nears, diff); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of 8 pixels added in.
+      int16x8_t prv0 = vextq_s16(curr, curr, 7);
+      int16x8_t nxt0 = vextq_s16(curr, curr, 1);
+      int16x8_t prev = vsetq_lane_s16(t1, prv0, 0);
+      int16x8_t next = vsetq_lane_s16(3*in_near[i+8] + in_far[i+8], nxt0, 7);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev = cur*4 + (prev - cur)
+      // odd  pixels = 3*cur + next = cur*4 + (next - cur)
+      // note the shared term.
+      int16x8_t curs = vshlq_n_s16(curr, 2);
+      int16x8_t prvd = vsubq_s16(prev, curr);
+      int16x8_t nxtd = vsubq_s16(next, curr);
+      int16x8_t even = vaddq_s16(curs, prvd);
+      int16x8_t odd  = vaddq_s16(curs, nxtd);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      uint8x8x2_t o;
+      o.val[0] = vqrshrun_n_s16(even, 4);
+      o.val[1] = vqrshrun_n_s16(odd,  4);
+      vst2_u8(out + i*2, o);
+#endif
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+7] + in_far[i+7];
+   }
+
+   t0 = t1;
+   t1 = 3*in_near[i] + in_far[i];
+   out[i*2] = stbi__div16(3*t1 + t0 + 8);
+
+   for (++i; i < w; ++i) {
+      t0 = t1;
+      t1 = 3*in_near[i]+in_far[i];
+      out[i*2-1] = stbi__div16(3*t0 + t1 + 8);
+      out[i*2  ] = stbi__div16(3*t1 + t0 + 8);
+   }
+   out[w*2-1] = stbi__div4(t1+2);
+
+   STBI_NOTUSED(hs);
+
+   return out;
+}
+#endif
+
+static stbi_uc *stbi__resample_row_generic(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
+{
+   // resample with nearest-neighbor
+   int i,j;
+   STBI_NOTUSED(in_far);
+   for (i=0; i < w; ++i)
+      for (j=0; j < hs; ++j)
+         out[i*hs+j] = in_near[i];
+   return out;
+}
+
+// this is a reduced-precision calculation of YCbCr-to-RGB introduced
+// to make sure the code produces the same results in both SIMD and scalar
+#define stbi__float2fixed(x)  (((int) ((x) * 4096.0f + 0.5f)) << 8)
+static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc *pcb, const stbi_uc *pcr, int count, int step)
+{
+   int i;
+   for (i=0; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed +  cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + (cr*-stbi__float2fixed(0.71414f)) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                     +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+
+#if defined(STBI_SSE2) || defined(STBI_NEON)
+static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
+{
+   int i = 0;
+
+#ifdef STBI_SSE2
+   // step == 3 is pretty ugly on the final interleave, and i'm not convinced
+   // it's useful in practice (you wouldn't use it for textures, for example).
+   // so just accelerate step == 4 case.
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      __m128i signflip  = _mm_set1_epi8(-0x80);
+      __m128i cr_const0 = _mm_set1_epi16(   (short) ( 1.40200f*4096.0f+0.5f));
+      __m128i cr_const1 = _mm_set1_epi16( - (short) ( 0.71414f*4096.0f+0.5f));
+      __m128i cb_const0 = _mm_set1_epi16( - (short) ( 0.34414f*4096.0f+0.5f));
+      __m128i cb_const1 = _mm_set1_epi16(   (short) ( 1.77200f*4096.0f+0.5f));
+      __m128i y_bias = _mm_set1_epi8((char) (unsigned char) 128);
+      __m128i xw = _mm_set1_epi16(255); // alpha channel
+
+      for (; i+7 < count; i += 8) {
+         // load
+         __m128i y_bytes = _mm_loadl_epi64((__m128i *) (y+i));
+         __m128i cr_bytes = _mm_loadl_epi64((__m128i *) (pcr+i));
+         __m128i cb_bytes = _mm_loadl_epi64((__m128i *) (pcb+i));
+         __m128i cr_biased = _mm_xor_si128(cr_bytes, signflip); // -128
+         __m128i cb_biased = _mm_xor_si128(cb_bytes, signflip); // -128
+
+         // unpack to short (and left-shift cr, cb by 8)
+         __m128i yw  = _mm_unpacklo_epi8(y_bias, y_bytes);
+         __m128i crw = _mm_unpacklo_epi8(_mm_setzero_si128(), cr_biased);
+         __m128i cbw = _mm_unpacklo_epi8(_mm_setzero_si128(), cb_biased);
+
+         // color transform
+         __m128i yws = _mm_srli_epi16(yw, 4);
+         __m128i cr0 = _mm_mulhi_epi16(cr_const0, crw);
+         __m128i cb0 = _mm_mulhi_epi16(cb_const0, cbw);
+         __m128i cb1 = _mm_mulhi_epi16(cbw, cb_const1);
+         __m128i cr1 = _mm_mulhi_epi16(crw, cr_const1);
+         __m128i rws = _mm_add_epi16(cr0, yws);
+         __m128i gwt = _mm_add_epi16(cb0, yws);
+         __m128i bws = _mm_add_epi16(yws, cb1);
+         __m128i gws = _mm_add_epi16(gwt, cr1);
+
+         // descale
+         __m128i rw = _mm_srai_epi16(rws, 4);
+         __m128i bw = _mm_srai_epi16(bws, 4);
+         __m128i gw = _mm_srai_epi16(gws, 4);
+
+         // back to byte, set up for transpose
+         __m128i brb = _mm_packus_epi16(rw, bw);
+         __m128i gxb = _mm_packus_epi16(gw, xw);
+
+         // transpose to interleave channels
+         __m128i t0 = _mm_unpacklo_epi8(brb, gxb);
+         __m128i t1 = _mm_unpackhi_epi8(brb, gxb);
+         __m128i o0 = _mm_unpacklo_epi16(t0, t1);
+         __m128i o1 = _mm_unpackhi_epi16(t0, t1);
+
+         // store
+         _mm_storeu_si128((__m128i *) (out + 0), o0);
+         _mm_storeu_si128((__m128i *) (out + 16), o1);
+         out += 32;
+      }
+   }
+#endif
+
+#ifdef STBI_NEON
+   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x4_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+         o.val[3] = vdup_n_u8(255);
+
+         // store, interleaving r/g/b/a
+         vst4_u8(out, o);
+         out += 8*4;
+      }
+   }
+#endif
+
+   for (; i < count; ++i) {
+      int y_fixed = (y[i] << 20) + (1<<19); // rounding
+      int r,g,b;
+      int cr = pcr[i] - 128;
+      int cb = pcb[i] - 128;
+      r = y_fixed + cr* stbi__float2fixed(1.40200f);
+      g = y_fixed + cr*-stbi__float2fixed(0.71414f) + ((cb*-stbi__float2fixed(0.34414f)) & 0xffff0000);
+      b = y_fixed                                   +   cb* stbi__float2fixed(1.77200f);
+      r >>= 20;
+      g >>= 20;
+      b >>= 20;
+      if ((unsigned) r > 255) { if (r < 0) r = 0; else r = 255; }
+      if ((unsigned) g > 255) { if (g < 0) g = 0; else g = 255; }
+      if ((unsigned) b > 255) { if (b < 0) b = 0; else b = 255; }
+      out[0] = (stbi_uc)r;
+      out[1] = (stbi_uc)g;
+      out[2] = (stbi_uc)b;
+      out[3] = 255;
+      out += step;
+   }
+}
+#endif
+
+// set up the kernels
+static void stbi__setup_jpeg(stbi__jpeg *j)
+{
+   j->idct_block_kernel = stbi__idct_block;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_row;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2;
+
+#ifdef STBI_SSE2
+   if (stbi__sse2_available()) {
+      j->idct_block_kernel = stbi__idct_simd;
+      j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+      j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+   }
+#endif
+
+#ifdef STBI_NEON
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
+}
+
+// clean up the temporary component buffers
+static void stbi__cleanup_jpeg(stbi__jpeg *j)
+{
+   stbi__free_jpeg_components(j, j->s->img_n, 0);
+}
+
+typedef struct
+{
+   resample_row_func resample;
+   stbi_uc *line0,*line1;
+   int hs,vs;   // expansion factor in each axis
+   int w_lores; // horizontal pixels pre-expansion
+   int ystep;   // how far through vertical expansion we are
+   int ypos;    // which pre-expansion row we're on
+} stbi__resample;
+
+// fast 0..255 * 0..255 => 0..255 rounded multiplication
+static stbi_uc stbi__blinn_8x8(stbi_uc x, stbi_uc y)
+{
+   unsigned int t = x*y + 128;
+   return (stbi_uc) ((t + (t >>8)) >> 8);
+}
+
+static stbi_uc *load_jpeg_image(stbi__jpeg *z, int *out_x, int *out_y, int *comp, int req_comp)
+{
+   int n, decode_n, is_rgb;
+   z->s->img_n = 0; // make stbi__cleanup_jpeg safe
+
+   // validate req_comp
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+
+   // load a jpeg image from whichever source, but leave in YCbCr format
+   if (!stbi__decode_jpeg_image(z)) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // determine actual number of components to generate
+   n = req_comp ? req_comp : z->s->img_n >= 3 ? 3 : 1;
+
+   is_rgb = z->s->img_n == 3 && (z->rgb == 3 || (z->app14_color_transform == 0 && !z->jfif));
+
+   if (z->s->img_n == 3 && n < 3 && !is_rgb)
+      decode_n = 1;
+   else
+      decode_n = z->s->img_n;
+
+   // nothing to do if no components requested; check this now to avoid
+   // accessing uninitialized coutput[0] later
+   if (decode_n <= 0) { stbi__cleanup_jpeg(z); return NULL; }
+
+   // resample and color-convert
+   {
+      int k;
+      unsigned int i,j;
+      stbi_uc *output;
+      stbi_uc *coutput[4] = { NULL, NULL, NULL, NULL };
+
+      stbi__resample res_comp[4];
+
+      for (k=0; k < decode_n; ++k) {
+         stbi__resample *r = &res_comp[k];
+
+         // allocate line buffer big enough for upsampling off the edges
+         // with upsample factor of 4
+         z->img_comp[k].linebuf = (stbi_uc *) stbi__malloc(z->s->img_x + 3);
+         if (!z->img_comp[k].linebuf) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+         r->hs      = z->img_h_max / z->img_comp[k].h;
+         r->vs      = z->img_v_max / z->img_comp[k].v;
+         r->ystep   = r->vs >> 1;
+         r->w_lores = (z->s->img_x + r->hs-1) / r->hs;
+         r->ypos    = 0;
+         r->line0   = r->line1 = z->img_comp[k].data;
+
+         if      (r->hs == 1 && r->vs == 1) r->resample = resample_row_1;
+         else if (r->hs == 1 && r->vs == 2) r->resample = stbi__resample_row_v_2;
+         else if (r->hs == 2 && r->vs == 1) r->resample = stbi__resample_row_h_2;
+         else if (r->hs == 2 && r->vs == 2) r->resample = z->resample_row_hv_2_kernel;
+         else                               r->resample = stbi__resample_row_generic;
+      }
+
+      // can't error after this so, this is safe
+      output = (stbi_uc *) stbi__malloc_mad3(n, z->s->img_x, z->s->img_y, 1);
+      if (!output) { stbi__cleanup_jpeg(z); return stbi__errpuc("outofmem", "Out of memory"); }
+
+      // now go ahead and resample
+      for (j=0; j < z->s->img_y; ++j) {
+         stbi_uc *out = output + n * z->s->img_x * j;
+         for (k=0; k < decode_n; ++k) {
+            stbi__resample *r = &res_comp[k];
+            int y_bot = r->ystep >= (r->vs >> 1);
+            coutput[k] = r->resample(z->img_comp[k].linebuf,
+                                     y_bot ? r->line1 : r->line0,
+                                     y_bot ? r->line0 : r->line1,
+                                     r->w_lores, r->hs);
+            if (++r->ystep >= r->vs) {
+               r->ystep = 0;
+               r->line0 = r->line1;
+               if (++r->ypos < z->img_comp[k].y)
+                  r->line1 += z->img_comp[k].w2;
+            }
+         }
+         if (n >= 3) {
+            stbi_uc *y = coutput[0];
+            if (z->s->img_n == 3) {
+               if (is_rgb) {
+                  for (i=0; i < z->s->img_x; ++i) {
+                     out[0] = y[i];
+                     out[1] = coutput[1][i];
+                     out[2] = coutput[2][i];
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else {
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else if (z->s->img_n == 4) {
+               if (z->app14_color_transform == 0) { // CMYK
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(coutput[0][i], m);
+                     out[1] = stbi__blinn_8x8(coutput[1][i], m);
+                     out[2] = stbi__blinn_8x8(coutput[2][i], m);
+                     out[3] = 255;
+                     out += n;
+                  }
+               } else if (z->app14_color_transform == 2) { // YCCK
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+                  for (i=0; i < z->s->img_x; ++i) {
+                     stbi_uc m = coutput[3][i];
+                     out[0] = stbi__blinn_8x8(255 - out[0], m);
+                     out[1] = stbi__blinn_8x8(255 - out[1], m);
+                     out[2] = stbi__blinn_8x8(255 - out[2], m);
+                     out += n;
+                  }
+               } else { // YCbCr + alpha?  Ignore the fourth channel for now
+                  z->YCbCr_to_RGB_kernel(out, y, coutput[1], coutput[2], z->s->img_x, n);
+               }
+            } else
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = out[1] = out[2] = y[i];
+                  out[3] = 255; // not used if n==3
+                  out += n;
+               }
+         } else {
+            if (is_rgb) {
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i)
+                     *out++ = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+               else {
+                  for (i=0; i < z->s->img_x; ++i, out += 2) {
+                     out[0] = stbi__compute_y(coutput[0][i], coutput[1][i], coutput[2][i]);
+                     out[1] = 255;
+                  }
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 0) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  stbi_uc m = coutput[3][i];
+                  stbi_uc r = stbi__blinn_8x8(coutput[0][i], m);
+                  stbi_uc g = stbi__blinn_8x8(coutput[1][i], m);
+                  stbi_uc b = stbi__blinn_8x8(coutput[2][i], m);
+                  out[0] = stbi__compute_y(r, g, b);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else if (z->s->img_n == 4 && z->app14_color_transform == 2) {
+               for (i=0; i < z->s->img_x; ++i) {
+                  out[0] = stbi__blinn_8x8(255 - coutput[0][i], coutput[3][i]);
+                  out[1] = 255;
+                  out += n;
+               }
+            } else {
+               stbi_uc *y = coutput[0];
+               if (n == 1)
+                  for (i=0; i < z->s->img_x; ++i) out[i] = y[i];
+               else
+                  for (i=0; i < z->s->img_x; ++i) { *out++ = y[i]; *out++ = 255; }
+            }
+         }
+      }
+      stbi__cleanup_jpeg(z);
+      *out_x = z->s->img_x;
+      *out_y = z->s->img_y;
+      if (comp) *comp = z->s->img_n >= 3 ? 3 : 1; // report original components, not output
+      return output;
+   }
+}
+
+static void *stbi__jpeg_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   unsigned char* result;
+   stbi__jpeg* j = (stbi__jpeg*) stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__errpuc("outofmem", "Out of memory");
+   STBI_NOTUSED(ri);
+   j->s = s;
+   stbi__setup_jpeg(j);
+   result = load_jpeg_image(j, x,y,comp,req_comp);
+   STBI_FREE(j);
+   return result;
+}
+
+static int stbi__jpeg_test(stbi__context *s)
+{
+   int r;
+   stbi__jpeg* j = (stbi__jpeg*)stbi__malloc(sizeof(stbi__jpeg));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   j->s = s;
+   stbi__setup_jpeg(j);
+   r = stbi__decode_jpeg_header(j, STBI__SCAN_type);
+   stbi__rewind(s);
+   STBI_FREE(j);
+   return r;
+}
+
+static int stbi__jpeg_info_raw(stbi__jpeg *j, int *x, int *y, int *comp)
+{
+   if (!stbi__decode_jpeg_header(j, STBI__SCAN_header)) {
+      stbi__rewind( j->s );
+      return 0;
+   }
+   if (x) *x = j->s->img_x;
+   if (y) *y = j->s->img_y;
+   if (comp) *comp = j->s->img_n >= 3 ? 3 : 1;
+   return 1;
+}
+
+static int stbi__jpeg_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int result;
+   stbi__jpeg* j = (stbi__jpeg*) (stbi__malloc(sizeof(stbi__jpeg)));
+   if (!j) return stbi__err("outofmem", "Out of memory");
+   j->s = s;
+   result = stbi__jpeg_info_raw(j, x, y, comp);
+   STBI_FREE(j);
+   return result;
+}
+#endif
+
+// public domain zlib decode    v0.2  Sean Barrett 2006-11-18
+//    simple implementation
+//      - all input must be provided in an upfront buffer
+//      - all output is written to a single output buffer (can malloc/realloc)
+//    performance
+//      - fast huffman
+
+#ifndef STBI_NO_ZLIB
+
+// fast-way is faster to check than jpeg huffman, but slow way is slower
+#define STBI__ZFAST_BITS  9 // accelerate all cases in default tables
+#define STBI__ZFAST_MASK  ((1 << STBI__ZFAST_BITS) - 1)
+#define STBI__ZNSYMS 288 // number of symbols in literal/length alphabet
+
+// zlib-style huffman encoding
+// (jpegs packs from left, zlib from right, so can't share code)
+typedef struct
+{
+   stbi__uint16 fast[1 << STBI__ZFAST_BITS];
+   stbi__uint16 firstcode[16];
+   int maxcode[17];
+   stbi__uint16 firstsymbol[16];
+   stbi_uc  size[STBI__ZNSYMS];
+   stbi__uint16 value[STBI__ZNSYMS];
+} stbi__zhuffman;
+
+stbi_inline static int stbi__bitreverse16(int n)
+{
+  n = ((n & 0xAAAA) >>  1) | ((n & 0x5555) << 1);
+  n = ((n & 0xCCCC) >>  2) | ((n & 0x3333) << 2);
+  n = ((n & 0xF0F0) >>  4) | ((n & 0x0F0F) << 4);
+  n = ((n & 0xFF00) >>  8) | ((n & 0x00FF) << 8);
+  return n;
+}
+
+stbi_inline static int stbi__bit_reverse(int v, int bits)
+{
+   STBI_ASSERT(bits <= 16);
+   // to bit reverse n bits, reverse 16 and shift
+   // e.g. 11 bits, bit reverse and shift away 5
+   return stbi__bitreverse16(v) >> (16-bits);
+}
+
+static int stbi__zbuild_huffman(stbi__zhuffman *z, const stbi_uc *sizelist, int num)
+{
+   int i,k=0;
+   int code, next_code[16], sizes[17];
+
+   // DEFLATE spec for generating codes
+   memset(sizes, 0, sizeof(sizes));
+   memset(z->fast, 0, sizeof(z->fast));
+   for (i=0; i < num; ++i)
+      ++sizes[sizelist[i]];
+   sizes[0] = 0;
+   for (i=1; i < 16; ++i)
+      if (sizes[i] > (1 << i))
+         return stbi__err("bad sizes", "Corrupt PNG");
+   code = 0;
+   for (i=1; i < 16; ++i) {
+      next_code[i] = code;
+      z->firstcode[i] = (stbi__uint16) code;
+      z->firstsymbol[i] = (stbi__uint16) k;
+      code = (code + sizes[i]);
+      if (sizes[i])
+         if (code-1 >= (1 << i)) return stbi__err("bad codelengths","Corrupt PNG");
+      z->maxcode[i] = code << (16-i); // preshift for inner loop
+      code <<= 1;
+      k += sizes[i];
+   }
+   z->maxcode[16] = 0x10000; // sentinel
+   for (i=0; i < num; ++i) {
+      int s = sizelist[i];
+      if (s) {
+         int c = next_code[s] - z->firstcode[s] + z->firstsymbol[s];
+         stbi__uint16 fastv = (stbi__uint16) ((s << 9) | i);
+         z->size [c] = (stbi_uc     ) s;
+         z->value[c] = (stbi__uint16) i;
+         if (s <= STBI__ZFAST_BITS) {
+            int j = stbi__bit_reverse(next_code[s],s);
+            while (j < (1 << STBI__ZFAST_BITS)) {
+               z->fast[j] = fastv;
+               j += (1 << s);
+            }
+         }
+         ++next_code[s];
+      }
+   }
+   return 1;
+}
+
+// zlib-from-memory implementation for PNG reading
+//    because PNG allows splitting the zlib stream arbitrarily,
+//    and it's annoying structurally to have PNG call ZLIB call PNG,
+//    we require PNG read all the IDATs and combine them into a single
+//    memory buffer
+
+typedef struct
+{
+   stbi_uc *zbuffer, *zbuffer_end;
+   int num_bits;
+   stbi__uint32 code_buffer;
+
+   char *zout;
+   char *zout_start;
+   char *zout_end;
+   int   z_expandable;
+
+   stbi__zhuffman z_length, z_distance;
+} stbi__zbuf;
+
+stbi_inline static int stbi__zeof(stbi__zbuf *z)
+{
+   return (z->zbuffer >= z->zbuffer_end);
+}
+
+stbi_inline static stbi_uc stbi__zget8(stbi__zbuf *z)
+{
+   return stbi__zeof(z) ? 0 : *z->zbuffer++;
+}
+
+static void stbi__fill_bits(stbi__zbuf *z)
+{
+   do {
+      if (z->code_buffer >= (1U << z->num_bits)) {
+        z->zbuffer = z->zbuffer_end;  /* treat this as EOF so we fail. */
+        return;
+      }
+      z->code_buffer |= (unsigned int) stbi__zget8(z) << z->num_bits;
+      z->num_bits += 8;
+   } while (z->num_bits <= 24);
+}
+
+stbi_inline static unsigned int stbi__zreceive(stbi__zbuf *z, int n)
+{
+   unsigned int k;
+   if (z->num_bits < n) stbi__fill_bits(z);
+   k = z->code_buffer & ((1 << n) - 1);
+   z->code_buffer >>= n;
+   z->num_bits -= n;
+   return k;
+}
+
+static int stbi__zhuffman_decode_slowpath(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s,k;
+   // not resolved by fast table, so compute it the slow way
+   // use jpeg approach, which requires MSbits at top
+   k = stbi__bit_reverse(a->code_buffer, 16);
+   for (s=STBI__ZFAST_BITS+1; ; ++s)
+      if (k < z->maxcode[s])
+         break;
+   if (s >= 16) return -1; // invalid code!
+   // code size is s, so:
+   b = (k >> (16-s)) - z->firstcode[s] + z->firstsymbol[s];
+   if (b >= STBI__ZNSYMS) return -1; // some data was corrupt somewhere!
+   if (z->size[b] != s) return -1;  // was originally an assert, but report failure instead.
+   a->code_buffer >>= s;
+   a->num_bits -= s;
+   return z->value[b];
+}
+
+stbi_inline static int stbi__zhuffman_decode(stbi__zbuf *a, stbi__zhuffman *z)
+{
+   int b,s;
+   if (a->num_bits < 16) {
+      if (stbi__zeof(a)) {
+         return -1;   /* report error for unexpected end of data. */
+      }
+      stbi__fill_bits(a);
+   }
+   b = z->fast[a->code_buffer & STBI__ZFAST_MASK];
+   if (b) {
+      s = b >> 9;
+      a->code_buffer >>= s;
+      a->num_bits -= s;
+      return b & 511;
+   }
+   return stbi__zhuffman_decode_slowpath(a, z);
+}
+
+static int stbi__zexpand(stbi__zbuf *z, char *zout, int n)  // need to make room for n bytes
+{
+   char *q;
+   unsigned int cur, limit, old_limit;
+   z->zout = zout;
+   if (!z->z_expandable) return stbi__err("output buffer limit","Corrupt PNG");
+   cur   = (unsigned int) (z->zout - z->zout_start);
+   limit = old_limit = (unsigned) (z->zout_end - z->zout_start);
+   if (UINT_MAX - cur < (unsigned) n) return stbi__err("outofmem", "Out of memory");
+   while (cur + n > limit) {
+      if(limit > UINT_MAX / 2) return stbi__err("outofmem", "Out of memory");
+      limit *= 2;
+   }
+   q = (char *) STBI_REALLOC_SIZED(z->zout_start, old_limit, limit);
+   STBI_NOTUSED(old_limit);
+   if (q == NULL) return stbi__err("outofmem", "Out of memory");
+   z->zout_start = q;
+   z->zout       = q + cur;
+   z->zout_end   = q + limit;
+   return 1;
+}
+
+static const int stbi__zlength_base[31] = {
+   3,4,5,6,7,8,9,10,11,13,
+   15,17,19,23,27,31,35,43,51,59,
+   67,83,99,115,131,163,195,227,258,0,0 };
+
+static const int stbi__zlength_extra[31]=
+{ 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0,0,0 };
+
+static const int stbi__zdist_base[32] = { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,
+257,385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577,0,0};
+
+static const int stbi__zdist_extra[32] =
+{ 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13};
+
+static int stbi__parse_huffman_block(stbi__zbuf *a)
+{
+   char *zout = a->zout;
+   for(;;) {
+      int z = stbi__zhuffman_decode(a, &a->z_length);
+      if (z < 256) {
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG"); // error in huffman codes
+         if (zout >= a->zout_end) {
+            if (!stbi__zexpand(a, zout, 1)) return 0;
+            zout = a->zout;
+         }
+         *zout++ = (char) z;
+      } else {
+         stbi_uc *p;
+         int len,dist;
+         if (z == 256) {
+            a->zout = zout;
+            return 1;
+         }
+         z -= 257;
+         len = stbi__zlength_base[z];
+         if (stbi__zlength_extra[z]) len += stbi__zreceive(a, stbi__zlength_extra[z]);
+         z = stbi__zhuffman_decode(a, &a->z_distance);
+         if (z < 0) return stbi__err("bad huffman code","Corrupt PNG");
+         dist = stbi__zdist_base[z];
+         if (stbi__zdist_extra[z]) dist += stbi__zreceive(a, stbi__zdist_extra[z]);
+         if (zout - a->zout_start < dist) return stbi__err("bad dist","Corrupt PNG");
+         if (zout + len > a->zout_end) {
+            if (!stbi__zexpand(a, zout, len)) return 0;
+            zout = a->zout;
+         }
+         p = (stbi_uc *) (zout - dist);
+         if (dist == 1) { // run of one byte; common in images.
+            stbi_uc v = *p;
+            if (len) { do *zout++ = v; while (--len); }
+         } else {
+            if (len) { do *zout++ = *p++; while (--len); }
+         }
+      }
+   }
+}
+
+static int stbi__compute_huffman_codes(stbi__zbuf *a)
+{
+   static const stbi_uc length_dezigzag[19] = { 16,17,18,0,8,7,9,6,10,5,11,4,12,3,13,2,14,1,15 };
+   stbi__zhuffman z_codelength;
+   stbi_uc lencodes[286+32+137];//padding for maximum single op
+   stbi_uc codelength_sizes[19];
+   int i,n;
+
+   int hlit  = stbi__zreceive(a,5) + 257;
+   int hdist = stbi__zreceive(a,5) + 1;
+   int hclen = stbi__zreceive(a,4) + 4;
+   int ntot  = hlit + hdist;
+
+   memset(codelength_sizes, 0, sizeof(codelength_sizes));
+   for (i=0; i < hclen; ++i) {
+      int s = stbi__zreceive(a,3);
+      codelength_sizes[length_dezigzag[i]] = (stbi_uc) s;
+   }
+   if (!stbi__zbuild_huffman(&z_codelength, codelength_sizes, 19)) return 0;
+
+   n = 0;
+   while (n < ntot) {
+      int c = stbi__zhuffman_decode(a, &z_codelength);
+      if (c < 0 || c >= 19) return stbi__err("bad codelengths", "Corrupt PNG");
+      if (c < 16)
+         lencodes[n++] = (stbi_uc) c;
+      else {
+         stbi_uc fill = 0;
+         if (c == 16) {
+            c = stbi__zreceive(a,2)+3;
+            if (n == 0) return stbi__err("bad codelengths", "Corrupt PNG");
+            fill = lencodes[n-1];
+         } else if (c == 17) {
+            c = stbi__zreceive(a,3)+3;
+         } else if (c == 18) {
+            c = stbi__zreceive(a,7)+11;
+         } else {
+            return stbi__err("bad codelengths", "Corrupt PNG");
+         }
+         if (ntot - n < c) return stbi__err("bad codelengths", "Corrupt PNG");
+         memset(lencodes+n, fill, c);
+         n += c;
+      }
+   }
+   if (n != ntot) return stbi__err("bad codelengths","Corrupt PNG");
+   if (!stbi__zbuild_huffman(&a->z_length, lencodes, hlit)) return 0;
+   if (!stbi__zbuild_huffman(&a->z_distance, lencodes+hlit, hdist)) return 0;
+   return 1;
+}
+
+static int stbi__parse_uncompressed_block(stbi__zbuf *a)
+{
+   stbi_uc header[4];
+   int len,nlen,k;
+   if (a->num_bits & 7)
+      stbi__zreceive(a, a->num_bits & 7); // discard
+   // drain the bit-packed data into header
+   k = 0;
+   while (a->num_bits > 0) {
+      header[k++] = (stbi_uc) (a->code_buffer & 255); // suppress MSVC run-time check
+      a->code_buffer >>= 8;
+      a->num_bits -= 8;
+   }
+   if (a->num_bits < 0) return stbi__err("zlib corrupt","Corrupt PNG");
+   // now fill header the normal way
+   while (k < 4)
+      header[k++] = stbi__zget8(a);
+   len  = header[1] * 256 + header[0];
+   nlen = header[3] * 256 + header[2];
+   if (nlen != (len ^ 0xffff)) return stbi__err("zlib corrupt","Corrupt PNG");
+   if (a->zbuffer + len > a->zbuffer_end) return stbi__err("read past buffer","Corrupt PNG");
+   if (a->zout + len > a->zout_end)
+      if (!stbi__zexpand(a, a->zout, len)) return 0;
+   memcpy(a->zout, a->zbuffer, len);
+   a->zbuffer += len;
+   a->zout += len;
+   return 1;
+}
+
+static int stbi__parse_zlib_header(stbi__zbuf *a)
+{
+   int cmf   = stbi__zget8(a);
+   int cm    = cmf & 15;
+   /* int cinfo = cmf >> 4; */
+   int flg   = stbi__zget8(a);
+   if (stbi__zeof(a)) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if ((cmf*256+flg) % 31 != 0) return stbi__err("bad zlib header","Corrupt PNG"); // zlib spec
+   if (flg & 32) return stbi__err("no preset dict","Corrupt PNG"); // preset dictionary not allowed in png
+   if (cm != 8) return stbi__err("bad compression","Corrupt PNG"); // DEFLATE required for png
+   // window = 1 << (8 + cinfo)... but who cares, we fully buffer output
+   return 1;
+}
+
+static const stbi_uc stbi__zdefault_length[STBI__ZNSYMS] =
+{
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+   8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
+   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8
+};
+static const stbi_uc stbi__zdefault_distance[32] =
+{
+   5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+};
+/*
+Init algorithm:
+{
+   int i;   // use <= to match clearly with spec
+   for (i=0; i <= 143; ++i)     stbi__zdefault_length[i]   = 8;
+   for (   ; i <= 255; ++i)     stbi__zdefault_length[i]   = 9;
+   for (   ; i <= 279; ++i)     stbi__zdefault_length[i]   = 7;
+   for (   ; i <= 287; ++i)     stbi__zdefault_length[i]   = 8;
+
+   for (i=0; i <=  31; ++i)     stbi__zdefault_distance[i] = 5;
+}
+*/
+
+static int stbi__parse_zlib(stbi__zbuf *a, int parse_header)
+{
+   int final, type;
+   if (parse_header)
+      if (!stbi__parse_zlib_header(a)) return 0;
+   a->num_bits = 0;
+   a->code_buffer = 0;
+   do {
+      final = stbi__zreceive(a,1);
+      type = stbi__zreceive(a,2);
+      if (type == 0) {
+         if (!stbi__parse_uncompressed_block(a)) return 0;
+      } else if (type == 3) {
+         return 0;
+      } else {
+         if (type == 1) {
+            // use fixed code lengths
+            if (!stbi__zbuild_huffman(&a->z_length  , stbi__zdefault_length  , STBI__ZNSYMS)) return 0;
+            if (!stbi__zbuild_huffman(&a->z_distance, stbi__zdefault_distance,  32)) return 0;
+         } else {
+            if (!stbi__compute_huffman_codes(a)) return 0;
+         }
+         if (!stbi__parse_huffman_block(a)) return 0;
+      }
+   } while (!final);
+   return 1;
+}
+
+static int stbi__do_zlib(stbi__zbuf *a, char *obuf, int olen, int exp, int parse_header)
+{
+   a->zout_start = obuf;
+   a->zout       = obuf;
+   a->zout_end   = obuf + olen;
+   a->z_expandable = exp;
+
+   return stbi__parse_zlib(a, parse_header);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize(const char *buffer, int len, int initial_size, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, 1)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF char *stbi_zlib_decode_malloc(char const *buffer, int len, int *outlen)
+{
+   return stbi_zlib_decode_malloc_guesssize(buffer, len, 16384, outlen);
+}
+
+STBIDEF char *stbi_zlib_decode_malloc_guesssize_headerflag(const char *buffer, int len, int initial_size, int *outlen, int parse_header)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(initial_size);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer + len;
+   if (stbi__do_zlib(&a, p, initial_size, 1, parse_header)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_buffer(char *obuffer, int olen, char const *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 1))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+
+STBIDEF char *stbi_zlib_decode_noheader_malloc(char const *buffer, int len, int *outlen)
+{
+   stbi__zbuf a;
+   char *p = (char *) stbi__malloc(16384);
+   if (p == NULL) return NULL;
+   a.zbuffer = (stbi_uc *) buffer;
+   a.zbuffer_end = (stbi_uc *) buffer+len;
+   if (stbi__do_zlib(&a, p, 16384, 1, 0)) {
+      if (outlen) *outlen = (int) (a.zout - a.zout_start);
+      return a.zout_start;
+   } else {
+      STBI_FREE(a.zout_start);
+      return NULL;
+   }
+}
+
+STBIDEF int stbi_zlib_decode_noheader_buffer(char *obuffer, int olen, const char *ibuffer, int ilen)
+{
+   stbi__zbuf a;
+   a.zbuffer = (stbi_uc *) ibuffer;
+   a.zbuffer_end = (stbi_uc *) ibuffer + ilen;
+   if (stbi__do_zlib(&a, obuffer, olen, 0, 0))
+      return (int) (a.zout - a.zout_start);
+   else
+      return -1;
+}
+#endif
+
+// public domain "baseline" PNG decoder   v0.10  Sean Barrett 2006-11-18
+//    simple implementation
+//      - only 8-bit samples
+//      - no CRC checking
+//      - allocates lots of intermediate memory
+//        - avoids problem of streaming data between subsystems
+//        - avoids explicit window management
+//    performance
+//      - uses stb_zlib, a PD zlib implementation with fast huffman decoding
+
+#ifndef STBI_NO_PNG
+typedef struct
+{
+   stbi__uint32 length;
+   stbi__uint32 type;
+} stbi__pngchunk;
+
+static stbi__pngchunk stbi__get_chunk_header(stbi__context *s)
+{
+   stbi__pngchunk c;
+   c.length = stbi__get32be(s);
+   c.type   = stbi__get32be(s);
+   return c;
+}
+
+static int stbi__check_png_header(stbi__context *s)
+{
+   static const stbi_uc png_sig[8] = { 137,80,78,71,13,10,26,10 };
+   int i;
+   for (i=0; i < 8; ++i)
+      if (stbi__get8(s) != png_sig[i]) return stbi__err("bad png sig","Not a PNG");
+   return 1;
+}
+
+typedef struct
+{
+   stbi__context *s;
+   stbi_uc *idata, *expanded, *out;
+   int depth;
+} stbi__png;
+
+
+enum {
+   STBI__F_none=0,
+   STBI__F_sub=1,
+   STBI__F_up=2,
+   STBI__F_avg=3,
+   STBI__F_paeth=4,
+   // synthetic filters used for first scanline to avoid needing a dummy row of 0s
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static stbi_uc first_row_filter[5] =
+{
+   STBI__F_none,
+   STBI__F_sub,
+   STBI__F_none,
+   STBI__F_avg_first,
+   STBI__F_paeth_first
+};
+
+static int stbi__paeth(int a, int b, int c)
+{
+   int p = a + b - c;
+   int pa = abs(p-a);
+   int pb = abs(p-b);
+   int pc = abs(p-c);
+   if (pa <= pb && pa <= pc) return a;
+   if (pb <= pc) return b;
+   return c;
+}
+
+static const stbi_uc stbi__depth_scale_table[9] = { 0, 0xff, 0x55, 0, 0x11, 0,0,0, 0x01 };
+
+// create the png data from post-deflated data
+static int stbi__create_png_image_raw(stbi__png *a, stbi_uc *raw, stbi__uint32 raw_len, int out_n, stbi__uint32 x, stbi__uint32 y, int depth, int color)
+{
+   int bytes = (depth == 16? 2 : 1);
+   stbi__context *s = a->s;
+   stbi__uint32 i,j,stride = x*out_n*bytes;
+   stbi__uint32 img_len, img_width_bytes;
+   int k;
+   int img_n = s->img_n; // copy it into a local for later
+
+   int output_bytes = out_n*bytes;
+   int filter_bytes = img_n*bytes;
+   int width = x;
+
+   STBI_ASSERT(out_n == s->img_n || out_n == s->img_n+1);
+   a->out = (stbi_uc *) stbi__malloc_mad3(x, y, output_bytes, 0); // extra bytes to write off the end into
+   if (!a->out) return stbi__err("outofmem", "Out of memory");
+
+   if (!stbi__mad3sizes_valid(img_n, x, depth, 7)) return stbi__err("too large", "Corrupt PNG");
+   img_width_bytes = (((img_n * x * depth) + 7) >> 3);
+   img_len = (img_width_bytes + 1) * y;
+
+   // we used to check for exact match between raw_len and img_len on non-interlaced PNGs,
+   // but issue #276 reported a PNG in the wild that had extra data at the end (all zeros),
+   // so just check for raw_len < img_len always.
+   if (raw_len < img_len) return stbi__err("not enough pixels","Corrupt PNG");
+
+   for (j=0; j < y; ++j) {
+      stbi_uc *cur = a->out + stride*j;
+      stbi_uc *prior;
+      int filter = *raw++;
+
+      if (filter > 4)
+         return stbi__err("invalid filter","Corrupt PNG");
+
+      if (depth < 8) {
+         if (img_width_bytes > x) return stbi__err("invalid width","Corrupt PNG");
+         cur += x*out_n - img_width_bytes; // store output to the rightmost img_len bytes, so we can decode in place
+         filter_bytes = 1;
+         width = img_width_bytes;
+      }
+      prior = cur - stride; // bugfix: need to compute this after 'cur +=' computation above
+
+      // if first row, use special filter that doesn't sample previous row
+      if (j == 0) filter = first_row_filter[filter];
+
+      // handle first byte explicitly
+      for (k=0; k < filter_bytes; ++k) {
+         switch (filter) {
+            case STBI__F_none       : cur[k] = raw[k]; break;
+            case STBI__F_sub        : cur[k] = raw[k]; break;
+            case STBI__F_up         : cur[k] = STBI__BYTECAST(raw[k] + prior[k]); break;
+            case STBI__F_avg        : cur[k] = STBI__BYTECAST(raw[k] + (prior[k]>>1)); break;
+            case STBI__F_paeth      : cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(0,prior[k],0)); break;
+            case STBI__F_avg_first  : cur[k] = raw[k]; break;
+            case STBI__F_paeth_first: cur[k] = raw[k]; break;
+         }
+      }
+
+      if (depth == 8) {
+         if (img_n != out_n)
+            cur[img_n] = 255; // first pixel
+         raw += img_n;
+         cur += out_n;
+         prior += out_n;
+      } else if (depth == 16) {
+         if (img_n != out_n) {
+            cur[filter_bytes]   = 255; // first pixel top byte
+            cur[filter_bytes+1] = 255; // first pixel bottom byte
+         }
+         raw += filter_bytes;
+         cur += output_bytes;
+         prior += output_bytes;
+      } else {
+         raw += 1;
+         cur += 1;
+         prior += 1;
+      }
+
+      // this is a little gross, so that we don't switch per-pixel or per-component
+      if (depth < 8 || img_n == out_n) {
+         int nk = (width - 1)*filter_bytes;
+         #define STBI__CASE(f) \
+             case f:     \
+                for (k=0; k < nk; ++k)
+         switch (filter) {
+            // "none" filter turns into a memcpy here; make that explicit.
+            case STBI__F_none:         memcpy(cur, raw, nk); break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k-filter_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k-filter_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],prior[k],prior[k-filter_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k-filter_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k-filter_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+         raw += nk;
+      } else {
+         STBI_ASSERT(img_n+1 == out_n);
+         #define STBI__CASE(f) \
+             case f:     \
+                for (i=x-1; i >= 1; --i, cur[filter_bytes]=255,raw+=filter_bytes,cur+=output_bytes,prior+=output_bytes) \
+                   for (k=0; k < filter_bytes; ++k)
+         switch (filter) {
+            STBI__CASE(STBI__F_none)         { cur[k] = raw[k]; } break;
+            STBI__CASE(STBI__F_sub)          { cur[k] = STBI__BYTECAST(raw[k] + cur[k- output_bytes]); } break;
+            STBI__CASE(STBI__F_up)           { cur[k] = STBI__BYTECAST(raw[k] + prior[k]); } break;
+            STBI__CASE(STBI__F_avg)          { cur[k] = STBI__BYTECAST(raw[k] + ((prior[k] + cur[k- output_bytes])>>1)); } break;
+            STBI__CASE(STBI__F_paeth)        { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],prior[k],prior[k- output_bytes])); } break;
+            STBI__CASE(STBI__F_avg_first)    { cur[k] = STBI__BYTECAST(raw[k] + (cur[k- output_bytes] >> 1)); } break;
+            STBI__CASE(STBI__F_paeth_first)  { cur[k] = STBI__BYTECAST(raw[k] + stbi__paeth(cur[k- output_bytes],0,0)); } break;
+         }
+         #undef STBI__CASE
+
+         // the loop above sets the high byte of the pixels' alpha, but for
+         // 16 bit png files we also need the low byte set. we'll do that here.
+         if (depth == 16) {
+            cur = a->out + stride*j; // start at the beginning of the row again
+            for (i=0; i < x; ++i,cur+=output_bytes) {
+               cur[filter_bytes+1] = 255;
+            }
+         }
+      }
+   }
+
+   // we make a separate pass to expand bits to pixels; for performance,
+   // this could run two scanlines behind the above code, so it won't
+   // intefere with filtering but will still be in the cache.
+   if (depth < 8) {
+      for (j=0; j < y; ++j) {
+         stbi_uc *cur = a->out + stride*j;
+         stbi_uc *in  = a->out + stride*j + x*out_n - img_width_bytes;
+         // unpack 1/2/4-bit into a 8-bit buffer. allows us to keep the common 8-bit path optimal at minimal cost for 1/2/4-bit
+         // png guarante byte alignment, if width is not multiple of 8/4/2 we'll decode dummy trailing data that will be skipped in the later loop
+         stbi_uc scale = (color == 0) ? stbi__depth_scale_table[depth] : 1; // scale grayscale values to 0..255 range
+
+         // note that the final byte might overshoot and write more data than desired.
+         // we can allocate enough data that this never writes out of memory, but it
+         // could also overwrite the next scanline. can it overwrite non-empty data
+         // on the next scanline? yes, consider 1-pixel-wide scanlines with 1-bit-per-pixel.
+         // so we need to explicitly clamp the final ones
+
+         if (depth == 4) {
+            for (k=x*img_n; k >= 2; k-=2, ++in) {
+               *cur++ = scale * ((*in >> 4)       );
+               *cur++ = scale * ((*in     ) & 0x0f);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 4)       );
+         } else if (depth == 2) {
+            for (k=x*img_n; k >= 4; k-=4, ++in) {
+               *cur++ = scale * ((*in >> 6)       );
+               *cur++ = scale * ((*in >> 4) & 0x03);
+               *cur++ = scale * ((*in >> 2) & 0x03);
+               *cur++ = scale * ((*in     ) & 0x03);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 6)       );
+            if (k > 1) *cur++ = scale * ((*in >> 4) & 0x03);
+            if (k > 2) *cur++ = scale * ((*in >> 2) & 0x03);
+         } else if (depth == 1) {
+            for (k=x*img_n; k >= 8; k-=8, ++in) {
+               *cur++ = scale * ((*in >> 7)       );
+               *cur++ = scale * ((*in >> 6) & 0x01);
+               *cur++ = scale * ((*in >> 5) & 0x01);
+               *cur++ = scale * ((*in >> 4) & 0x01);
+               *cur++ = scale * ((*in >> 3) & 0x01);
+               *cur++ = scale * ((*in >> 2) & 0x01);
+               *cur++ = scale * ((*in >> 1) & 0x01);
+               *cur++ = scale * ((*in     ) & 0x01);
+            }
+            if (k > 0) *cur++ = scale * ((*in >> 7)       );
+            if (k > 1) *cur++ = scale * ((*in >> 6) & 0x01);
+            if (k > 2) *cur++ = scale * ((*in >> 5) & 0x01);
+            if (k > 3) *cur++ = scale * ((*in >> 4) & 0x01);
+            if (k > 4) *cur++ = scale * ((*in >> 3) & 0x01);
+            if (k > 5) *cur++ = scale * ((*in >> 2) & 0x01);
+            if (k > 6) *cur++ = scale * ((*in >> 1) & 0x01);
+         }
+         if (img_n != out_n) {
+            int q;
+            // insert alpha = 255
+            cur = a->out + stride*j;
+            if (img_n == 1) {
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*2+1] = 255;
+                  cur[q*2+0] = cur[q];
+               }
+            } else {
+               STBI_ASSERT(img_n == 3);
+               for (q=x-1; q >= 0; --q) {
+                  cur[q*4+3] = 255;
+                  cur[q*4+2] = cur[q*3+2];
+                  cur[q*4+1] = cur[q*3+1];
+                  cur[q*4+0] = cur[q*3+0];
+               }
+            }
+         }
+      }
+   } else if (depth == 16) {
+      // force the image data from big-endian to platform-native.
+      // this is done in a separate pass due to the decoding relying
+      // on the data being untouched, but could probably be done
+      // per-line during decode if care is taken.
+      stbi_uc *cur = a->out;
+      stbi__uint16 *cur16 = (stbi__uint16*)cur;
+
+      for(i=0; i < x*y*out_n; ++i,cur16++,cur+=2) {
+         *cur16 = (cur[0] << 8) | cur[1];
+      }
+   }
+
+   return 1;
+}
+
+static int stbi__create_png_image(stbi__png *a, stbi_uc *image_data, stbi__uint32 image_data_len, int out_n, int depth, int color, int interlaced)
+{
+   int bytes = (depth == 16 ? 2 : 1);
+   int out_bytes = out_n * bytes;
+   stbi_uc *final;
+   int p;
+   if (!interlaced)
+      return stbi__create_png_image_raw(a, image_data, image_data_len, out_n, a->s->img_x, a->s->img_y, depth, color);
+
+   // de-interlacing
+   final = (stbi_uc *) stbi__malloc_mad3(a->s->img_x, a->s->img_y, out_bytes, 0);
+   if (!final) return stbi__err("outofmem", "Out of memory");
+   for (p=0; p < 7; ++p) {
+      int xorig[] = { 0,4,0,2,0,1,0 };
+      int yorig[] = { 0,0,4,0,2,0,1 };
+      int xspc[]  = { 8,8,4,4,2,2,1 };
+      int yspc[]  = { 8,8,8,4,4,2,2 };
+      int i,j,x,y;
+      // pass1_x[4] = 0, pass1_x[5] = 1, pass1_x[12] = 1
+      x = (a->s->img_x - xorig[p] + xspc[p]-1) / xspc[p];
+      y = (a->s->img_y - yorig[p] + yspc[p]-1) / yspc[p];
+      if (x && y) {
+         stbi__uint32 img_len = ((((a->s->img_n * x * depth) + 7) >> 3) + 1) * y;
+         if (!stbi__create_png_image_raw(a, image_data, image_data_len, out_n, x, y, depth, color)) {
+            STBI_FREE(final);
+            return 0;
+         }
+         for (j=0; j < y; ++j) {
+            for (i=0; i < x; ++i) {
+               int out_y = j*yspc[p]+yorig[p];
+               int out_x = i*xspc[p]+xorig[p];
+               memcpy(final + out_y*a->s->img_x*out_bytes + out_x*out_bytes,
+                      a->out + (j*x+i)*out_bytes, out_bytes);
+            }
+         }
+         STBI_FREE(a->out);
+         image_data += img_len;
+         image_data_len -= img_len;
+      }
+   }
+   a->out = final;
+
+   return 1;
+}
+
+static int stbi__compute_transparency(stbi__png *z, stbi_uc tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 255 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i=0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 255);
+         p += 2;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__compute_transparency16(stbi__png *z, stbi__uint16 tc[3], int out_n)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi__uint16 *p = (stbi__uint16*) z->out;
+
+   // compute color-based transparency, assuming we've
+   // already got 65535 as the alpha value in the output
+   STBI_ASSERT(out_n == 2 || out_n == 4);
+
+   if (out_n == 2) {
+      for (i = 0; i < pixel_count; ++i) {
+         p[1] = (p[0] == tc[0] ? 0 : 65535);
+         p += 2;
+      }
+   } else {
+      for (i = 0; i < pixel_count; ++i) {
+         if (p[0] == tc[0] && p[1] == tc[1] && p[2] == tc[2])
+            p[3] = 0;
+         p += 4;
+      }
+   }
+   return 1;
+}
+
+static int stbi__expand_png_palette(stbi__png *a, stbi_uc *palette, int len, int pal_img_n)
+{
+   stbi__uint32 i, pixel_count = a->s->img_x * a->s->img_y;
+   stbi_uc *p, *temp_out, *orig = a->out;
+
+   p = (stbi_uc *) stbi__malloc_mad2(pixel_count, pal_img_n, 0);
+   if (p == NULL) return stbi__err("outofmem", "Out of memory");
+
+   // between here and free(out) below, exitting would leak
+   temp_out = p;
+
+   if (pal_img_n == 3) {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p += 3;
+      }
+   } else {
+      for (i=0; i < pixel_count; ++i) {
+         int n = orig[i]*4;
+         p[0] = palette[n  ];
+         p[1] = palette[n+1];
+         p[2] = palette[n+2];
+         p[3] = palette[n+3];
+         p += 4;
+      }
+   }
+   STBI_FREE(a->out);
+   a->out = temp_out;
+
+   STBI_NOTUSED(len);
+
+   return 1;
+}
+
+static int stbi__unpremultiply_on_load_global = 0;
+static int stbi__de_iphone_flag_global = 0;
+
+STBIDEF void stbi_set_unpremultiply_on_load(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_global = flag_true_if_should_unpremultiply;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_global = flag_true_if_should_convert;
+}
+
+#ifndef STBI_THREAD_LOCAL
+#define stbi__unpremultiply_on_load  stbi__unpremultiply_on_load_global
+#define stbi__de_iphone_flag  stbi__de_iphone_flag_global
+#else
+static STBI_THREAD_LOCAL int stbi__unpremultiply_on_load_local, stbi__unpremultiply_on_load_set;
+static STBI_THREAD_LOCAL int stbi__de_iphone_flag_local, stbi__de_iphone_flag_set;
+
+STBIDEF void stbi__unpremultiply_on_load_thread(int flag_true_if_should_unpremultiply)
+{
+   stbi__unpremultiply_on_load_local = flag_true_if_should_unpremultiply;
+   stbi__unpremultiply_on_load_set = 1;
+}
+
+STBIDEF void stbi_convert_iphone_png_to_rgb_thread(int flag_true_if_should_convert)
+{
+   stbi__de_iphone_flag_local = flag_true_if_should_convert;
+   stbi__de_iphone_flag_set = 1;
+}
+
+#define stbi__unpremultiply_on_load  (stbi__unpremultiply_on_load_set           \
+                                       ? stbi__unpremultiply_on_load_local      \
+                                       : stbi__unpremultiply_on_load_global)
+#define stbi__de_iphone_flag  (stbi__de_iphone_flag_set                         \
+                                ? stbi__de_iphone_flag_local                    \
+                                : stbi__de_iphone_flag_global)
+#endif // STBI_THREAD_LOCAL
+
+static void stbi__de_iphone(stbi__png *z)
+{
+   stbi__context *s = z->s;
+   stbi__uint32 i, pixel_count = s->img_x * s->img_y;
+   stbi_uc *p = z->out;
+
+   if (s->img_out_n == 3) {  // convert bgr to rgb
+      for (i=0; i < pixel_count; ++i) {
+         stbi_uc t = p[0];
+         p[0] = p[2];
+         p[2] = t;
+         p += 3;
+      }
+   } else {
+      STBI_ASSERT(s->img_out_n == 4);
+      if (stbi__unpremultiply_on_load) {
+         // convert bgr to rgb and unpremultiply
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc a = p[3];
+            stbi_uc t = p[0];
+            if (a) {
+               stbi_uc half = a / 2;
+               p[0] = (p[2] * 255 + half) / a;
+               p[1] = (p[1] * 255 + half) / a;
+               p[2] = ( t   * 255 + half) / a;
+            } else {
+               p[0] = p[2];
+               p[2] = t;
+            }
+            p += 4;
+         }
+      } else {
+         // convert bgr to rgb
+         for (i=0; i < pixel_count; ++i) {
+            stbi_uc t = p[0];
+            p[0] = p[2];
+            p[2] = t;
+            p += 4;
+         }
+      }
+   }
+}
+
+#define STBI__PNG_TYPE(a,b,c,d)  (((unsigned) (a) << 24) + ((unsigned) (b) << 16) + ((unsigned) (c) << 8) + (unsigned) (d))
+
+static int stbi__parse_png_file(stbi__png *z, int scan, int req_comp)
+{
+   stbi_uc palette[1024], pal_img_n=0;
+   stbi_uc has_trans=0, tc[3]={0};
+   stbi__uint16 tc16[3];
+   stbi__uint32 ioff=0, idata_limit=0, i, pal_len=0;
+   int first=1,k,interlace=0, color=0, is_iphone=0;
+   stbi__context *s = z->s;
+
+   z->expanded = NULL;
+   z->idata = NULL;
+   z->out = NULL;
+
+   if (!stbi__check_png_header(s)) return 0;
+
+   if (scan == STBI__SCAN_type) return 1;
+
+   for (;;) {
+      stbi__pngchunk c = stbi__get_chunk_header(s);
+      switch (c.type) {
+         case STBI__PNG_TYPE('C','g','B','I'):
+            is_iphone = 1;
+            stbi__skip(s, c.length);
+            break;
+         case STBI__PNG_TYPE('I','H','D','R'): {
+            int comp,filter;
+            if (!first) return stbi__err("multiple IHDR","Corrupt PNG");
+            first = 0;
+            if (c.length != 13) return stbi__err("bad IHDR len","Corrupt PNG");
+            s->img_x = stbi__get32be(s);
+            s->img_y = stbi__get32be(s);
+            if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+            z->depth = stbi__get8(s);  if (z->depth != 1 && z->depth != 2 && z->depth != 4 && z->depth != 8 && z->depth != 16)  return stbi__err("1/2/4/8/16-bit only","PNG not supported: 1/2/4/8/16-bit only");
+            color = stbi__get8(s);  if (color > 6)         return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3 && z->depth == 16)                  return stbi__err("bad ctype","Corrupt PNG");
+            if (color == 3) pal_img_n = 3; else if (color & 1) return stbi__err("bad ctype","Corrupt PNG");
+            comp  = stbi__get8(s);  if (comp) return stbi__err("bad comp method","Corrupt PNG");
+            filter= stbi__get8(s);  if (filter) return stbi__err("bad filter method","Corrupt PNG");
+            interlace = stbi__get8(s); if (interlace>1) return stbi__err("bad interlace method","Corrupt PNG");
+            if (!s->img_x || !s->img_y) return stbi__err("0-pixel image","Corrupt PNG");
+            if (!pal_img_n) {
+               s->img_n = (color & 2 ? 3 : 1) + (color & 4 ? 1 : 0);
+               if ((1 << 30) / s->img_x / s->img_n < s->img_y) return stbi__err("too large", "Image too large to decode");
+               if (scan == STBI__SCAN_header) return 1;
+            } else {
+               // if paletted, then pal_n is our final components, and
+               // img_n is # components to decompress/filter.
+               s->img_n = 1;
+               if ((1 << 30) / s->img_x / 4 < s->img_y) return stbi__err("too large","Corrupt PNG");
+               // if SCAN_header, have to scan to see if we have a tRNS
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('P','L','T','E'):  {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (c.length > 256*3) return stbi__err("invalid PLTE","Corrupt PNG");
+            pal_len = c.length / 3;
+            if (pal_len * 3 != c.length) return stbi__err("invalid PLTE","Corrupt PNG");
+            for (i=0; i < pal_len; ++i) {
+               palette[i*4+0] = stbi__get8(s);
+               palette[i*4+1] = stbi__get8(s);
+               palette[i*4+2] = stbi__get8(s);
+               palette[i*4+3] = 255;
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('t','R','N','S'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (z->idata) return stbi__err("tRNS after IDAT","Corrupt PNG");
+            if (pal_img_n) {
+               if (scan == STBI__SCAN_header) { s->img_n = 4; return 1; }
+               if (pal_len == 0) return stbi__err("tRNS before PLTE","Corrupt PNG");
+               if (c.length > pal_len) return stbi__err("bad tRNS len","Corrupt PNG");
+               pal_img_n = 4;
+               for (i=0; i < c.length; ++i)
+                  palette[i*4+3] = stbi__get8(s);
+            } else {
+               if (!(s->img_n & 1)) return stbi__err("tRNS with alpha","Corrupt PNG");
+               if (c.length != (stbi__uint32) s->img_n*2) return stbi__err("bad tRNS len","Corrupt PNG");
+               has_trans = 1;
+               if (z->depth == 16) {
+                  for (k = 0; k < s->img_n; ++k) tc16[k] = (stbi__uint16)stbi__get16be(s); // copy the values as-is
+               } else {
+                  for (k = 0; k < s->img_n; ++k) tc[k] = (stbi_uc)(stbi__get16be(s) & 255) * stbi__depth_scale_table[z->depth]; // non 8-bit images will be larger
+               }
+            }
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','D','A','T'): {
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (pal_img_n && !pal_len) return stbi__err("no PLTE","Corrupt PNG");
+            if (scan == STBI__SCAN_header) { s->img_n = pal_img_n; return 1; }
+            if ((int)(ioff + c.length) < (int)ioff) return 0;
+            if (ioff + c.length > idata_limit) {
+               stbi__uint32 idata_limit_old = idata_limit;
+               stbi_uc *p;
+               if (idata_limit == 0) idata_limit = c.length > 4096 ? c.length : 4096;
+               while (ioff + c.length > idata_limit)
+                  idata_limit *= 2;
+               STBI_NOTUSED(idata_limit_old);
+               p = (stbi_uc *) STBI_REALLOC_SIZED(z->idata, idata_limit_old, idata_limit); if (p == NULL) return stbi__err("outofmem", "Out of memory");
+               z->idata = p;
+            }
+            if (!stbi__getn(s, z->idata+ioff,c.length)) return stbi__err("outofdata","Corrupt PNG");
+            ioff += c.length;
+            break;
+         }
+
+         case STBI__PNG_TYPE('I','E','N','D'): {
+            stbi__uint32 raw_len, bpl;
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if (scan != STBI__SCAN_load) return 1;
+            if (z->idata == NULL) return stbi__err("no IDAT","Corrupt PNG");
+            // initial guess for decoded data size to avoid unnecessary reallocs
+            bpl = (s->img_x * z->depth + 7) / 8; // bytes per line, per component
+            raw_len = bpl * s->img_y * s->img_n /* pixels */ + s->img_y /* filter mode per row */;
+            z->expanded = (stbi_uc *) stbi_zlib_decode_malloc_guesssize_headerflag((char *) z->idata, ioff, raw_len, (int *) &raw_len, !is_iphone);
+            if (z->expanded == NULL) return 0; // zlib should set error
+            STBI_FREE(z->idata); z->idata = NULL;
+            if ((req_comp == s->img_n+1 && req_comp != 3 && !pal_img_n) || has_trans)
+               s->img_out_n = s->img_n+1;
+            else
+               s->img_out_n = s->img_n;
+            if (!stbi__create_png_image(z, z->expanded, raw_len, s->img_out_n, z->depth, color, interlace)) return 0;
+            if (has_trans) {
+               if (z->depth == 16) {
+                  if (!stbi__compute_transparency16(z, tc16, s->img_out_n)) return 0;
+               } else {
+                  if (!stbi__compute_transparency(z, tc, s->img_out_n)) return 0;
+               }
+            }
+            if (is_iphone && stbi__de_iphone_flag && s->img_out_n > 2)
+               stbi__de_iphone(z);
+            if (pal_img_n) {
+               // pal_img_n == 3 or 4
+               s->img_n = pal_img_n; // record the actual colors we had
+               s->img_out_n = pal_img_n;
+               if (req_comp >= 3) s->img_out_n = req_comp;
+               if (!stbi__expand_png_palette(z, palette, pal_len, s->img_out_n))
+                  return 0;
+            } else if (has_trans) {
+               // non-paletted image with tRNS -> source image has (constant) alpha
+               ++s->img_n;
+            }
+            STBI_FREE(z->expanded); z->expanded = NULL;
+            // end of PNG chunk, read and skip CRC
+            stbi__get32be(s);
+            return 1;
+         }
+
+         default:
+            // if critical, fail
+            if (first) return stbi__err("first not IHDR", "Corrupt PNG");
+            if ((c.type & (1 << 29)) == 0) {
+               #ifndef STBI_NO_FAILURE_STRINGS
+               // not threadsafe
+               static char invalid_chunk[] = "XXXX PNG chunk not known";
+               invalid_chunk[0] = STBI__BYTECAST(c.type >> 24);
+               invalid_chunk[1] = STBI__BYTECAST(c.type >> 16);
+               invalid_chunk[2] = STBI__BYTECAST(c.type >>  8);
+               invalid_chunk[3] = STBI__BYTECAST(c.type >>  0);
+               #endif
+               return stbi__err(invalid_chunk, "PNG not supported: unknown PNG chunk type");
+            }
+            stbi__skip(s, c.length);
+            break;
+      }
+      // end of PNG chunk, read and skip CRC
+      stbi__get32be(s);
+   }
+}
+
+static void *stbi__do_png(stbi__png *p, int *x, int *y, int *n, int req_comp, stbi__result_info *ri)
+{
+   void *result=NULL;
+   if (req_comp < 0 || req_comp > 4) return stbi__errpuc("bad req_comp", "Internal error");
+   if (stbi__parse_png_file(p, STBI__SCAN_load, req_comp)) {
+      if (p->depth <= 8)
+         ri->bits_per_channel = 8;
+      else if (p->depth == 16)
+         ri->bits_per_channel = 16;
+      else
+         return stbi__errpuc("bad bits_per_channel", "PNG not supported: unsupported color depth");
+      result = p->out;
+      p->out = NULL;
+      if (req_comp && req_comp != p->s->img_out_n) {
+         if (ri->bits_per_channel == 8)
+            result = stbi__convert_format((unsigned char *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         else
+            result = stbi__convert_format16((stbi__uint16 *) result, p->s->img_out_n, req_comp, p->s->img_x, p->s->img_y);
+         p->s->img_out_n = req_comp;
+         if (result == NULL) return result;
+      }
+      *x = p->s->img_x;
+      *y = p->s->img_y;
+      if (n) *n = p->s->img_n;
+   }
+   STBI_FREE(p->out);      p->out      = NULL;
+   STBI_FREE(p->expanded); p->expanded = NULL;
+   STBI_FREE(p->idata);    p->idata    = NULL;
+
+   return result;
+}
+
+static void *stbi__png_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__do_png(&p, x,y,comp,req_comp, ri);
+}
+
+static int stbi__png_test(stbi__context *s)
+{
+   int r;
+   r = stbi__check_png_header(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__png_info_raw(stbi__png *p, int *x, int *y, int *comp)
+{
+   if (!stbi__parse_png_file(p, STBI__SCAN_header, 0)) {
+      stbi__rewind( p->s );
+      return 0;
+   }
+   if (x) *x = p->s->img_x;
+   if (y) *y = p->s->img_y;
+   if (comp) *comp = p->s->img_n;
+   return 1;
+}
+
+static int stbi__png_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__png p;
+   p.s = s;
+   return stbi__png_info_raw(&p, x, y, comp);
+}
+
+static int stbi__png_is16(stbi__context *s)
+{
+   stbi__png p;
+   p.s = s;
+   if (!stbi__png_info_raw(&p, NULL, NULL, NULL))
+     return 0;
+   if (p.depth != 16) {
+      stbi__rewind(p.s);
+      return 0;
+   }
+   return 1;
+}
+#endif
+
+// Microsoft/Windows BMP image
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_test_raw(stbi__context *s)
+{
+   int r;
+   int sz;
+   if (stbi__get8(s) != 'B') return 0;
+   if (stbi__get8(s) != 'M') return 0;
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   stbi__get32le(s); // discard data offset
+   sz = stbi__get32le(s);
+   r = (sz == 12 || sz == 40 || sz == 56 || sz == 108 || sz == 124);
+   return r;
+}
+
+static int stbi__bmp_test(stbi__context *s)
+{
+   int r = stbi__bmp_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+
+// returns 0..31 for the highest set bit
+static int stbi__high_bit(unsigned int z)
+{
+   int n=0;
+   if (z == 0) return -1;
+   if (z >= 0x10000) { n += 16; z >>= 16; }
+   if (z >= 0x00100) { n +=  8; z >>=  8; }
+   if (z >= 0x00010) { n +=  4; z >>=  4; }
+   if (z >= 0x00004) { n +=  2; z >>=  2; }
+   if (z >= 0x00002) { n +=  1;/* >>=  1;*/ }
+   return n;
+}
+
+static int stbi__bitcount(unsigned int a)
+{
+   a = (a & 0x55555555) + ((a >>  1) & 0x55555555); // max 2
+   a = (a & 0x33333333) + ((a >>  2) & 0x33333333); // max 4
+   a = (a + (a >> 4)) & 0x0f0f0f0f; // max 8 per 4, now 8 bits
+   a = (a + (a >> 8)); // max 16 per 8 bits
+   a = (a + (a >> 16)); // max 32 per 8 bits
+   return a & 0xff;
+}
+
+// extract an arbitrarily-aligned N-bit value (N=bits)
+// from v, and then make it 8-bits long and fractionally
+// extend it to full full range.
+static int stbi__shiftsigned(unsigned int v, int shift, int bits)
+{
+   static unsigned int mul_table[9] = {
+      0,
+      0xff/*0b11111111*/, 0x55/*0b01010101*/, 0x49/*0b01001001*/, 0x11/*0b00010001*/,
+      0x21/*0b00100001*/, 0x41/*0b01000001*/, 0x81/*0b10000001*/, 0x01/*0b00000001*/,
+   };
+   static unsigned int shift_table[9] = {
+      0, 0,0,1,0,2,4,6,0,
+   };
+   if (shift < 0)
+      v <<= -shift;
+   else
+      v >>= shift;
+   STBI_ASSERT(v < 256);
+   v >>= (8-bits);
+   STBI_ASSERT(bits >= 0 && bits <= 8);
+   return (int) ((unsigned) v * mul_table[bits]) >> shift_table[bits];
+}
+
+typedef struct
+{
+   int bpp, offset, hsz;
+   unsigned int mr,mg,mb,ma, all_a;
+   int extra_read;
+} stbi__bmp_data;
+
+static int stbi__bmp_set_mask_defaults(stbi__bmp_data *info, int compress)
+{
+   // BI_BITFIELDS specifies masks explicitly, don't override
+   if (compress == 3)
+      return 1;
+
+   if (compress == 0) {
+      if (info->bpp == 16) {
+         info->mr = 31u << 10;
+         info->mg = 31u <<  5;
+         info->mb = 31u <<  0;
+      } else if (info->bpp == 32) {
+         info->mr = 0xffu << 16;
+         info->mg = 0xffu <<  8;
+         info->mb = 0xffu <<  0;
+         info->ma = 0xffu << 24;
+         info->all_a = 0; // if all_a is 0 at end, then we loaded alpha channel but it was all 0
+      } else {
+         // otherwise, use defaults, which is all-0
+         info->mr = info->mg = info->mb = info->ma = 0;
+      }
+      return 1;
+   }
+   return 0; // error
+}
+
+static void *stbi__bmp_parse_header(stbi__context *s, stbi__bmp_data *info)
+{
+   int hsz;
+   if (stbi__get8(s) != 'B' || stbi__get8(s) != 'M') return stbi__errpuc("not BMP", "Corrupt BMP");
+   stbi__get32le(s); // discard filesize
+   stbi__get16le(s); // discard reserved
+   stbi__get16le(s); // discard reserved
+   info->offset = stbi__get32le(s);
+   info->hsz = hsz = stbi__get32le(s);
+   info->mr = info->mg = info->mb = info->ma = 0;
+   info->extra_read = 14;
+
+   if (info->offset < 0) return stbi__errpuc("bad BMP", "bad BMP");
+
+   if (hsz != 12 && hsz != 40 && hsz != 56 && hsz != 108 && hsz != 124) return stbi__errpuc("unknown BMP", "BMP type not supported: unknown");
+   if (hsz == 12) {
+      s->img_x = stbi__get16le(s);
+      s->img_y = stbi__get16le(s);
+   } else {
+      s->img_x = stbi__get32le(s);
+      s->img_y = stbi__get32le(s);
+   }
+   if (stbi__get16le(s) != 1) return stbi__errpuc("bad BMP", "bad BMP");
+   info->bpp = stbi__get16le(s);
+   if (hsz != 12) {
+      int compress = stbi__get32le(s);
+      if (compress == 1 || compress == 2) return stbi__errpuc("BMP RLE", "BMP type not supported: RLE");
+      if (compress >= 4) return stbi__errpuc("BMP JPEG/PNG", "BMP type not supported: unsupported compression"); // this includes PNG/JPEG modes
+      if (compress == 3 && info->bpp != 16 && info->bpp != 32) return stbi__errpuc("bad BMP", "bad BMP"); // bitfields requires 16 or 32 bits/pixel
+      stbi__get32le(s); // discard sizeof
+      stbi__get32le(s); // discard hres
+      stbi__get32le(s); // discard vres
+      stbi__get32le(s); // discard colorsused
+      stbi__get32le(s); // discard max important
+      if (hsz == 40 || hsz == 56) {
+         if (hsz == 56) {
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+            stbi__get32le(s);
+         }
+         if (info->bpp == 16 || info->bpp == 32) {
+            if (compress == 0) {
+               stbi__bmp_set_mask_defaults(info, compress);
+            } else if (compress == 3) {
+               info->mr = stbi__get32le(s);
+               info->mg = stbi__get32le(s);
+               info->mb = stbi__get32le(s);
+               info->extra_read += 12;
+               // not documented, but generated by photoshop and handled by mspaint
+               if (info->mr == info->mg && info->mg == info->mb) {
+                  // ?!?!?
+                  return stbi__errpuc("bad BMP", "bad BMP");
+               }
+            } else
+               return stbi__errpuc("bad BMP", "bad BMP");
+         }
+      } else {
+         // V4/V5 header
+         int i;
+         if (hsz != 108 && hsz != 124)
+            return stbi__errpuc("bad BMP", "bad BMP");
+         info->mr = stbi__get32le(s);
+         info->mg = stbi__get32le(s);
+         info->mb = stbi__get32le(s);
+         info->ma = stbi__get32le(s);
+         if (compress != 3) // override mr/mg/mb unless in BI_BITFIELDS mode, as per docs
+            stbi__bmp_set_mask_defaults(info, compress);
+         stbi__get32le(s); // discard color space
+         for (i=0; i < 12; ++i)
+            stbi__get32le(s); // discard color space parameters
+         if (hsz == 124) {
+            stbi__get32le(s); // discard rendering intent
+            stbi__get32le(s); // discard offset of profile data
+            stbi__get32le(s); // discard size of profile data
+            stbi__get32le(s); // discard reserved
+         }
+      }
+   }
+   return (void *) 1;
+}
+
+
+static void *stbi__bmp_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   unsigned int mr=0,mg=0,mb=0,ma=0, all_a;
+   stbi_uc pal[256][4];
+   int psize=0,i,j,width;
+   int flip_vertically, pad, target;
+   stbi__bmp_data info;
+   STBI_NOTUSED(ri);
+
+   info.all_a = 255;
+   if (stbi__bmp_parse_header(s, &info) == NULL)
+      return NULL; // error code already set
+
+   flip_vertically = ((int) s->img_y) > 0;
+   s->img_y = abs((int) s->img_y);
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   mr = info.mr;
+   mg = info.mg;
+   mb = info.mb;
+   ma = info.ma;
+   all_a = info.all_a;
+
+   if (info.hsz == 12) {
+      if (info.bpp < 24)
+         psize = (info.offset - info.extra_read - 24) / 3;
+   } else {
+      if (info.bpp < 16)
+         psize = (info.offset - info.extra_read - info.hsz) >> 2;
+   }
+   if (psize == 0) {
+      if (info.offset != s->callback_already_read + (s->img_buffer - s->img_buffer_original)) {
+        return stbi__errpuc("bad offset", "Corrupt BMP");
+      }
+   }
+
+   if (info.bpp == 24 && ma == 0xff000000)
+      s->img_n = 3;
+   else
+      s->img_n = ma ? 4 : 3;
+   if (req_comp && req_comp >= 3) // we can directly decode 3 or 4
+      target = req_comp;
+   else
+      target = s->img_n; // if they want monochrome, we'll post-convert
+
+   // sanity-check size
+   if (!stbi__mad3sizes_valid(target, s->img_x, s->img_y, 0))
+      return stbi__errpuc("too large", "Corrupt BMP");
+
+   out = (stbi_uc *) stbi__malloc_mad3(target, s->img_x, s->img_y, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   if (info.bpp < 16) {
+      int z=0;
+      if (psize == 0 || psize > 256) { STBI_FREE(out); return stbi__errpuc("invalid", "Corrupt BMP"); }
+      for (i=0; i < psize; ++i) {
+         pal[i][2] = stbi__get8(s);
+         pal[i][1] = stbi__get8(s);
+         pal[i][0] = stbi__get8(s);
+         if (info.hsz != 12) stbi__get8(s);
+         pal[i][3] = 255;
+      }
+      stbi__skip(s, info.offset - info.extra_read - info.hsz - psize * (info.hsz == 12 ? 3 : 4));
+      if (info.bpp == 1) width = (s->img_x + 7) >> 3;
+      else if (info.bpp == 4) width = (s->img_x + 1) >> 1;
+      else if (info.bpp == 8) width = s->img_x;
+      else { STBI_FREE(out); return stbi__errpuc("bad bpp", "Corrupt BMP"); }
+      pad = (-width)&3;
+      if (info.bpp == 1) {
+         for (j=0; j < (int) s->img_y; ++j) {
+            int bit_offset = 7, v = stbi__get8(s);
+            for (i=0; i < (int) s->img_x; ++i) {
+               int color = (v>>bit_offset)&0x1;
+               out[z++] = pal[color][0];
+               out[z++] = pal[color][1];
+               out[z++] = pal[color][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               if((--bit_offset) < 0) {
+                  bit_offset = 7;
+                  v = stbi__get8(s);
+               }
+            }
+            stbi__skip(s, pad);
+         }
+      } else {
+         for (j=0; j < (int) s->img_y; ++j) {
+            for (i=0; i < (int) s->img_x; i += 2) {
+               int v=stbi__get8(s),v2=0;
+               if (info.bpp == 4) {
+                  v2 = v & 15;
+                  v >>= 4;
+               }
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+               if (i+1 == (int) s->img_x) break;
+               v = (info.bpp == 8) ? stbi__get8(s) : v2;
+               out[z++] = pal[v][0];
+               out[z++] = pal[v][1];
+               out[z++] = pal[v][2];
+               if (target == 4) out[z++] = 255;
+            }
+            stbi__skip(s, pad);
+         }
+      }
+   } else {
+      int rshift=0,gshift=0,bshift=0,ashift=0,rcount=0,gcount=0,bcount=0,acount=0;
+      int z = 0;
+      int easy=0;
+      stbi__skip(s, info.offset - info.extra_read - info.hsz);
+      if (info.bpp == 24) width = 3 * s->img_x;
+      else if (info.bpp == 16) width = 2*s->img_x;
+      else /* bpp = 32 and pad = 0 */ width=0;
+      pad = (-width) & 3;
+      if (info.bpp == 24) {
+         easy = 1;
+      } else if (info.bpp == 32) {
+         if (mb == 0xff && mg == 0xff00 && mr == 0x00ff0000 && ma == 0xff000000)
+            easy = 2;
+      }
+      if (!easy) {
+         if (!mr || !mg || !mb) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+         // right shift amt to put high bit in position #7
+         rshift = stbi__high_bit(mr)-7; rcount = stbi__bitcount(mr);
+         gshift = stbi__high_bit(mg)-7; gcount = stbi__bitcount(mg);
+         bshift = stbi__high_bit(mb)-7; bcount = stbi__bitcount(mb);
+         ashift = stbi__high_bit(ma)-7; acount = stbi__bitcount(ma);
+         if (rcount > 8 || gcount > 8 || bcount > 8 || acount > 8) { STBI_FREE(out); return stbi__errpuc("bad masks", "Corrupt BMP"); }
+      }
+      for (j=0; j < (int) s->img_y; ++j) {
+         if (easy) {
+            for (i=0; i < (int) s->img_x; ++i) {
+               unsigned char a;
+               out[z+2] = stbi__get8(s);
+               out[z+1] = stbi__get8(s);
+               out[z+0] = stbi__get8(s);
+               z += 3;
+               a = (easy == 2 ? stbi__get8(s) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = a;
+            }
+         } else {
+            int bpp = info.bpp;
+            for (i=0; i < (int) s->img_x; ++i) {
+               stbi__uint32 v = (bpp == 16 ? (stbi__uint32) stbi__get16le(s) : stbi__get32le(s));
+               unsigned int a;
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mr, rshift, rcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mg, gshift, gcount));
+               out[z++] = STBI__BYTECAST(stbi__shiftsigned(v & mb, bshift, bcount));
+               a = (ma ? stbi__shiftsigned(v & ma, ashift, acount) : 255);
+               all_a |= a;
+               if (target == 4) out[z++] = STBI__BYTECAST(a);
+            }
+         }
+         stbi__skip(s, pad);
+      }
+   }
+
+   // if alpha channel is all 0s, replace with all 255s
+   if (target == 4 && all_a == 0)
+      for (i=4*s->img_x*s->img_y-1; i >= 0; i -= 4)
+         out[i] = 255;
+
+   if (flip_vertically) {
+      stbi_uc t;
+      for (j=0; j < (int) s->img_y>>1; ++j) {
+         stbi_uc *p1 = out +      j     *s->img_x*target;
+         stbi_uc *p2 = out + (s->img_y-1-j)*s->img_x*target;
+         for (i=0; i < (int) s->img_x*target; ++i) {
+            t = p1[i]; p1[i] = p2[i]; p2[i] = t;
+         }
+      }
+   }
+
+   if (req_comp && req_comp != target) {
+      out = stbi__convert_format(out, target, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+   return out;
+}
+#endif
+
+// Targa Truevision - TGA
+// by Jonathan Dummer
+#ifndef STBI_NO_TGA
+// returns STBI_rgb or whatever, 0 on error
+static int stbi__tga_get_comp(int bits_per_pixel, int is_grey, int* is_rgb16)
+{
+   // only RGB or RGBA (incl. 16bit) or grey allowed
+   if (is_rgb16) *is_rgb16 = 0;
+   switch(bits_per_pixel) {
+      case 8:  return STBI_grey;
+      case 16: if(is_grey) return STBI_grey_alpha;
+               // fallthrough
+      case 15: if(is_rgb16) *is_rgb16 = 1;
+               return STBI_rgb;
+      case 24: // fallthrough
+      case 32: return bits_per_pixel/8;
+      default: return 0;
+   }
+}
+
+static int stbi__tga_info(stbi__context *s, int *x, int *y, int *comp)
+{
+    int tga_w, tga_h, tga_comp, tga_image_type, tga_bits_per_pixel, tga_colormap_bpp;
+    int sz, tga_colormap_type;
+    stbi__get8(s);                   // discard Offset
+    tga_colormap_type = stbi__get8(s); // colormap type
+    if( tga_colormap_type > 1 ) {
+        stbi__rewind(s);
+        return 0;      // only RGB or indexed allowed
+    }
+    tga_image_type = stbi__get8(s); // image type
+    if ( tga_colormap_type == 1 ) { // colormapped (paletted) image
+        if (tga_image_type != 1 && tga_image_type != 9) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+        sz = stbi__get8(s);    //   check bits per palette color entry
+        if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) {
+            stbi__rewind(s);
+            return 0;
+        }
+        stbi__skip(s,4);       // skip image x and y origin
+        tga_colormap_bpp = sz;
+    } else { // "normal" image w/o colormap - only RGB or grey allowed, +/- RLE
+        if ( (tga_image_type != 2) && (tga_image_type != 3) && (tga_image_type != 10) && (tga_image_type != 11) ) {
+            stbi__rewind(s);
+            return 0; // only RGB or grey allowed, +/- RLE
+        }
+        stbi__skip(s,9); // skip colormap specification and image x/y origin
+        tga_colormap_bpp = 0;
+    }
+    tga_w = stbi__get16le(s);
+    if( tga_w < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test width
+    }
+    tga_h = stbi__get16le(s);
+    if( tga_h < 1 ) {
+        stbi__rewind(s);
+        return 0;   // test height
+    }
+    tga_bits_per_pixel = stbi__get8(s); // bits per pixel
+    stbi__get8(s); // ignore alpha bits
+    if (tga_colormap_bpp != 0) {
+        if((tga_bits_per_pixel != 8) && (tga_bits_per_pixel != 16)) {
+            // when using a colormap, tga_bits_per_pixel is the size of the indexes
+            // I don't think anything but 8 or 16bit indexes makes sense
+            stbi__rewind(s);
+            return 0;
+        }
+        tga_comp = stbi__tga_get_comp(tga_colormap_bpp, 0, NULL);
+    } else {
+        tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3) || (tga_image_type == 11), NULL);
+    }
+    if(!tga_comp) {
+      stbi__rewind(s);
+      return 0;
+    }
+    if (x) *x = tga_w;
+    if (y) *y = tga_h;
+    if (comp) *comp = tga_comp;
+    return 1;                   // seems to have passed everything
+}
+
+static int stbi__tga_test(stbi__context *s)
+{
+   int res = 0;
+   int sz, tga_color_type;
+   stbi__get8(s);      //   discard Offset
+   tga_color_type = stbi__get8(s);   //   color type
+   if ( tga_color_type > 1 ) goto errorEnd;   //   only RGB or indexed allowed
+   sz = stbi__get8(s);   //   image type
+   if ( tga_color_type == 1 ) { // colormapped (paletted) image
+      if (sz != 1 && sz != 9) goto errorEnd; // colortype 1 demands image type 1 or 9
+      stbi__skip(s,4);       // skip index of first colormap entry and number of entries
+      sz = stbi__get8(s);    //   check bits per palette color entry
+      if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+      stbi__skip(s,4);       // skip image x and y origin
+   } else { // "normal" image w/o colormap
+      if ( (sz != 2) && (sz != 3) && (sz != 10) && (sz != 11) ) goto errorEnd; // only RGB or grey allowed, +/- RLE
+      stbi__skip(s,9); // skip colormap specification and image x/y origin
+   }
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test width
+   if ( stbi__get16le(s) < 1 ) goto errorEnd;      //   test height
+   sz = stbi__get8(s);   //   bits per pixel
+   if ( (tga_color_type == 1) && (sz != 8) && (sz != 16) ) goto errorEnd; // for colormapped images, bpp is size of an index
+   if ( (sz != 8) && (sz != 15) && (sz != 16) && (sz != 24) && (sz != 32) ) goto errorEnd;
+
+   res = 1; // if we got this far, everything's good and we can return 1 instead of 0
+
+errorEnd:
+   stbi__rewind(s);
+   return res;
+}
+
+// read 16bit value and convert to 24bit RGB
+static void stbi__tga_read_rgb16(stbi__context *s, stbi_uc* out)
+{
+   stbi__uint16 px = (stbi__uint16)stbi__get16le(s);
+   stbi__uint16 fiveBitMask = 31;
+   // we have 3 channels with 5bits each
+   int r = (px >> 10) & fiveBitMask;
+   int g = (px >> 5) & fiveBitMask;
+   int b = px & fiveBitMask;
+   // Note that this saves the data in RGB(A) order, so it doesn't need to be swapped later
+   out[0] = (stbi_uc)((r * 255)/31);
+   out[1] = (stbi_uc)((g * 255)/31);
+   out[2] = (stbi_uc)((b * 255)/31);
+
+   // some people claim that the most significant bit might be used for alpha
+   // (possibly if an alpha-bit is set in the "image descriptor byte")
+   // but that only made 16bit test images completely translucent..
+   // so let's treat all 15 and 16bit TGAs as RGB with no alpha.
+}
+
+static void *stbi__tga_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   //   read in the TGA header stuff
+   int tga_offset = stbi__get8(s);
+   int tga_indexed = stbi__get8(s);
+   int tga_image_type = stbi__get8(s);
+   int tga_is_RLE = 0;
+   int tga_palette_start = stbi__get16le(s);
+   int tga_palette_len = stbi__get16le(s);
+   int tga_palette_bits = stbi__get8(s);
+   int tga_x_origin = stbi__get16le(s);
+   int tga_y_origin = stbi__get16le(s);
+   int tga_width = stbi__get16le(s);
+   int tga_height = stbi__get16le(s);
+   int tga_bits_per_pixel = stbi__get8(s);
+   int tga_comp, tga_rgb16=0;
+   int tga_inverted = stbi__get8(s);
+   // int tga_alpha_bits = tga_inverted & 15; // the 4 lowest bits - unused (useless?)
+   //   image data
+   unsigned char *tga_data;
+   unsigned char *tga_palette = NULL;
+   int i, j;
+   unsigned char raw_data[4] = {0};
+   int RLE_count = 0;
+   int RLE_repeating = 0;
+   int read_next_pixel = 1;
+   STBI_NOTUSED(ri);
+   STBI_NOTUSED(tga_x_origin); // @TODO
+   STBI_NOTUSED(tga_y_origin); // @TODO
+
+   if (tga_height > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (tga_width > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   //   do a tiny bit of precessing
+   if ( tga_image_type >= 8 )
+   {
+      tga_image_type -= 8;
+      tga_is_RLE = 1;
+   }
+   tga_inverted = 1 - ((tga_inverted >> 5) & 1);
+
+   //   If I'm paletted, then I'll use the number of bits from the palette
+   if ( tga_indexed ) tga_comp = stbi__tga_get_comp(tga_palette_bits, 0, &tga_rgb16);
+   else tga_comp = stbi__tga_get_comp(tga_bits_per_pixel, (tga_image_type == 3), &tga_rgb16);
+
+   if(!tga_comp) // shouldn't really happen, stbi__tga_test() should have ensured basic consistency
+      return stbi__errpuc("bad format", "Can't find out TGA pixelformat");
+
+   //   tga info
+   *x = tga_width;
+   *y = tga_height;
+   if (comp) *comp = tga_comp;
+
+   if (!stbi__mad3sizes_valid(tga_width, tga_height, tga_comp, 0))
+      return stbi__errpuc("too large", "Corrupt TGA");
+
+   tga_data = (unsigned char*)stbi__malloc_mad3(tga_width, tga_height, tga_comp, 0);
+   if (!tga_data) return stbi__errpuc("outofmem", "Out of memory");
+
+   // skip to the data's starting position (offset usually = 0)
+   stbi__skip(s, tga_offset );
+
+   if ( !tga_indexed && !tga_is_RLE && !tga_rgb16 ) {
+      for (i=0; i < tga_height; ++i) {
+         int row = tga_inverted ? tga_height -i - 1 : i;
+         stbi_uc *tga_row = tga_data + row*tga_width*tga_comp;
+         stbi__getn(s, tga_row, tga_width * tga_comp);
+      }
+   } else  {
+      //   do I need to load a palette?
+      if ( tga_indexed)
+      {
+         if (tga_palette_len == 0) {  /* you have to have at least one entry! */
+            STBI_FREE(tga_data);
+            return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+
+         //   any data to skip? (offset usually = 0)
+         stbi__skip(s, tga_palette_start );
+         //   load the palette
+         tga_palette = (unsigned char*)stbi__malloc_mad2(tga_palette_len, tga_comp, 0);
+         if (!tga_palette) {
+            STBI_FREE(tga_data);
+            return stbi__errpuc("outofmem", "Out of memory");
+         }
+         if (tga_rgb16) {
+            stbi_uc *pal_entry = tga_palette;
+            STBI_ASSERT(tga_comp == STBI_rgb);
+            for (i=0; i < tga_palette_len; ++i) {
+               stbi__tga_read_rgb16(s, pal_entry);
+               pal_entry += tga_comp;
+            }
+         } else if (!stbi__getn(s, tga_palette, tga_palette_len * tga_comp)) {
+               STBI_FREE(tga_data);
+               STBI_FREE(tga_palette);
+               return stbi__errpuc("bad palette", "Corrupt TGA");
+         }
+      }
+      //   load the data
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         //   if I'm in RLE mode, do I need to get a RLE stbi__pngchunk?
+         if ( tga_is_RLE )
+         {
+            if ( RLE_count == 0 )
+            {
+               //   yep, get the next byte as a RLE command
+               int RLE_cmd = stbi__get8(s);
+               RLE_count = 1 + (RLE_cmd & 127);
+               RLE_repeating = RLE_cmd >> 7;
+               read_next_pixel = 1;
+            } else if ( !RLE_repeating )
+            {
+               read_next_pixel = 1;
+            }
+         } else
+         {
+            read_next_pixel = 1;
+         }
+         //   OK, if I need to read a pixel, do it now
+         if ( read_next_pixel )
+         {
+            //   load however much data we did have
+            if ( tga_indexed )
+            {
+               // read in index, then perform the lookup
+               int pal_idx = (tga_bits_per_pixel == 8) ? stbi__get8(s) : stbi__get16le(s);
+               if ( pal_idx >= tga_palette_len ) {
+                  // invalid index
+                  pal_idx = 0;
+               }
+               pal_idx *= tga_comp;
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = tga_palette[pal_idx+j];
+               }
+            } else if(tga_rgb16) {
+               STBI_ASSERT(tga_comp == STBI_rgb);
+               stbi__tga_read_rgb16(s, raw_data);
+            } else {
+               //   read in the data raw
+               for (j = 0; j < tga_comp; ++j) {
+                  raw_data[j] = stbi__get8(s);
+               }
+            }
+            //   clear the reading flag for the next pixel
+            read_next_pixel = 0;
+         } // end of reading a pixel
+
+         // copy data
+         for (j = 0; j < tga_comp; ++j)
+           tga_data[i*tga_comp+j] = raw_data[j];
+
+         //   in case we're in RLE mode, keep counting down
+         --RLE_count;
+      }
+      //   do I need to invert the image?
+      if ( tga_inverted )
+      {
+         for (j = 0; j*2 < tga_height; ++j)
+         {
+            int index1 = j * tga_width * tga_comp;
+            int index2 = (tga_height - 1 - j) * tga_width * tga_comp;
+            for (i = tga_width * tga_comp; i > 0; --i)
+            {
+               unsigned char temp = tga_data[index1];
+               tga_data[index1] = tga_data[index2];
+               tga_data[index2] = temp;
+               ++index1;
+               ++index2;
+            }
+         }
+      }
+      //   clear my palette, if I had one
+      if ( tga_palette != NULL )
+      {
+         STBI_FREE( tga_palette );
+      }
+   }
+
+   // swap RGB - if the source data was RGB16, it already is in the right order
+   if (tga_comp >= 3 && !tga_rgb16)
+   {
+      unsigned char* tga_pixel = tga_data;
+      for (i=0; i < tga_width * tga_height; ++i)
+      {
+         unsigned char temp = tga_pixel[0];
+         tga_pixel[0] = tga_pixel[2];
+         tga_pixel[2] = temp;
+         tga_pixel += tga_comp;
+      }
+   }
+
+   // convert to target component count
+   if (req_comp && req_comp != tga_comp)
+      tga_data = stbi__convert_format(tga_data, tga_comp, req_comp, tga_width, tga_height);
+
+   //   the things I do to get rid of an error message, and yet keep
+   //   Microsoft's C compilers happy... [8^(
+   tga_palette_start = tga_palette_len = tga_palette_bits =
+         tga_x_origin = tga_y_origin = 0;
+   STBI_NOTUSED(tga_palette_start);
+   //   OK, done
+   return tga_data;
+}
+#endif
+
+// *************************************************************************************************
+// Photoshop PSD loader -- PD by Thatcher Ulrich, integration by Nicolas Schulz, tweaked by STB
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_test(stbi__context *s)
+{
+   int r = (stbi__get32be(s) == 0x38425053);
+   stbi__rewind(s);
+   return r;
+}
+
+static int stbi__psd_decode_rle(stbi__context *s, stbi_uc *p, int pixelCount)
+{
+   int count, nleft, len;
+
+   count = 0;
+   while ((nleft = pixelCount - count) > 0) {
+      len = stbi__get8(s);
+      if (len == 128) {
+         // No-op.
+      } else if (len < 128) {
+         // Copy next len+1 bytes literally.
+         len++;
+         if (len > nleft) return 0; // corrupt data
+         count += len;
+         while (len) {
+            *p = stbi__get8(s);
+            p += 4;
+            len--;
+         }
+      } else if (len > 128) {
+         stbi_uc   val;
+         // Next -len+1 bytes in the dest are replicated from next source byte.
+         // (Interpret len as a negative 8-bit int.)
+         len = 257 - len;
+         if (len > nleft) return 0; // corrupt data
+         val = stbi__get8(s);
+         count += len;
+         while (len) {
+            *p = val;
+            p += 4;
+            len--;
+         }
+      }
+   }
+
+   return 1;
+}
+
+static void *stbi__psd_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri, int bpc)
+{
+   int pixelCount;
+   int channelCount, compression;
+   int channel, i;
+   int bitdepth;
+   int w,h;
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   if (stbi__get32be(s) != 0x38425053)   // "8BPS"
+      return stbi__errpuc("not PSD", "Corrupt PSD image");
+
+   // Check file type version.
+   if (stbi__get16be(s) != 1)
+      return stbi__errpuc("wrong version", "Unsupported version of PSD image");
+
+   // Skip 6 reserved bytes.
+   stbi__skip(s, 6 );
+
+   // Read the number of channels (R, G, B, A, etc).
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16)
+      return stbi__errpuc("wrong channel count", "Unsupported number of channels in PSD image");
+
+   // Read the rows and columns of the image.
+   h = stbi__get32be(s);
+   w = stbi__get32be(s);
+
+   if (h > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (w > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   // Make sure the depth is 8 bits.
+   bitdepth = stbi__get16be(s);
+   if (bitdepth != 8 && bitdepth != 16)
+      return stbi__errpuc("unsupported bit depth", "PSD bit depth is not 8 or 16 bit");
+
+   // Make sure the color mode is RGB.
+   // Valid options are:
+   //   0: Bitmap
+   //   1: Grayscale
+   //   2: Indexed color
+   //   3: RGB color
+   //   4: CMYK color
+   //   7: Multichannel
+   //   8: Duotone
+   //   9: Lab color
+   if (stbi__get16be(s) != 3)
+      return stbi__errpuc("wrong color format", "PSD is not in RGB color format");
+
+   // Skip the Mode Data.  (It's the palette for indexed color; other info for other modes.)
+   stbi__skip(s,stbi__get32be(s) );
+
+   // Skip the image resources.  (resolution, pen tool paths, etc)
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Skip the reserved data.
+   stbi__skip(s, stbi__get32be(s) );
+
+   // Find out if the data is compressed.
+   // Known values:
+   //   0: no compression
+   //   1: RLE compressed
+   compression = stbi__get16be(s);
+   if (compression > 1)
+      return stbi__errpuc("bad compression", "PSD has an unknown compression format");
+
+   // Check size
+   if (!stbi__mad3sizes_valid(4, w, h, 0))
+      return stbi__errpuc("too large", "Corrupt PSD");
+
+   // Create the destination image.
+
+   if (!compression && bitdepth == 16 && bpc == 16) {
+      out = (stbi_uc *) stbi__malloc_mad3(8, w, h, 0);
+      ri->bits_per_channel = 16;
+   } else
+      out = (stbi_uc *) stbi__malloc(4 * w*h);
+
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   pixelCount = w*h;
+
+   // Initialize the data to zero.
+   //memset( out, 0, pixelCount * 4 );
+
+   // Finally, the image data.
+   if (compression) {
+      // RLE as used by .PSD and .TIFF
+      // Loop until you get the number of unpacked bytes you are expecting:
+      //     Read the next source byte into n.
+      //     If n is between 0 and 127 inclusive, copy the next n+1 bytes literally.
+      //     Else if n is between -127 and -1 inclusive, copy the next byte -n+1 times.
+      //     Else if n is 128, noop.
+      // Endloop
+
+      // The RLE-compressed data is preceded by a 2-byte data count for each row in the data,
+      // which we're going to just skip.
+      stbi__skip(s, h * channelCount * 2 );
+
+      // Read the RLE data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         stbi_uc *p;
+
+         p = out+channel;
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            for (i = 0; i < pixelCount; i++, p += 4)
+               *p = (channel == 3 ? 255 : 0);
+         } else {
+            // Read the RLE data.
+            if (!stbi__psd_decode_rle(s, p, pixelCount)) {
+               STBI_FREE(out);
+               return stbi__errpuc("corrupt", "bad RLE data");
+            }
+         }
+      }
+
+   } else {
+      // We're at the raw image data.  It's each channel in order (Red, Green, Blue, Alpha, ...)
+      // where each channel consists of an 8-bit (or 16-bit) value for each pixel in the image.
+
+      // Read the data by channel.
+      for (channel = 0; channel < 4; channel++) {
+         if (channel >= channelCount) {
+            // Fill this channel with default data.
+            if (bitdepth == 16 && bpc == 16) {
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               stbi__uint16 val = channel == 3 ? 65535 : 0;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = val;
+            } else {
+               stbi_uc *p = out+channel;
+               stbi_uc val = channel == 3 ? 255 : 0;
+               for (i = 0; i < pixelCount; i++, p += 4)
+                  *p = val;
+            }
+         } else {
+            if (ri->bits_per_channel == 16) {    // output bpc
+               stbi__uint16 *q = ((stbi__uint16 *) out) + channel;
+               for (i = 0; i < pixelCount; i++, q += 4)
+                  *q = (stbi__uint16) stbi__get16be(s);
+            } else {
+               stbi_uc *p = out+channel;
+               if (bitdepth == 16) {  // input bpc
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = (stbi_uc) (stbi__get16be(s) >> 8);
+               } else {
+                  for (i = 0; i < pixelCount; i++, p += 4)
+                     *p = stbi__get8(s);
+               }
+            }
+         }
+      }
+   }
+
+   // remove weird white matte from PSD
+   if (channelCount >= 4) {
+      if (ri->bits_per_channel == 16) {
+         for (i=0; i < w*h; ++i) {
+            stbi__uint16 *pixel = (stbi__uint16 *) out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 65535) {
+               float a = pixel[3] / 65535.0f;
+               float ra = 1.0f / a;
+               float inv_a = 65535.0f * (1 - ra);
+               pixel[0] = (stbi__uint16) (pixel[0]*ra + inv_a);
+               pixel[1] = (stbi__uint16) (pixel[1]*ra + inv_a);
+               pixel[2] = (stbi__uint16) (pixel[2]*ra + inv_a);
+            }
+         }
+      } else {
+         for (i=0; i < w*h; ++i) {
+            unsigned char *pixel = out + 4*i;
+            if (pixel[3] != 0 && pixel[3] != 255) {
+               float a = pixel[3] / 255.0f;
+               float ra = 1.0f / a;
+               float inv_a = 255.0f * (1 - ra);
+               pixel[0] = (unsigned char) (pixel[0]*ra + inv_a);
+               pixel[1] = (unsigned char) (pixel[1]*ra + inv_a);
+               pixel[2] = (unsigned char) (pixel[2]*ra + inv_a);
+            }
+         }
+      }
+   }
+
+   // convert to desired output format
+   if (req_comp && req_comp != 4) {
+      if (ri->bits_per_channel == 16)
+         out = (stbi_uc *) stbi__convert_format16((stbi__uint16 *) out, 4, req_comp, w, h);
+      else
+         out = stbi__convert_format(out, 4, req_comp, w, h);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+
+   if (comp) *comp = 4;
+   *y = h;
+   *x = w;
+
+   return out;
+}
+#endif
+
+// *************************************************************************************************
+// Softimage PIC loader
+// by Tom Seddon
+//
+// See http://softimage.wiki.softimage.com/index.php/INFO:_PIC_file_format
+// See http://ozviz.wasp.uwa.edu.au/~pbourke/dataformats/softimagepic/
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_is4(stbi__context *s,const char *str)
+{
+   int i;
+   for (i=0; i<4; ++i)
+      if (stbi__get8(s) != (stbi_uc)str[i])
+         return 0;
+
+   return 1;
+}
+
+static int stbi__pic_test_core(stbi__context *s)
+{
+   int i;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34"))
+      return 0;
+
+   for(i=0;i<84;++i)
+      stbi__get8(s);
+
+   if (!stbi__pic_is4(s,"PICT"))
+      return 0;
+
+   return 1;
+}
+
+typedef struct
+{
+   stbi_uc size,type,channel;
+} stbi__pic_packet;
+
+static stbi_uc *stbi__readval(stbi__context *s, int channel, stbi_uc *dest)
+{
+   int mask=0x80, i;
+
+   for (i=0; i<4; ++i, mask>>=1) {
+      if (channel & mask) {
+         if (stbi__at_eof(s)) return stbi__errpuc("bad file","PIC file too short");
+         dest[i]=stbi__get8(s);
+      }
+   }
+
+   return dest;
+}
+
+static void stbi__copyval(int channel,stbi_uc *dest,const stbi_uc *src)
+{
+   int mask=0x80,i;
+
+   for (i=0;i<4; ++i, mask>>=1)
+      if (channel&mask)
+         dest[i]=src[i];
+}
+
+static stbi_uc *stbi__pic_load_core(stbi__context *s,int width,int height,int *comp, stbi_uc *result)
+{
+   int act_comp=0,num_packets=0,y,chained;
+   stbi__pic_packet packets[10];
+
+   // this will (should...) cater for even some bizarre stuff like having data
+    // for the same channel in multiple packets.
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return stbi__errpuc("bad format","too many packets");
+
+      packet = &packets[num_packets++];
+
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s))          return stbi__errpuc("bad file","file too short (reading packets)");
+      if (packet->size != 8)  return stbi__errpuc("bad format","packet isn't 8bpp");
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3); // has alpha channel?
+
+   for(y=0; y<height; ++y) {
+      int packet_idx;
+
+      for(packet_idx=0; packet_idx < num_packets; ++packet_idx) {
+         stbi__pic_packet *packet = &packets[packet_idx];
+         stbi_uc *dest = result+y*width*4;
+
+         switch (packet->type) {
+            default:
+               return stbi__errpuc("bad format","packet has bad compression type");
+
+            case 0: {//uncompressed
+               int x;
+
+               for(x=0;x<width;++x, dest+=4)
+                  if (!stbi__readval(s,packet->channel,dest))
+                     return 0;
+               break;
+            }
+
+            case 1://Pure RLE
+               {
+                  int left=width, i;
+
+                  while (left>0) {
+                     stbi_uc count,value[4];
+
+                     count=stbi__get8(s);
+                     if (stbi__at_eof(s))   return stbi__errpuc("bad file","file too short (pure read count)");
+
+                     if (count > left)
+                        count = (stbi_uc) left;
+
+                     if (!stbi__readval(s,packet->channel,value))  return 0;
+
+                     for(i=0; i<count; ++i,dest+=4)
+                        stbi__copyval(packet->channel,dest,value);
+                     left -= count;
+                  }
+               }
+               break;
+
+            case 2: {//Mixed RLE
+               int left=width;
+               while (left>0) {
+                  int count = stbi__get8(s), i;
+                  if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (mixed read count)");
+
+                  if (count >= 128) { // Repeated
+                     stbi_uc value[4];
+
+                     if (count==128)
+                        count = stbi__get16be(s);
+                     else
+                        count -= 127;
+                     if (count > left)
+                        return stbi__errpuc("bad file","scanline overrun");
+
+                     if (!stbi__readval(s,packet->channel,value))
+                        return 0;
+
+                     for(i=0;i<count;++i, dest += 4)
+                        stbi__copyval(packet->channel,dest,value);
+                  } else { // Raw
+                     ++count;
+                     if (count>left) return stbi__errpuc("bad file","scanline overrun");
+
+                     for(i=0;i<count;++i, dest+=4)
+                        if (!stbi__readval(s,packet->channel,dest))
+                           return 0;
+                  }
+                  left-=count;
+               }
+               break;
+            }
+         }
+      }
+   }
+
+   return result;
+}
+
+static void *stbi__pic_load(stbi__context *s,int *px,int *py,int *comp,int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *result;
+   int i, x,y, internal_comp;
+   STBI_NOTUSED(ri);
+
+   if (!comp) comp = &internal_comp;
+
+   for (i=0; i<92; ++i)
+      stbi__get8(s);
+
+   x = stbi__get16be(s);
+   y = stbi__get16be(s);
+
+   if (y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   if (stbi__at_eof(s))  return stbi__errpuc("bad file","file too short (pic header)");
+   if (!stbi__mad3sizes_valid(x, y, 4, 0)) return stbi__errpuc("too large", "PIC image too large to decode");
+
+   stbi__get32be(s); //skip `ratio'
+   stbi__get16be(s); //skip `fields'
+   stbi__get16be(s); //skip `pad'
+
+   // intermediate buffer is RGBA
+   result = (stbi_uc *) stbi__malloc_mad3(x, y, 4, 0);
+   if (!result) return stbi__errpuc("outofmem", "Out of memory");
+   memset(result, 0xff, x*y*4);
+
+   if (!stbi__pic_load_core(s,x,y,comp, result)) {
+      STBI_FREE(result);
+      result=0;
+   }
+   *px = x;
+   *py = y;
+   if (req_comp == 0) req_comp = *comp;
+   result=stbi__convert_format(result,4,req_comp,x,y);
+
+   return result;
+}
+
+static int stbi__pic_test(stbi__context *s)
+{
+   int r = stbi__pic_test_core(s);
+   stbi__rewind(s);
+   return r;
+}
+#endif
+
+// *************************************************************************************************
+// GIF loader -- public domain by Jean-Marc Lienher -- simplified/shrunk by stb
+
+#ifndef STBI_NO_GIF
+typedef struct
+{
+   stbi__int16 prefix;
+   stbi_uc first;
+   stbi_uc suffix;
+} stbi__gif_lzw;
+
+typedef struct
+{
+   int w,h;
+   stbi_uc *out;                 // output buffer (always 4 components)
+   stbi_uc *background;          // The current "background" as far as a gif is concerned
+   stbi_uc *history;
+   int flags, bgindex, ratio, transparent, eflags;
+   stbi_uc  pal[256][4];
+   stbi_uc lpal[256][4];
+   stbi__gif_lzw codes[8192];
+   stbi_uc *color_table;
+   int parse, step;
+   int lflags;
+   int start_x, start_y;
+   int max_x, max_y;
+   int cur_x, cur_y;
+   int line_size;
+   int delay;
+} stbi__gif;
+
+static int stbi__gif_test_raw(stbi__context *s)
+{
+   int sz;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8') return 0;
+   sz = stbi__get8(s);
+   if (sz != '9' && sz != '7') return 0;
+   if (stbi__get8(s) != 'a') return 0;
+   return 1;
+}
+
+static int stbi__gif_test(stbi__context *s)
+{
+   int r = stbi__gif_test_raw(s);
+   stbi__rewind(s);
+   return r;
+}
+
+static void stbi__gif_parse_colortable(stbi__context *s, stbi_uc pal[256][4], int num_entries, int transp)
+{
+   int i;
+   for (i=0; i < num_entries; ++i) {
+      pal[i][2] = stbi__get8(s);
+      pal[i][1] = stbi__get8(s);
+      pal[i][0] = stbi__get8(s);
+      pal[i][3] = transp == i ? 0 : 255;
+   }
+}
+
+static int stbi__gif_header(stbi__context *s, stbi__gif *g, int *comp, int is_info)
+{
+   stbi_uc version;
+   if (stbi__get8(s) != 'G' || stbi__get8(s) != 'I' || stbi__get8(s) != 'F' || stbi__get8(s) != '8')
+      return stbi__err("not GIF", "Corrupt GIF");
+
+   version = stbi__get8(s);
+   if (version != '7' && version != '9')    return stbi__err("not GIF", "Corrupt GIF");
+   if (stbi__get8(s) != 'a')                return stbi__err("not GIF", "Corrupt GIF");
+
+   stbi__g_failure_reason = "";
+   g->w = stbi__get16le(s);
+   g->h = stbi__get16le(s);
+   g->flags = stbi__get8(s);
+   g->bgindex = stbi__get8(s);
+   g->ratio = stbi__get8(s);
+   g->transparent = -1;
+
+   if (g->w > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+   if (g->h > STBI_MAX_DIMENSIONS) return stbi__err("too large","Very large image (corrupt?)");
+
+   if (comp != 0) *comp = 4;  // can't actually tell whether it's 3 or 4 until we parse the comments
+
+   if (is_info) return 1;
+
+   if (g->flags & 0x80)
+      stbi__gif_parse_colortable(s,g->pal, 2 << (g->flags & 7), -1);
+
+   return 1;
+}
+
+static int stbi__gif_info_raw(stbi__context *s, int *x, int *y, int *comp)
+{
+   stbi__gif* g = (stbi__gif*) stbi__malloc(sizeof(stbi__gif));
+   if (!g) return stbi__err("outofmem", "Out of memory");
+   if (!stbi__gif_header(s, g, comp, 1)) {
+      STBI_FREE(g);
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = g->w;
+   if (y) *y = g->h;
+   STBI_FREE(g);
+   return 1;
+}
+
+static void stbi__out_gif_code(stbi__gif *g, stbi__uint16 code)
+{
+   stbi_uc *p, *c;
+   int idx;
+
+   // recurse to decode the prefixes, since the linked-list is backwards,
+   // and working backwards through an interleaved image would be nasty
+   if (g->codes[code].prefix >= 0)
+      stbi__out_gif_code(g, g->codes[code].prefix);
+
+   if (g->cur_y >= g->max_y) return;
+
+   idx = g->cur_x + g->cur_y;
+   p = &g->out[idx];
+   g->history[idx / 4] = 1;
+
+   c = &g->color_table[g->codes[code].suffix * 4];
+   if (c[3] > 128) { // don't render transparent pixels;
+      p[0] = c[2];
+      p[1] = c[1];
+      p[2] = c[0];
+      p[3] = c[3];
+   }
+   g->cur_x += 4;
+
+   if (g->cur_x >= g->max_x) {
+      g->cur_x = g->start_x;
+      g->cur_y += g->step;
+
+      while (g->cur_y >= g->max_y && g->parse > 0) {
+         g->step = (1 << g->parse) * g->line_size;
+         g->cur_y = g->start_y + (g->step >> 1);
+         --g->parse;
+      }
+   }
+}
+
+static stbi_uc *stbi__process_gif_raster(stbi__context *s, stbi__gif *g)
+{
+   stbi_uc lzw_cs;
+   stbi__int32 len, init_code;
+   stbi__uint32 first;
+   stbi__int32 codesize, codemask, avail, oldcode, bits, valid_bits, clear;
+   stbi__gif_lzw *p;
+
+   lzw_cs = stbi__get8(s);
+   if (lzw_cs > 12) return NULL;
+   clear = 1 << lzw_cs;
+   first = 1;
+   codesize = lzw_cs + 1;
+   codemask = (1 << codesize) - 1;
+   bits = 0;
+   valid_bits = 0;
+   for (init_code = 0; init_code < clear; init_code++) {
+      g->codes[init_code].prefix = -1;
+      g->codes[init_code].first = (stbi_uc) init_code;
+      g->codes[init_code].suffix = (stbi_uc) init_code;
+   }
+
+   // support no starting clear code
+   avail = clear+2;
+   oldcode = -1;
+
+   len = 0;
+   for(;;) {
+      if (valid_bits < codesize) {
+         if (len == 0) {
+            len = stbi__get8(s); // start new block
+            if (len == 0)
+               return g->out;
+         }
+         --len;
+         bits |= (stbi__int32) stbi__get8(s) << valid_bits;
+         valid_bits += 8;
+      } else {
+         stbi__int32 code = bits & codemask;
+         bits >>= codesize;
+         valid_bits -= codesize;
+         // @OPTIMIZE: is there some way we can accelerate the non-clear path?
+         if (code == clear) {  // clear code
+            codesize = lzw_cs + 1;
+            codemask = (1 << codesize) - 1;
+            avail = clear + 2;
+            oldcode = -1;
+            first = 0;
+         } else if (code == clear + 1) { // end of stream code
+            stbi__skip(s, len);
+            while ((len = stbi__get8(s)) > 0)
+               stbi__skip(s,len);
+            return g->out;
+         } else if (code <= avail) {
+            if (first) {
+               return stbi__errpuc("no clear code", "Corrupt GIF");
+            }
+
+            if (oldcode >= 0) {
+               p = &g->codes[avail++];
+               if (avail > 8192) {
+                  return stbi__errpuc("too many codes", "Corrupt GIF");
+               }
+
+               p->prefix = (stbi__int16) oldcode;
+               p->first = g->codes[oldcode].first;
+               p->suffix = (code == avail) ? p->first : g->codes[code].first;
+            } else if (code == avail)
+               return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+
+            stbi__out_gif_code(g, (stbi__uint16) code);
+
+            if ((avail & codemask) == 0 && avail <= 0x0FFF) {
+               codesize++;
+               codemask = (1 << codesize) - 1;
+            }
+
+            oldcode = code;
+         } else {
+            return stbi__errpuc("illegal code in raster", "Corrupt GIF");
+         }
+      }
+   }
+}
+
+// this function is designed to support animated gifs, although stb_image doesn't support it
+// two back is the image from two frames ago, used for a very specific disposal format
+static stbi_uc *stbi__gif_load_next(stbi__context *s, stbi__gif *g, int *comp, int req_comp, stbi_uc *two_back)
+{
+   int dispose;
+   int first_frame;
+   int pi;
+   int pcount;
+   STBI_NOTUSED(req_comp);
+
+   // on first frame, any non-written pixels get the background colour (non-transparent)
+   first_frame = 0;
+   if (g->out == 0) {
+      if (!stbi__gif_header(s, g, comp,0)) return 0; // stbi__g_failure_reason set by stbi__gif_header
+      if (!stbi__mad3sizes_valid(4, g->w, g->h, 0))
+         return stbi__errpuc("too large", "GIF image is too large");
+      pcount = g->w * g->h;
+      g->out = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->background = (stbi_uc *) stbi__malloc(4 * pcount);
+      g->history = (stbi_uc *) stbi__malloc(pcount);
+      if (!g->out || !g->background || !g->history)
+         return stbi__errpuc("outofmem", "Out of memory");
+
+      // image is treated as "transparent" at the start - ie, nothing overwrites the current background;
+      // background colour is only used for pixels that are not rendered first frame, after that "background"
+      // color refers to the color that was there the previous frame.
+      memset(g->out, 0x00, 4 * pcount);
+      memset(g->background, 0x00, 4 * pcount); // state of the background (starts transparent)
+      memset(g->history, 0x00, pcount);        // pixels that were affected previous frame
+      first_frame = 1;
+   } else {
+      // second frame - how do we dispose of the previous one?
+      dispose = (g->eflags & 0x1C) >> 2;
+      pcount = g->w * g->h;
+
+      if ((dispose == 3) && (two_back == 0)) {
+         dispose = 2; // if I don't have an image to revert back to, default to the old background
+      }
+
+      if (dispose == 3) { // use previous graphic
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &two_back[pi * 4], 4 );
+            }
+         }
+      } else if (dispose == 2) {
+         // restore what was changed last frame to background before that frame;
+         for (pi = 0; pi < pcount; ++pi) {
+            if (g->history[pi]) {
+               memcpy( &g->out[pi * 4], &g->background[pi * 4], 4 );
+            }
+         }
+      } else {
+         // This is a non-disposal case eithe way, so just
+         // leave the pixels as is, and they will become the new background
+         // 1: do not dispose
+         // 0:  not specified.
+      }
+
+      // background is what out is after the undoing of the previou frame;
+      memcpy( g->background, g->out, 4 * g->w * g->h );
+   }
+
+   // clear my history;
+   memset( g->history, 0x00, g->w * g->h );        // pixels that were affected previous frame
+
+   for (;;) {
+      int tag = stbi__get8(s);
+      switch (tag) {
+         case 0x2C: /* Image Descriptor */
+         {
+            stbi__int32 x, y, w, h;
+            stbi_uc *o;
+
+            x = stbi__get16le(s);
+            y = stbi__get16le(s);
+            w = stbi__get16le(s);
+            h = stbi__get16le(s);
+            if (((x + w) > (g->w)) || ((y + h) > (g->h)))
+               return stbi__errpuc("bad Image Descriptor", "Corrupt GIF");
+
+            g->line_size = g->w * 4;
+            g->start_x = x * 4;
+            g->start_y = y * g->line_size;
+            g->max_x   = g->start_x + w * 4;
+            g->max_y   = g->start_y + h * g->line_size;
+            g->cur_x   = g->start_x;
+            g->cur_y   = g->start_y;
+
+            // if the width of the specified rectangle is 0, that means
+            // we may not see *any* pixels or the image is malformed;
+            // to make sure this is caught, move the current y down to
+            // max_y (which is what out_gif_code checks).
+            if (w == 0)
+               g->cur_y = g->max_y;
+
+            g->lflags = stbi__get8(s);
+
+            if (g->lflags & 0x40) {
+               g->step = 8 * g->line_size; // first interlaced spacing
+               g->parse = 3;
+            } else {
+               g->step = g->line_size;
+               g->parse = 0;
+            }
+
+            if (g->lflags & 0x80) {
+               stbi__gif_parse_colortable(s,g->lpal, 2 << (g->lflags & 7), g->eflags & 0x01 ? g->transparent : -1);
+               g->color_table = (stbi_uc *) g->lpal;
+            } else if (g->flags & 0x80) {
+               g->color_table = (stbi_uc *) g->pal;
+            } else
+               return stbi__errpuc("missing color table", "Corrupt GIF");
+
+            o = stbi__process_gif_raster(s, g);
+            if (!o) return NULL;
+
+            // if this was the first frame,
+            pcount = g->w * g->h;
+            if (first_frame && (g->bgindex > 0)) {
+               // if first frame, any pixel not drawn to gets the background color
+               for (pi = 0; pi < pcount; ++pi) {
+                  if (g->history[pi] == 0) {
+                     g->pal[g->bgindex][3] = 255; // just in case it was made transparent, undo that; It will be reset next frame if need be;
+                     memcpy( &g->out[pi * 4], &g->pal[g->bgindex], 4 );
+                  }
+               }
+            }
+
+            return o;
+         }
+
+         case 0x21: // Comment Extension.
+         {
+            int len;
+            int ext = stbi__get8(s);
+            if (ext == 0xF9) { // Graphic Control Extension.
+               len = stbi__get8(s);
+               if (len == 4) {
+                  g->eflags = stbi__get8(s);
+                  g->delay = 10 * stbi__get16le(s); // delay - 1/100th of a second, saving as 1/1000ths.
+
+                  // unset old transparent
+                  if (g->transparent >= 0) {
+                     g->pal[g->transparent][3] = 255;
+                  }
+                  if (g->eflags & 0x01) {
+                     g->transparent = stbi__get8(s);
+                     if (g->transparent >= 0) {
+                        g->pal[g->transparent][3] = 0;
+                     }
+                  } else {
+                     // don't need transparent
+                     stbi__skip(s, 1);
+                     g->transparent = -1;
+                  }
+               } else {
+                  stbi__skip(s, len);
+                  break;
+               }
+            }
+            while ((len = stbi__get8(s)) != 0) {
+               stbi__skip(s, len);
+            }
+            break;
+         }
+
+         case 0x3B: // gif stream termination code
+            return (stbi_uc *) s; // using '1' causes warning on some compilers
+
+         default:
+            return stbi__errpuc("unknown code", "Corrupt GIF");
+      }
+   }
+}
+
+static void *stbi__load_gif_main_outofmem(stbi__gif *g, stbi_uc *out, int **delays)
+{
+   STBI_FREE(g->out);
+   STBI_FREE(g->history);
+   STBI_FREE(g->background);
+
+   if (out) STBI_FREE(out);
+   if (delays && *delays) STBI_FREE(*delays);
+   return stbi__errpuc("outofmem", "Out of memory");
+}
+
+static void *stbi__load_gif_main(stbi__context *s, int **delays, int *x, int *y, int *z, int *comp, int req_comp)
+{
+   if (stbi__gif_test(s)) {
+      int layers = 0;
+      stbi_uc *u = 0;
+      stbi_uc *out = 0;
+      stbi_uc *two_back = 0;
+      stbi__gif g;
+      int stride;
+      int out_size = 0;
+      int delays_size = 0;
+
+      STBI_NOTUSED(out_size);
+      STBI_NOTUSED(delays_size);
+
+      memset(&g, 0, sizeof(g));
+      if (delays) {
+         *delays = 0;
+      }
+
+      do {
+         u = stbi__gif_load_next(s, &g, comp, req_comp, two_back);
+         if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+
+         if (u) {
+            *x = g.w;
+            *y = g.h;
+            ++layers;
+            stride = g.w * g.h * 4;
+
+            if (out) {
+               void *tmp = (stbi_uc*) STBI_REALLOC_SIZED( out, out_size, layers * stride );
+               if (!tmp)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               else {
+                   out = (stbi_uc*) tmp;
+                   out_size = layers * stride;
+               }
+
+               if (delays) {
+                  int *new_delays = (int*) STBI_REALLOC_SIZED( *delays, delays_size, sizeof(int) * layers );
+                  if (!new_delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  *delays = new_delays;
+                  delays_size = layers * sizeof(int);
+               }
+            } else {
+               out = (stbi_uc*)stbi__malloc( layers * stride );
+               if (!out)
+                  return stbi__load_gif_main_outofmem(&g, out, delays);
+               out_size = layers * stride;
+               if (delays) {
+                  *delays = (int*) stbi__malloc( layers * sizeof(int) );
+                  if (!*delays)
+                     return stbi__load_gif_main_outofmem(&g, out, delays);
+                  delays_size = layers * sizeof(int);
+               }
+            }
+            memcpy( out + ((layers - 1) * stride), u, stride );
+            if (layers >= 2) {
+               two_back = out - 2 * stride;
+            }
+
+            if (delays) {
+               (*delays)[layers - 1U] = g.delay;
+            }
+         }
+      } while (u != 0);
+
+      // free temp buffer;
+      STBI_FREE(g.out);
+      STBI_FREE(g.history);
+      STBI_FREE(g.background);
+
+      // do the final conversion after loading everything;
+      if (req_comp && req_comp != 4)
+         out = stbi__convert_format(out, 4, req_comp, layers * g.w, g.h);
+
+      *z = layers;
+      return out;
+   } else {
+      return stbi__errpuc("not GIF", "Image was not as a gif type.");
+   }
+}
+
+static void *stbi__gif_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *u = 0;
+   stbi__gif g;
+   memset(&g, 0, sizeof(g));
+   STBI_NOTUSED(ri);
+
+   u = stbi__gif_load_next(s, &g, comp, req_comp, 0);
+   if (u == (stbi_uc *) s) u = 0;  // end of animated gif marker
+   if (u) {
+      *x = g.w;
+      *y = g.h;
+
+      // moved conversion to after successful load so that the same
+      // can be done for multiple frames.
+      if (req_comp && req_comp != 4)
+         u = stbi__convert_format(u, 4, req_comp, g.w, g.h);
+   } else if (g.out) {
+      // if there was an error and we allocated an image buffer, free it!
+      STBI_FREE(g.out);
+   }
+
+   // free buffers needed for multiple frame loading;
+   STBI_FREE(g.history);
+   STBI_FREE(g.background);
+
+   return u;
+}
+
+static int stbi__gif_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   return stbi__gif_info_raw(s,x,y,comp);
+}
+#endif
+
+// *************************************************************************************************
+// Radiance RGBE HDR loader
+// originally by Nicolas Schulz
+#ifndef STBI_NO_HDR
+static int stbi__hdr_test_core(stbi__context *s, const char *signature)
+{
+   int i;
+   for (i=0; signature[i]; ++i)
+      if (stbi__get8(s) != signature[i])
+          return 0;
+   stbi__rewind(s);
+   return 1;
+}
+
+static int stbi__hdr_test(stbi__context* s)
+{
+   int r = stbi__hdr_test_core(s, "#?RADIANCE\n");
+   stbi__rewind(s);
+   if(!r) {
+       r = stbi__hdr_test_core(s, "#?RGBE\n");
+       stbi__rewind(s);
+   }
+   return r;
+}
+
+#define STBI__HDR_BUFLEN  1024
+static char *stbi__hdr_gettoken(stbi__context *z, char *buffer)
+{
+   int len=0;
+   char c = '\0';
+
+   c = (char) stbi__get8(z);
+
+   while (!stbi__at_eof(z) && c != '\n') {
+      buffer[len++] = c;
+      if (len == STBI__HDR_BUFLEN-1) {
+         // flush to end of line
+         while (!stbi__at_eof(z) && stbi__get8(z) != '\n')
+            ;
+         break;
+      }
+      c = (char) stbi__get8(z);
+   }
+
+   buffer[len] = 0;
+   return buffer;
+}
+
+static void stbi__hdr_convert(float *output, stbi_uc *input, int req_comp)
+{
+   if ( input[3] != 0 ) {
+      float f1;
+      // Exponent
+      f1 = (float) ldexp(1.0f, input[3] - (int)(128 + 8));
+      if (req_comp <= 2)
+         output[0] = (input[0] + input[1] + input[2]) * f1 / 3;
+      else {
+         output[0] = input[0] * f1;
+         output[1] = input[1] * f1;
+         output[2] = input[2] * f1;
+      }
+      if (req_comp == 2) output[1] = 1;
+      if (req_comp == 4) output[3] = 1;
+   } else {
+      switch (req_comp) {
+         case 4: output[3] = 1; /* fallthrough */
+         case 3: output[0] = output[1] = output[2] = 0;
+                 break;
+         case 2: output[1] = 1; /* fallthrough */
+         case 1: output[0] = 0;
+                 break;
+      }
+   }
+}
+
+static float *stbi__hdr_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int width, height;
+   stbi_uc *scanline;
+   float *hdr_data;
+   int len;
+   unsigned char count, value;
+   int i, j, k, c1,c2, z;
+   const char *headerToken;
+   STBI_NOTUSED(ri);
+
+   // Check identifier
+   headerToken = stbi__hdr_gettoken(s,buffer);
+   if (strcmp(headerToken, "#?RADIANCE") != 0 && strcmp(headerToken, "#?RGBE") != 0)
+      return stbi__errpf("not HDR", "Corrupt HDR image");
+
+   // Parse header
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid)    return stbi__errpf("unsupported format", "Unsupported HDR format");
+
+   // Parse width and height
+   // can't use sscanf() if we're not using stdio!
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   height = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3))  return stbi__errpf("unsupported data layout", "Unsupported HDR format");
+   token += 3;
+   width = (int) strtol(token, NULL, 10);
+
+   if (height > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+   if (width > STBI_MAX_DIMENSIONS) return stbi__errpf("too large","Very large image (corrupt?)");
+
+   *x = width;
+   *y = height;
+
+   if (comp) *comp = 3;
+   if (req_comp == 0) req_comp = 3;
+
+   if (!stbi__mad4sizes_valid(width, height, req_comp, sizeof(float), 0))
+      return stbi__errpf("too large", "HDR image is too large");
+
+   // Read data
+   hdr_data = (float *) stbi__malloc_mad4(width, height, req_comp, sizeof(float), 0);
+   if (!hdr_data)
+      return stbi__errpf("outofmem", "Out of memory");
+
+   // Load image data
+   // image data is stored as some number of sca
+   if ( width < 8 || width >= 32768) {
+      // Read flat data
+      for (j=0; j < height; ++j) {
+         for (i=0; i < width; ++i) {
+            stbi_uc rgbe[4];
+           main_decode_loop:
+            stbi__getn(s, rgbe, 4);
+            stbi__hdr_convert(hdr_data + j * width * req_comp + i * req_comp, rgbe, req_comp);
+         }
+      }
+   } else {
+      // Read RLE-encoded data
+      scanline = NULL;
+
+      for (j = 0; j < height; ++j) {
+         c1 = stbi__get8(s);
+         c2 = stbi__get8(s);
+         len = stbi__get8(s);
+         if (c1 != 2 || c2 != 2 || (len & 0x80)) {
+            // not run-length encoded, so we have to actually use THIS data as a decoded
+            // pixel (note this can't be a valid pixel--one of RGB must be >= 128)
+            stbi_uc rgbe[4];
+            rgbe[0] = (stbi_uc) c1;
+            rgbe[1] = (stbi_uc) c2;
+            rgbe[2] = (stbi_uc) len;
+            rgbe[3] = (stbi_uc) stbi__get8(s);
+            stbi__hdr_convert(hdr_data, rgbe, req_comp);
+            i = 1;
+            j = 0;
+            STBI_FREE(scanline);
+            goto main_decode_loop; // yes, this makes no sense
+         }
+         len <<= 8;
+         len |= stbi__get8(s);
+         if (len != width) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("invalid decoded scanline length", "corrupt HDR"); }
+         if (scanline == NULL) {
+            scanline = (stbi_uc *) stbi__malloc_mad2(width, 4, 0);
+            if (!scanline) {
+               STBI_FREE(hdr_data);
+               return stbi__errpf("outofmem", "Out of memory");
+            }
+         }
+
+         for (k = 0; k < 4; ++k) {
+            int nleft;
+            i = 0;
+            while ((nleft = width - i) > 0) {
+               count = stbi__get8(s);
+               if (count > 128) {
+                  // Run
+                  value = stbi__get8(s);
+                  count -= 128;
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = value;
+               } else {
+                  // Dump
+                  if (count > nleft) { STBI_FREE(hdr_data); STBI_FREE(scanline); return stbi__errpf("corrupt", "bad RLE data in HDR"); }
+                  for (z = 0; z < count; ++z)
+                     scanline[i++ * 4 + k] = stbi__get8(s);
+               }
+            }
+         }
+         for (i=0; i < width; ++i)
+            stbi__hdr_convert(hdr_data+(j*width + i)*req_comp, scanline + i*4, req_comp);
+      }
+      if (scanline)
+         STBI_FREE(scanline);
+   }
+
+   return hdr_data;
+}
+
+static int stbi__hdr_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   char buffer[STBI__HDR_BUFLEN];
+   char *token;
+   int valid = 0;
+   int dummy;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (stbi__hdr_test(s) == 0) {
+       stbi__rewind( s );
+       return 0;
+   }
+
+   for(;;) {
+      token = stbi__hdr_gettoken(s,buffer);
+      if (token[0] == 0) break;
+      if (strcmp(token, "FORMAT=32-bit_rle_rgbe") == 0) valid = 1;
+   }
+
+   if (!valid) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token = stbi__hdr_gettoken(s,buffer);
+   if (strncmp(token, "-Y ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *y = (int) strtol(token, &token, 10);
+   while (*token == ' ') ++token;
+   if (strncmp(token, "+X ", 3)) {
+       stbi__rewind( s );
+       return 0;
+   }
+   token += 3;
+   *x = (int) strtol(token, NULL, 10);
+   *comp = 3;
+   return 1;
+}
+#endif // STBI_NO_HDR
+
+#ifndef STBI_NO_BMP
+static int stbi__bmp_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   void *p;
+   stbi__bmp_data info;
+
+   info.all_a = 255;
+   p = stbi__bmp_parse_header(s, &info);
+   if (p == NULL) {
+      stbi__rewind( s );
+      return 0;
+   }
+   if (x) *x = s->img_x;
+   if (y) *y = s->img_y;
+   if (comp) {
+      if (info.bpp == 24 && info.ma == 0xff000000)
+         *comp = 3;
+      else
+         *comp = info.ma ? 4 : 3;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PSD
+static int stbi__psd_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int channelCount, dummy, depth;
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *y = stbi__get32be(s);
+   *x = stbi__get32be(s);
+   depth = stbi__get16be(s);
+   if (depth != 8 && depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 3) {
+       stbi__rewind( s );
+       return 0;
+   }
+   *comp = 4;
+   return 1;
+}
+
+static int stbi__psd_is16(stbi__context *s)
+{
+   int channelCount, depth;
+   if (stbi__get32be(s) != 0x38425053) {
+       stbi__rewind( s );
+       return 0;
+   }
+   if (stbi__get16be(s) != 1) {
+       stbi__rewind( s );
+       return 0;
+   }
+   stbi__skip(s, 6);
+   channelCount = stbi__get16be(s);
+   if (channelCount < 0 || channelCount > 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   STBI_NOTUSED(stbi__get32be(s));
+   STBI_NOTUSED(stbi__get32be(s));
+   depth = stbi__get16be(s);
+   if (depth != 16) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+#endif
+
+#ifndef STBI_NO_PIC
+static int stbi__pic_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int act_comp=0,num_packets=0,chained,dummy;
+   stbi__pic_packet packets[10];
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   if (!stbi__pic_is4(s,"\x53\x80\xF6\x34")) {
+      stbi__rewind(s);
+      return 0;
+   }
+
+   stbi__skip(s, 88);
+
+   *x = stbi__get16be(s);
+   *y = stbi__get16be(s);
+   if (stbi__at_eof(s)) {
+      stbi__rewind( s);
+      return 0;
+   }
+   if ( (*x) != 0 && (1 << 28) / (*x) < (*y)) {
+      stbi__rewind( s );
+      return 0;
+   }
+
+   stbi__skip(s, 8);
+
+   do {
+      stbi__pic_packet *packet;
+
+      if (num_packets==sizeof(packets)/sizeof(packets[0]))
+         return 0;
+
+      packet = &packets[num_packets++];
+      chained = stbi__get8(s);
+      packet->size    = stbi__get8(s);
+      packet->type    = stbi__get8(s);
+      packet->channel = stbi__get8(s);
+      act_comp |= packet->channel;
+
+      if (stbi__at_eof(s)) {
+          stbi__rewind( s );
+          return 0;
+      }
+      if (packet->size != 8) {
+          stbi__rewind( s );
+          return 0;
+      }
+   } while (chained);
+
+   *comp = (act_comp & 0x10 ? 4 : 3);
+
+   return 1;
+}
+#endif
+
+// *************************************************************************************************
+// Portable Gray Map and Portable Pixel Map loader
+// by Ken Miller
+//
+// PGM: http://netpbm.sourceforge.net/doc/pgm.html
+// PPM: http://netpbm.sourceforge.net/doc/ppm.html
+//
+// Known limitations:
+//    Does not support comments in the header section
+//    Does not support ASCII image data (formats P2 and P3)
+
+#ifndef STBI_NO_PNM
+
+static int      stbi__pnm_test(stbi__context *s)
+{
+   char p, t;
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind( s );
+       return 0;
+   }
+   return 1;
+}
+
+static void *stbi__pnm_load(stbi__context *s, int *x, int *y, int *comp, int req_comp, stbi__result_info *ri)
+{
+   stbi_uc *out;
+   STBI_NOTUSED(ri);
+
+   ri->bits_per_channel = stbi__pnm_info(s, (int *)&s->img_x, (int *)&s->img_y, (int *)&s->img_n);
+   if (ri->bits_per_channel == 0)
+      return 0;
+
+   if (s->img_y > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+   if (s->img_x > STBI_MAX_DIMENSIONS) return stbi__errpuc("too large","Very large image (corrupt?)");
+
+   *x = s->img_x;
+   *y = s->img_y;
+   if (comp) *comp = s->img_n;
+
+   if (!stbi__mad4sizes_valid(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0))
+      return stbi__errpuc("too large", "PNM too large");
+
+   out = (stbi_uc *) stbi__malloc_mad4(s->img_n, s->img_x, s->img_y, ri->bits_per_channel / 8, 0);
+   if (!out) return stbi__errpuc("outofmem", "Out of memory");
+   stbi__getn(s, out, s->img_n * s->img_x * s->img_y * (ri->bits_per_channel / 8));
+
+   if (req_comp && req_comp != s->img_n) {
+      out = stbi__convert_format(out, s->img_n, req_comp, s->img_x, s->img_y);
+      if (out == NULL) return out; // stbi__convert_format frees input on failure
+   }
+   return out;
+}
+
+static int      stbi__pnm_isspace(char c)
+{
+   return c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r';
+}
+
+static void     stbi__pnm_skip_whitespace(stbi__context *s, char *c)
+{
+   for (;;) {
+      while (!stbi__at_eof(s) && stbi__pnm_isspace(*c))
+         *c = (char) stbi__get8(s);
+
+      if (stbi__at_eof(s) || *c != '#')
+         break;
+
+      while (!stbi__at_eof(s) && *c != '\n' && *c != '\r' )
+         *c = (char) stbi__get8(s);
+   }
+}
+
+static int      stbi__pnm_isdigit(char c)
+{
+   return c >= '0' && c <= '9';
+}
+
+static int      stbi__pnm_getinteger(stbi__context *s, char *c)
+{
+   int value = 0;
+
+   while (!stbi__at_eof(s) && stbi__pnm_isdigit(*c)) {
+      value = value*10 + (*c - '0');
+      *c = (char) stbi__get8(s);
+   }
+
+   return value;
+}
+
+static int      stbi__pnm_info(stbi__context *s, int *x, int *y, int *comp)
+{
+   int maxv, dummy;
+   char c, p, t;
+
+   if (!x) x = &dummy;
+   if (!y) y = &dummy;
+   if (!comp) comp = &dummy;
+
+   stbi__rewind(s);
+
+   // Get identifier
+   p = (char) stbi__get8(s);
+   t = (char) stbi__get8(s);
+   if (p != 'P' || (t != '5' && t != '6')) {
+       stbi__rewind(s);
+       return 0;
+   }
+
+   *comp = (t == '6') ? 3 : 1;  // '5' is 1-component .pgm; '6' is 3-component .ppm
+
+   c = (char) stbi__get8(s);
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *x = stbi__pnm_getinteger(s, &c); // read width
+   stbi__pnm_skip_whitespace(s, &c);
+
+   *y = stbi__pnm_getinteger(s, &c); // read height
+   stbi__pnm_skip_whitespace(s, &c);
+
+   maxv = stbi__pnm_getinteger(s, &c);  // read max value
+   if (maxv > 65535)
+      return stbi__err("max value > 65535", "PPM image supports only 8-bit and 16-bit images");
+   else if (maxv > 255)
+      return 16;
+   else
+      return 8;
+}
+
+static int stbi__pnm_is16(stbi__context *s)
+{
+   if (stbi__pnm_info(s, NULL, NULL, NULL) == 16)
+     return 1;
+   return 0;
+}
+#endif
+
+static int stbi__info_main(stbi__context *s, int *x, int *y, int *comp)
+{
+   #ifndef STBI_NO_JPEG
+   if (stbi__jpeg_info(s, x, y, comp)) return 1;
+   #endif
+
+   #ifndef STBI_NO_PNG
+   if (stbi__png_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_GIF
+   if (stbi__gif_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_BMP
+   if (stbi__bmp_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PIC
+   if (stbi__pic_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_info(s, x, y, comp))  return 1;
+   #endif
+
+   #ifndef STBI_NO_HDR
+   if (stbi__hdr_info(s, x, y, comp))  return 1;
+   #endif
+
+   // test tga last because it's a crappy test!
+   #ifndef STBI_NO_TGA
+   if (stbi__tga_info(s, x, y, comp))
+       return 1;
+   #endif
+   return stbi__err("unknown image type", "Image not of any known type, or corrupt");
+}
+
+static int stbi__is_16_main(stbi__context *s)
+{
+   #ifndef STBI_NO_PNG
+   if (stbi__png_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PSD
+   if (stbi__psd_is16(s))  return 1;
+   #endif
+
+   #ifndef STBI_NO_PNM
+   if (stbi__pnm_is16(s))  return 1;
+   #endif
+   return 0;
+}
+
+#ifndef STBI_NO_STDIO
+STBIDEF int stbi_info(char const *filename, int *x, int *y, int *comp)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_info_from_file(f, x, y, comp);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_info_from_file(FILE *f, int *x, int *y, int *comp)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__info_main(&s,x,y,comp);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+
+STBIDEF int stbi_is_16_bit(char const *filename)
+{
+    FILE *f = stbi__fopen(filename, "rb");
+    int result;
+    if (!f) return stbi__err("can't fopen", "Unable to open file");
+    result = stbi_is_16_bit_from_file(f);
+    fclose(f);
+    return result;
+}
+
+STBIDEF int stbi_is_16_bit_from_file(FILE *f)
+{
+   int r;
+   stbi__context s;
+   long pos = ftell(f);
+   stbi__start_file(&s, f);
+   r = stbi__is_16_main(&s);
+   fseek(f,pos,SEEK_SET);
+   return r;
+}
+#endif // !STBI_NO_STDIO
+
+STBIDEF int stbi_info_from_memory(stbi_uc const *buffer, int len, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_info_from_callbacks(stbi_io_callbacks const *c, void *user, int *x, int *y, int *comp)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__info_main(&s,x,y,comp);
+}
+
+STBIDEF int stbi_is_16_bit_from_memory(stbi_uc const *buffer, int len)
+{
+   stbi__context s;
+   stbi__start_mem(&s,buffer,len);
+   return stbi__is_16_main(&s);
+}
+
+STBIDEF int stbi_is_16_bit_from_callbacks(stbi_io_callbacks const *c, void *user)
+{
+   stbi__context s;
+   stbi__start_callbacks(&s, (stbi_io_callbacks *) c, user);
+   return stbi__is_16_main(&s);
+}
+
+#endif // STB_IMAGE_IMPLEMENTATION
+
+/*
+   revision history:
+      2.20  (2019-02-07) support utf8 filenames in Windows; fix warnings and platform ifdefs
+      2.19  (2018-02-11) fix warning
+      2.18  (2018-01-30) fix warnings
+      2.17  (2018-01-29) change sbti__shiftsigned to avoid clang -O2 bug
+                         1-bit BMP
+                         *_is_16_bit api
+                         avoid warnings
+      2.16  (2017-07-23) all functions have 16-bit variants;
+                         STBI_NO_STDIO works again;
+                         compilation fixes;
+                         fix rounding in unpremultiply;
+                         optimize vertical flip;
+                         disable raw_len validation;
+                         documentation fixes
+      2.15  (2017-03-18) fix png-1,2,4 bug; now all Imagenet JPGs decode;
+                         warning fixes; disable run-time SSE detection on gcc;
+                         uniform handling of optional "return" values;
+                         thread-safe initialization of zlib tables
+      2.14  (2017-03-03) remove deprecated STBI_JPEG_OLD; fixes for Imagenet JPGs
+      2.13  (2016-11-29) add 16-bit API, only supported for PNG right now
+      2.12  (2016-04-02) fix typo in 2.11 PSD fix that caused crashes
+      2.11  (2016-04-02) allocate large structures on the stack
+                         remove white matting for transparent PSD
+                         fix reported channel count for PNG & BMP
+                         re-enable SSE2 in non-gcc 64-bit
+                         support RGB-formatted JPEG
+                         read 16-bit PNGs (only as 8-bit)
+      2.10  (2016-01-22) avoid warning introduced in 2.09 by STBI_REALLOC_SIZED
+      2.09  (2016-01-16) allow comments in PNM files
+                         16-bit-per-pixel TGA (not bit-per-component)
+                         info() for TGA could break due to .hdr handling
+                         info() for BMP to shares code instead of sloppy parse
+                         can use STBI_REALLOC_SIZED if allocator doesn't support realloc
+                         code cleanup
+      2.08  (2015-09-13) fix to 2.07 cleanup, reading RGB PSD as RGBA
+      2.07  (2015-09-13) fix compiler warnings
+                         partial animated GIF support
+                         limited 16-bpc PSD support
+                         #ifdef unused functions
+                         bug with < 92 byte PIC,PNM,HDR,TGA
+      2.06  (2015-04-19) fix bug where PSD returns wrong '*comp' value
+      2.05  (2015-04-19) fix bug in progressive JPEG handling, fix warning
+      2.04  (2015-04-15) try to re-enable SIMD on MinGW 64-bit
+      2.03  (2015-04-12) extra corruption checking (mmozeiko)
+                         stbi_set_flip_vertically_on_load (nguillemot)
+                         fix NEON support; fix mingw support
+      2.02  (2015-01-19) fix incorrect assert, fix warning
+      2.01  (2015-01-17) fix various warnings; suppress SIMD on gcc 32-bit without -msse2
+      2.00b (2014-12-25) fix STBI_MALLOC in progressive JPEG
+      2.00  (2014-12-25) optimize JPG, including x86 SSE2 & NEON SIMD (ryg)
+                         progressive JPEG (stb)
+                         PGM/PPM support (Ken Miller)
+                         STBI_MALLOC,STBI_REALLOC,STBI_FREE
+                         GIF bugfix -- seemingly never worked
+                         STBI_NO_*, STBI_ONLY_*
+      1.48  (2014-12-14) fix incorrectly-named assert()
+      1.47  (2014-12-14) 1/2/4-bit PNG support, both direct and paletted (Omar Cornut & stb)
+                         optimize PNG (ryg)
+                         fix bug in interlaced PNG with user-specified channel count (stb)
+      1.46  (2014-08-26)
+              fix broken tRNS chunk (colorkey-style transparency) in non-paletted PNG
+      1.45  (2014-08-16)
+              fix MSVC-ARM internal compiler error by wrapping malloc
+      1.44  (2014-08-07)
+              various warning fixes from Ronny Chevalier
+      1.43  (2014-07-15)
+              fix MSVC-only compiler problem in code changed in 1.42
+      1.42  (2014-07-09)
+              don't define _CRT_SECURE_NO_WARNINGS (affects user code)
+              fixes to stbi__cleanup_jpeg path
+              added STBI_ASSERT to avoid requiring assert.h
+      1.41  (2014-06-25)
+              fix search&replace from 1.36 that messed up comments/error messages
+      1.40  (2014-06-22)
+              fix gcc struct-initialization warning
+      1.39  (2014-06-15)
+              fix to TGA optimization when req_comp != number of components in TGA;
+              fix to GIF loading because BMP wasn't rewinding (whoops, no GIFs in my test suite)
+              add support for BMP version 5 (more ignored fields)
+      1.38  (2014-06-06)
+              suppress MSVC warnings on integer casts truncating values
+              fix accidental rename of 'skip' field of I/O
+      1.37  (2014-06-04)
+              remove duplicate typedef
+      1.36  (2014-06-03)
+              convert to header file single-file library
+              if de-iphone isn't set, load iphone images color-swapped instead of returning NULL
+      1.35  (2014-05-27)
+              various warnings
+              fix broken STBI_SIMD path
+              fix bug where stbi_load_from_file no longer left file pointer in correct place
+              fix broken non-easy path for 32-bit BMP (possibly never used)
+              TGA optimization by Arseny Kapoulkine
+      1.34  (unknown)
+              use STBI_NOTUSED in stbi__resample_row_generic(), fix one more leak in tga failure case
+      1.33  (2011-07-14)
+              make stbi_is_hdr work in STBI_NO_HDR (as specified), minor compiler-friendly improvements
+      1.32  (2011-07-13)
+              support for "info" function for all supported filetypes (SpartanJ)
+      1.31  (2011-06-20)
+              a few more leak fixes, bug in PNG handling (SpartanJ)
+      1.30  (2011-06-11)
+              added ability to load files via callbacks to accomidate custom input streams (Ben Wenger)
+              removed deprecated format-specific test/load functions
+              removed support for installable file formats (stbi_loader) -- would have been broken for IO callbacks anyway
+              error cases in bmp and tga give messages and don't leak (Raymond Barbiero, grisha)
+              fix inefficiency in decoding 32-bit BMP (David Woo)
+      1.29  (2010-08-16)
+              various warning fixes from Aurelien Pocheville
+      1.28  (2010-08-01)
+              fix bug in GIF palette transparency (SpartanJ)
+      1.27  (2010-08-01)
+              cast-to-stbi_uc to fix warnings
+      1.26  (2010-07-24)
+              fix bug in file buffering for PNG reported by SpartanJ
+      1.25  (2010-07-17)
+              refix trans_data warning (Won Chun)
+      1.24  (2010-07-12)
+              perf improvements reading from files on platforms with lock-heavy fgetc()
+              minor perf improvements for jpeg
+              deprecated type-specific functions so we'll get feedback if they're needed
+              attempt to fix trans_data warning (Won Chun)
+      1.23    fixed bug in iPhone support
+      1.22  (2010-07-10)
+              removed image *writing* support
+              stbi_info support from Jetro Lauha
+              GIF support from Jean-Marc Lienher
+              iPhone PNG-extensions from James Brown
+              warning-fixes from Nicolas Schulz and Janez Zemva (i.stbi__err. Janez (U+017D)emva)
+      1.21    fix use of 'stbi_uc' in header (reported by jon blow)
+      1.20    added support for Softimage PIC, by Tom Seddon
+      1.19    bug in interlaced PNG corruption check (found by ryg)
+      1.18  (2008-08-02)
+              fix a threading bug (local mutable static)
+      1.17    support interlaced PNG
+      1.16    major bugfix - stbi__convert_format converted one too many pixels
+      1.15    initialize some fields for thread safety
+      1.14    fix threadsafe conversion bug
+              header-file-only version (#define STBI_HEADER_FILE_ONLY before including)
+      1.13    threadsafe
+      1.12    const qualifiers in the API
+      1.11    Support installable IDCT, colorspace conversion routines
+      1.10    Fixes for 64-bit (don't use "unsigned long")
+              optimized upsampling by Fabian "ryg" Giesen
+      1.09    Fix format-conversion for PSD code (bad global variables!)
+      1.08    Thatcher Ulrich's PSD code integrated by Nicolas Schulz
+      1.07    attempt to fix C++ warning/errors again
+      1.06    attempt to fix C++ warning/errors again
+      1.05    fix TGA loading to return correct *comp and use good luminance calc
+      1.04    default float alpha is 1, not 255; use 'void *' for stbi_image_free
+      1.03    bugfixes to STBI_NO_STDIO, STBI_NO_HDR
+      1.02    support for (subset of) HDR files, float interface for preferred access to them
+      1.01    fix bug: possible bug in handling right-side up bmps... not sure
+              fix bug: the stbi__bmp_load() and stbi__tga_load() functions didn't work at all
+      1.00    interface to zlib that skips zlib header
+      0.99    correct handling of alpha in palette
+      0.98    TGA loader by lonesock; dynamically add loaders (untested)
+      0.97    jpeg errors on too large a file; also catch another malloc failure
+      0.96    fix detection of invalid v value - particleman@mollyrocket forum
+      0.95    during header scan, seek to markers in case of padding
+      0.94    STBI_NO_STDIO to disable stdio usage; rename all #defines the same
+      0.93    handle jpegtran output; verbose errors
+      0.92    read 4,8,16,24,32-bit BMP files of several formats
+      0.91    output 24-bit Windows 3.0 BMP files
+      0.90    fix a few more warnings; bump version number to approach 1.0
+      0.61    bugfixes due to Marc LeBlanc, Christopher Lloyd
+      0.60    fix compiling as c++
+      0.59    fix warnings: merge Dave Moore's -Wall fixes
+      0.58    fix bug: zlib uncompressed mode len/nlen was wrong endian
+      0.57    fix bug: jpg last huffman symbol before marker was >9 bits but less than 16 available
+      0.56    fix bug: zlib uncompressed mode len vs. nlen
+      0.55    fix bug: restart_interval not initialized to 0
+      0.54    allow NULL for 'int *comp'
+      0.53    fix bug in png 3->4; speedup png decoding
+      0.52    png handles req_comp=3,4 directly; minor cleanup; jpeg comments
+      0.51    obey req_comp requests, 1-component jpegs return as 1-component,
+              on 'test' only check type, not whether we support this variant
+      0.50  (2006-11-19)
+              first released version
+*/
+
+
+/*
+------------------------------------------------------------------------------
+This software is available under 2 licenses -- choose whichever you prefer.
+------------------------------------------------------------------------------
+ALTERNATIVE A - MIT License
+Copyright (c) 2017 Sean Barrett
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+------------------------------------------------------------------------------
+ALTERNATIVE B - Public Domain (www.unlicense.org)
+This is free and unencumbered software released into the public domain.
+Anyone is free to copy, modify, publish, use, compile, sell, or distribute this
+software, either in source code form or as a compiled binary, for any purpose,
+commercial or non-commercial, and by any means.
+In jurisdictions that recognize copyright laws, the author or authors of this
+software dedicate any and all copyright interest in the software to the public
+domain. We make this dedication for the benefit of the public at large and to
+the detriment of our heirs and successors. We intend this dedication to be an
+overt act of relinquishment in perpetuity of all present and future rights to
+this software under copyright law.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+------------------------------------------------------------------------------
+*/
diff --git a/nvblox/include/nvblox/datasets/image_loader.h b/nvblox/include/nvblox/datasets/image_loader.h
new file mode 100644
index 00000000..078ccba4
--- /dev/null
+++ b/nvblox/include/nvblox/datasets/image_loader.h
@@ -0,0 +1,86 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <deque>
+#include <future>
+#include <string>
+
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace datasets {
+
+// Loads a depth image encoded as a 16-bit PNG, assumed to express the depth in
+// mm, and converts it to a float with meters.
+bool load16BitDepthImage(const std::string& filename,
+                         DepthImage* depth_frame_ptr,
+                         MemoryType memory_type = kDefaultImageMemoryType);
+bool load8BitColorImage(const std::string& filename,
+                        ColorImage* color_image_ptr,
+                        MemoryType memory_type = kDefaultImageMemoryType);
+
+using IndexToFilepathFunction = std::function<std::string(int image_idx)>;
+
+// Single-threaded image loader
+template <typename ImageType>
+class ImageLoader {
+ public:
+  ImageLoader(IndexToFilepathFunction index_to_filepath,
+              MemoryType memory_type = kDefaultImageMemoryType)
+      : index_to_filepath_(index_to_filepath), memory_type_(memory_type) {}
+  virtual ~ImageLoader() {}
+
+  virtual bool getNextImage(ImageType* image_ptr);
+
+ protected:
+  bool getImage(int image_idx, ImageType*);
+
+  int image_idx_ = 0;
+  IndexToFilepathFunction index_to_filepath_;
+  MemoryType memory_type_;
+};
+
+// Multi-threaded image loader
+template <typename ImageType>
+using ImageOptional = std::pair<bool, ImageType>;
+
+// Multi-threaded image loader
+template <typename ImageType>
+class MultiThreadedImageLoader : public ImageLoader<ImageType> {
+ public:
+  MultiThreadedImageLoader(IndexToFilepathFunction index_to_filepath,
+                           int num_threads,
+                           MemoryType memory_type = kDefaultImageMemoryType);
+  ~MultiThreadedImageLoader();
+
+  bool getNextImage(ImageType* image_ptr) override;
+
+ protected:
+  void initLoadQueue();
+  void emptyLoadQueue();
+  void addNextImageToQueue();
+  ImageOptional<ImageType> getImageAsOptional(int image_idx);
+
+  const int num_threads_;
+  std::deque<std::future<ImageOptional<ImageType>>> load_queue_;
+};
+
+}  // namespace datasets
+}  // namespace nvblox
+
+#include "nvblox/datasets/impl/image_loader_impl.h"
diff --git a/nvblox/include/nvblox/datasets/impl/image_loader_impl.h b/nvblox/include/nvblox/datasets/impl/image_loader_impl.h
new file mode 100644
index 00000000..e8794cd1
--- /dev/null
+++ b/nvblox/include/nvblox/datasets/impl/image_loader_impl.h
@@ -0,0 +1,88 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+namespace datasets {
+
+template <typename ImageType>
+bool ImageLoader<ImageType>::getNextImage(ImageType* image_ptr) {
+  bool res = getImage(image_idx_, image_ptr);
+  ++image_idx_;
+  return res;
+}
+
+template <typename ImageType>
+MultiThreadedImageLoader<ImageType>::MultiThreadedImageLoader(
+    IndexToFilepathFunction index_to_filepath, int num_threads,
+    MemoryType memory_type)
+    : ImageLoader<ImageType>(index_to_filepath, memory_type),
+      num_threads_(num_threads) {
+  initLoadQueue();
+}
+
+template <typename ImageType>
+MultiThreadedImageLoader<ImageType>::~MultiThreadedImageLoader() {
+  emptyLoadQueue();
+}
+
+template <typename ImageType>
+bool MultiThreadedImageLoader<ImageType>::getNextImage(ImageType* image_ptr) {
+  CHECK_NOTNULL(image_ptr);
+  auto images_optional_future = std::move(load_queue_.front());
+  ImageOptional<ImageType> image_optional =
+      std::move(images_optional_future.get());
+  if (image_optional.first) {
+    *image_ptr = std::move(image_optional.second);
+  }
+  load_queue_.pop_front();
+  addNextImageToQueue();
+  return image_optional.first;
+}
+
+template <typename ImageType>
+void MultiThreadedImageLoader<ImageType>::initLoadQueue() {
+  for (int i = 0; i < num_threads_; i++) {
+    addNextImageToQueue();
+  }
+}
+
+template <typename ImageType>
+void MultiThreadedImageLoader<ImageType>::emptyLoadQueue() {
+  while (!load_queue_.empty()) {
+    load_queue_.pop_front();
+  }
+}
+
+template <typename ImageType>
+void MultiThreadedImageLoader<ImageType>::addNextImageToQueue() {
+  load_queue_.push_back(
+      std::async(std::launch::async,
+                 &MultiThreadedImageLoader<ImageType>::getImageAsOptional, this,
+                 this->image_idx_));
+  ++this->image_idx_;
+}
+
+template <typename ImageType>
+ImageOptional<ImageType>
+MultiThreadedImageLoader<ImageType>::getImageAsOptional(int image_idx) {
+  ImageType image;
+  bool success_flag = ImageLoader<ImageType>::getImage(image_idx, &image);
+  return {success_flag, std::move(image)};
+}
+
+}  // namespace datasets
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/datasets/parse_3dmatch.h b/nvblox/include/nvblox/datasets/parse_3dmatch.h
new file mode 100644
index 00000000..774dfdaf
--- /dev/null
+++ b/nvblox/include/nvblox/datasets/parse_3dmatch.h
@@ -0,0 +1,63 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "nvblox/core/types.h"
+#include "nvblox/datasets/image_loader.h"
+
+namespace nvblox {
+namespace datasets {
+namespace threedmatch {
+
+/// Parses a 4x4 matrix from a text file in 3D Match format: space-separated.
+bool parsePoseFromFile(const std::string& filename, Transform* transform);
+
+/// Parse 3x3 camera intrinsics matrix from 3D Match format: space-separated.
+bool parseCameraFromFile(const std::string& filename,
+                         Eigen::Matrix3f* intrinsics);
+
+std::string getPathForCameraIntrinsics(const std::string& base_path);
+std::string getPathForFramePose(const std::string& base_path, const int seq_id,
+                                const int frame_id);
+std::string getPathForDepthImage(const std::string& base_path, const int seq_id,
+                                 const int frame_id);
+std::string getPathForColorImage(const std::string& base_path, const int seq_id,
+                                 const int frame_id);
+
+// Factory functions for single-threaded 3DMatch image loaders
+std::unique_ptr<nvblox::datasets::ImageLoader<DepthImage>>
+createDepthImageLoader(const std::string& base_path, const int seq_id,
+                       MemoryType memory_type = kDefaultImageMemoryType);
+std::unique_ptr<nvblox::datasets::ImageLoader<ColorImage>>
+createColorImageLoader(const std::string& base_path, const int seq_id,
+                       MemoryType memory_type = kDefaultImageMemoryType);
+
+// Factory functions for multi-threaded 3DMatch image loaders
+std::unique_ptr<nvblox::datasets::ImageLoader<DepthImage>>
+createMultithreadedDepthImageLoader(
+    const std::string& base_path, const int seq_id, const int num_threads,
+    MemoryType memory_type = kDefaultImageMemoryType);
+std::unique_ptr<nvblox::datasets::ImageLoader<ColorImage>>
+createMultithreadedColorImageLoader(
+    const std::string& base_path, const int seq_id, const int num_threads,
+    MemoryType memory_type = kDefaultImageMemoryType);
+
+}  // namespace threedmatch
+}  // namespace datasets
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/gpu_hash/cuda/gpu_hash_interface.cuh b/nvblox/include/nvblox/gpu_hash/cuda/gpu_hash_interface.cuh
new file mode 100644
index 00000000..7df672fa
--- /dev/null
+++ b/nvblox/include/nvblox/gpu_hash/cuda/gpu_hash_interface.cuh
@@ -0,0 +1,58 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/pair.h>
+
+#include <stdgpu/cstddef.h>
+#include <stdgpu/unordered_map.cuh>
+
+#include <glog/logging.h>
+
+#include "nvblox/core/hash.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+template <typename BlockType>
+using IndexBlockPair = thrust::pair<Index3D, BlockType*>;
+
+template <typename BlockType>
+using ConstIndexBlockPair = thrust::pair<const Index3D, BlockType*>;
+
+template <typename BlockType>
+using Index3DDeviceHashMapType =
+    stdgpu::unordered_map<Index3D, BlockType*, Index3DHash,
+                          std::equal_to<Index3D>>;
+
+template <typename BlockType>
+class GPUHashImpl {
+ public:
+  GPUHashImpl() = default;
+  GPUHashImpl(int max_num_blocks);
+  ~GPUHashImpl();
+
+  size_t size() const { return static_cast<size_t>(max_num_blocks_); }
+
+  stdgpu::index_t max_num_blocks_;
+  Index3DDeviceHashMapType<BlockType> impl_;
+};
+
+}  // namespace nvblox
+
+#include "nvblox/gpu_hash/cuda/impl/gpu_hash_interface_impl.cuh"
diff --git a/nvblox/include/nvblox/gpu_hash/cuda/gpu_indexing.cuh b/nvblox/include/nvblox/gpu_hash/cuda/gpu_indexing.cuh
new file mode 100644
index 00000000..5eacd1b7
--- /dev/null
+++ b/nvblox/include/nvblox/gpu_hash/cuda/gpu_indexing.cuh
@@ -0,0 +1,46 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <thrust/pair.h>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+#include "nvblox/gpu_hash/cuda/gpu_hash_interface.cuh"
+
+namespace nvblox {
+
+template <typename VoxelType>
+__device__ inline bool getVoxelAtPosition(
+    Index3DDeviceHashMapType<VoxelBlock<VoxelType>> block_hash,
+    const Vector3f& p_L, float block_size, VoxelType** voxel_ptr) {
+  Index3D block_idx;
+  Index3D voxel_idx;
+  getBlockAndVoxelIndexFromPositionInLayer(block_size, p_L, &block_idx,
+                                           &voxel_idx);
+  auto it = block_hash.find(block_idx);
+  if (it != block_hash.end()) {
+    *voxel_ptr =
+        &it->second->voxels[voxel_idx.x()][voxel_idx.y()][voxel_idx.z()];
+    return true;
+  }
+  return false;
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/gpu_hash/cuda/gpu_set.cuh b/nvblox/include/nvblox/gpu_hash/cuda/gpu_set.cuh
new file mode 100644
index 00000000..858427b5
--- /dev/null
+++ b/nvblox/include/nvblox/gpu_hash/cuda/gpu_set.cuh
@@ -0,0 +1,41 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <stdgpu/unordered_set.cuh>
+
+#include "nvblox/core/hash.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+typedef class stdgpu::unordered_set<Index3D, Index3DHash,
+                                    std::equal_to<Index3D>>
+    Index3DDeviceSet_t;
+
+struct Index3DDeviceSet {
+  Index3DDeviceSet(size_t size);
+  ~Index3DDeviceSet();
+
+  Index3DDeviceSet_t set;
+};
+
+// Copies the contents of a Index3DDeviceSet to an std::vector.
+void copySetToVector(const Index3DDeviceSet_t& set, std::vector<Index3D>* vec);
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/gpu_hash/cuda/impl/gpu_hash_interface_impl.cuh b/nvblox/include/nvblox/gpu_hash/cuda/impl/gpu_hash_interface_impl.cuh
new file mode 100644
index 00000000..a0012df3
--- /dev/null
+++ b/nvblox/include/nvblox/gpu_hash/cuda/impl/gpu_hash_interface_impl.cuh
@@ -0,0 +1,37 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+template <typename BlockType>
+GPUHashImpl<BlockType>::GPUHashImpl(int max_num_blocks)
+    : max_num_blocks_(max_num_blocks),
+      impl_(Index3DDeviceHashMapType<BlockType>::createDeviceObject(
+          max_num_blocks_)) {
+  VLOG(3) << "Creating a GPUHashImpl with max capacity of " << max_num_blocks_
+          << " blocks";
+}
+
+template <typename BlockType>
+GPUHashImpl<BlockType>::~GPUHashImpl() {
+  if (impl_.size() > 0) {
+    Index3DDeviceHashMapType<BlockType>::destroyDeviceObject(impl_);
+    VLOG(3) << "Destroying a GPUHashImpl";
+  }
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh b/nvblox/include/nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh
new file mode 100644
index 00000000..f4b9bf9b
--- /dev/null
+++ b/nvblox/include/nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh
@@ -0,0 +1,132 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/gpu_hash/cuda/gpu_hash_interface.cuh"
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+template <typename BlockType>
+GPULayerView<BlockType>::GPULayerView(LayerType* layer_ptr)
+    : gpu_hash_ptr_(std::make_shared<GPUHashImpl<BlockType>>()) {
+  reset(layer_ptr);
+}
+
+template <typename BlockType>
+GPULayerView<BlockType>::GPULayerView(size_t max_num_blocks)
+    : gpu_hash_ptr_(std::make_shared<GPUHashImpl<BlockType>>(max_num_blocks)) {
+  // The GPUHashImpl takes care of allocating GPU memory.
+}
+
+template <typename BlockType>
+GPULayerView<BlockType>::~GPULayerView() {
+  // The GPUHashImpl takes care of cleaning up GPU memory.
+}
+
+template <typename BlockType>
+GPULayerView<BlockType>::GPULayerView(const GPULayerView<BlockType>& other)
+    : block_size_(other.block_size_), gpu_hash_ptr_(other.gpu_hash_ptr_) {}
+
+template <typename BlockType>
+GPULayerView<BlockType>::GPULayerView(GPULayerView<BlockType>&& other)
+    : block_size_(other.block_size_),
+      gpu_hash_ptr_(std::move(other.gpu_hash_ptr_)) {}
+
+template <typename BlockType>
+GPULayerView<BlockType>& GPULayerView<BlockType>::operator=(
+    const GPULayerView<BlockType>& other) {
+  block_size_ = other.block_size_;
+  gpu_hash_ptr_ = other.gpu_hash_ptr_;
+  return *this;
+}
+
+template <typename BlockType>
+GPULayerView<BlockType>& GPULayerView<BlockType>::operator=(
+    GPULayerView<BlockType>&& other) {
+  block_size_ = other.block_size_;
+  gpu_hash_ptr_ = std::move(other.gpu_hash_ptr_);
+  return *this;
+}
+
+template <typename BlockType>
+void GPULayerView<BlockType>::reset(LayerType* layer_ptr) {
+  CHECK_NOTNULL(layer_ptr);
+
+  // Allocate gpu hash if not allocated already
+  if (!gpu_hash_ptr_) {
+    gpu_hash_ptr_ = std::make_shared<GPUHashImpl<BlockType>>(
+        layer_ptr->numAllocatedBlocks() * size_expansion_factor_);
+  }
+
+  // Check the load factor and increase the size if required.
+  const float current_load_factor =
+      static_cast<float>(layer_ptr->numAllocatedBlocks()) /
+      static_cast<float>(gpu_hash_ptr_->max_num_blocks_);
+  if (current_load_factor > max_load_factor_) {
+    const size_t new_max_num_blocks = std::ceil(
+        size_expansion_factor_ * std::max(gpu_hash_ptr_->max_num_blocks_,
+                                          layer_ptr->numAllocatedBlocks()));
+    reset(new_max_num_blocks);
+    CHECK_LT(layer_ptr->numAllocatedBlocks(), gpu_hash_ptr_->max_num_blocks_);
+  }
+
+  timing::Timer timer("gpu_hash/transfer");
+
+  gpu_hash_ptr_->impl_.clear();
+
+  block_size_ = layer_ptr->block_size();
+
+  // Arange blocks in continuous host memory for transfer
+  thrust::host_vector<IndexBlockPair<BlockType>> host_block_vector;
+  host_block_vector.reserve(layer_ptr->numAllocatedBlocks());
+  for (const auto& index : layer_ptr->getAllBlockIndices()) {
+    host_block_vector.push_back(IndexBlockPair<BlockType>(
+        index, layer_ptr->getBlockAtIndex(index).get()));
+  }
+
+  // CPU -> GPU
+  thrust::device_vector<IndexBlockPair<BlockType>> device_block_vector(
+      host_block_vector);
+
+  // GPU Insert
+  // NOTE(alexmillane): We have to do some unfortunate casting here. The problem
+  // is that the hash stores pairs with const keys, but the IndexBlockPair
+  // vector CANNOT have const keys. So the only way is to perform this cast.
+  ConstIndexBlockPair<BlockType>* block_start_raw_ptr =
+      reinterpret_cast<ConstIndexBlockPair<BlockType>*>(
+          device_block_vector.data().get());
+  gpu_hash_ptr_->impl_.insert(
+      stdgpu::make_device(block_start_raw_ptr),
+      stdgpu::make_device(block_start_raw_ptr + device_block_vector.size()));
+
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+template <typename BlockType>
+void GPULayerView<BlockType>::reset(size_t new_max_num_blocks) {
+  timing::Timer timer("gpu_hash/reallocation");
+  gpu_hash_ptr_ = std::make_shared<GPUHashImpl<BlockType>>(new_max_num_blocks);
+}
+
+template <typename BlockType>
+size_t GPULayerView<BlockType>::size() const {
+  return gpu_hash_ptr_->size();
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/gpu_hash/gpu_layer_view.h b/nvblox/include/nvblox/gpu_hash/gpu_layer_view.h
new file mode 100644
index 00000000..f1e5dc71
--- /dev/null
+++ b/nvblox/include/nvblox/gpu_hash/gpu_layer_view.h
@@ -0,0 +1,84 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+
+namespace nvblox {
+
+template <typename BlockType>
+class GPUHashImpl;
+
+template <typename BlockType>
+class BlockLayer;
+
+template <typename BlockType>
+class GPULayerView {
+ public:
+  using LayerType = BlockLayer<BlockType>;
+
+  GPULayerView() = default;
+  GPULayerView(size_t max_num_blocks);
+  GPULayerView(LayerType* layer_ptr);
+
+  GPULayerView(const GPULayerView& other);
+  GPULayerView(GPULayerView&& other);
+  GPULayerView& operator=(const GPULayerView& other);
+  GPULayerView& operator=(GPULayerView&& other);
+
+  ~GPULayerView();
+
+  // Creates a new GPULayerView from a layer
+  void reset(LayerType* layer_ptr);
+
+  // Resizes the underlying GPU hash as well as deleting its contents
+  void reset(size_t new_max_num_blocks);
+
+  float block_size() const { return block_size_; }
+
+  const GPUHashImpl<BlockType>& getHash() const { return *gpu_hash_ptr_; }
+
+  size_t size() const;
+
+  // Accessors
+  float max_load_factor() const { return max_load_factor_; }
+  float size_expansion_factor() const { return size_expansion_factor_; }
+
+ private:
+  // Layer params
+  float block_size_;
+
+  // The load factor at which we reallocate space.
+  const float max_load_factor_ = 0.75;
+
+  // This is the factor by which we overallocate space 
+  const float size_expansion_factor_ = 2.0f;
+
+  // NOTE(alexmillane): To keep GPU code out of the header we use PIMPL to hide
+  // the details of the GPU hash.
+  std::shared_ptr<GPUHashImpl<BlockType>> gpu_hash_ptr_;
+};
+
+}  // namespace nvblox
+
+// NOTE(alexmillane):
+// - I am leaving this here as a reminder to NOT include the implementation.
+// - The implementation file is instead included by .cu files which declare
+//   specializations.
+// - The problem is that we don't want the GPULayerView implementation, which
+//   contains CUDA calls and stdgpu code, bleeding into into layer.h, one of our
+//   main interace headers.
+//#include "nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh"
diff --git a/nvblox/include/nvblox/integrators/cuda/impl/projective_integrators_common_impl.cuh b/nvblox/include/nvblox/integrators/cuda/impl/projective_integrators_common_impl.cuh
new file mode 100644
index 00000000..8565d4f0
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/cuda/impl/projective_integrators_common_impl.cuh
@@ -0,0 +1,49 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+__device__ inline bool projectThreadVoxel(
+    const Index3D* block_indices_device_ptr, const Camera& camera,
+    const Transform& T_C_L, const float block_size, Eigen::Vector2f* u_px_ptr,
+    float* u_depth_ptr) {
+  // The indices of the voxel this thread will work on
+  // block_indices_device_ptr[blockIdx.x]:
+  //                 - The index of the block we're working on (blockIdx.y/z
+  //                   should be zero)
+  // threadIdx.x/y/z - The indices of the voxel within the block (we
+  //                   expect the threadBlockDims == voxelBlockDims)
+  const Index3D block_idx = block_indices_device_ptr[blockIdx.x];
+  const Index3D voxel_idx(threadIdx.z, threadIdx.y, threadIdx.x);
+
+  // Voxel center point
+  const Vector3f p_voxel_center_L = getCenterPostionFromBlockIndexAndVoxelIndex(
+      block_size, block_idx, voxel_idx);
+  // To camera frame
+  const Vector3f p_voxel_center_C = T_C_L * p_voxel_center_L;
+
+  // Project to image plane
+  if (!camera.project(p_voxel_center_C, u_px_ptr)) {
+    return false;
+  }
+
+  // Depth
+  *u_depth_ptr = p_voxel_center_C.z();
+  return true;
+}
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/integrators/cuda/projective_integrators_common.cuh b/nvblox/include/nvblox/integrators/cuda/projective_integrators_common.cuh
new file mode 100644
index 00000000..f1e8c3ba
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/cuda/projective_integrators_common.cuh
@@ -0,0 +1,32 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+__device__ inline bool projectThreadVoxel(
+    const Index3D* block_indices_device_ptr, const Camera& camera,
+    const Transform& T_C_L, const float block_size, Eigen::Vector2f* u_px_ptr,
+    float* u_depth_ptr);
+
+}  // namespace nvblox
+
+#include "nvblox/integrators/cuda/impl/projective_integrators_common_impl.cuh"
diff --git a/nvblox/include/nvblox/integrators/esdf_integrator.h b/nvblox/include/nvblox/integrators/esdf_integrator.h
new file mode 100644
index 00000000..a5b0c348
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/esdf_integrator.h
@@ -0,0 +1,167 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_vector.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+
+class EsdfIntegrator {
+ public:
+  // Just takes care of the cuda stream.
+  ~EsdfIntegrator();
+
+  // Integrate the entire layer.
+  void integrateLayer(const TsdfLayer& tsdf_layer, EsdfLayer* esdf_layer);
+
+  // Integrate specific blocks in a layer.
+  void integrateBlocks(const TsdfLayer& tsdf_layer,
+                       const std::vector<Index3D>& block_indices,
+                       EsdfLayer* esdf_layer);
+
+  // Separate CPU and GPU methods.
+  void integrateBlocksOnCPU(const TsdfLayer& tsdf_layer,
+                            const std::vector<Index3D>& block_indices,
+                            EsdfLayer* esdf_layer);
+  void integrateBlocksOnGPU(const TsdfLayer& tsdf_layer,
+                            const std::vector<Index3D>& block_indices,
+                            EsdfLayer* esdf_layer);
+  void integrateSliceOnGPU(const TsdfLayer& tsdf_layer,
+                           const std::vector<Index3D>& block_indices,
+                           float z_min, float z_max, float z_output,
+                           EsdfLayer* esdf_layer);
+
+  // Setting setters and getting getters
+  float max_distance_m() const { return max_distance_m_; }
+  float& max_distance_m() { return max_distance_m_; }
+
+  float min_site_distance_vox() const { return min_site_distance_vox_; }
+  float& min_site_distance_vox() { return min_site_distance_vox_; }
+
+  float min_weight() const { return min_weight_; }
+  float& min_weight() { return min_weight_; }
+
+  float max_squared_distance_vox(float voxel_size) const {
+    return max_distance_m_ * max_distance_m_ / (voxel_size * voxel_size);
+  }
+
+ protected:
+  void allocateBlocksOnCPU(const std::vector<Index3D>& block_indices,
+                           EsdfLayer* esdf_layer);
+  void markAllSitesOnCPU(const TsdfLayer& tsdf_layer,
+                         const std::vector<Index3D>& block_indices,
+                         EsdfLayer* esdf_layer,
+                         std::vector<Index3D>* blocks_with_sites,
+                         std::vector<Index3D>* blocks_to_clear);
+  void computeEsdfOnCPU(const std::vector<Index3D>& blocks_with_sites,
+                        EsdfLayer* esdf_layer);
+
+  // Internal functions for ESDF computation.
+  void sweepBlockBandOnCPU(int axis_to_sweep, float voxel_size,
+                           EsdfBlock* esdf_block);
+  void updateLocalNeighborBandsOnCPU(const Index3D& block_index,
+                                     EsdfLayer* esdf_layer,
+                                     std::vector<Index3D>* updated_blocks);
+  void propagateEdgesOnCPU(int axis_to_sweep, float voxel_size,
+                           EsdfBlock* esdf_block);
+
+  // CPU-based incremental.
+  void clearInvalidOnCPU(const std::vector<Index3D>& blocks_with_sites,
+                         EsdfLayer* esdf_layer,
+                         std::vector<Index3D>* updated_blocks);
+  void clearNeighborsOnCPU(const Index3D& block_index, EsdfLayer* esdf_layer,
+                           Index3DSet* neighbor_clearing_set);
+  void clearVoxel(EsdfVoxel* voxel, float max_squared_distance_vox);
+
+  // Test function to just clear everything in the whole map.
+  void clearAllInvalidOnCPU(EsdfLayer* esdf_layer,
+                            std::vector<Index3D>* updated_blocks);
+
+  // GPU computation functions.
+  void markAllSitesOnGPU(const TsdfLayer& tsdf_layer,
+                         const std::vector<Index3D>& block_indices,
+                         EsdfLayer* esdf_layer,
+                         std::vector<Index3D>* blocks_with_sites,
+                         std::vector<Index3D>* cleared_blocks);
+
+  // Same as the other function but basically makes the whole operation 2D.
+  // Considers a min and max z in a bounding box which is compressed down into a
+  // single layer.
+  void markSitesInSliceOnGPU(const TsdfLayer& tsdf_layer,
+                             const std::vector<Index3D>& block_indices,
+                             float min_z, float max_z, float output_z,
+                             EsdfLayer* esdf_layer,
+                             std::vector<Index3D>* output_blocks,
+                             std::vector<Index3D>* cleared_blocks);
+
+  void clearInvalidOnGPU(const std::vector<Index3D>& blocks_to_clear,
+                         EsdfLayer* esdf_layer,
+                         std::vector<Index3D>* updated_blocks);
+
+  void computeEsdfOnGPU(const std::vector<Index3D>& blocks_with_sites,
+                        EsdfLayer* esdf_layer);
+
+  // Internal helpers for GPU computation.
+  void sweepBlockBandOnGPU(device_vector<EsdfBlock*>& block_pointers,
+                           float max_squared_distance_vox);
+  void updateLocalNeighborBandsOnGPU(
+      const std::vector<Index3D>& block_indices,
+      device_vector<EsdfBlock*>& block_pointers, float max_squared_distance_vox,
+      EsdfLayer* esdf_layer, std::vector<Index3D>* updated_blocks,
+      device_vector<EsdfBlock*>* updated_block_pointers);
+  void createNeighborTable(const std::vector<Index3D>& block_indices,
+                           EsdfLayer* esdf_layer,
+                           std::vector<Index3D>* neighbor_indices,
+                           host_vector<EsdfBlock*>* neighbor_pointers,
+                           host_vector<int>* neighbor_table);
+  void clearBlockNeighbors(std::vector<Index3D>& clear_list,
+                           EsdfLayer* esdf_layer,
+                           std::vector<Index3D>* new_clear_list);
+
+  // Helper function to figure out which axes to sweep first and second.
+  void getSweepAxes(int axis_to_sweep, int* first_axis, int* second_axis) const;
+
+  /// Maximum distance to compute the ESDF.
+  float max_distance_m_ = 10.0;
+  /// Minimum distance to a site
+  float min_site_distance_vox_ = 1.0;
+  /// Minimum weight to consider a TSDF voxel observed.
+  float min_weight_ = 1e-4;
+
+  // State.
+  cudaStream_t cuda_stream_ = nullptr;
+
+  // Temporary storage variables so we don't have to reallocate as much.
+  host_vector<bool> updated_blocks_host_;
+  device_vector<bool> updated_blocks_device_;
+  host_vector<bool> cleared_blocks_host_;
+  device_vector<bool> cleared_blocks_device_;
+  host_vector<const TsdfBlock*> tsdf_pointers_host_;
+  device_vector<const TsdfBlock*> tsdf_pointers_device_;
+  host_vector<EsdfBlock*> block_pointers_host_;
+  device_vector<EsdfBlock*> block_pointers_device_;
+  host_vector<int> neighbor_table_host_;
+  device_vector<int> neighbor_table_device_;
+  host_vector<EsdfBlock*> neighbor_pointers_host_;
+  device_vector<EsdfBlock*> neighbor_pointers_device_;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/integrators/frustum.h b/nvblox/include/nvblox/integrators/frustum.h
new file mode 100644
index 00000000..aa0526d3
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/frustum.h
@@ -0,0 +1,101 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_vector.h"
+
+namespace nvblox {
+
+// Helper class for frustum calculations.
+class FrustumCalculator {
+ public:
+  FrustumCalculator();
+  ~FrustumCalculator();
+
+  // Gets blocks which fall into the camera view up to: max camera distance
+  static std::vector<Index3D> getBlocksInView(const Transform& T_L_C,
+                                              const Camera& camera,
+                                              const float block_size,
+                                              const float max_distance);
+
+  // Gets blocks which fall into the camera view up to: max distance in the
+  // depth frame
+  static std::vector<Index3D> getBlocksInImageView(
+      const DepthImage& depth_frame, const Transform& T_L_C,
+      const Camera& camera, const float block_size,
+      const float truncation_distance_m,
+      const float max_integration_distance_m);
+
+  // Performs ray casting to get the blocks in view
+  // NOTE(helenol) The only non-static member function of this class.
+  std::vector<Index3D> getBlocksInImageViewCuda(
+      const DepthImage& depth_frame, const Transform& T_L_C,
+      const Camera& camera, const float block_size,
+      const float truncation_distance_m,
+      const float max_integration_distance_m);
+
+  // Params
+  unsigned int raycast_subsampling_factor() const;
+  void raycast_subsampling_factor(unsigned int raycast_subsampling_rate);
+
+ private:
+  // Raycasts to all corners of all blocks touched by the endpoints of depth
+  // rays.
+  void getBlocksByRaycastingCorners(
+      const Transform& T_L_C,                  // NOLINT
+      const Camera& camera,                    // NOLINT
+      const DepthImage& depth_frame,           // NOLINT
+      float block_size,                        // NOLINT
+      const float truncation_distance_m,       // NOLINT
+      const float max_integration_distance_m,  // NOLINT
+      const Index3D& min_index,                // NOLINT
+      const Index3D& aabb_size,                // NOLINT
+      bool* aabb_updated_cuda);
+
+  // Raycasts through (possibly subsampled) pixels in the image.
+  void getBlocksByRaycastingPixels(
+      const Transform& T_L_C,                  // NOLINT
+      const Camera& camera,                    // NOLINT
+      const DepthImage& depth_frame,           // NOLINT
+      float block_size,                        // NOLINT
+      const float truncation_distance_m,       // NOLINT
+      const float max_integration_distance_m,  // NOLINT
+      const Index3D& min_index,                // NOLINT
+      const Index3D& aabb_size,                // NOLINT
+      bool* aabb_updated_cuda);
+
+  // A 3D grid of bools, one for each block in the AABB, which indicates if it
+  // is in the view. The 3D grid is represented as a flat vector.
+  device_vector<bool> aabb_device_buffer_;
+  host_vector<bool> aabb_host_buffer_;
+
+  // Whether to use the single kernel or double kernel computation for the
+  // CUDA version. Single kernel raycasts to every pixel, double kernel
+  // raycasts to every block corner.
+  bool raycast_to_pixels_ = true;
+  // The rate at which we subsample pixels to raycast. Note that we always
+  // raycast the extremes of the frame, no matter the subsample rate.
+  unsigned int raycast_subsampling_factor_ = 1;
+
+  cudaStream_t cuda_stream_;
+};
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/integrators/impl/integrators_common_impl.h b/nvblox/include/nvblox/integrators/impl/integrators_common_impl.h
new file mode 100644
index 00000000..e0a3fe1f
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/impl/integrators_common_impl.h
@@ -0,0 +1,57 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+namespace nvblox {
+
+template <typename BlockType>
+__host__ std::vector<BlockType*> getBlockPtrsFromIndices(
+    const std::vector<Index3D>& block_indices,
+    BlockLayer<BlockType>* layer_ptr) {
+  std::vector<BlockType*> block_ptrs;
+  block_ptrs.reserve(block_indices.size());
+  for (const Index3D& block_index : block_indices) {
+    typename BlockType::Ptr block_ptr = layer_ptr->getBlockAtIndex(block_index);
+    CHECK(block_ptr);
+    block_ptrs.push_back(block_ptr.get());
+  }
+  return block_ptrs;
+}
+
+template <typename BlockType>
+__host__ std::vector<const BlockType*> getBlockPtrsFromIndices(
+    const std::vector<Index3D>& block_indices,
+    const BlockLayer<BlockType>& layer) {
+  std::vector<const BlockType*> block_ptrs;
+  block_ptrs.reserve(block_indices.size());
+  for (const Index3D& block_index : block_indices) {
+    const typename BlockType::ConstPtr block_ptr =
+        layer.getBlockAtIndex(block_index);
+    CHECK(block_ptr);
+    block_ptrs.push_back(block_ptr.get());
+  }
+  return block_ptrs;
+}
+
+template <typename VoxelType>
+void allocateBlocksWhereRequired(const std::vector<Index3D>& block_indices,
+                                 BlockLayer<VoxelBlock<VoxelType>>* layer) {
+  for (const Index3D& block_index : block_indices) {
+    layer->allocateBlockAtIndex(block_index);
+  }
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/integrators/impl/ray_caster_impl.h b/nvblox/include/nvblox/integrators/impl/ray_caster_impl.h
new file mode 100644
index 00000000..004242a0
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/impl/ray_caster_impl.h
@@ -0,0 +1,90 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/indexing.h"
+
+namespace nvblox {
+
+__host__ __device__ int inline signum(float x) {
+  return (x > 0.0f) ? 1 : ((x < 0.0f) ? -1 : 0);
+}
+
+RayCaster::RayCaster(const Vector3f& origin, const Vector3f& destination,
+                     float scale)
+    : scale_(scale) {
+  Vector3f start_scaled = origin / scale_;
+  Vector3f end_scaled = destination / scale_;
+
+  current_index_ = getBlockIndexFromPositionInLayer(scale, origin);
+  const Index3D end_index =
+      getBlockIndexFromPositionInLayer(scale, destination);
+  const Index3D diff_index = end_index - current_index_;
+
+  current_step_ = 0;
+
+  ray_length_in_steps_ = diff_index.cwiseAbs().sum();
+  const Vector3f ray_scaled = end_scaled - start_scaled;
+
+  ray_step_signs_ = Index3D(signum(ray_scaled.x()), signum(ray_scaled.y()),
+                            signum(ray_scaled.z()));
+
+  const Index3D corrected_step = ray_step_signs_.cwiseMax(0);
+
+  const Vector3f start_scaled_shifted =
+      start_scaled - current_index_.cast<float>();
+
+  Vector3f distance_to_boundaries(corrected_step.cast<float>() -
+                                  start_scaled_shifted);
+
+  // NaNs are fine in the next 2 lines.
+  t_to_next_boundary_ = distance_to_boundaries.cwiseQuotient(ray_scaled);
+
+  // Distance to cross one grid cell along the ray in t.
+  // Same as absolute inverse value of delta_coord.
+  t_step_size_ = ray_step_signs_.cast<float>().cwiseQuotient(ray_scaled);
+}
+
+bool RayCaster::nextRayIndex(Index3D* ray_index) {
+  if (current_step_++ > ray_length_in_steps_) {
+    return false;
+  }
+
+  DCHECK(ray_index != nullptr);
+  *ray_index = current_index_;
+
+  int t_min_idx;
+  t_to_next_boundary_.minCoeff(&t_min_idx);
+  current_index_[t_min_idx] += ray_step_signs_[t_min_idx];
+  t_to_next_boundary_[t_min_idx] += t_step_size_[t_min_idx];
+
+  return true;
+}
+
+bool RayCaster::nextRayPositionScaled(Vector3f* ray_position) {
+  Index3D ray_index;
+  bool success = nextRayIndex(&ray_index);
+  *ray_position = scale_ * ray_index.cast<float>();
+  return success;
+}
+
+void RayCaster::getAllIndices(std::vector<Index3D>* indices) {
+  indices->clear();
+  Index3D next_index;
+  while (nextRayIndex(&next_index)) {
+    indices->push_back(next_index);
+  }
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/integrators/integrators_common.h b/nvblox/include/nvblox/integrators/integrators_common.h
new file mode 100644
index 00000000..e2633f8a
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/integrators_common.h
@@ -0,0 +1,42 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/layer.h"
+
+namespace nvblox {
+
+// Convert a list of BlockIndices on host, to a list of device pointers.
+template <typename BlockType>
+__host__ std::vector<BlockType*> getBlockPtrsFromIndices(
+    const std::vector<Index3D>& block_indices,
+    BlockLayer<BlockType>* layer_ptr);
+
+// Const version
+template <typename BlockType>
+__host__ std::vector<const BlockType*> getBlockPtrsFromIndices(
+    const std::vector<Index3D>& block_indices,
+    const BlockLayer<BlockType>& layer_ptr);
+
+// Allocates blocks in the block_indices list which are not already allocated.
+template <typename VoxelType>
+void allocateBlocksWhereRequired(const std::vector<Index3D>& block_indices,
+                                 BlockLayer<VoxelBlock<VoxelType>>* layer);
+
+}  // namespace nvblox
+
+#include "nvblox/integrators/impl/integrators_common_impl.h"
diff --git a/nvblox/include/nvblox/integrators/projective_color_integrator.h b/nvblox/include/nvblox/integrators/projective_color_integrator.h
new file mode 100644
index 00000000..fe1fc1ec
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/projective_color_integrator.h
@@ -0,0 +1,74 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/integrators/projective_integrator_base.h"
+#include "nvblox/ray_tracing/sphere_tracer.h"
+
+namespace nvblox {
+
+class ProjectiveColorIntegrator : public ProjectiveIntegratorBase {
+ public:
+  ProjectiveColorIntegrator();
+  virtual ~ProjectiveColorIntegrator();
+
+  void finish() const override;
+
+  // Main interface - integrate a ColorImage
+  void integrateFrame(const ColorImage& color_frame, const Transform& T_L_C,
+                      const Camera& camera, const TsdfLayer& tsdf_layer,
+                      ColorLayer* color_layer,
+                      std::vector<Index3D>* updated_blocks = nullptr);
+
+ protected:
+  void updateBlocks(const std::vector<Index3D>& block_indices,
+                    const ColorImage& color_frame,
+                    const DepthImage& depth_frame, const Transform& T_L_C,
+                    const Camera& camera, const float truncation_distance_m,
+                    ColorLayer* layer);
+
+  std::vector<Index3D> reduceBlocksToThoseInTruncationBand(
+      const std::vector<Index3D>& block_indices, const TsdfLayer& tsdf_layer,
+      const float truncation_distance_m);
+
+  // Params
+  int depth_render_ray_subsampling_factor_ = 4;
+
+  // Object to do ray tracing to generate occlusions
+  // TODO(alexmillane): Expose params.
+  SphereTracer sphere_tracer_;
+
+  // Blocks to integrate on the current call and their indices
+  // NOTE(alexmillane): We have one pinned host and one device vector and
+  // transfer between them.
+  device_vector<Index3D> block_indices_device_;
+  device_vector<ColorBlock*> block_ptrs_device_;
+  host_vector<Index3D> block_indices_host_;
+  host_vector<ColorBlock*> block_ptrs_host_;
+
+  // Buffers for getting blocks in truncation band
+  device_vector<const TsdfBlock*> truncation_band_block_ptrs_device_;
+  host_vector<const TsdfBlock*> truncation_band_block_ptrs_host_;
+  device_vector<bool> block_in_truncation_band_device_;
+  host_vector<bool> block_in_truncation_band_host_;
+
+  // CUDA stream to process ingration on
+  cudaStream_t integration_stream_;
+};
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/integrators/projective_integrator_base.h b/nvblox/include/nvblox/integrators/projective_integrator_base.h
new file mode 100644
index 00000000..dad3028b
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/projective_integrator_base.h
@@ -0,0 +1,79 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <vector>
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+#include "nvblox/integrators/frustum.h"
+
+namespace nvblox {
+
+class ProjectiveIntegratorBase {
+ public:
+  ProjectiveIntegratorBase(){};
+  virtual ~ProjectiveIntegratorBase() {}
+
+  // Getters
+  float truncation_distance_vox() const;
+  float max_weight() const;
+  float max_integration_distance_m() const;
+
+  // Setters
+  void truncation_distance_vox(float truncation_distance_vox);
+  void max_weight(float max_weight);
+  void max_integration_distance_m(float max_integration_distance_m);
+
+  // Scale dependent getter
+  float truncation_distance_m(float block_size) const;
+
+  // Gets blocks in view (called by sub-classes)
+  // Using the image:
+  std::vector<Index3D> getBlocksInView(const DepthImage& depth_frame,
+                                       const Transform& T_L_C,
+                                       const Camera& camera,
+                                       float block_size) const;
+  // Does not use the image, just uses the AABB.
+  std::vector<Index3D> getBlocksInView(const Transform& T_L_C,
+                                       const Camera& camera,
+                                       float block_size) const;
+  // Using the image and raycasting:
+  std::vector<Index3D> getBlocksInViewUsingRaycasting(
+      const DepthImage& depth_frame, const Transform& T_L_C,
+      const Camera& camera, const float block_size) const;
+
+  // Ensure outstanding operations are finished (relevant for integrators
+  // launching asynchronous work)
+  virtual void finish() const = 0;
+
+  const FrustumCalculator& frustum_calculator() const {
+    return frustum_calculator_;
+  }
+  FrustumCalculator& frustum_calculator() { return frustum_calculator_; }
+
+ protected:
+  // Params
+  float truncation_distance_vox_ = 4.0f;
+  float max_weight_ = 100.0f;
+  float max_integration_distance_m_ = 10.0f;
+
+  // Frustum calculation.
+  mutable FrustumCalculator frustum_calculator_;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/integrators/projective_tsdf_integrator.h b/nvblox/include/nvblox/integrators/projective_tsdf_integrator.h
new file mode 100644
index 00000000..38ee4e93
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/projective_tsdf_integrator.h
@@ -0,0 +1,61 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+#include "nvblox/integrators/frustum.h"
+#include "nvblox/integrators/projective_integrator_base.h"
+
+namespace nvblox {
+
+class ProjectiveTsdfIntegrator : public ProjectiveIntegratorBase {
+ public:
+  ProjectiveTsdfIntegrator();
+  virtual ~ProjectiveTsdfIntegrator();
+
+  void integrateFrame(const DepthImage& depth_frame, const Transform& T_L_C,
+                      const Camera& camera, TsdfLayer* layer,
+                      std::vector<Index3D>* updated_blocks = nullptr);
+
+  void finish() const override;
+
+ protected:
+  virtual void updateBlocks(const std::vector<Index3D>& block_indices,
+                            const DepthImage& depth_frame,
+                            const Transform& T_L_C, const Camera& camera,
+                            const float truncation_distance_m,
+                            TsdfLayer* layer);
+
+  // Blocks to integrate on the current call and their indices
+  // NOTE(alexmillane): We have one pinned host and one device vector and
+  // transfer between them.
+  device_vector<Index3D> block_indices_device_;
+  device_vector<TsdfBlock*> block_ptrs_device_;
+  host_vector<Index3D> block_indices_host_;
+  host_vector<TsdfBlock*> block_ptrs_host_;
+
+  // CUDA stream to process ingration on
+  cudaStream_t integration_stream_;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/integrators/ray_caster.h b/nvblox/include/nvblox/integrators/ray_caster.h
new file mode 100644
index 00000000..53e8a0b7
--- /dev/null
+++ b/nvblox/include/nvblox/integrators/ray_caster.h
@@ -0,0 +1,50 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+
+class RayCaster {
+ public:
+  __host__ __device__ inline RayCaster(const Vector3f& origin,
+                                       const Vector3f& destination,
+                                       float scale = 1.0f);
+
+  // Returns the index, so in "unscaled" coordinates.
+  __host__ __device__ inline bool nextRayIndex(Index3D* ray_index);
+  // Returns scaled coordinates. Just the above multiplied by the scale factor.
+  __host__ __device__ inline bool nextRayPositionScaled(Vector3f* ray_position);
+
+  // Just raycasts over the whole thing and puts them in a vector for you.
+  __host__ void getAllIndices(std::vector<Index3D>* indices);
+
+ private:
+  const float scale_ = 1.0f;
+
+  uint32_t ray_length_in_steps_ = 0;
+  uint32_t current_step_ = 0;
+
+  Vector3f t_to_next_boundary_;
+  Index3D current_index_;
+  Index3D ray_step_signs_;
+  Vector3f t_step_size_;
+};
+
+}  // namespace nvblox
+
+#include "nvblox/integrators/impl/ray_caster_impl.h"
\ No newline at end of file
diff --git a/nvblox/include/nvblox/io/csv.h b/nvblox/include/nvblox/io/csv.h
new file mode 100644
index 00000000..151108e7
--- /dev/null
+++ b/nvblox/include/nvblox/io/csv.h
@@ -0,0 +1,35 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <string>
+
+#include "nvblox/core/image.h"
+
+namespace nvblox {
+namespace io {
+
+void writeToCsv(const std::string& filepath, const DepthImage& frame);
+void writeToCsv(const std::string& filepath, const ColorImage& frame);
+
+template <typename Derived>
+void writeToCsv(const std::string& filepath,
+                const Eigen::DenseBase<Derived>& eig);
+
+}  // namespace io
+}  // namespace nvblox
+
+#include "nvblox/io/impl/csv_impl.h"
\ No newline at end of file
diff --git a/nvblox/include/nvblox/io/impl/csv_impl.h b/nvblox/include/nvblox/io/impl/csv_impl.h
new file mode 100644
index 00000000..2103d45e
--- /dev/null
+++ b/nvblox/include/nvblox/io/impl/csv_impl.h
@@ -0,0 +1,35 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <glog/logging.h>
+
+#include <fstream>
+
+namespace nvblox {
+namespace io {
+
+template <typename Derived>
+void writeToCsv(const std::string& filepath,
+                const Eigen::DenseBase<Derived>& eig) {
+  std::ofstream file_stream(filepath, std::ofstream::out);
+  const Eigen::IOFormat csv_format(Eigen::StreamPrecision, Eigen::DontAlignCols,
+                                   ", ", "\n");
+  file_stream << eig.format(csv_format);
+}
+
+}  // namespace io
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/io/impl/pointcloud_io_impl.h b/nvblox/include/nvblox/io/impl/pointcloud_io_impl.h
new file mode 100644
index 00000000..ff8a4018
--- /dev/null
+++ b/nvblox/include/nvblox/io/impl/pointcloud_io_impl.h
@@ -0,0 +1,91 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/io/ply_writer.h"
+
+namespace nvblox {
+namespace io {
+
+/// Outputs a voxel layer as a pointcloud with the lambda function deciding the
+/// intensity.
+template <typename VoxelType>
+bool outputVoxelLayerToPly(
+    const VoxelBlockLayer<VoxelType>& layer, const std::string& filename,
+    std::function<bool(const VoxelType* voxel, float* intensity)> lambda) {
+  // Create a ply writer object.
+  io::PlyWriter writer(filename);
+
+  // Combine all the voxels in the mesh into a pointcloud.
+  std::vector<Vector3f> points;
+  std::vector<float> intensities;
+
+  constexpr int kVoxelsPerSide = VoxelBlock<VoxelType>::kVoxelsPerSide;
+  const float block_size = layer.block_size();
+  const float voxel_size = layer.voxel_size();
+
+  auto new_lambda = [&points, &intensities, &block_size, &voxel_size, &lambda](
+                        const Index3D& block_index, const Index3D& voxel_index,
+                        const VoxelType* voxel) {
+    float intensity = 0.0f;
+    if (lambda(voxel, &intensity)) {
+      points.push_back(getCenterPostionFromBlockIndexAndVoxelIndex(
+          block_size, block_index, voxel_index));
+      intensities.push_back(intensity);
+    }
+  };
+
+  // Call above lambda on every voxel in the layer.
+  callFunctionOnAllVoxels<VoxelType>(layer, new_lambda);
+
+  // Add the pointcloud to the ply writer.
+  writer.setPoints(&points);
+  writer.setIntensities(&intensities);
+
+  // Write out the ply.
+  return writer.write();
+}
+
+/// Specializations for the TSDF type.
+template <>
+bool outputVoxelLayerToPly(const TsdfLayer& layer,
+                           const std::string& filename) {
+  constexpr float kMinWeight = 0.1f;
+  auto lambda = [&kMinWeight](const TsdfVoxel* voxel, float* distance) -> bool {
+    *distance = voxel->distance;
+    return voxel->weight > kMinWeight;
+  };
+  return outputVoxelLayerToPly<TsdfVoxel>(layer, filename, lambda);
+}
+
+/// Specialization for the ESDF type.
+template <>
+bool outputVoxelLayerToPly(const EsdfLayer& layer,
+                           const std::string& filename) {
+  const float voxel_size = layer.voxel_size();
+  auto lambda = [&voxel_size](const EsdfVoxel* voxel, float* distance) -> bool {
+    *distance = voxel_size * std::sqrt(voxel->squared_distance_vox);
+    if (voxel->is_inside) {
+      *distance = -*distance;
+    }
+    return voxel->observed;
+  };
+  return outputVoxelLayerToPly<EsdfVoxel>(layer, filename, lambda);
+}
+
+}  // namespace io
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/io/mesh_io.h b/nvblox/include/nvblox/io/mesh_io.h
new file mode 100644
index 00000000..f7ae9716
--- /dev/null
+++ b/nvblox/include/nvblox/io/mesh_io.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <string>
+
+#include "nvblox/core/layer.h"
+#include "nvblox/mesh/mesh_block.h"
+
+namespace nvblox {
+namespace io {
+
+// Combines mesh blocks in a layer into a single list of vertices, triangles,
+// and normals.
+void combineMeshBlocks(const BlockLayer<MeshBlock>& layer,
+                       std::vector<Vector3f>* vertices_ptr,
+                       std::vector<Vector3f>* normals_ptr,
+                       std::vector<int>* triangles_ptr);
+
+bool outputMeshLayerToPly(const BlockLayer<MeshBlock>& layer,
+                          const std::string& filename);
+
+}  // namespace io
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/io/ply_writer.h b/nvblox/include/nvblox/io/ply_writer.h
new file mode 100644
index 00000000..ee69f8a6
--- /dev/null
+++ b/nvblox/include/nvblox/io/ply_writer.h
@@ -0,0 +1,67 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <fstream>
+#include <string>
+
+#include "nvblox/core/color.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace io {
+/**
+ * Writes a mesh to a .ply file. For reference on the format, see:
+ *  http://paulbourke.net/dataformats/ply/
+ */
+class PlyWriter {
+ public:
+  explicit PlyWriter(const std::string& filename) : file_(filename) {}
+
+  ~PlyWriter() { file_.close(); }
+
+  // Add however many of these you want.
+  void setPoints(const std::vector<Vector3f>* points) { points_ = points; }
+  void setColors(const std::vector<Color>* colors) { colors_ = colors; }
+  void setIntensities(const std::vector<float>* intensities) {
+    intensities_ = intensities;
+  }
+  void setNormals(const std::vector<Vector3f>* normals) { normals_ = normals; }
+  void setTriangles(const std::vector<int>* triangles) {
+    triangles_ = triangles;
+  }
+
+  // Call this after points, normals, triangles, etc. have been added to write
+  // to file.
+  bool write();
+
+ private:
+  void writeHeader();
+  void writePoints();
+  void writeTriangles();
+
+  const std::vector<Vector3f>* points_ = nullptr;
+  const std::vector<Vector3f>* normals_ = nullptr;
+  const std::vector<float>* intensities_ = nullptr;
+  const std::vector<Color>* colors_ = nullptr;
+  const std::vector<int>* triangles_ = nullptr;
+
+  std::ofstream file_;
+};
+
+}  // namespace io
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/io/pointcloud_io.h b/nvblox/include/nvblox/io/pointcloud_io.h
new file mode 100644
index 00000000..01318728
--- /dev/null
+++ b/nvblox/include/nvblox/io/pointcloud_io.h
@@ -0,0 +1,52 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <string>
+
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+namespace io {
+
+/// Outputs a voxel layer as a pointcloud with the lambda function deciding the
+/// intensity.
+/// The lambda outputs a boolean, saying whether that voxel should be
+/// visualized, and an intensity which will be written to the pointcloud.
+template <typename VoxelType>
+bool outputVoxelLayerToPly(
+    const VoxelBlockLayer<VoxelType>& layer, const std::string& filename,
+    std::function<bool(const VoxelType* voxel, float* intensity)> lambda);
+
+/// Without specifying a lambda, this outputs the distance as intensity.
+template <typename VoxelType>
+bool outputVoxelLayerToPly(const VoxelBlockLayer<VoxelType>& layer,
+                           const std::string& filename);
+
+/// Specializations for the TSDF type.
+template <>
+bool outputVoxelLayerToPly(const TsdfLayer& layer, const std::string& filename);
+
+/// Specialization for the ESDF type.
+template <>
+bool outputVoxelLayerToPly(const EsdfLayer& layer, const std::string& filename);
+
+}  // namespace io
+}  // namespace nvblox
+
+#include "nvblox/io/impl/pointcloud_io_impl.h"
\ No newline at end of file
diff --git a/nvblox/include/nvblox/mesh/impl/marching_cubes_table.h b/nvblox/include/nvblox/mesh/impl/marching_cubes_table.h
new file mode 100644
index 00000000..d06a0d32
--- /dev/null
+++ b/nvblox/include/nvblox/mesh/impl/marching_cubes_table.h
@@ -0,0 +1,334 @@
+// The MIT License (MIT)
+// Copyright (c) 2014 Matthew Klingensmith and Ivan Dryanovski
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+namespace nvblox {
+namespace marching_cubes {
+
+// Lookup table from the 256 possible cube configurations from
+// CalculateVertexConfigurationIndex() to the 0-5 triplets the give the edges
+// where the triangle vertices lie.
+__device__ __constant__ const int8_t kTriangleTable[256][16] = {
+    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 8, 3, 9, 8, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 3, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 2, 10, 0, 2, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {2, 8, 3, 2, 10, 8, 10, 9, 8, -1, -1, -1, -1, -1, -1, -1},
+    {3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 11, 2, 8, 11, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 9, 0, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 11, 2, 1, 9, 11, 9, 8, 11, -1, -1, -1, -1, -1, -1, -1},
+    {3, 10, 1, 11, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 10, 1, 0, 8, 10, 8, 11, 10, -1, -1, -1, -1, -1, -1, -1},
+    {3, 9, 0, 3, 11, 9, 11, 10, 9, -1, -1, -1, -1, -1, -1, -1},
+    {9, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 3, 0, 7, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 1, 9, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 1, 9, 4, 7, 1, 7, 3, 1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 10, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {3, 4, 7, 3, 0, 4, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1},
+    {9, 2, 10, 9, 0, 2, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},
+    {2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, -1, -1, -1, -1},
+    {8, 4, 7, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {11, 4, 7, 11, 2, 4, 2, 0, 4, -1, -1, -1, -1, -1, -1, -1},
+    {9, 0, 1, 8, 4, 7, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},
+    {4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, -1, -1, -1, -1},
+    {3, 10, 1, 3, 11, 10, 7, 8, 4, -1, -1, -1, -1, -1, -1, -1},
+    {1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, -1, -1, -1, -1},
+    {4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, -1, -1, -1, -1},
+    {4, 7, 11, 4, 11, 9, 9, 11, 10, -1, -1, -1, -1, -1, -1, -1},
+    {9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 5, 4, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 5, 4, 1, 5, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {8, 5, 4, 8, 3, 5, 3, 1, 5, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 10, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {3, 0, 8, 1, 2, 10, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},
+    {5, 2, 10, 5, 4, 2, 4, 0, 2, -1, -1, -1, -1, -1, -1, -1},
+    {2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, -1, -1, -1, -1},
+    {9, 5, 4, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 11, 2, 0, 8, 11, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},
+    {0, 5, 4, 0, 1, 5, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},
+    {2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, -1, -1, -1, -1},
+    {10, 3, 11, 10, 1, 3, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1},
+    {4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, -1, -1, -1, -1},
+    {5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, -1, -1, -1, -1},
+    {5, 4, 8, 5, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1},
+    {9, 7, 8, 5, 7, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 3, 0, 9, 5, 3, 5, 7, 3, -1, -1, -1, -1, -1, -1, -1},
+    {0, 7, 8, 0, 1, 7, 1, 5, 7, -1, -1, -1, -1, -1, -1, -1},
+    {1, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 7, 8, 9, 5, 7, 10, 1, 2, -1, -1, -1, -1, -1, -1, -1},
+    {10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, -1, -1, -1, -1},
+    {8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, -1, -1, -1, -1},
+    {2, 10, 5, 2, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1},
+    {7, 9, 5, 7, 8, 9, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1},
+    {9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, -1, -1, -1, -1},
+    {2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, -1, -1, -1, -1},
+    {11, 2, 1, 11, 1, 7, 7, 1, 5, -1, -1, -1, -1, -1, -1, -1},
+    {9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, -1, -1, -1, -1},
+    {5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, -1},
+    {11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, -1},
+    {11, 10, 5, 7, 11, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 3, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 0, 1, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 8, 3, 1, 9, 8, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},
+    {1, 6, 5, 2, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 6, 5, 1, 2, 6, 3, 0, 8, -1, -1, -1, -1, -1, -1, -1},
+    {9, 6, 5, 9, 0, 6, 0, 2, 6, -1, -1, -1, -1, -1, -1, -1},
+    {5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, -1, -1, -1, -1},
+    {2, 3, 11, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {11, 0, 8, 11, 2, 0, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},
+    {0, 1, 9, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},
+    {5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, -1, -1, -1, -1},
+    {6, 3, 11, 6, 5, 3, 5, 1, 3, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, -1, -1, -1, -1},
+    {3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, -1, -1, -1, -1},
+    {6, 5, 9, 6, 9, 11, 11, 9, 8, -1, -1, -1, -1, -1, -1, -1},
+    {5, 10, 6, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 3, 0, 4, 7, 3, 6, 5, 10, -1, -1, -1, -1, -1, -1, -1},
+    {1, 9, 0, 5, 10, 6, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},
+    {10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, -1, -1, -1, -1},
+    {6, 1, 2, 6, 5, 1, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, -1, -1, -1, -1},
+    {8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, -1, -1, -1, -1},
+    {7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, -1},
+    {3, 11, 2, 7, 8, 4, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},
+    {5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, -1, -1, -1, -1},
+    {0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1},
+    {9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, -1},
+    {8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, -1, -1, -1, -1},
+    {5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, -1},
+    {0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, -1},
+    {6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, -1, -1, -1, -1},
+    {10, 4, 9, 6, 4, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 10, 6, 4, 9, 10, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1},
+    {10, 0, 1, 10, 6, 0, 6, 4, 0, -1, -1, -1, -1, -1, -1, -1},
+    {8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, -1, -1, -1, -1},
+    {1, 4, 9, 1, 2, 4, 2, 6, 4, -1, -1, -1, -1, -1, -1, -1},
+    {3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, -1, -1, -1, -1},
+    {0, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {8, 3, 2, 8, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1},
+    {10, 4, 9, 10, 6, 4, 11, 2, 3, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, -1, -1, -1, -1},
+    {3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, -1, -1, -1, -1},
+    {6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, -1},
+    {9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, -1, -1, -1, -1},
+    {8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, -1},
+    {3, 11, 6, 3, 6, 0, 0, 6, 4, -1, -1, -1, -1, -1, -1, -1},
+    {6, 4, 8, 11, 6, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {7, 10, 6, 7, 8, 10, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1},
+    {0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, -1, -1, -1, -1},
+    {10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, -1, -1, -1, -1},
+    {10, 6, 7, 10, 7, 1, 1, 7, 3, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, -1, -1, -1, -1},
+    {2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, -1},
+    {7, 8, 0, 7, 0, 6, 6, 0, 2, -1, -1, -1, -1, -1, -1, -1},
+    {7, 3, 2, 6, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, -1, -1, -1, -1},
+    {2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, -1},
+    {1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, -1},
+    {11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, -1, -1, -1, -1},
+    {8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, -1},
+    {0, 9, 1, 11, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, -1, -1, -1, -1},
+    {7, 11, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {3, 0, 8, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 1, 9, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {8, 1, 9, 8, 3, 1, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},
+    {10, 1, 2, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 10, 3, 0, 8, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},
+    {2, 9, 0, 2, 10, 9, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},
+    {6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, -1, -1, -1, -1},
+    {7, 2, 3, 6, 2, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {7, 0, 8, 7, 6, 0, 6, 2, 0, -1, -1, -1, -1, -1, -1, -1},
+    {2, 7, 6, 2, 3, 7, 0, 1, 9, -1, -1, -1, -1, -1, -1, -1},
+    {1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, -1, -1, -1, -1},
+    {10, 7, 6, 10, 1, 7, 1, 3, 7, -1, -1, -1, -1, -1, -1, -1},
+    {10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, -1, -1, -1, -1},
+    {0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, -1, -1, -1, -1},
+    {7, 6, 10, 7, 10, 8, 8, 10, 9, -1, -1, -1, -1, -1, -1, -1},
+    {6, 8, 4, 11, 8, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {3, 6, 11, 3, 0, 6, 0, 4, 6, -1, -1, -1, -1, -1, -1, -1},
+    {8, 6, 11, 8, 4, 6, 9, 0, 1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, -1, -1, -1, -1},
+    {6, 8, 4, 6, 11, 8, 2, 10, 1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, -1, -1, -1, -1},
+    {4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, -1, -1, -1, -1},
+    {10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, -1},
+    {8, 2, 3, 8, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1},
+    {0, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, -1, -1, -1, -1},
+    {1, 9, 4, 1, 4, 2, 2, 4, 6, -1, -1, -1, -1, -1, -1, -1},
+    {8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, -1, -1, -1, -1},
+    {10, 1, 0, 10, 0, 6, 6, 0, 4, -1, -1, -1, -1, -1, -1, -1},
+    {4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, -1},
+    {10, 9, 4, 6, 10, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 9, 5, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 3, 4, 9, 5, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},
+    {5, 0, 1, 5, 4, 0, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},
+    {11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, -1, -1, -1, -1},
+    {9, 5, 4, 10, 1, 2, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},
+    {6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, -1, -1, -1, -1},
+    {7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, -1, -1, -1, -1},
+    {3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, -1},
+    {7, 2, 3, 7, 6, 2, 5, 4, 9, -1, -1, -1, -1, -1, -1, -1},
+    {9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, -1, -1, -1, -1},
+    {3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, -1, -1, -1, -1},
+    {6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, -1},
+    {9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, -1, -1, -1, -1},
+    {1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, -1},
+    {4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, -1},
+    {7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, -1, -1, -1, -1},
+    {6, 9, 5, 6, 11, 9, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1},
+    {3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, -1, -1, -1, -1},
+    {0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, -1, -1, -1, -1},
+    {6, 11, 3, 6, 3, 5, 5, 3, 1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, -1, -1, -1, -1},
+    {0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, -1},
+    {11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, -1},
+    {6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, -1, -1, -1, -1},
+    {5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, -1, -1, -1, -1},
+    {9, 5, 6, 9, 6, 0, 0, 6, 2, -1, -1, -1, -1, -1, -1, -1},
+    {1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, -1},
+    {1, 5, 6, 2, 1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, -1},
+    {10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, -1, -1, -1, -1},
+    {0, 3, 8, 5, 6, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {10, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {11, 5, 10, 7, 5, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {11, 5, 10, 11, 7, 5, 8, 3, 0, -1, -1, -1, -1, -1, -1, -1},
+    {5, 11, 7, 5, 10, 11, 1, 9, 0, -1, -1, -1, -1, -1, -1, -1},
+    {10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, -1, -1, -1, -1},
+    {11, 1, 2, 11, 7, 1, 7, 5, 1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, -1, -1, -1, -1},
+    {9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, -1, -1, -1, -1},
+    {7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, -1},
+    {2, 5, 10, 2, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1},
+    {8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, -1, -1, -1, -1},
+    {9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, -1, -1, -1, -1},
+    {9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, -1},
+    {1, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 7, 0, 7, 1, 1, 7, 5, -1, -1, -1, -1, -1, -1, -1},
+    {9, 0, 3, 9, 3, 5, 5, 3, 7, -1, -1, -1, -1, -1, -1, -1},
+    {9, 8, 7, 5, 9, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {5, 8, 4, 5, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1},
+    {5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, -1, -1, -1, -1},
+    {0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, -1, -1, -1, -1},
+    {10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, -1},
+    {2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, -1, -1, -1, -1},
+    {0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, -1},
+    {0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, -1},
+    {9, 4, 5, 2, 11, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, -1, -1, -1, -1},
+    {5, 10, 2, 5, 2, 4, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1},
+    {3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, -1},
+    {5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, -1, -1, -1, -1},
+    {8, 4, 5, 8, 5, 3, 3, 5, 1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 4, 5, 1, 0, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, -1, -1, -1, -1},
+    {9, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 11, 7, 4, 9, 11, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1},
+    {0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, -1, -1, -1, -1},
+    {1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, -1, -1, -1, -1},
+    {3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, -1},
+    {4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, -1, -1, -1, -1},
+    {9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, -1},
+    {11, 7, 4, 11, 4, 2, 2, 4, 0, -1, -1, -1, -1, -1, -1, -1},
+    {11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, -1, -1, -1, -1},
+    {2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, -1, -1, -1, -1},
+    {9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, -1},
+    {3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, -1},
+    {1, 10, 2, 8, 7, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 9, 1, 4, 1, 7, 7, 1, 3, -1, -1, -1, -1, -1, -1, -1},
+    {4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, -1, -1, -1, -1},
+    {4, 0, 3, 7, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {4, 8, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {9, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {3, 0, 9, 3, 9, 11, 11, 9, 10, -1, -1, -1, -1, -1, -1, -1},
+    {0, 1, 10, 0, 10, 8, 8, 10, 11, -1, -1, -1, -1, -1, -1, -1},
+    {3, 1, 10, 11, 3, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 2, 11, 1, 11, 9, 9, 11, 8, -1, -1, -1, -1, -1, -1, -1},
+    {3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, -1, -1, -1, -1},
+    {0, 2, 11, 8, 0, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {3, 2, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {2, 3, 8, 2, 8, 10, 10, 8, 9, -1, -1, -1, -1, -1, -1, -1},
+    {9, 10, 2, 0, 9, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, -1, -1, -1, -1},
+    {1, 10, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {1, 3, 8, 9, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 9, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {0, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+    {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}};
+
+// Lookup table mapping the number of vertices in the table above.
+__device__ __constant__ const uint8_t kNumVertsTable[256] = {
+    0,  3,  3,  6,  3,  6,  6,  9,  3,  6,  6,  9,  6,  9,  9,  6,  3,  6,  6,
+    9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  3,  6,  6,  9,  6,  9,
+    9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  6,  9,  12, 12, 9,  9,
+    12, 12, 9,  12, 15, 15, 6,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12,
+    9,  12, 12, 9,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 12, 15, 15,
+    12, 6,  9,  9,  12, 9,  12, 6,  9,  9,  12, 12, 15, 12, 15, 9,  6,  9,  12,
+    12, 9,  12, 15, 9,  6,  12, 15, 15, 12, 15, 6,  12, 3,  3,  6,  6,  9,  6,
+    9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  12, 9,  12, 12, 15,
+    9,  6,  12, 9,  12, 9,  15, 6,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12,
+    15, 12, 15, 15, 12, 9,  12, 12, 9,  12, 15, 15, 12, 12, 9,  15, 6,  15, 12,
+    6,  3,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 6,  9,  9,  6,  9,
+    12, 12, 15, 12, 15, 15, 6,  12, 9,  15, 12, 9,  6,  12, 3,  9,  12, 12, 15,
+    12, 15, 9,  12, 12, 15, 15, 6,  9,  12, 6,  3,  6,  9,  9,  6,  9,  12, 6,
+    3,  9,  6,  12, 3,  6,  3,  3,  0,
+};
+
+// Number of triangles for each case
+__device__ __constant__ const uint8_t kNumTrianglesTable[256] = {
+    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 2, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 3, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 3,
+    2, 3, 3, 2, 3, 4, 4, 3, 3, 4, 4, 3, 4, 5, 5, 2, 1, 2, 2, 3, 2, 3, 3, 4,
+    2, 3, 3, 4, 3, 4, 4, 3, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 4,
+    2, 3, 3, 4, 3, 4, 2, 3, 3, 4, 4, 5, 4, 5, 3, 2, 3, 4, 4, 3, 4, 5, 3, 2,
+    4, 5, 5, 4, 5, 2, 4, 1, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 3,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 2, 4, 3, 4, 3, 5, 2, 2, 3, 3, 4, 3, 4, 4, 5,
+    3, 4, 4, 5, 4, 5, 5, 4, 3, 4, 4, 3, 4, 5, 5, 4, 4, 3, 5, 2, 5, 4, 2, 1,
+    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 2, 3, 3, 2, 3, 4, 4, 5, 4, 5, 5, 2,
+    4, 3, 5, 4, 3, 2, 4, 1, 3, 4, 4, 5, 4, 5, 3, 4, 4, 5, 5, 2, 3, 4, 2, 1,
+    2, 3, 3, 2, 3, 4, 2, 1, 3, 2, 4, 1, 2, 1, 1, 0};
+
+// Lookup table from the 12 cube edge indices to their corresponding corner
+// indices.
+__device__ __constant__ const uint8_t kEdgeIndexPairs[12][2] = {
+    {0, 1}, {1, 2}, {2, 3}, {3, 0}, {4, 5}, {5, 6},
+    {6, 7}, {7, 4}, {0, 4}, {1, 5}, {2, 6}, {3, 7}};
+
+// Lookup table for voxel neighbors.
+__device__ __constant__ const uint8_t kCornerIndexOffsets[8][3] = {
+    {0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0},
+    {0, 0, 1}, {1, 0, 1}, {1, 1, 1}, {0, 1, 1}};
+
+}  // namespace marching_cubes
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/mesh/marching_cubes.h b/nvblox/include/nvblox/mesh/marching_cubes.h
new file mode 100644
index 00000000..a6714b3b
--- /dev/null
+++ b/nvblox/include/nvblox/mesh/marching_cubes.h
@@ -0,0 +1,90 @@
+// The MIT License (MIT)
+// Copyright (c) 2014 Matthew Klingensmith and Ivan Dryanovski
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#pragma once
+
+#include <cuda_runtime.h>
+
+#include "nvblox/core/types.h"
+#include "nvblox/mesh/mesh_block.h"
+
+namespace nvblox {
+namespace marching_cubes {
+
+// This (internal) struct contains intermediate results of marching cubes.
+struct PerVoxelMarchingCubesResults {
+  // The 3D positions of the corners of a 2x2x2 cube of voxels formed by the
+  // surrounding voxel neighbours in the positive direction of each coordinate.
+  Vector3f vertex_coords[8];
+  // The value of the TSDF at each of the neighbouring voxels described above.
+  float vertex_sdf[8];
+  // Does this voxel contain a mesh? (Does it straddle a zero level set?)
+  bool contains_mesh = false;
+  // The index into the marching cubes triangle table (found in
+  // nvblox/mesh/impl/marching_cubes_table.h). This index is determined based on
+  // the tsdf configuration (+/-) of the surrounding voxels and is the main
+  // contribution of marching cubes algorithm.
+  uint8_t marching_cubes_table_index = 0;
+  // At the end of marching cubes, vertices calculated for this and other voxels
+  // in this MeshBlock are stored in a single vector. This member indicates where
+  // in this vertex vector the vertices associated with this voxel begin. It is
+  // calculated through an exclusive prefix sum of the numbers of vertices
+  // in each voxel of this MeshBlock.
+  int vertex_vector_start_index;
+};
+
+// Performs the marching cubes algorithm to generate a mesh layer from a TSDF.
+// Implementation taken heavily from Open Chisel
+// https://github.com/personalrobotics/OpenChisel
+
+// Calculate the vertex configuration of a given set of neighbor distances.
+__host__ __device__ int calculateVertexConfiguration(const float vertex_sdf[8]);
+
+// This is for blocks access in the kernel.
+__host__ __device__ int neighborIndexFromDirection(const Index3D& direction);
+__host__ __device__ Index3D directionFromNeighborIndex(const int index);
+
+// Output (edge coords) is 12 long. Should be preallocated.
+__host__ __device__ void interpolateEdgeVertices(
+    const PerVoxelMarchingCubesResults& marching_cubes_results,
+    Eigen::Matrix<float, 3, 12>* edge_coords);
+
+/// Performs linear interpolation on two cube corners to find the approximate
+/// zero crossing (surface) value.
+__host__ __device__ Vector3f interpolateVertex(const Vector3f& vertex1,
+                                               const Vector3f& vertex2,
+                                               float sdf1, float sdf2);
+
+// Actually populate the mesh block.
+__host__ void meshCube(
+    const PerVoxelMarchingCubesResults& marching_cubes_results,
+    MeshBlock* mesh);
+
+__device__ void calculateOutputIndex(
+    PerVoxelMarchingCubesResults* marching_cubes_results, int* size);
+
+__device__ void calculateVertices(
+    const PerVoxelMarchingCubesResults& marching_cubes_results,
+    CudaMeshBlock* mesh);
+
+}  // namespace marching_cubes
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/mesh/mesh.h b/nvblox/include/nvblox/mesh/mesh.h
new file mode 100644
index 00000000..4e00d9c1
--- /dev/null
+++ b/nvblox/include/nvblox/mesh/mesh.h
@@ -0,0 +1,39 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <nvblox/core/layer.h>
+#include <nvblox/core/types.h>
+#include <nvblox/mesh/mesh_block.h>
+
+namespace nvblox {
+
+// A structure which holds a Mesh.
+// Generally produced as the result of fusing MeshBlocks in a Layer<MeshBlock>
+// into a single mesh.
+// NOTE(alexmillane): Currently only on the CPU.
+struct Mesh {
+  // Data
+  std::vector<Vector3f> vertices;
+  std::vector<Vector3f> normals;
+  std::vector<int> triangles;
+  std::vector<Color> colors;
+
+  // Factory
+  static Mesh fromLayer(const BlockLayer<MeshBlock>& layer);
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/mesh/mesh_block.h b/nvblox/include/nvblox/mesh/mesh_block.h
new file mode 100644
index 00000000..56726366
--- /dev/null
+++ b/nvblox/include/nvblox/mesh/mesh_block.h
@@ -0,0 +1,86 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/color.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_ptr.h"
+#include "nvblox/core/unified_vector.h"
+
+namespace nvblox {
+
+// A mesh block containing all of the triangles from this block.
+// Each block contains only the UPPER part of its neighbors: i.e., the max
+// x, y, and z axes. Its neighbors are responsible for the rest.
+struct MeshBlock {
+  typedef std::shared_ptr<MeshBlock> Ptr;
+  typedef std::shared_ptr<const MeshBlock> ConstPtr;
+
+  MeshBlock(MemoryType memory_type = MemoryType::kDevice);
+
+  // Mesh Data
+  // These unified vectors contain the mesh data for this block. Note that
+  // Colors and/or intensities are optional. The "triangles" vector is a vector
+  // of indices into the vertices vector. Triplets of consecutive elements form
+  // triangles with the indexed vertices as their corners.
+  unified_vector<Vector3f> vertices;
+  unified_vector<Vector3f> normals;
+  unified_vector<Color> colors;
+  unified_vector<float> intensities;
+  unified_vector<int> triangles;
+
+  void clear();
+
+  // Resize and reserve.
+  void resizeToNumberOfVertices(size_t new_size);
+  void reserveNumberOfVertices(size_t new_capacity);
+
+  size_t size() const;
+  size_t capacity() const;
+
+  // Resize colors/intensities such that:
+  // `colors.size()/intensities.size() == vertices.size()`
+  void expandColorsToMatchVertices();
+  void expandIntensitiesToMatchVertices();
+
+  // Copy mesh data to the CPU.
+  std::vector<Vector3f> getVertexVectorOnCPU() const;
+  std::vector<Vector3f> getNormalVectorOnCPU() const;
+  std::vector<int> getTriangleVectorOnCPU() const;
+  std::vector<Color> getColorVectorOnCPU() const;
+
+  // Note(alexmillane): Memory type ignored, MeshBlocks live in CPU memory.
+  static Ptr allocate(MemoryType memory_type);
+};
+
+// Helper struct for mesh blocks on CUDA.
+// NOTE: We need this because we cant pass MeshBlock to kernel functions because
+// of the presence of unified_vector members.
+struct CudaMeshBlock {
+  CudaMeshBlock() = default;
+  CudaMeshBlock(MeshBlock* block);
+
+  Vector3f* vertices;
+  Vector3f* normals;
+  int* triangles;
+  Color* colors;
+  int size;
+};
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/mesh/mesh_integrator.h b/nvblox/include/nvblox/mesh/mesh_integrator.h
new file mode 100644
index 00000000..e17852e6
--- /dev/null
+++ b/nvblox/include/nvblox/mesh/mesh_integrator.h
@@ -0,0 +1,133 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/mesh/marching_cubes.h"
+#include "nvblox/mesh/mesh_block.h"
+
+namespace nvblox {
+
+class MeshIntegrator {
+ public:
+  MeshIntegrator();
+  ~MeshIntegrator();
+
+  /// Chooses the default mesher between CPU and GPU.
+  bool integrateMeshFromDistanceField(
+      const TsdfLayer& distance_layer, BlockLayer<MeshBlock>* mesh_layer,
+      const DeviceType device_type = DeviceType::kGPU);
+
+  /// Integrates only the selected blocks from the distance layer.
+  bool integrateBlocksCPU(const TsdfLayer& distance_layer,
+                          const std::vector<Index3D>& block_indices,
+                          BlockLayer<MeshBlock>* mesh_layer);
+
+  bool integrateBlocksGPU(const TsdfLayer& distance_layer,
+                          const std::vector<Index3D>& block_indices,
+                          BlockLayer<MeshBlock>* mesh_layer);
+
+  // Color mesh layer.
+  // TODO(alexmillane): Currently these functions color vertices by taking the
+  // CLOSEST color. Would be good to have an option at least for interpolation.
+  void colorMesh(const ColorLayer& color_layer, MeshLayer* mesh_layer);
+  void colorMesh(const ColorLayer& color_layer,
+                 const std::vector<Index3D>& block_indices,
+                 MeshLayer* mesh_layer);
+  void colorMeshGPU(const ColorLayer& color_layer, MeshLayer* mesh_layer);
+  void colorMeshGPU(const ColorLayer& color_layer,
+                    const std::vector<Index3D>& block_indices,
+                    MeshLayer* mesh_layer);
+  void colorMeshCPU(const ColorLayer& color_layer, MeshLayer* mesh_layer);
+  void colorMeshCPU(const ColorLayer& color_layer,
+                    const std::vector<Index3D>& block_indices,
+                    MeshLayer* mesh_layer);
+
+  float min_weight() const { return min_weight_; }
+  float& min_weight() { return min_weight_; }
+
+  bool weld_vertices() const { return weld_vertices_; }
+  bool& weld_vertices() { return weld_vertices_; }
+
+ private:
+  bool isBlockMeshable(const VoxelBlock<TsdfVoxel>::ConstPtr block,
+                       float cutoff) const;
+
+  bool getTriangleCandidatesAroundVoxel(
+      const VoxelBlock<TsdfVoxel>::ConstPtr block,
+      const std::vector<VoxelBlock<TsdfVoxel>::ConstPtr>& neighbor_blocks,
+      const Index3D& index, const Vector3f& voxel_position,
+      const float voxel_size,
+      marching_cubes::PerVoxelMarchingCubesResults* neighbors);
+
+  void getTriangleCandidatesInBlock(
+      const VoxelBlock<TsdfVoxel>::ConstPtr block,
+      const std::vector<VoxelBlock<TsdfVoxel>::ConstPtr>& neighbor_blocks,
+      const Index3D& block_index, const float block_size,
+      std::vector<marching_cubes::PerVoxelMarchingCubesResults>*
+          triangle_candidates);
+
+  void getMeshableBlocksGPU(const TsdfLayer& distance_layer,
+                            const std::vector<Index3D>& block_indices,
+                            float cutoff_distance,
+                            std::vector<Index3D>* meshable_blocks);
+
+  void meshBlocksGPU(const TsdfLayer& distance_layer,
+                     const std::vector<Index3D>& block_indices,
+                     BlockLayer<MeshBlock>* mesh_layer);
+
+  void weldVertices(const std::vector<Index3D>& block_indices,
+                    BlockLayer<MeshBlock>* mesh_layer);
+
+  // Minimum weight to actually mesh.
+  float min_weight_ = 1e-4;
+
+  /// Whether to perform vertex welding or not. It's slow but cuts down number
+  /// of vertices by 5x.
+  bool weld_vertices_ = false;
+
+  // Offsets for cube indices.
+  Eigen::Matrix<int, 3, 8> cube_index_offsets_;
+
+  // The color that the mesh takes if no coloring is available.
+  Color default_mesh_color_ = Color::Gray();
+
+  // State.
+  cudaStream_t cuda_stream_ = nullptr;
+
+  // These are temporary variables so we don't have to allocate every single
+  // frame.
+  host_vector<const VoxelBlock<TsdfVoxel>*> block_ptrs_host_;
+  device_vector<const VoxelBlock<TsdfVoxel>*> block_ptrs_device_;
+  host_vector<bool> meshable_host_;
+  device_vector<bool> meshable_device_;
+  host_vector<Vector3f> block_positions_host_;
+  device_vector<Vector3f> block_positions_device_;
+  host_vector<CudaMeshBlock> mesh_blocks_host_;
+  device_vector<CudaMeshBlock> mesh_blocks_device_;
+  // Caches for welding.
+  device_vector<Vector3f> input_vertices_;
+  device_vector<Vector3f> input_normals_;
+
+  // Intermediate marching cube results.
+  device_vector<marching_cubes::PerVoxelMarchingCubesResults>
+      marching_cubes_results_device_;
+  device_vector<int> mesh_block_sizes_device_;
+  host_vector<int> mesh_block_sizes_host_;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/nvblox.h b/nvblox/include/nvblox/nvblox.h
new file mode 100644
index 00000000..bb13f1ab
--- /dev/null
+++ b/nvblox/include/nvblox/nvblox.h
@@ -0,0 +1,28 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/mapper.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/integrators/esdf_integrator.h"
+#include "nvblox/integrators/projective_color_integrator.h"
+#include "nvblox/integrators/projective_integrator_base.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/csv.h"
+#include "nvblox/mesh/mesh_integrator.h"
diff --git a/nvblox/include/nvblox/primitives/impl/scene_impl.h b/nvblox/include/nvblox/primitives/impl/scene_impl.h
new file mode 100644
index 00000000..be00ec60
--- /dev/null
+++ b/nvblox/include/nvblox/primitives/impl/scene_impl.h
@@ -0,0 +1,112 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original voxblox license, which this file is heavily based on: ----
+Copyright (c) 2016, ETHZ ASL
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of voxblox nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/bounding_boxes.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+namespace primitives {
+
+template <>
+inline void Scene::setVoxel(float dist, TsdfVoxel* voxel) const {
+  voxel->distance = dist;
+  voxel->weight = 1.0f;  // Just to make sure it gets visualized/meshed/etc.
+}
+
+template <typename VoxelType>
+void Scene::generateSdfFromScene(float max_dist,
+                                 VoxelBlockLayer<VoxelType>* layer) const {
+  CHECK(layer->memory_type() == MemoryType::kUnified)
+      << "For scene generation the layer must be CPU accessible "
+         "(MemoryType::kUnified).";
+  timing::Timer sim_timer("primitives/generate_sdf");
+
+  CHECK_NOTNULL(layer);
+
+  float block_size = layer->block_size();
+
+  // First allocate all the blocks within the AABB.
+  std::vector<Index3D> block_indices =
+      getBlockIndicesTouchedByBoundingBox(block_size, aabb_);
+
+  for (const Index3D& block_index : block_indices) {
+    layer->allocateBlockAtIndex(block_index);
+  }
+
+  // Iterate over every voxel in the layer and compute its distance to all
+  // objects.
+  auto lambda = [&block_size, &max_dist, this](const Index3D& block_index,
+                                               const Index3D& voxel_index,
+                                               VoxelType* voxel) {
+    const Vector3f position = getCenterPostionFromBlockIndexAndVoxelIndex(
+        block_size, block_index, voxel_index);
+
+    if (!aabb_.contains(position)) {
+      return;
+    }
+
+    // Iterate over all objects and get distances to this thing.
+    // Only computes up to max_distance away from the voxel (to reduce amount
+    // of ray casting).
+    float voxel_dist = getSignedDistanceToPoint(position, max_dist);
+
+    // Then update the thing.
+    // Also truncate the distance *inside* the obstacle, which is not truncated
+    // by the previous function.
+    voxel_dist = std::max(voxel_dist, -max_dist);
+    setVoxel<VoxelType>(voxel_dist, voxel);
+  };
+
+  // Call above lambda on every voxel in the layer.
+  callFunctionOnAllVoxels<VoxelType>(layer, lambda);
+}
+
+}  // namespace primitives
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/primitives/primitives.h b/nvblox/include/nvblox/primitives/primitives.h
new file mode 100644
index 00000000..d532914c
--- /dev/null
+++ b/nvblox/include/nvblox/primitives/primitives.h
@@ -0,0 +1,169 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original voxblox license, which this file is heavily based on: ----
+Copyright (c) 2016, ETHZ ASL
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of voxblox nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+
+#include <glog/logging.h>
+
+#include "nvblox/core/color.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace primitives {
+
+/// Base class for primitive objects.
+class Primitive {
+ public:
+  enum class Type { kPlane, kCube, kSphere, kCylinder };
+
+  // Epsilon for ray intersection and computation.
+  static constexpr float kEpsilon = 1e-4;
+
+  Primitive(const Vector3f& center, Type type)
+      : Primitive(center, type, Color::White()) {}
+  Primitive(const Vector3f& center, Type type, const Color& color)
+      : center_(center), type_(type), color_(color) {}
+  virtual ~Primitive() {}
+
+  /// Map-building accessors.
+  virtual float getDistanceToPoint(const Vector3f& point) const = 0;
+
+  Color getColor() const { return color_; }
+  Type getType() const { return type_; }
+
+  /// Raycasting accessors.
+  virtual bool getRayIntersection(const Vector3f& ray_origin,
+                                  const Vector3f& ray_direction, float max_dist,
+                                  Vector3f* intersect_point,
+                                  float* intersect_dist) const = 0;
+
+ protected:
+  Vector3f center_;
+  Type type_;
+  Color color_;
+};
+
+class Sphere : public Primitive {
+ public:
+  Sphere(const Vector3f& center, float radius)
+      : Primitive(center, Type::kSphere), radius_(radius) {}
+  Sphere(const Vector3f& center, float radius, const Color& color)
+      : Primitive(center, Type::kSphere, color), radius_(radius) {}
+
+  virtual float getDistanceToPoint(const Vector3f& point) const override;
+
+  virtual bool getRayIntersection(const Vector3f& ray_origin,
+                                  const Vector3f& ray_direction, float max_dist,
+                                  Vector3f* intersect_point,
+                                  float* intersect_dist) const override;
+
+ protected:
+  float radius_;
+};
+
+class Cube : public Primitive {
+ public:
+  Cube(const Vector3f& center, const Vector3f& size)
+      : Primitive(center, Type::kCube), size_(size) {}
+  Cube(const Vector3f& center, const Vector3f& size, const Color& color)
+      : Primitive(center, Type::kCube, color), size_(size) {}
+
+  virtual float getDistanceToPoint(const Vector3f& point) const override;
+  virtual bool getRayIntersection(const Vector3f& ray_origin,
+                                  const Vector3f& ray_direction, float max_dist,
+                                  Vector3f* intersect_point,
+                                  float* intersect_dist) const override;
+
+ protected:
+  Vector3f size_;
+};
+
+/// Requires normal being passed in to ALREADY BE NORMALIZED!!!!
+class Plane : public Primitive {
+ public:
+  Plane(const Vector3f& center, const Vector3f& normal)
+      : Primitive(center, Type::kPlane), normal_(normal) {
+    CHECK_NEAR(normal.norm(), 1.0, 1e-3);
+  }
+  Plane(const Vector3f& center, const Vector3f& normal, const Color& color)
+      : Primitive(center, Type::kPlane, color), normal_(normal) {
+    CHECK_NEAR(normal.norm(), 1.0, 1e-3);
+  }
+
+  virtual float getDistanceToPoint(const Vector3f& point) const override;
+
+  virtual bool getRayIntersection(const Vector3f& ray_origin,
+                                  const Vector3f& ray_direction, float max_dist,
+                                  Vector3f* intersect_point,
+                                  float* intersect_dist) const override;
+
+ protected:
+  Vector3f normal_;
+};
+
+/// Cylinder centered on the XY plane!
+class Cylinder : public Primitive {
+ public:
+  Cylinder(const Vector3f& center, float radius, float height)
+      : Primitive(center, Type::kCylinder), radius_(radius), height_(height) {}
+  Cylinder(const Vector3f& center, float radius, float height,
+           const Color& color)
+      : Primitive(center, Type::kCylinder, color),
+        radius_(radius),
+        height_(height) {}
+
+  virtual float getDistanceToPoint(const Vector3f& point) const override;
+
+  virtual bool getRayIntersection(const Vector3f& ray_origin,
+                                  const Vector3f& ray_direction, float max_dist,
+                                  Vector3f* intersect_point,
+                                  float* intersect_dist) const override;
+
+ protected:
+  float radius_;
+  float height_;
+};
+
+}  // namespace primitives
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/include/nvblox/primitives/scene.h b/nvblox/include/nvblox/primitives/scene.h
new file mode 100644
index 00000000..57a9bead
--- /dev/null
+++ b/nvblox/include/nvblox/primitives/scene.h
@@ -0,0 +1,118 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original voxblox license, which this file is heavily based on: ----
+Copyright (c) 2016, ETHZ ASL
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of voxblox nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/camera.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/primitives/primitives.h"
+
+namespace nvblox {
+namespace primitives {
+
+class Scene {
+ public:
+  Scene();
+
+  /// === Creating an environment ===
+  void addPrimitive(std::unique_ptr<Primitive> primitive);
+
+  /// Convenience functions for setting up bounded areas.
+  void addGroundLevel(float height);
+  void addCeiling(float height);
+
+  /// Add 4 walls (infinite planes) bounding the space. In case this is not the
+  /// desired behavior, can use addObject to add walls manually one by one.
+  /// If infinite walls are undesirable, then use cubes.
+  void addPlaneBoundaries(float x_min, float x_max, float y_min, float y_max);
+
+  /// Deletes all objects!
+  void clear();
+
+  /// === Generating synthetic data from environment ===
+  /// Generates a synthetic view given camera parameters and a transformation
+  /// of the camera to the scene.
+  void generateDepthImageFromScene(const Camera& camera, const Transform& T_S_C,
+                                   float max_dist,
+                                   DepthImage* depth_frame) const;
+
+  /// === Computing ground truth SDFs ===
+  template <typename VoxelType>
+  void generateSdfFromScene(float max_dist,
+                            VoxelBlockLayer<VoxelType>* layer) const;
+
+  /// Computes distance to an arbitrary point across all objects.
+  /// Positive distance is computed only up to max_dist, though negative
+  /// distance may be smaller than -max_dist.
+  float getSignedDistanceToPoint(const Vector3f& coords, float max_dist) const;
+
+  /// Get the intersection of a ray with the first hit object in the scene.
+  bool getRayIntersection(const Vector3f& ray_origin,
+                          const Vector3f& ray_direction, float max_dist,
+                          Vector3f* ray_intersection, float* ray_dist) const;
+
+  const AxisAlignedBoundingBox& aabb() const { return aabb_; }
+  AxisAlignedBoundingBox& aabb() { return aabb_; }
+
+ protected:
+  template <typename VoxelType>
+  inline void setVoxel(float dist, VoxelType* voxel) const;
+
+  /// Vector storing pointers to all the objects in this world.
+  std::vector<std::unique_ptr<Primitive>> primitives_;
+
+  /// World boundaries... Can be changed arbitrarily, just sets ground truth
+  /// generation and visualization bounds, accurate only up to block size.
+  AxisAlignedBoundingBox aabb_;
+};
+
+}  // namespace primitives
+}  // namespace nvblox
+
+#include "nvblox/primitives/impl/scene_impl.h"
diff --git a/nvblox/include/nvblox/ray_tracing/sphere_tracer.h b/nvblox/include/nvblox/ray_tracing/sphere_tracer.h
new file mode 100644
index 00000000..9516de2f
--- /dev/null
+++ b/nvblox/include/nvblox/ray_tracing/sphere_tracer.h
@@ -0,0 +1,74 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <utility>
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/image.h"
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+
+namespace nvblox {
+
+struct Ray {
+  Vector3f direction;
+  Vector3f origin;
+};
+
+class SphereTracer {
+ public:
+  struct Params {
+    Params() {}
+
+    int maximum_steps = 100;
+    float maximum_ray_length_m = 15.0f;
+    float surface_distance_epsilon_vox = 0.1f;
+  };
+
+  SphereTracer(Params params = Params());
+  ~SphereTracer();
+
+  // Render an image
+  // We return a pointer to the buffer on GPU where the image is rendered.
+  // Note that additional calls to this function will change the contents of the
+  // pointed-to image.
+  std::shared_ptr<const DepthImage> renderImageOnGPU(
+      const Camera& camera, const Transform& T_S_C, const TsdfLayer& tsdf_layer,
+      const float truncation_distance_m,
+      const MemoryType output_image_memory_type = MemoryType::kDevice,
+      const int ray_subsampling_factor = 1);
+
+  // NOTE(alexmillane): This starts a single threaded kernel to cast the ray
+  // and so is not efficient. Intended for testing rather than production.
+  bool castOnGPU(const Ray& ray, const TsdfLayer& tsdf_layer,
+                 const float truncation_distance_m, float* t) const;
+
+  const Params& params() const { return params_; }
+  Params& params() { return params_; }
+
+ private:
+  // Device working space
+  float* t_device_;
+  bool* success_flag_device_;
+  std::shared_ptr<DepthImage> depth_image_;
+
+  Params params_;
+  cudaStream_t tracing_stream_;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/utils/nvtx_ranges.h b/nvblox/include/nvblox/utils/nvtx_ranges.h
new file mode 100644
index 00000000..6be20dc3
--- /dev/null
+++ b/nvblox/include/nvblox/utils/nvtx_ranges.h
@@ -0,0 +1,55 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <string>
+
+#include <nvToolsExt.h>
+
+#include "nvblox/core/color.h"
+
+namespace nvblox {
+namespace timing {
+
+class NvtxRange {
+ public:
+  NvtxRange(const std::string& message, const Color& color,
+            bool constructed_stopped = false);
+  NvtxRange(const std::string& message, bool constructed_stopped = false);
+  ~NvtxRange();
+
+  NvtxRange() = delete;
+  NvtxRange(const NvtxRange& other) = delete;
+
+  void Start();
+  void Stop();
+
+  bool Started() const { return started_; };
+
+ private:
+  void Init(const std::string& message, const Color& color);
+  void Init(const std::string& message, const uint32_t color);
+
+  bool started_;
+  nvtxEventAttributes_t event_attributes_;
+  nvtxRangeId_t id_;
+};
+
+void mark(const std::string& message, const Color& color);
+void mark(const std::string& message);
+
+}  // namespace timing
+}  // namespace nvblox
diff --git a/nvblox/include/nvblox/utils/timing.h b/nvblox/include/nvblox/utils/timing.h
new file mode 100644
index 00000000..8286e66e
--- /dev/null
+++ b/nvblox/include/nvblox/utils/timing.h
@@ -0,0 +1,225 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original license for this file: ----
+ * Copyright (C) 2012-2013 Simon Lynen, ASL, ETH Zurich, Switzerland
+ * You can contact the author at <slynen at ethz dot ch>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from Paul Furgale Schweizer Messer sm_timing */
+
+#pragma once
+
+#include <algorithm>
+#include <chrono>
+#include <limits>
+#include <map>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "nvblox/utils/nvtx_ranges.h"
+
+namespace nvblox {
+namespace timing {
+
+// Forward declaration
+class TimerChrono;
+class TimerNvtx;
+class DummyTimer;
+
+// The project wide typedef defining the type of timer to be used
+typedef TimerNvtx Timer;
+
+template <typename T, typename Total, int N>
+class Accumulator {
+ public:
+  Accumulator()
+      : window_samples_(0),
+        totalsamples_(0),
+        window_sum_(0),
+        sum_(0),
+        min_(std::numeric_limits<T>::max()),
+        max_(std::numeric_limits<T>::min()) {}
+
+  void Add(T sample) {
+    if (window_samples_ < N) {
+      samples_[window_samples_++] = sample;
+      window_sum_ += sample;
+    } else {
+      T& oldest = samples_[window_samples_++ % N];
+      window_sum_ += sample - oldest;
+      oldest = sample;
+    }
+    sum_ += sample;
+    ++totalsamples_;
+    if (sample > max_) {
+      max_ = sample;
+    }
+    if (sample < min_) {
+      min_ = sample;
+    }
+  }
+
+  int TotalSamples() const { return totalsamples_; }
+
+  double Sum() const { return sum_; }
+
+  double Mean() const { return sum_ / totalsamples_; }
+
+  double RollingMean() const {
+    return window_sum_ / std::min(window_samples_, N);
+  }
+
+  double Max() const { return max_; }
+
+  double Min() const { return min_; }
+
+  double LazyVariance() const {
+    if (window_samples_ == 0) {
+      return 0.0;
+    }
+    double var = 0;
+    double mean = RollingMean();
+    for (int i = 0; i < std::min(window_samples_, N); ++i) {
+      var += (samples_[i] - mean) * (samples_[i] - mean);
+    }
+    var /= std::min(window_samples_, N);
+    return var;
+  }
+
+ private:
+  int window_samples_;
+  int totalsamples_;
+  Total window_sum_;
+  Total sum_;
+  T min_;
+  T max_;
+  T samples_[N];
+};
+
+struct TimerMapValue {
+  TimerMapValue() {}
+
+  /// Create an accumulator with specified window size.
+  Accumulator<double, double, 50> acc_;
+};
+
+/**
+ * A class that has the timer interface but does nothing. Swapping this in in
+ * place of the Timer class (say with a typedef) should allow one to disable
+ * timing. Because all of the functions are inline, they should just disappear.
+ */
+class DummyTimer {
+ public:
+  explicit DummyTimer(size_t /*handle*/, bool /*constructStopped*/ = false) {}
+  explicit DummyTimer(std::string const& /*tag*/,
+                      bool /*constructStopped*/ = false) {}
+  ~DummyTimer() {}
+
+  void Start() {}
+  void Stop() {}
+  bool IsTiming() { return false; }
+};
+
+class TimerChrono {
+ public:
+  explicit TimerChrono(size_t handle, bool constructStopped = false);
+  explicit TimerChrono(std::string const& tag, bool constructStopped = false);
+  ~TimerChrono();
+
+  void Start();
+  void Stop();
+  bool IsTiming() const;
+
+ private:
+  std::chrono::time_point<std::chrono::system_clock> time_;
+
+  bool timing_;
+  size_t handle_;
+};
+
+class TimerNvtx {
+ public:
+  explicit TimerNvtx(std::string const& tag, const Color& color,
+                     bool constructStopped = false);
+  explicit TimerNvtx(std::string const& tag, bool constructStopped = false);
+
+  void Start();
+  void Stop();
+  bool IsTiming() const;
+
+ private:
+  NvtxRange nvtx_range_;
+  TimerChrono timer_;
+};
+
+class Timing {
+ public:
+  typedef std::map<std::string, size_t> map_t;
+  friend class TimerChrono;
+  // Definition of static functions to query the timers.
+  static size_t GetHandle(std::string const& tag);
+  static std::string GetTag(size_t handle);
+  static double GetTotalSeconds(size_t handle);
+  static double GetTotalSeconds(std::string const& tag);
+  static double GetMeanSeconds(size_t handle);
+  static double GetMeanSeconds(std::string const& tag);
+  static size_t GetNumSamples(size_t handle);
+  static size_t GetNumSamples(std::string const& tag);
+  static double GetVarianceSeconds(size_t handle);
+  static double GetVarianceSeconds(std::string const& tag);
+  static double GetMinSeconds(size_t handle);
+  static double GetMinSeconds(std::string const& tag);
+  static double GetMaxSeconds(size_t handle);
+  static double GetMaxSeconds(std::string const& tag);
+  static double GetHz(size_t handle);
+  static double GetHz(std::string const& tag);
+  static void Print(std::ostream& out);
+  static std::string Print();
+  static std::string SecondsToTimeString(double seconds);
+  static void Reset();
+  static const map_t& GetTimers() { return Instance().tagMap_; }
+
+ private:
+  void AddTime(size_t handle, double seconds);
+
+  static Timing& Instance();
+
+  Timing();
+  ~Timing();
+
+  typedef std::vector<TimerMapValue> list_t;
+
+  list_t timers_;
+  map_t tagMap_;
+  size_t maxTagLength_;
+  std::mutex mutex_;
+};
+
+}  // namespace timing
+}  // namespace nvblox
diff --git a/nvblox/src/benchmarks/sphere_benchmark.cpp b/nvblox/src/benchmarks/sphere_benchmark.cpp
new file mode 100644
index 00000000..7a2a48b7
--- /dev/null
+++ b/nvblox/src/benchmarks/sphere_benchmark.cpp
@@ -0,0 +1,179 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <fstream>
+#include <iostream>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/integrators/esdf_integrator.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/mesh_io.h"
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/io/pointcloud_io.h"
+#include "nvblox/mesh/mesh_block.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/primitives/scene.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+class SphereBenchmark {
+ public:
+  SphereBenchmark();
+
+  void runBenchmark(const std::string& csv_output_path = "");
+  bool outputMesh(const std::string& ply_output_path);
+
+ private:
+  // Settings. Do not modify or the benchmark isn't comparable.
+  static constexpr float kVoxelSize = 0.05;
+  static constexpr float kBlockSize =
+      VoxelBlock<TsdfVoxel>::kVoxelsPerSide * kVoxelSize;
+  static constexpr int kNumTrajectoryPoints = 80;
+  static constexpr float kSphereRadius = 2.0f;
+  static constexpr float kTrajectoryRadius = 4.0f;
+  static constexpr float kMaxEnvironmentDimension = 5.0f;
+
+  // Actual layers.
+  TsdfLayer tsdf_layer_;
+  EsdfLayer esdf_layer_;
+  MeshLayer mesh_layer_;
+
+  // Simulated camera.
+  constexpr static float fu_ = 300;
+  constexpr static float fv_ = 300;
+  constexpr static int width_ = 640;
+  constexpr static int height_ = 480;
+  constexpr static float cu_ = static_cast<float>(width_) / 2.0f;
+  constexpr static float cv_ = static_cast<float>(height_) / 2.0f;
+  Camera camera_;
+};
+
+SphereBenchmark::SphereBenchmark()
+    : tsdf_layer_(kVoxelSize, MemoryType::kDevice),
+      esdf_layer_(kVoxelSize, MemoryType::kUnified),
+      mesh_layer_(kBlockSize, MemoryType::kUnified),
+      camera_(Camera(fu_, fv_, cu_, cv_, width_, height_)) {}
+
+void SphereBenchmark::runBenchmark(const std::string& csv_output_path) {
+  // Create an integrator with default settings.
+  ProjectiveTsdfIntegrator integrator;
+  MeshIntegrator mesh_integrator;
+  EsdfIntegrator esdf_integrator;
+  esdf_integrator.max_distance_m() = 4.0;
+
+  // Scene is bounded to the dimensions above.
+  primitives::Scene scene;
+  scene.aabb() = AxisAlignedBoundingBox(
+      Vector3f(-kMaxEnvironmentDimension, -kMaxEnvironmentDimension, 0.0f),
+      Vector3f(kMaxEnvironmentDimension, kMaxEnvironmentDimension,
+               kMaxEnvironmentDimension));
+  // Create a scene with a ground plane and a sphere.
+  scene.addGroundLevel(0.0f);
+  scene.addCeiling(kMaxEnvironmentDimension);
+  scene.addPrimitive(std::make_unique<primitives::Sphere>(
+      Vector3f(0.0f, 0.0f, kSphereRadius), kSphereRadius));
+  // Add bounding planes at 5 meters. Basically makes it sphere in a box.
+  scene.addPlaneBoundaries(-kMaxEnvironmentDimension, kMaxEnvironmentDimension,
+                           -kMaxEnvironmentDimension, kMaxEnvironmentDimension);
+
+  // Simulate a trajectory of the requisite amount of points, on the circle
+  // around the sphere.
+  const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+
+  // Create a depth frame. We share this memory buffer for the entire
+  // trajectory.
+  DepthImage depth_frame(camera_.height(), camera_.width(),
+                         MemoryType::kUnified);
+
+  for (size_t i = 0; i < kNumTrajectoryPoints; i++) {
+    const float theta = radians_increment * i;
+    // Convert polar to cartesian coordinates.
+    Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                   kTrajectoryRadius * std::sin(theta),
+                                   kSphereRadius);
+    // The camera has its z axis pointing towards the origin.
+    Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+    Eigen::Quaternionf rotation_theta(
+        Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+    // Construct a transform from camera to scene with this.
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(rotation_theta * rotation_base);
+    T_S_C.pretranslate(cartesian_coordinates);
+
+    // Generate a depth image of the scene.
+    scene.generateDepthImageFromScene(
+        camera_, T_S_C, 2 * kMaxEnvironmentDimension, &depth_frame);
+
+    std::vector<Index3D> updated_blocks;
+    // Integrate this depth image.
+    {
+      timing::Timer integration_timer("benchmark/integrate_tsdf");
+      integrator.integrateFrame(depth_frame, T_S_C, camera_, &tsdf_layer_,
+                                &updated_blocks);
+    }
+
+    // Integrate the mesh.
+    {
+      timing::Timer mesh_timer("benchmark/integrate_mesh");
+      mesh_integrator.integrateBlocksGPU(tsdf_layer_, updated_blocks,
+                                         &mesh_layer_);
+    }
+
+    // Integrate the ESDF.
+    {
+      timing::Timer esdf_timer("benchmark/integrate_esdf");
+      esdf_integrator.integrateBlocksOnGPU(tsdf_layer_, updated_blocks,
+                                           &esdf_layer_);
+    }
+  }
+}
+
+bool SphereBenchmark::outputMesh(const std::string& ply_output_path) {
+  timing::Timer timer_write("mesh/write");
+  return io::outputMeshLayerToPly(mesh_layer_, ply_output_path);
+}
+
+}  // namespace nvblox
+
+int main(int argc, char* argv[]) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+
+  nvblox::warmupCuda();
+
+  std::string output_mesh_path = "";
+  if (argc >= 2) {
+    output_mesh_path = argv[1];
+  }
+
+  nvblox::SphereBenchmark benchmark;
+  benchmark.runBenchmark("");
+
+  if (!output_mesh_path.empty()) {
+    benchmark.outputMesh(output_mesh_path);
+  }
+
+  std::cout << nvblox::timing::Timing::Print();
+
+  return 0;
+}
diff --git a/nvblox/src/core/bounding_boxes.cpp b/nvblox/src/core/bounding_boxes.cpp
new file mode 100644
index 00000000..6752035f
--- /dev/null
+++ b/nvblox/src/core/bounding_boxes.cpp
@@ -0,0 +1,73 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/bounding_boxes.h"
+
+namespace nvblox {
+
+AxisAlignedBoundingBox getAABBOfObservedVoxels(const EsdfLayer& layer) {
+  AxisAlignedBoundingBox aabb;
+  auto lambda = [&aabb, &layer](const Index3D& block_index,
+                                const Index3D& voxel_index,
+                                const EsdfVoxel* voxel) -> void {
+    if (voxel->observed) {
+      Vector3f p = getPositionFromBlockIndexAndVoxelIndex(
+          layer.block_size(), block_index, voxel_index);
+      aabb.extend(p);
+      p += layer.block_size() * Vector3f::Ones();
+      aabb.extend(p);
+    }
+  };
+  callFunctionOnAllVoxels<EsdfVoxel>(layer, lambda);
+  return aabb;
+}
+
+AxisAlignedBoundingBox getAABBOfObservedVoxels(const TsdfLayer& layer,
+                                               const float min_weight) {
+  AxisAlignedBoundingBox aabb;
+  auto lambda = [&aabb, &layer, min_weight](const Index3D& block_index,
+                                            const Index3D& voxel_index,
+                                            const TsdfVoxel* voxel) -> void {
+    if (voxel->weight > min_weight) {
+      Vector3f p = getPositionFromBlockIndexAndVoxelIndex(
+          layer.block_size(), block_index, voxel_index);
+      aabb.extend(p);
+      p += layer.block_size() * Vector3f::Ones();
+      aabb.extend(p);
+    }
+  };
+  callFunctionOnAllVoxels<TsdfVoxel>(layer, lambda);
+  return aabb;
+}
+
+AxisAlignedBoundingBox getAABBOfObservedVoxels(const ColorLayer& layer,
+                                               const float min_weight) {
+  AxisAlignedBoundingBox aabb;
+  auto lambda = [&aabb, &layer, min_weight](const Index3D& block_index,
+                                            const Index3D& voxel_index,
+                                            const ColorVoxel* voxel) -> void {
+    if (voxel->weight > min_weight) {
+      Vector3f p = getPositionFromBlockIndexAndVoxelIndex(
+          layer.block_size(), block_index, voxel_index);
+      aabb.extend(p);
+      p += layer.block_size() * Vector3f::Ones();
+      aabb.extend(p);
+    }
+  };
+  callFunctionOnAllVoxels<ColorVoxel>(layer, lambda);
+  return aabb;
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/core/camera.cpp b/nvblox/src/core/camera.cpp
new file mode 100644
index 00000000..97d6e898
--- /dev/null
+++ b/nvblox/src/core/camera.cpp
@@ -0,0 +1,181 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/camera.h"
+
+namespace nvblox {
+
+AxisAlignedBoundingBox Camera::getViewAABB(const Transform& T_L_C,
+                                           const float min_depth,
+                                           const float max_depth) const {
+  // Get the bounding corners of this view.
+  Eigen::Matrix<float, 8, 3> corners_C = getViewCorners(min_depth, max_depth);
+
+  Vector3f aabb_min, aabb_max;
+  aabb_min.setConstant(std::numeric_limits<float>::max());
+  aabb_max.setConstant(std::numeric_limits<float>::lowest());
+
+  // Transform it into the layer coordinate frame.
+  for (size_t i = 0; i < corners_C.rows(); i++) {
+    const Vector3f& corner_C = corners_C.row(i);
+    Vector3f corner_L = T_L_C * corner_C;
+    for (int i = 0; i < 3; i++) {
+      aabb_min(i) = std::min(aabb_min(i), corner_L(i));
+      aabb_max(i) = std::max(aabb_max(i), corner_L(i));
+    }
+  }
+
+  return AxisAlignedBoundingBox(aabb_min, aabb_max);
+}
+
+Frustum Camera::getViewFrustum(const Transform& T_L_C, const float min_depth,
+                               const float max_depth) const {
+  return Frustum(*this, T_L_C, min_depth, max_depth);
+}
+
+Eigen::Matrix<float, 8, 3> Camera::getViewCorners(const float min_depth,
+                                                  const float max_depth) const {
+  // Rays through the corners of the image plane
+  // Clockwise from the top left corner of the image.
+  const Vector3f ray_0_C =
+      rayFromImagePlaneCoordinates(Vector2f(0.0f, 0.0f));  // NOLINT
+  const Vector3f ray_1_C =
+      rayFromImagePlaneCoordinates(Vector2f(width_, 0.0f));  // NOLINT
+  const Vector3f ray_2_C =
+      rayFromImagePlaneCoordinates(Vector2f(width_, height_));  // NOLINT
+  const Vector3f ray_3_C =
+      rayFromImagePlaneCoordinates(Vector2f(0.0f, height_));  // NOLINT
+
+  // True bounding box from the 3D points
+  Eigen::Matrix<float, 8, 3> corners_C;
+  corners_C.row(0) = min_depth * ray_2_C;
+  corners_C.row(1) = min_depth * ray_1_C;
+  corners_C.row(2) = min_depth * ray_0_C,
+  corners_C.row(3) = min_depth * ray_3_C;
+  corners_C.row(4) = max_depth * ray_2_C;
+  corners_C.row(5) = max_depth * ray_1_C;
+  corners_C.row(6) = max_depth * ray_0_C;
+  corners_C.row(7) = max_depth * ray_3_C;
+  return corners_C;
+}
+
+// Frustum definitions.
+Frustum::Frustum(const Camera& camera, const Transform& T_L_C, float min_depth,
+                 float max_depth) {
+  Eigen::Matrix<float, 8, 3> corners_C =
+      camera.getViewCorners(min_depth, max_depth);
+  computeBoundingPlanes(corners_C, T_L_C);
+}
+void Frustum::computeBoundingPlanes(const Eigen::Matrix<float, 8, 3>& corners_C,
+                                    const Transform& T_L_C) {
+  // Transform the corners.
+  const Eigen::Matrix<float, 8, 3> corners_L =
+      (T_L_C * corners_C.transpose()).transpose();
+
+  // Near plane first.
+  bounding_planes_L_[0].setFromPoints(corners_L.row(0), corners_L.row(2),
+                                      corners_L.row(1));
+  // Far plane.
+  bounding_planes_L_[1].setFromPoints(corners_L.row(4), corners_L.row(5),
+                                      corners_L.row(6));
+  // Left.
+  bounding_planes_L_[2].setFromPoints(corners_L.row(3), corners_L.row(7),
+                                      corners_L.row(6));
+  // Right.
+  bounding_planes_L_[3].setFromPoints(corners_L.row(0), corners_L.row(5),
+                                      corners_L.row(4));
+  // Top.
+  bounding_planes_L_[4].setFromPoints(corners_L.row(3), corners_L.row(4),
+                                      corners_L.row(7));
+  // Bottom.
+  bounding_planes_L_[5].setFromPoints(corners_L.row(2), corners_L.row(6),
+                                      corners_L.row(5));
+
+  // Calculate AABB.
+  Vector3f aabb_min, aabb_max;
+
+  aabb_min.setConstant(std::numeric_limits<double>::max());
+  aabb_max.setConstant(std::numeric_limits<double>::lowest());
+
+  for (int i = 0; i < corners_L.cols(); i++) {
+    for (size_t j = 0; j < corners_L.rows(); j++) {
+      aabb_min(i) = std::min(aabb_min(i), corners_L(j, i));
+      aabb_max(i) = std::max(aabb_max(i), corners_L(j, i));
+    }
+  }
+
+  aabb_ = AxisAlignedBoundingBox(aabb_min, aabb_max);
+}
+
+bool Frustum::isPointInView(const Vector3f& point) const {
+  // Skip the AABB check, assume already been done.
+  for (size_t i = 0; i < bounding_planes_L_.size(); i++) {
+    if (!bounding_planes_L_[i].isPointInside(point)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool Frustum::isAABBInView(const AxisAlignedBoundingBox& aabb) const {
+  // If we're not even close, don't bother checking the planes.
+  if (!aabb_.intersects(aabb)) {
+    return false;
+  }
+  constexpr int kNumCorners = 8;
+
+  // Check the center of the bounding box to see if it's within the AABB.
+  // This covers a corner case where the given AABB is larger than the
+  // frustum.
+  if (isPointInView(aabb.center())) {
+    return true;
+  }
+
+  // Iterate over all the corners of the bounding box and see if any are
+  // within the view frustum.
+  for (int i = 0; i < kNumCorners; i++) {
+    if (isPointInView(
+            aabb.corner(static_cast<Eigen::AlignedBox3f::CornerType>(i)))) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Bounding plane definitions.
+void BoundingPlane::setFromPoints(const Vector3f& p1, const Vector3f& p2,
+                                  const Vector3f& p3) {
+  Vector3f p1p2 = p2 - p1;
+  Vector3f p1p3 = p3 - p1;
+
+  Vector3f cross = p1p2.cross(p1p3);
+  normal_ = cross.normalized();
+  distance_ = normal_.dot(p1);
+}
+
+void BoundingPlane::setFromDistanceNormal(const Vector3f& normal,
+                                          float distance) {
+  normal_ = normal;
+  distance_ = distance;
+}
+
+bool BoundingPlane::isPointInside(const Vector3f& point) const {
+  if (point.dot(normal_) >= distance_) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/core/color.cpp b/nvblox/src/core/color.cpp
new file mode 100644
index 00000000..3132cc3c
--- /dev/null
+++ b/nvblox/src/core/color.cpp
@@ -0,0 +1,38 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/color.h"
+
+namespace nvblox {
+
+Color Color::blendTwoColors(const Color& first_color, float first_weight,
+                            const Color& second_color, float second_weight) {
+  float total_weight = first_weight + second_weight;
+
+  first_weight /= total_weight;
+  second_weight /= total_weight;
+
+  Color new_color;
+  new_color.r = static_cast<uint8_t>(std::round(
+      first_color.r * first_weight + second_color.r * second_weight));
+  new_color.g = static_cast<uint8_t>(std::round(
+      first_color.g * first_weight + second_color.g * second_weight));
+  new_color.b = static_cast<uint8_t>(std::round(
+      first_color.b * first_weight + second_color.b * second_weight));
+
+  return new_color;
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/core/cuda/blox.cu b/nvblox/src/core/cuda/blox.cu
new file mode 100644
index 00000000..b5872055
--- /dev/null
+++ b/nvblox/src/core/cuda/blox.cu
@@ -0,0 +1,44 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+
+namespace nvblox {
+
+// Must be called with:
+// - a single block
+// - one thread per voxel
+__global__ void setColorBlockGray(ColorBlock* block_device_ptr) {
+  ColorVoxel* voxel_ptr =
+      &block_device_ptr->voxels[threadIdx.z][threadIdx.y][threadIdx.x];
+  voxel_ptr->color.r = 127;
+  voxel_ptr->color.g = 127;
+  voxel_ptr->color.b = 127;
+  voxel_ptr->weight = 0.0f;
+}
+
+void setColorBlockGrayOnGPU(ColorBlock* block_device_ptr) {
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  setColorBlockGray<<<1, kThreadsPerBlock>>>(block_device_ptr);
+  // NOTE(alexmillane): At the moment we launch this allocation on the default
+  // stream which implicitly synchronizes. At some point in the future we should
+  // probably move this to a stream.
+  // checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/core/cuda/error_check.cu b/nvblox/src/core/cuda/error_check.cu
new file mode 100644
index 00000000..0d4bc34d
--- /dev/null
+++ b/nvblox/src/core/cuda/error_check.cu
@@ -0,0 +1,34 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/cuda/error_check.cuh"
+
+#include <iostream>
+
+namespace nvblox {
+
+void check_cuda(cudaError_t result, char const* const func,
+                const char* const file, int const line) {
+  if (result) {
+    std::cerr << "CUDA error = " << static_cast<unsigned int>(result) << " at "
+              << file << ":" << line << " '" << func
+              << "'. Error string: " << cudaGetErrorString(result) << ".\n";
+    // Make sure we call CUDA Device Reset before exiting
+    cudaDeviceReset();
+    exit(99);
+  }
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/core/cuda/image_cuda.cu b/nvblox/src/core/cuda/image_cuda.cu
new file mode 100644
index 00000000..51e3a861
--- /dev/null
+++ b/nvblox/src/core/cuda/image_cuda.cu
@@ -0,0 +1,51 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/cuda/image_cuda.h"
+
+#include <thrust/device_ptr.h>
+#include <thrust/execution_policy.h>
+#include <thrust/extrema.h>
+
+#include "nvblox/core/cuda/error_check.cuh"
+
+namespace nvblox {
+namespace cuda {
+
+float max(const int rows, const int cols, const float* image) {
+  const thrust::device_ptr<const float> dev_ptr(image);
+  const thrust::device_ptr<const float> max_elem =
+      thrust::max_element(thrust::device, dev_ptr, dev_ptr + (rows * cols));
+  return *max_elem;
+}
+
+float min(const int rows, const int cols, const float* image) {
+  const thrust::device_ptr<const float> dev_ptr(image);
+  const thrust::device_ptr<const float> min_elem =
+      thrust::min_element(thrust::device, dev_ptr, dev_ptr + (rows * cols));
+  return *min_elem;
+}
+
+std::pair<float, float> minmax(const int rows, const int cols,
+                               const float* image) {
+  // Wrap our memory and reduce using thrust
+  const thrust::device_ptr<const float> dev_ptr(image);
+  const auto minmax_elem =
+      thrust::minmax_element(thrust::device, dev_ptr, dev_ptr + (rows * cols));
+  return {*minmax_elem.first, *minmax_elem.second};
+}
+
+}  // namespace cuda
+}  // namespace nvblox
diff --git a/nvblox/src/core/cuda/warmup.cu b/nvblox/src/core/cuda/warmup.cu
new file mode 100644
index 00000000..4a3cb861
--- /dev/null
+++ b/nvblox/src/core/cuda/warmup.cu
@@ -0,0 +1,32 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/cuda/warmup.h"
+
+namespace nvblox {
+
+__global__ void warm_up_gpu() {
+  unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  float ia, ib;
+  ia = ib = 0.0f;
+  ib += ia + tid ;
+}
+
+void warmupCuda() {
+  warm_up_gpu<<<64, 128>>>();
+  cudaDeviceSynchronize();
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/core/image.cpp b/nvblox/src/core/image.cpp
new file mode 100644
index 00000000..8f9f74db
--- /dev/null
+++ b/nvblox/src/core/image.cpp
@@ -0,0 +1,71 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/image.h"
+
+#include <algorithm>
+
+#include "nvblox/core/cuda/image_cuda.h"
+#include "nvblox/io/csv.h"
+
+namespace nvblox {
+namespace image {
+
+float max(const DepthImage& image) { return maxGPU(image); }
+
+float min(const DepthImage& image) { return minGPU(image); }
+
+std::pair<float, float> minmax(const DepthImage& image) {
+  return minmaxGPU(image);
+}
+
+float maxGPU(const DepthImage& image) {
+  return cuda::max(image.rows(), image.cols(), image.dataConstPtr());
+}
+
+float minGPU(const DepthImage& image) {
+  return cuda::min(image.rows(), image.cols(), image.dataConstPtr());
+}
+
+std::pair<float, float> minmaxGPU(const DepthImage& image) {
+  const auto minmax_elem =
+      cuda::minmax(image.rows(), image.cols(), image.dataConstPtr());
+  return {minmax_elem.first, minmax_elem.second};
+}
+
+float maxCPU(const DepthImage& image) {
+  CHECK(image.memory_type() == MemoryType::kUnified)
+      << "CPU function called on kDevice image.";
+  return *std::max_element(image.dataConstPtr(),
+                           image.dataConstPtr() + image.numel());
+}
+
+float minCPU(const DepthImage& image) {
+  CHECK(image.memory_type() == MemoryType::kUnified)
+      << "CPU function called on kDevice image.";
+  return *std::min_element(image.dataConstPtr(),
+                           image.dataConstPtr() + image.numel());
+}
+
+std::pair<float, float> minmaxCPU(const DepthImage& image) {
+  CHECK(image.memory_type() == MemoryType::kUnified)
+      << "CPU function called on kDevice image.";
+  const auto res = std::minmax_element(image.dataConstPtr(),
+                                       image.dataConstPtr() + image.numel());
+  return {*res.first, *res.second};
+}
+
+}  // namespace image
+}  // namespace nvblox
diff --git a/nvblox/src/core/interpolation_3d.cpp b/nvblox/src/core/interpolation_3d.cpp
new file mode 100644
index 00000000..ccb1fc7d
--- /dev/null
+++ b/nvblox/src/core/interpolation_3d.cpp
@@ -0,0 +1,57 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/interpolation_3d.h"
+
+#include <glog/logging.h>
+
+namespace nvblox {
+namespace interpolation {
+
+bool interpolateOnCPU(const Vector3f& p_L, const TsdfLayer& layer,
+                      float* distance_ptr) {
+  CHECK_NOTNULL(distance_ptr);
+  CHECK(layer.memory_type() == MemoryType::kUnified)
+      << "For CPU-based interpolation, the layer must be CPU accessible (ie "
+         "MemoryType::kUnified).";
+  auto get_distance_lambda = [](const TsdfVoxel& voxel) -> float {
+    return voxel.distance;
+  };
+  constexpr float kMinWeight = 1e-4;
+  auto voxel_valid_lambda = [](const TsdfVoxel& voxel) -> bool {
+    return voxel.weight > kMinWeight;
+  };
+  return internal::interpolateMemberOnCPU<TsdfVoxel>(
+      p_L, layer, get_distance_lambda, voxel_valid_lambda, distance_ptr);
+}
+
+bool interpolateOnCPU(const Vector3f& p_L, const EsdfLayer& layer,
+                      float* distance_ptr) {
+  CHECK_NOTNULL(distance_ptr);
+  CHECK(layer.memory_type() == MemoryType::kUnified)
+      << "For CPU-based interpolation, the layer must be CPU accessible (ie "
+         "MemoryType::kUnified).";
+  auto get_distance_lambda = [](const EsdfVoxel& voxel) -> float {
+    return std::sqrt(voxel.squared_distance_vox);
+  };
+  auto voxel_valid_lambda = [](const EsdfVoxel& voxel) -> bool {
+    return voxel.observed;
+  };
+  return internal::interpolateMemberOnCPU<EsdfVoxel>(
+      p_L, layer, get_distance_lambda, voxel_valid_lambda, distance_ptr);
+}
+
+}  // namespace interpolation
+}  // namespace nvblox
diff --git a/nvblox/src/core/mapper.cpp b/nvblox/src/core/mapper.cpp
new file mode 100644
index 00000000..1b691462
--- /dev/null
+++ b/nvblox/src/core/mapper.cpp
@@ -0,0 +1,90 @@
+#include "nvblox/core/mapper.h"
+
+namespace nvblox {
+
+RgbdMapper::RgbdMapper(float voxel_size_m, MemoryType memory_type)
+    : voxel_size_m_(voxel_size_m) {
+  layers_ = LayerCake::create<TsdfLayer, ColorLayer, EsdfLayer, MeshLayer>(
+      voxel_size_m_, memory_type);
+}
+
+void RgbdMapper::integrateDepth(const DepthImage& depth_frame,
+                                const Transform& T_L_C, const Camera& camera) {
+  // Call the integrator.
+  std::vector<Index3D> updated_blocks;
+  tsdf_integrator_.integrateFrame(depth_frame, T_L_C, camera,
+                                  layers_.getPtr<TsdfLayer>(), &updated_blocks);
+
+  // Update all the relevant queues.
+  mesh_blocks_to_update_.insert(updated_blocks.begin(), updated_blocks.end());
+  esdf_blocks_to_update_.insert(updated_blocks.begin(), updated_blocks.end());
+}
+
+void RgbdMapper::integrateColor(const ColorImage& color_frame,
+                                const Transform& T_L_C, const Camera& camera) {
+  color_integrator_.integrateFrame(color_frame, T_L_C, camera,
+                                   layers_.get<TsdfLayer>(),
+                                   layers_.getPtr<ColorLayer>());
+}
+
+std::vector<Index3D> RgbdMapper::updateMesh() {
+  // Convert the set of MeshBlocks needing an update to a vector
+  std::vector<Index3D> mesh_blocks_to_update_vector(
+      mesh_blocks_to_update_.begin(), mesh_blocks_to_update_.end());
+
+  // Call the integrator.
+  mesh_integrator_.integrateBlocksGPU(layers_.get<TsdfLayer>(),
+                                      mesh_blocks_to_update_vector,
+                                      layers_.getPtr<MeshLayer>());
+
+  mesh_integrator_.colorMesh(layers_.get<ColorLayer>(),
+                             mesh_blocks_to_update_vector,
+                             layers_.getPtr<MeshLayer>());
+
+  // Mark blocks as updated
+  mesh_blocks_to_update_.clear();
+
+  return mesh_blocks_to_update_vector;
+}
+
+std::vector<Index3D> RgbdMapper::updateEsdf() {
+  CHECK(esdf_mode_ != EsdfMode::k2D)
+      << "Currently, we limit computation of the ESDF to 2d *or* 3d. Not both.";
+  esdf_mode_ = EsdfMode::k3D;
+
+  // Convert the set of EsdfBlocks needing an update to a vector
+  std::vector<Index3D> esdf_blocks_to_update_vector(
+      esdf_blocks_to_update_.begin(), esdf_blocks_to_update_.end());
+
+  esdf_integrator_.integrateBlocksOnGPU(layers_.get<TsdfLayer>(),
+                                        esdf_blocks_to_update_vector,
+                                        layers_.getPtr<EsdfLayer>());
+
+  // Mark blocks as updated
+  esdf_blocks_to_update_.clear();
+
+  return esdf_blocks_to_update_vector;
+}
+
+std::vector<Index3D> RgbdMapper::updateEsdfSlice(float slice_input_z_min,
+                                                 float slice_input_z_max,
+                                                 float slice_output_z) {
+  CHECK(esdf_mode_ != EsdfMode::k3D)
+      << "Currently, we limit computation of the ESDF to 2d *or* 3d. Not both.";
+  esdf_mode_ = EsdfMode::k2D;
+
+  // Convert the set of MeshBlocks needing an update to a vector
+  std::vector<Index3D> esdf_blocks_to_update_vector(
+      esdf_blocks_to_update_.begin(), esdf_blocks_to_update_.end());
+
+  esdf_integrator_.integrateSliceOnGPU(
+      layers_.get<TsdfLayer>(), esdf_blocks_to_update_vector, slice_input_z_min,
+      slice_input_z_max, slice_output_z, layers_.getPtr<EsdfLayer>());
+
+  // Mark blocks as updated
+  esdf_blocks_to_update_.clear();
+
+  return esdf_blocks_to_update_vector;
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/datasets/image_loader.cpp b/nvblox/src/datasets/image_loader.cpp
new file mode 100644
index 00000000..af952e2c
--- /dev/null
+++ b/nvblox/src/datasets/image_loader.cpp
@@ -0,0 +1,101 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/datasets/image_loader.h"
+
+#include <Eigen/Core>
+
+#include "nvblox/utils/timing.h"
+
+#define STB_IMAGE_IMPLEMENTATION
+#include "nvblox/datasets/external/stb_image.h"
+
+namespace nvblox {
+namespace datasets {
+
+bool load16BitDepthImage(const std::string& filename,
+                         DepthImage* depth_frame_ptr, MemoryType memory_type) {
+  CHECK_NOTNULL(depth_frame_ptr);
+  timing::Timer stbi_timer("file_loading/depth_image/stbi");
+  int width, height, num_channels;
+  uint16_t* image_data =
+      stbi_load_16(filename.c_str(), &width, &height, &num_channels, 0);
+  stbi_timer.Stop();
+
+  if (image_data == nullptr) {
+    return false;
+  }
+  CHECK_EQ(num_channels, 1);
+
+  // The image is expressed in mm. We need to divide by 1000 and convert to
+  // float.
+  // TODO(alexmillane): It's likely better to do this on the GPU.
+  //                    Follow up: I measured and this scaling + cast takes
+  //                    ~1ms. So only do this when 1ms is relevant.
+  std::vector<float> float_image_data(height * width);
+  for (int lin_idx = 0; lin_idx < float_image_data.size(); lin_idx++) {
+    float_image_data[lin_idx] =
+        static_cast<float>(image_data[lin_idx]) / 1000.0f;
+  }
+
+  *depth_frame_ptr = DepthImage::fromBuffer(
+      height, width, float_image_data.data(), memory_type);
+
+  stbi_image_free(image_data);
+  return true;
+}
+
+bool load8BitColorImage(const std::string& filename,
+                        ColorImage* color_image_ptr, MemoryType memory_type) {
+  CHECK_NOTNULL(color_image_ptr);
+  timing::Timer stbi_timer("file_loading/color_image/stbi");
+  int width, height, num_channels;
+  uint8_t* image_data =
+      stbi_load(filename.c_str(), &width, &height, &num_channels, 0);
+  stbi_timer.Stop();
+
+  if (image_data == nullptr) {
+    return false;
+  }
+  CHECK_EQ(num_channels, 3);
+
+  CHECK_EQ(sizeof(Color), 3 * sizeof(uint8_t))
+      << "Color is padded so image loading wont work.";
+
+  *color_image_ptr = ColorImage::fromBuffer(
+      height, width, reinterpret_cast<Color*>(image_data), memory_type);
+
+  stbi_image_free(image_data);
+  return true;
+}
+
+template <>
+bool ImageLoader<DepthImage>::getImage(int image_idx, DepthImage* image_ptr) {
+  CHECK_NOTNULL(image_ptr);
+  bool res = load16BitDepthImage(index_to_filepath_(image_idx), image_ptr,
+                                 memory_type_);
+  return res;
+}
+
+template <>
+bool ImageLoader<ColorImage>::getImage(int image_idx, ColorImage* image_ptr) {
+  CHECK_NOTNULL(image_ptr);
+  bool res = load8BitColorImage(index_to_filepath_(image_idx), image_ptr,
+                                memory_type_);
+  return res;
+}
+
+}  // namespace datasets
+}  // namespace nvblox
diff --git a/nvblox/src/datasets/parse_3dmatch.cpp b/nvblox/src/datasets/parse_3dmatch.cpp
new file mode 100644
index 00000000..cb720493
--- /dev/null
+++ b/nvblox/src/datasets/parse_3dmatch.cpp
@@ -0,0 +1,138 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/datasets/parse_3dmatch.h"
+
+#include <glog/logging.h>
+
+#include <fstream>
+#include <functional>
+#include <iomanip>
+#include <iostream>
+
+namespace nvblox {
+namespace datasets {
+namespace threedmatch {
+
+// Parses a 4x4 matrix from a text file in 3D Match format: space-separated.
+bool parsePoseFromFile(const std::string& filename, Transform* transform) {
+  CHECK_NOTNULL(transform);
+  constexpr int kDimension = 4;
+
+  std::ifstream fin(filename);
+  if (fin.is_open()) {
+    for (int row = 0; row < kDimension; row++)
+      for (int col = 0; col < kDimension; col++) {
+        float item = 0.0;
+        fin >> item;
+        (*transform)(row, col) = item;
+      }
+    fin.close();
+    return true;
+  }
+  return false;
+}
+
+// Parse 3x3 camera intrinsics matrix from 3D Match format: space-separated.
+bool parseCameraFromFile(const std::string& filename,
+                         Eigen::Matrix3f* intrinsics) {
+  CHECK_NOTNULL(intrinsics);
+  constexpr int kDimension = 3;
+
+  std::ifstream fin(filename);
+  if (fin.is_open()) {
+    for (int row = 0; row < kDimension; row++)
+      for (int col = 0; col < kDimension; col++) {
+        float item = 0.0;
+        fin >> item;
+        (*intrinsics)(row, col) = item;
+      }
+    fin.close();
+
+    return true;
+  }
+  return false;
+}
+
+std::string getPathForCameraIntrinsics(const std::string& base_path) {
+  return base_path + "/camera-intrinsics.txt";
+}
+
+std::string getPathForFramePose(const std::string& base_path, const int seq_id,
+                                const int frame_id) {
+  std::stringstream ss;
+  ss << base_path << "/seq-" << std::setfill('0') << std::setw(2) << seq_id
+     << "/frame-" << std::setw(6) << frame_id << ".pose.txt";
+
+  return ss.str();
+}
+
+std::string getPathForDepthImage(const std::string& base_path, const int seq_id,
+                                 const int frame_id) {
+  std::stringstream ss;
+  ss << base_path << "/seq-" << std::setfill('0') << std::setw(2) << seq_id
+     << "/frame-" << std::setw(6) << frame_id << ".depth.png";
+
+  return ss.str();
+}
+
+std::string getPathForColorImage(const std::string& base_path, const int seq_id,
+                                 const int frame_id) {
+  std::stringstream ss;
+  ss << base_path << "/seq-" << std::setfill('0') << std::setw(2) << seq_id
+     << "/frame-" << std::setw(6) << frame_id << ".color.png";
+
+  return ss.str();
+}
+
+std::unique_ptr<nvblox::datasets::ImageLoader<DepthImage>>
+createDepthImageLoader(const std::string& base_path, const int seq_id,
+                       MemoryType memory_type) {
+  return std::make_unique<nvblox::datasets::ImageLoader<DepthImage>>(
+      std::bind(getPathForDepthImage, base_path, seq_id, std::placeholders::_1),
+      memory_type);
+}
+
+std::unique_ptr<nvblox::datasets::ImageLoader<ColorImage>>
+createColorImageLoader(const std::string& base_path, const int seq_id,
+                       MemoryType memory_type) {
+  return std::make_unique<nvblox::datasets::ImageLoader<ColorImage>>(
+      std::bind(getPathForColorImage, base_path, seq_id, std::placeholders::_1),
+      memory_type);
+}
+
+std::unique_ptr<nvblox::datasets::ImageLoader<DepthImage>>
+createMultithreadedDepthImageLoader(const std::string& base_path,
+                                    const int seq_id, const int num_threads,
+                                    MemoryType memory_type) {
+  return std::make_unique<
+      nvblox::datasets::MultiThreadedImageLoader<DepthImage>>(
+      std::bind(getPathForDepthImage, base_path, seq_id, std::placeholders::_1),
+      num_threads, memory_type);
+}
+
+std::unique_ptr<nvblox::datasets::ImageLoader<ColorImage>>
+createMultithreadedColorImageLoader(const std::string& base_path,
+                                    const int seq_id, const int num_threads,
+                                    MemoryType memory_type) {
+  return std::make_unique<
+      nvblox::datasets::MultiThreadedImageLoader<ColorImage>>(
+      std::bind(getPathForColorImage, base_path, seq_id, std::placeholders::_1),
+      num_threads, memory_type);
+}
+
+}  // namespace threedmatch
+}  // namespace datasets
+}  // namespace nvblox
diff --git a/nvblox/src/gpu_hash/cuda/gpu_layer_view.cu b/nvblox/src/gpu_hash/cuda/gpu_layer_view.cu
new file mode 100644
index 00000000..6bb0bdaf
--- /dev/null
+++ b/nvblox/src/gpu_hash/cuda/gpu_layer_view.cu
@@ -0,0 +1,33 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/pair.h>
+
+#include "nvblox/core/common_names.h"
+#include "nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh"
+
+namespace nvblox {
+
+// Compile Specializations for the standard block types.
+template class GPULayerView<TsdfBlock>;
+template class GPULayerView<EsdfBlock>;
+template class GPULayerView<ColorBlock>;
+template class GPULayerView<MeshBlock>;
+
+}  // namespace nvblox
diff --git a/nvblox/src/gpu_hash/cuda/gpu_set.cu b/nvblox/src/gpu_hash/cuda/gpu_set.cu
new file mode 100644
index 00000000..3d1e00ea
--- /dev/null
+++ b/nvblox/src/gpu_hash/cuda/gpu_set.cu
@@ -0,0 +1,33 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/gpu_hash/cuda/gpu_set.cuh"
+
+namespace nvblox {
+
+Index3DDeviceSet::Index3DDeviceSet(size_t size) {
+  set = Index3DDeviceSet_t::createDeviceObject(static_cast<int64_t>(size));
+}
+Index3DDeviceSet::~Index3DDeviceSet() {
+  Index3DDeviceSet_t::destroyDeviceObject(set);
+}
+
+void copySetToVector(const Index3DDeviceSet_t& set, std::vector<Index3D>* vec) {
+  vec->resize(set.size());
+  auto set_iter = set.device_range();
+  thrust::copy_n(set_iter.begin(), set_iter.size(), vec->begin());
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/integrators/cuda/esdf_integrator.cu b/nvblox/src/integrators/cuda/esdf_integrator.cu
new file mode 100644
index 00000000..7c7eeb43
--- /dev/null
+++ b/nvblox/src/integrators/cuda/esdf_integrator.cu
@@ -0,0 +1,1214 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/gpu_hash/cuda/gpu_hash_interface.cuh"
+#include "nvblox/gpu_hash/cuda/gpu_indexing.cuh"
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/integrators/esdf_integrator.h"
+
+namespace nvblox {
+
+EsdfIntegrator::~EsdfIntegrator() {
+  if (cuda_stream_ != nullptr) {
+    cudaStreamDestroy(cuda_stream_);
+  }
+}
+
+void EsdfIntegrator::integrateBlocksOnGPU(
+    const TsdfLayer& tsdf_layer, const std::vector<Index3D>& block_indices,
+    EsdfLayer* esdf_layer) {
+  timing::Timer esdf_timer("esdf/integrate");
+
+  if (block_indices.empty()) {
+    return;
+  }
+
+  // First, check if the stream exists. If not, create one.
+  if (cuda_stream_ == nullptr) {
+    checkCudaErrors(cudaStreamCreate(&cuda_stream_));
+  }
+
+  timing::Timer allocate_timer("esdf/integrate/allocate");
+  // First, allocate all the destination blocks.
+  allocateBlocksOnCPU(block_indices, esdf_layer);
+  allocate_timer.Stop();
+
+  timing::Timer mark_timer("esdf/integrate/mark_sites");
+  // Then, mark all the sites on GPU.
+  // This finds all the blocks that are eligible to be parents.
+  std::vector<Index3D> blocks_with_sites;
+  std::vector<Index3D> blocks_to_clear;
+  markAllSitesOnGPU(tsdf_layer, block_indices, esdf_layer, &blocks_with_sites,
+                    &blocks_to_clear);
+  mark_timer.Stop();
+
+  std::vector<Index3D> cleared_blocks;
+  if (!blocks_to_clear.empty()) {
+    timing::Timer compute_timer("esdf/integrate/clear");
+    clearInvalidOnGPU(blocks_to_clear, esdf_layer, &cleared_blocks);
+    std::vector<Index3D> all_clear_updated;
+  }
+
+  timing::Timer compute_timer("esdf/integrate/compute");
+  // Parallel block banding on GPU.
+  computeEsdfOnGPU(blocks_with_sites, esdf_layer);
+  if (!cleared_blocks.empty()) {
+    computeEsdfOnGPU(cleared_blocks, esdf_layer);
+  }
+  compute_timer.Stop();
+}
+
+void EsdfIntegrator::integrateSliceOnGPU(
+    const TsdfLayer& tsdf_layer, const std::vector<Index3D>& block_indices,
+    float z_min, float z_max, float z_output, EsdfLayer* esdf_layer) {
+  timing::Timer esdf_timer("esdf/integrate_slice");
+
+  if (block_indices.empty()) {
+    return;
+  }
+
+  // First, check if the stream exists. If not, create one.
+  if (cuda_stream_ == nullptr) {
+    checkCudaErrors(cudaStreamCreate(&cuda_stream_));
+  }
+
+  timing::Timer allocate_timer("esdf/integrate_slice/allocate");
+  // First, allocate all the destination blocks.
+  allocateBlocksOnCPU(block_indices, esdf_layer);
+  allocate_timer.Stop();
+
+  timing::Timer mark_timer("esdf/integrate_slice/mark_sites");
+  // Then, mark all the sites on GPU.
+  // This finds all the blocks that are eligible to be parents.
+  std::vector<Index3D> blocks_with_sites;
+  std::vector<Index3D> blocks_to_clear;
+  markSitesInSliceOnGPU(tsdf_layer, block_indices, z_min, z_max, z_output,
+                        esdf_layer, &blocks_with_sites, &blocks_to_clear);
+  mark_timer.Stop();
+
+  std::vector<Index3D> cleared_blocks;
+  if (!blocks_to_clear.empty()) {
+    timing::Timer compute_timer("esdf/integrate_slice/clear");
+    clearInvalidOnGPU(blocks_to_clear, esdf_layer, &cleared_blocks);
+    std::vector<Index3D> all_clear_updated;
+  }
+
+  timing::Timer compute_timer("esdf/integrate_slice/compute");
+  // Parallel block banding on GPU.
+  computeEsdfOnGPU(blocks_with_sites, esdf_layer);
+
+  if (!cleared_blocks.empty()) {
+    computeEsdfOnGPU(cleared_blocks, esdf_layer);
+  }
+  compute_timer.Stop();
+}
+
+__device__ void clearVoxelDevice(EsdfVoxel* voxel,
+                                 float max_squared_distance_vox) {
+  voxel->parent_direction.setZero();
+  voxel->squared_distance_vox = max_squared_distance_vox;
+}
+
+// Takes in a vector of blocks, and outputs an integer true if that block is
+// meshable.
+// Block size MUST be voxels_per_side x voxels_per_side x voxel_per_size.
+// Grid size can be anything.
+__global__ void markAllSitesKernel(int num_blocks,
+                                   const TsdfBlock** tsdf_blocks,
+                                   EsdfBlock** esdf_blocks,
+                                   float min_site_distance_m, float min_weight,
+                                   float max_squared_distance_vox,
+                                   bool* updated, bool* to_clear) {
+  dim3 voxel_index = threadIdx;
+  // This for loop allows us to have fewer threadblocks than there are
+  // blocks in this computation. We assume the threadblock size is constant
+  // though to make our lives easier.
+  for (int block_index = blockIdx.x; block_index < num_blocks;
+       block_index += gridDim.x) {
+    // Get the correct voxel for this index.
+    const TsdfVoxel* tsdf_voxel =
+        &tsdf_blocks[block_index]
+             ->voxels[voxel_index.x][voxel_index.y][voxel_index.z];
+    EsdfVoxel* esdf_voxel =
+        &esdf_blocks[block_index]
+             ->voxels[voxel_index.x][voxel_index.y][voxel_index.z];
+    if (tsdf_voxel->weight >= min_weight) {
+      // Mark as inside if the voxel distance is negative.
+      bool is_inside = tsdf_voxel->distance <= 0.0f;
+      if (esdf_voxel->is_inside && is_inside == false) {
+        clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+        to_clear[block_index] = true;
+      }
+      esdf_voxel->is_inside = is_inside;
+      if (is_inside && fabsf(tsdf_voxel->distance) <= min_site_distance_m) {
+        esdf_voxel->is_site = true;
+        esdf_voxel->squared_distance_vox = 0.0f;
+        esdf_voxel->parent_direction.setZero();
+        updated[block_index] = true;
+      } else {
+        if (esdf_voxel->is_site) {
+          esdf_voxel->is_site = false;
+          // This voxel needs to be cleared.
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+          to_clear[block_index] = true;
+        } else if (!esdf_voxel->observed) {
+          // This is a brand new voxel.
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+        } else if (esdf_voxel->squared_distance_vox <= 1e-4) {
+          // This is an invalid voxel that should be cleared.
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+          to_clear[block_index] = true;
+        }
+      }
+      esdf_voxel->observed = true;
+    }
+  }
+}
+
+// From:
+// https://stackoverflow.com/questions/17399119/how-do-i-use-atomicmax-on-floating-point-values-in-cuda
+__device__ __forceinline__ float atomicMinFloat(float* addr, float value) {
+  float old;
+  old = (value >= 0)
+            ? __int_as_float(atomicMin((int*)addr, __float_as_int(value)))
+            : __uint_as_float(
+                  atomicMax((unsigned int*)addr, __float_as_uint(value)));
+
+  return old;
+}
+
+/// Thread size MUST be 8x8x8, block size can be anything.
+__global__ void markSitesInSliceKernel(
+    int num_input_blocks, int num_output_blocks, const TsdfBlock** tsdf_blocks,
+    EsdfBlock** esdf_blocks, int output_voxel_index, int input_min_voxel_index,
+    int input_max_voxel_index, float min_site_distance_m, float min_weight,
+    float max_squared_distance_vox, bool* updated, bool* cleared) {
+  dim3 voxel_index = threadIdx;
+  voxel_index.z = output_voxel_index;
+  int layer_index = threadIdx.z;
+  int num_layers = blockDim.z;
+
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+
+  __shared__ EsdfVoxel new_values[kVoxelsPerSide][kVoxelsPerSide];
+  __shared__ bool observed[kVoxelsPerSide][kVoxelsPerSide];
+  __shared__ float min_distance[kVoxelsPerSide][kVoxelsPerSide];
+
+  // Initialize these.
+  if (layer_index == 0) {
+    observed[voxel_index.x][voxel_index.y] = false;
+    min_distance[voxel_index.x][voxel_index.y] = 100.0f;
+  }
+  __syncthreads();
+
+  // This for loop allows us to have fewer threadblocks than there are
+  // blocks in this computation. We assume the threadblock size is constant
+  // though to make our lives easier.
+  for (int block_index = blockIdx.x; block_index < num_output_blocks;
+       block_index += gridDim.x) {
+    // Get the correct block for this.
+    const TsdfBlock* tsdf_block =
+        tsdf_blocks[block_index + num_output_blocks * layer_index];
+    // There's also null pointers in there.
+    if (tsdf_block != nullptr) {
+      // Iterate over all of the voxels in this block.
+      int start_index = 0;
+      int end_index = kVoxelsPerSide;
+      if (layer_index == 0) {
+        start_index = input_min_voxel_index;
+      }
+      if (layer_index == num_layers - 1) {
+        end_index = input_max_voxel_index;
+      }
+      for (int i = start_index; i < end_index; i++) {
+        const TsdfVoxel* tsdf_voxel =
+            &tsdf_block->voxels[voxel_index.x][voxel_index.y][i];
+        // EsdfVoxel* new_voxel = &new_values[voxel_index.x][voxel_index.y];
+        // Get the correct voxel for this index.
+        if (tsdf_voxel->weight >= min_weight) {
+          observed[voxel_index.x][voxel_index.y] = true;
+          atomicMinFloat(&min_distance[voxel_index.x][voxel_index.y],
+                         tsdf_voxel->distance);
+        }
+      }
+    }
+
+    // sync threads across everyone trying to update this voxel
+    __syncthreads();
+
+    // Ok now only if we're layer 0 do we compare the new and old values and
+    // decide what to output.
+    if (layer_index == 0) {
+      EsdfVoxel* esdf_voxel =
+          &esdf_blocks[block_index]
+               ->voxels[voxel_index.x][voxel_index.y][voxel_index.z];
+
+      // Case 0: Just skip it if it's unobserved. We don't care.
+      if (!observed[voxel_index.x][voxel_index.y]) {
+        continue;
+      }
+      // Determine if the new value puts us inside or in a site.
+      bool is_inside = min_distance[voxel_index.x][voxel_index.y] <= 0.0f;
+      bool is_site = fabsf(min_distance[voxel_index.x][voxel_index.y]) <=
+                         min_site_distance_m &&
+                     is_inside;
+
+      // First handle the case where the voxel is a site.
+      if (is_site) {
+        if (esdf_voxel->is_site) {
+          // Ok whatever. Add to the site list.
+          // Its existing values are fine.
+          updated[block_index] = true;
+        } else {
+          // Wasn't a site before, is now.
+          esdf_voxel->observed = true;
+          esdf_voxel->is_site = true;
+          clearVoxelDevice(esdf_voxel, 0.0f);
+          updated[block_index] = true;
+        }
+      } else {
+        // Here we have to double-check what's going on.
+        // If it was a site before, and isn't anymore, we have to clear it.
+        if (esdf_voxel->is_site) {
+          esdf_voxel->is_site = false;
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+          cleared[block_index] = true;
+        }
+        // Otherwise just leave it alone unless it's brand new.
+        if (!esdf_voxel->observed) {
+          esdf_voxel->observed = true;
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+        } else if (esdf_voxel->is_inside != is_inside) {
+          // In case the sidedness swapped, clear the voxel.
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+          cleared[block_index] = true;
+        } else if (esdf_voxel->squared_distance_vox <= 0.0f) {
+          // This is somehow invalidly marked as a site despite the fact
+          // it shouldn't be.
+          clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+          cleared[block_index] = true;
+        }
+      }
+      // Make the sidedness match.
+      esdf_voxel->is_inside = is_inside;
+    }
+  }
+}
+
+__device__ void sweepSingleBand(Index3D voxel_index, int sweep_axis,
+                                float max_squared_distance_vox,
+                                EsdfBlock* esdf_block) {
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  Index3D last_site;
+  bool site_found;
+  // Sweep sweep sweep.
+  // First we sweep forward, then backwards.
+  for (int i = 0; i < 2; i++) {
+    last_site = Index3D::Zero();
+    site_found = false;
+    int direction = 1;
+    int start_voxel = 0;
+    int end_voxel = kVoxelsPerSide;
+    if (i == 1) {
+      direction = -1;
+      start_voxel = kVoxelsPerSide - 1;
+      end_voxel = -1;
+    }
+
+    for (voxel_index(sweep_axis) = start_voxel;
+         voxel_index(sweep_axis) != end_voxel;
+         voxel_index(sweep_axis) += direction) {
+      EsdfVoxel* esdf_voxel =
+          &esdf_block
+               ->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+      if (!esdf_voxel->observed) {
+        continue;
+      }
+      // If this voxel is itself a site, then mark this for future voxels.
+      if (esdf_voxel->is_site) {
+        last_site = voxel_index;
+        site_found = true;
+      } else if (!site_found) {
+        // If this voxel isn't a site but we haven't found a site yet,
+        // then if this voxel is valid we set it as the site.
+        if (esdf_voxel->squared_distance_vox < max_squared_distance_vox) {
+          site_found = true;
+          last_site = esdf_voxel->parent_direction + voxel_index;
+        }
+      } else {
+        // If we've found the site, then should just decide what to do
+        // here.
+        Index3D potential_direction = last_site - voxel_index;
+        float potential_distance = potential_direction.squaredNorm();
+        // Either it hasn't been set at all or it's closer to the site
+        // than to its current value.
+        if (esdf_voxel->squared_distance_vox > potential_distance) {
+          esdf_voxel->parent_direction = potential_direction;
+          esdf_voxel->squared_distance_vox = potential_distance;
+        } else if (esdf_voxel->squared_distance_vox <
+                   max_squared_distance_vox) {
+          // If the current value is a better site, then set it as a site.
+          last_site = esdf_voxel->parent_direction + voxel_index;
+        }
+      }
+    }
+  }
+}
+__device__ bool updateSingleNeighbor(const EsdfBlock* esdf_block,
+                                     const Index3D& voxel_index,
+                                     const Index3D& neighbor_voxel_index,
+                                     int axis, int direction,
+                                     float max_squared_distance_vox,
+                                     EsdfBlock* neighbor_block) {
+  const EsdfVoxel* esdf_voxel =
+      &esdf_block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+  EsdfVoxel* neighbor_voxel =
+      &neighbor_block
+           ->voxels[neighbor_voxel_index.x()][neighbor_voxel_index.y()]
+                   [neighbor_voxel_index.z()];
+  if (!esdf_voxel->observed || !neighbor_voxel->observed ||
+      neighbor_voxel->is_site ||
+      esdf_voxel->squared_distance_vox >= max_squared_distance_vox) {
+    return false;
+  }
+  // Determine if we can update this.
+  Eigen::Vector3i potential_direction = esdf_voxel->parent_direction;
+  potential_direction(axis) -= direction;
+  float potential_distance = potential_direction.squaredNorm();
+  // TODO: might be some concurrency issues here, have to be a bit careful
+  // on the corners/edges.
+  if (neighbor_voxel->squared_distance_vox > potential_distance) {
+    neighbor_voxel->parent_direction = potential_direction;
+    neighbor_voxel->squared_distance_vox = potential_distance;
+    return true;
+  }
+  return false;
+}
+
+__device__ bool clearSingleNeighbor(const EsdfBlock* esdf_block,
+                                    const Index3D& voxel_index,
+                                    const Index3D& neighbor_voxel_index,
+                                    int axis, int direction,
+                                    float max_squared_distance_vox,
+                                    EsdfBlock* neighbor_block) {
+  const EsdfVoxel* esdf_voxel =
+      &esdf_block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+  EsdfVoxel* neighbor_voxel =
+      &neighbor_block
+           ->voxels[neighbor_voxel_index.x()][neighbor_voxel_index.y()]
+                   [neighbor_voxel_index.z()];
+
+  if (esdf_voxel->squared_distance_vox < max_squared_distance_vox ||
+      !esdf_voxel->observed || neighbor_voxel->is_site ||
+      neighbor_voxel->squared_distance_vox >= max_squared_distance_vox) {
+    return false;
+  }
+  // Determine if we can update this.
+  Index3D parent_voxel_dir = neighbor_voxel->parent_direction;
+  if ((direction > 0 && parent_voxel_dir(axis) > 0) ||
+      (direction < 0 && parent_voxel_dir(axis) < 0)) {
+    return false;
+  }
+
+  clearVoxelDevice(neighbor_voxel, max_squared_distance_vox);
+  return true;
+}
+
+/// Thread size MUST be 8x8xN (where N is a number of blocks up to 8), block
+/// size can be anything.
+__global__ void sweepBlockBandKernel(int num_blocks, EsdfBlock** esdf_blocks,
+                                     float max_squared_distance_vox) {
+  // We go one axis at a time, syncing threads in between.
+  dim3 thread_index = threadIdx;
+  thread_index.z = 0;
+
+  for (int block_index = blockIdx.x * blockDim.z + threadIdx.z;
+       block_index < num_blocks; block_index += gridDim.x * blockDim.z) {
+    // For simplicity we have to have the same number of blocks in the CUDA
+    // kernel call as we have actual blocks.
+    EsdfBlock* esdf_block = esdf_blocks[block_index];
+    Index3D voxel_index(0, thread_index.x, thread_index.y);
+
+    // X axis done.
+    sweepSingleBand(voxel_index, 0, max_squared_distance_vox, esdf_block);
+    __syncthreads();
+
+    // Y axis done.
+    voxel_index << thread_index.x, 0, thread_index.y;
+    sweepSingleBand(voxel_index, 1, max_squared_distance_vox, esdf_block);
+    __syncthreads();
+
+    // Z axis done.
+    voxel_index << thread_index.x, thread_index.y, 0;
+    sweepSingleBand(voxel_index, 2, max_squared_distance_vox, esdf_block);
+    __syncthreads();
+  }
+}
+
+/// Thread size MUST be 8x8xN, where N is the number of blocks processed at
+/// a time, block size can be anything.
+__global__ void updateLocalNeighborBandsKernel(int num_blocks, int i,
+                                               EsdfBlock** esdf_blocks,
+                                               int* neighbor_table,
+                                               EsdfBlock** neighbor_pointers,
+                                               float max_squared_distance_vox,
+                                               bool* updated_neighbors) {
+  // We go one axis at a time, syncing threads in between.
+  dim3 thread_index = threadIdx;
+  thread_index.z = 0;
+
+  constexpr int kNumNeighbors = 6;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+
+  for (int block_index = blockIdx.x * blockDim.z + threadIdx.z;
+       block_index < num_blocks; block_index += gridDim.x * blockDim.z) {
+    EsdfBlock* esdf_block = esdf_blocks[block_index];
+    Index3D voxel_index;
+    Index3D neighbor_voxel_index;
+    // Each thread updates 1 neighbors, set by "i".
+    // Get the neighbor block.
+    int neighbor_index = neighbor_table[block_index * kNumNeighbors + i];
+    if (neighbor_index < 0) {
+      continue;
+    }
+    EsdfBlock* neighbor_block = neighbor_pointers[neighbor_index];
+    // Now we have the neighbor block... Let's figure out which voxels we
+    // should look at.
+    int axis = i / 2;
+    int direction = i % 2 ? -1 : 1;
+
+    // Fill in the axes.
+    if (axis == 0) {
+      voxel_index << 0, thread_index.x, thread_index.y;
+    } else if (axis == 1) {
+      voxel_index << thread_index.x, 0, thread_index.y;
+    } else if (axis == 2) {
+      voxel_index << thread_index.x, thread_index.y, 0;
+    }
+    neighbor_voxel_index = voxel_index;
+    // If we're looking backwards...
+    if (direction < 0) {
+      voxel_index(axis) = 0;
+      neighbor_voxel_index(axis) = kVoxelsPerSide - 1;
+    } else {
+      voxel_index(axis) = kVoxelsPerSide - 1;
+      neighbor_voxel_index(axis) = 0;
+    }
+
+    bool updated = updateSingleNeighbor(
+        esdf_block, voxel_index, neighbor_voxel_index, axis, direction,
+        max_squared_distance_vox, neighbor_block);
+    if (updated) {
+      updated_neighbors[neighbor_index] = true;
+    }
+  }
+}
+
+/// Thread size MUST be 8x8x8, block size can be anything.
+__global__ void clearWithinBlockKernel(int num_blocks, EsdfBlock** esdf_blocks,
+                                       float max_squared_distance_vox) {
+  dim3 voxel_index = threadIdx;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+
+  // Allow block size to be whatever.
+  for (int block_index = blockIdx.x; block_index < num_blocks;
+       block_index += gridDim.x) {
+    // Get the voxel.
+    EsdfVoxel* esdf_voxel =
+        &esdf_blocks[block_index]
+             ->voxels[voxel_index.x][voxel_index.y][voxel_index.z];
+    // Check if its parent is in the same block.
+    if (!esdf_voxel->observed || esdf_voxel->is_site ||
+        esdf_voxel->squared_distance_vox >= max_squared_distance_vox) {
+      continue;
+    }
+    // Get the parent.
+    Index3D parent_index =
+        Index3D(voxel_index.x, voxel_index.y, voxel_index.z) +
+        esdf_voxel->parent_direction;
+
+    // Check if the voxel is within the same block.
+    if (parent_index.x() < 0 || parent_index.x() >= kVoxelsPerSide ||
+        parent_index.y() < 0 || parent_index.y() >= kVoxelsPerSide ||
+        parent_index.z() < 0 || parent_index.z() >= kVoxelsPerSide) {
+      continue;
+    }
+
+    // Ok check if the parent index is a site.
+    if (!esdf_blocks[block_index]
+             ->voxels[parent_index.x()][parent_index.y()][parent_index.z()]
+             .is_site) {
+      clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+    }
+  }
+}
+
+/// Thread size MUST be 8x8x8, block size can be anything.
+__global__ void clearInternalVoxelsKernel(int num_blocks,
+                                          EsdfBlock** esdf_blocks,
+                                          float max_squared_distance_vox) {
+  dim3 voxel_index = threadIdx;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+
+  // Allow block size to be whatever.
+  for (int block_index = blockIdx.x; block_index < num_blocks;
+       block_index += gridDim.x) {
+    // Get the voxel.
+    EsdfVoxel* esdf_voxel =
+        &esdf_blocks[block_index]
+             ->voxels[voxel_index.x][voxel_index.y][voxel_index.z];
+    if (!esdf_voxel->observed || esdf_voxel->is_site ||
+        esdf_voxel->squared_distance_vox >= max_squared_distance_vox) {
+      continue;
+    }
+    // Get the parent.
+    Index3D parent_index =
+        Index3D(voxel_index.x, voxel_index.y, voxel_index.z) +
+        esdf_voxel->parent_direction;
+
+    // Check if we're our own parent. This is definitely wrong since we're not
+    // a site.
+    if (esdf_voxel->parent_direction.isZero()) {
+      clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+      continue;
+    }
+
+    // Get the closest index to the parent within the same block.
+    // We just get the nearest neighbor.
+    Index3D closest_parent(min(max(parent_index.x(), 0), kVoxelsPerSide - 1),
+                           min(max(parent_index.y(), 0), kVoxelsPerSide - 1),
+                           min(max(parent_index.z(), 0), kVoxelsPerSide - 1));
+
+    // Ok check if the parent index is a site.
+    // TODO: Check if we need the observed rule or not...
+    const EsdfVoxel& neighbor_voxel =
+        esdf_blocks[block_index]->voxels[closest_parent.x()][closest_parent.y()]
+                                        [closest_parent.z()];
+    if (!neighbor_voxel.observed ||
+        neighbor_voxel.squared_distance_vox >= max_squared_distance_vox) {
+      clearVoxelDevice(esdf_voxel, max_squared_distance_vox);
+    }
+  }
+}
+
+/// Thread size MUST be 8x8xN, where N is the number of blocks processed at
+/// a time, block size can be anything.
+__global__ void clearLocalNeighborBandsKernel(int num_blocks, int i,
+                                              EsdfBlock** esdf_blocks,
+                                              int* neighbor_table,
+                                              EsdfBlock** neighbor_pointers,
+                                              float max_squared_distance_vox,
+                                              bool* updated_neighbors) {
+  // We go one axis at a time, syncing threads in between.
+  dim3 thread_index = threadIdx;
+  thread_index.z = 0;
+
+  constexpr int kNumNeighbors = 6;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+
+  for (int block_index = blockIdx.x * blockDim.z + threadIdx.z;
+       block_index < num_blocks; block_index += gridDim.x * blockDim.z) {
+    EsdfBlock* esdf_block = esdf_blocks[block_index];
+    Index3D voxel_index;
+    Index3D neighbor_voxel_index;
+    // Each thread updates 1 neighbors, set by "i".
+    // Get the neighbor block.
+    int neighbor_index = neighbor_table[block_index * kNumNeighbors + i];
+    if (neighbor_index < 0) {
+      continue;
+    }
+    EsdfBlock* neighbor_block = neighbor_pointers[neighbor_index];
+    // Now we have the neighbor block... Let's figure out which voxels we
+    // should look at.
+    int axis = i / 2;
+    int direction = i % 2 ? -1 : 1;
+
+    // Fill in the axes.
+    if (axis == 0) {
+      voxel_index << 0, thread_index.x, thread_index.y;
+    } else if (axis == 1) {
+      voxel_index << thread_index.x, 0, thread_index.y;
+    } else if (axis == 2) {
+      voxel_index << thread_index.x, thread_index.y, 0;
+    }
+    neighbor_voxel_index = voxel_index;
+    // If we're looking backwards...
+    if (direction < 0) {
+      voxel_index(axis) = 0;
+      neighbor_voxel_index(axis) = kVoxelsPerSide - 1;
+    } else {
+      voxel_index(axis) = kVoxelsPerSide - 1;
+      neighbor_voxel_index(axis) = 0;
+    }
+
+    bool updated = clearSingleNeighbor(
+        esdf_block, voxel_index, neighbor_voxel_index, axis, direction,
+        max_squared_distance_vox, neighbor_block);
+    if (updated) {
+      updated_neighbors[neighbor_index] = true;
+    }
+  }
+}
+
+void EsdfIntegrator::markAllSitesOnGPU(
+    const TsdfLayer& tsdf_layer, const std::vector<Index3D>& block_indices,
+    EsdfLayer* esdf_layer, std::vector<Index3D>* blocks_with_sites,
+    std::vector<Index3D>* cleared_blocks) {
+  CHECK_NOTNULL(esdf_layer);
+  CHECK_NOTNULL(blocks_with_sites);
+
+  // Caching.
+  const float voxel_size = tsdf_layer.voxel_size();
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+  // Cache the minimum distance in metric size.
+  const float min_site_distance_m = min_site_distance_vox_ * voxel_size;
+
+  int num_blocks = block_indices.size();
+
+  // Get all of the block pointers we need.
+  tsdf_pointers_host_.resize(num_blocks);
+  block_pointers_host_.resize(num_blocks);
+
+  // Have an updated output variable as well.
+  updated_blocks_device_.resize(num_blocks);
+  updated_blocks_device_.setZero();
+  cleared_blocks_device_.resize(num_blocks);
+  cleared_blocks_device_.setZero();
+
+  // Populate all the input vectors.
+  for (size_t i = 0; i < num_blocks; i++) {
+    const Index3D& block_index = block_indices[i];
+    EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+    TsdfBlock::ConstPtr tsdf_block = tsdf_layer.getBlockAtIndex(block_index);
+
+    if (!esdf_block || !tsdf_block) {
+      LOG(ERROR) << "Somehow trying to update non-existent blocks!";
+      continue;
+    }
+
+    tsdf_pointers_host_[i] = tsdf_block.get();
+    block_pointers_host_[i] = esdf_block.get();
+  }
+
+  // Copy what we need over to the device.
+  tsdf_pointers_device_ = tsdf_pointers_host_;
+  block_pointers_device_ = block_pointers_host_;
+
+  // Call the kernel.
+  int dim_block = num_blocks;
+  constexpr int kVoxelsPerSide = EsdfBlock::kVoxelsPerSide;
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  markAllSitesKernel<<<dim_block, dim_threads, 0, cuda_stream_>>>(
+      num_blocks, tsdf_pointers_device_.data(), block_pointers_device_.data(),
+      min_site_distance_m, min_weight_, max_squared_distance_vox,
+      updated_blocks_device_.data(), cleared_blocks_device_.data());
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Copy out.
+  updated_blocks_host_ = updated_blocks_device_;
+  cleared_blocks_host_ = cleared_blocks_device_;
+
+  // Get the output vector.
+  // TODO(helen): swap this to a kernel operation.
+  for (size_t i = 0; i < num_blocks; i++) {
+    if (updated_blocks_host_[i]) {
+      blocks_with_sites->push_back(block_indices[i]);
+    }
+    if (cleared_blocks_host_[i]) {
+      cleared_blocks->push_back(block_indices[i]);
+    }
+  }
+}
+
+// 2D slice version of the markAllSites function above.
+void EsdfIntegrator::markSitesInSliceOnGPU(
+    const TsdfLayer& tsdf_layer, const std::vector<Index3D>& block_indices,
+    float min_z, float max_z, float output_z, EsdfLayer* esdf_layer,
+    std::vector<Index3D>* updated_blocks,
+    std::vector<Index3D>* cleared_blocks) {
+  CHECK_NOTNULL(esdf_layer);
+  CHECK_NOTNULL(updated_blocks);
+  CHECK_NOTNULL(cleared_blocks);
+
+  // Caching.
+  const float voxel_size = tsdf_layer.voxel_size();
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+  // Cache the minimum distance in metric size.
+  const float min_site_distance_m = min_site_distance_vox_ * voxel_size;
+
+  // We are going to subsample the block_indices.
+  // We need to figure out all the output blocks, which will be a subset
+  // of the input blocks. At the same time we need to get all of the stacks
+  // of input blocks at all levels.
+  // We are going to pull some "clever" stuff: the input block list will be
+  // of length N * n_input_blocks, where "N" is the number of vertical
+  // layers there could be that fall into the min z to max z range.
+
+  // Ok first figure out how many layers we could have.
+  Index3D min_block_index;
+  Index3D min_voxel_index;
+  getBlockAndVoxelIndexFromPositionInLayer(tsdf_layer.block_size(),
+                                           Vector3f(0.0f, 0.0f, min_z),
+                                           &min_block_index, &min_voxel_index);
+  const int min_block_index_z = min_block_index.z();
+  const int min_voxel_index_z = min_voxel_index.z();
+  Index3D max_block_index;
+  Index3D max_voxel_index;
+  getBlockAndVoxelIndexFromPositionInLayer(tsdf_layer.block_size(),
+                                           Vector3f(0.0f, 0.0f, max_z),
+                                           &max_block_index, &max_voxel_index);
+  const int max_block_index_z = max_block_index.z();
+  const int max_voxel_index_z = max_voxel_index.z();
+
+  // There is always at least 1 layer.
+  int num_vertical_layers = max_block_index_z - min_block_index_z + 1;
+
+  // And figure out what the index of the output voxel is.
+  // std::pair<Index3D, Index3D> output_block_and_voxel_index
+  Index3D output_block_index;
+  Index3D output_voxel_index;
+  getBlockAndVoxelIndexFromPositionInLayer(
+      tsdf_layer.block_size(), Vector3f(0.0f, 0.0f, output_z),
+      &output_block_index, &output_voxel_index);
+  const int output_block_index_z = output_block_index.z();
+  const int output_voxel_index_z = output_voxel_index.z();
+
+  // Next get a list of all the valid input blocks.
+  Index3DSet output_block_set;
+  for (const Index3D& block_index : block_indices) {
+    if (block_index.z() >= min_block_index_z &&
+        block_index.z() <= max_block_index_z) {
+      output_block_set.insert(
+          Index3D(block_index.x(), block_index.y(), output_block_index_z));
+    }
+  }
+
+  // Ok now we have all the indices we actually need.
+  // Just have to get their pointers and we're good.
+  size_t num_blocks = output_block_set.size();
+  if (num_blocks == 0) {
+    return;
+  }
+
+  std::vector<Index3D> input_blocks(num_blocks * num_vertical_layers);
+  std::vector<Index3D> output_blocks(num_blocks);
+  tsdf_pointers_host_.resize(num_blocks * num_vertical_layers);
+  tsdf_pointers_host_.setZero();
+  block_pointers_host_.resize(num_blocks);
+
+  size_t i = 0;
+  for (const Index3D& block_index : output_block_set) {
+    // This is for the output block, which we allocate along the way.
+    output_blocks[i] = block_index;
+    block_pointers_host_[i] =
+        esdf_layer->allocateBlockAtIndex(block_index).get();
+
+    // Go through all the relevant input pointers:
+    Index3D input_block_index = block_index;
+
+    int j = 0;
+    for (input_block_index.z() = min_block_index_z;
+         input_block_index.z() <= max_block_index_z; input_block_index.z()++) {
+      input_blocks[i + num_blocks * j] = input_block_index;
+      // This can be null. It's fine.
+      tsdf_pointers_host_[i + num_blocks * j] =
+          tsdf_layer.getBlockAtIndex(input_block_index).get();
+      j++;
+    }
+    i++;
+  }
+
+  // Copy what we need over to the device.
+  tsdf_pointers_device_ = tsdf_pointers_host_;
+  block_pointers_device_ = block_pointers_host_;
+
+  // Finally, set up the updated and cleared vectors.
+  updated_blocks_device_.resize(num_blocks);
+  updated_blocks_device_.setZero();
+  cleared_blocks_device_.resize(num_blocks);
+  cleared_blocks_device_.setZero();
+
+  // Call the kernel!
+  int dim_block = num_blocks;
+  constexpr int kVoxelsPerSide = EsdfBlock::kVoxelsPerSide;
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, num_vertical_layers);
+  markSitesInSliceKernel<<<dim_block, dim_threads, 0, cuda_stream_>>>(
+      num_blocks, num_blocks, tsdf_pointers_device_.data(),
+      block_pointers_device_.data(), output_voxel_index_z, min_voxel_index_z,
+      max_voxel_index_z, min_site_distance_m, min_weight_,
+      max_squared_distance_vox, updated_blocks_device_.data(),
+      cleared_blocks_device_.data());
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Copy out.
+  updated_blocks_host_ = updated_blocks_device_;
+  cleared_blocks_host_ = cleared_blocks_device_;
+
+  // Pack the outputs. The rest of the functions should work as before.
+  for (size_t i = 0; i < output_blocks.size(); i++) {
+    if (updated_blocks_host_[i]) {
+      updated_blocks->push_back(output_blocks[i]);
+    }
+    if (cleared_blocks_host_[i]) {
+      cleared_blocks->push_back(output_blocks[i]);
+    }
+  }
+}
+
+void EsdfIntegrator::clearInvalidOnGPU(
+    const std::vector<Index3D>& blocks_to_clear, EsdfLayer* esdf_layer,
+    std::vector<Index3D>* updated_blocks) {
+  CHECK_NOTNULL(esdf_layer);
+  CHECK_NOTNULL(updated_blocks);
+
+  // Caching.
+  const float voxel_size = esdf_layer->voxel_size();
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  int num_blocks = blocks_to_clear.size();
+  block_pointers_host_.resize(num_blocks);
+
+  // Have an updated output variable as well.
+  updated_blocks_device_.resize(num_blocks);
+  updated_blocks_device_.setZero();
+
+  // Populate all the input vectors.
+  for (size_t i = 0; i < num_blocks; i++) {
+    const Index3D& block_index = blocks_to_clear[i];
+    block_pointers_host_[i] = esdf_layer->getBlockAtIndex(block_index).get();
+  }
+
+  block_pointers_device_ = block_pointers_host_;
+
+  // Alright now run a kernel to clear all the voxels within a block.
+  constexpr int kVoxelsPerSide = EsdfBlock::kVoxelsPerSide;
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  clearWithinBlockKernel<<<num_blocks, dim_threads, 0, cuda_stream_>>>(
+      num_blocks, block_pointers_device_.data(), max_squared_distance_vox);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Then clear all the neighbors.
+  Index3DSet all_cleared_blocks;
+  std::copy(blocks_to_clear.begin(), blocks_to_clear.end(),
+            std::inserter(all_cleared_blocks, all_cleared_blocks.end()));
+
+  std::vector<Index3D> clear_list = blocks_to_clear;
+  std::vector<Index3D> new_clear_list;
+  VLOG(3) << "Blocks to clear: " << blocks_to_clear.size();
+  while (!clear_list.empty()) {
+    clearBlockNeighbors(clear_list, esdf_layer, &new_clear_list);
+    std::copy(new_clear_list.begin(), new_clear_list.end(),
+              std::inserter(all_cleared_blocks, all_cleared_blocks.end()));
+    std::swap(clear_list, new_clear_list);
+    new_clear_list.clear();
+    VLOG(3) << "Clear list size: " << clear_list.size();
+  }
+
+  for (const Index3D& index : all_cleared_blocks) {
+    updated_blocks->push_back(index);
+  }
+}
+
+void EsdfIntegrator::clearBlockNeighbors(std::vector<Index3D>& clear_list,
+                                         EsdfLayer* esdf_layer,
+                                         std::vector<Index3D>* new_clear_list) {
+  int num_blocks = clear_list.size();
+
+  if (num_blocks == 0) {
+    return;
+  }
+  constexpr int kNumNeighbors = 6;
+  const float voxel_size = esdf_layer->voxel_size();
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  constexpr int kVoxelsPerSide = EsdfBlock::kVoxelsPerSide;
+  dim3 dim_threads_per_voxel(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+
+  // Step 0: block pointers.
+  block_pointers_host_.resize(num_blocks);
+  for (size_t i = 0; i < num_blocks; i++) {
+    const Index3D& block_index = clear_list[i];
+    block_pointers_host_[i] = esdf_layer->getBlockAtIndex(block_index).get();
+  }
+  block_pointers_device_ = block_pointers_host_;
+
+  // Step 0a: fix up the blocks so their neighbors are valid.
+  clearInternalVoxelsKernel<<<num_blocks, dim_threads_per_voxel, 0,
+                              cuda_stream_>>>(
+      num_blocks, block_pointers_device_.data(), max_squared_distance_vox);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+
+  // Step one: set up the neighbor table.
+  std::vector<Index3D> neighbor_indices;
+  neighbor_table_host_.resize(num_blocks * kNumNeighbors);
+  neighbor_table_host_.setZero();
+  neighbor_pointers_host_.resize(0);
+
+  createNeighborTable(clear_list, esdf_layer, &neighbor_indices,
+                      &neighbor_pointers_host_, &neighbor_table_host_);
+
+  // Step two: run the neighbor updating kernel.
+  updated_blocks_device_.resize(neighbor_indices.size());
+  updated_blocks_device_.setZero();
+
+  neighbor_pointers_device_ = neighbor_pointers_host_;
+  neighbor_table_device_ = neighbor_table_host_;
+
+  constexpr int kNumBlocksPerCudaBlock = 8;
+  int dim_block = std::max(
+      static_cast<int>(
+          std::ceil(num_blocks / static_cast<float>(kNumBlocksPerCudaBlock))),
+      1);
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kNumBlocksPerCudaBlock);
+  // We have to do the neighbors one at a time basically for concurrency
+  // issues.
+  // No clue if the concurrency issues hold for the clearing operation.
+  // But this is easier to copy-and-paste.
+  for (int i = 0; i < kNumNeighbors; i++) {
+    clearLocalNeighborBandsKernel<<<dim_block, dim_threads, 0, cuda_stream_>>>(
+        num_blocks, i, block_pointers_device_.data(),
+        neighbor_table_device_.data(), neighbor_pointers_device_.data(),
+        max_squared_distance_vox, updated_blocks_device_.data());
+  }
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Repack into output vector.
+  updated_blocks_host_ = updated_blocks_device_;
+  block_pointers_host_.resize(0);
+
+  new_clear_list->clear();
+  for (size_t i = 0; i < neighbor_indices.size(); i++) {
+    if (updated_blocks_host_[i]) {
+      new_clear_list->push_back(neighbor_indices[i]);
+      block_pointers_host_.push_back(neighbor_pointers_host_[i]);
+    }
+  }
+
+  // Step three: clear any remaining voxels on the interior of the blocks
+  int num_updated_blocks = new_clear_list->size();
+  if (num_updated_blocks == 0) {
+    return;
+  }
+
+  block_pointers_device_ = block_pointers_host_;
+  clearInternalVoxelsKernel<<<num_updated_blocks, dim_threads_per_voxel, 0,
+                              cuda_stream_>>>(block_pointers_device_.size(),
+                                              block_pointers_device_.data(),
+                                              max_squared_distance_vox);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+void EsdfIntegrator::computeEsdfOnGPU(
+    const std::vector<Index3D>& blocks_with_sites, EsdfLayer* esdf_layer) {
+  CHECK_NOTNULL(esdf_layer);
+  // Cache everything.
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = esdf_layer->block_size() / kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  block_pointers_host_.resize(blocks_with_sites.size());
+  for (size_t i = 0; i < blocks_with_sites.size(); i++) {
+    block_pointers_host_[i] =
+        esdf_layer->getBlockAtIndex(blocks_with_sites[i]).get();
+  }
+
+  // First we go over all of the blocks with sites.
+  // We compute all the proximal sites inside the block first.
+  block_pointers_device_ = block_pointers_host_;
+  sweepBlockBandOnGPU(block_pointers_device_, max_squared_distance_vox);
+
+  // Get the neighbors of all the blocks with sites.
+  std::vector<Index3D> blocks_to_run = blocks_with_sites;
+  std::vector<Index3D> updated_blocks;
+
+  int i = 0;
+  while (!blocks_to_run.empty()) {
+    updateLocalNeighborBandsOnGPU(blocks_to_run, block_pointers_device_,
+                                  max_squared_distance_vox, esdf_layer,
+                                  &updated_blocks, &neighbor_pointers_device_);
+    VLOG(3) << "Iteration: " << i
+            << " Number of updated blocks: " << updated_blocks.size()
+            << " blocks with sites: " << blocks_with_sites.size();
+    i++;
+    sweepBlockBandOnGPU(neighbor_pointers_device_, max_squared_distance_vox);
+    blocks_to_run = std::move(updated_blocks);
+    block_pointers_device_ = neighbor_pointers_device_;
+  }
+}
+
+void EsdfIntegrator::sweepBlockBandOnGPU(
+    device_vector<EsdfBlock*>& block_pointers, float max_squared_distance_vox) {
+  if (block_pointers.empty()) {
+    return;
+  }
+  timing::Timer sweep_timer("esdf/integrate/compute/sweep");
+
+  // Caching.
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const int num_blocks = block_pointers.size();
+
+  // Call the kernel.
+  // We do 2-dimensional sweeps in this kernel. Each thread does 3 sweeps.
+  // We do 8 blocks at a time.
+  constexpr int kNumBlocksPerCudaBlock = 8;
+  int dim_block = std::max(
+      static_cast<int>(
+          std::ceil(num_blocks / static_cast<float>(kNumBlocksPerCudaBlock))),
+      1);
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kNumBlocksPerCudaBlock);
+  sweepBlockBandKernel<<<dim_block, dim_threads, 0, cuda_stream_>>>(
+      num_blocks, block_pointers.data(), max_squared_distance_vox);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+void EsdfIntegrator::updateLocalNeighborBandsOnGPU(
+    const std::vector<Index3D>& block_indices,
+    device_vector<EsdfBlock*>& block_pointers, float max_squared_distance_vox,
+    EsdfLayer* esdf_layer, std::vector<Index3D>* updated_blocks,
+    device_vector<EsdfBlock*>* updated_block_pointers) {
+  if (block_indices.empty()) {
+    return;
+  }
+
+  timing::Timer neighbors_timer("esdf/integrate/compute/neighbors");
+
+  CHECK_EQ(block_indices.size(), block_pointers.size());
+
+  constexpr int kNumNeighbors = 6;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const int num_blocks = block_pointers.size();
+
+  timing::Timer table_timer("esdf/integrate/compute/neighbors/table");
+
+  // This one is quite a bit more complicated.
+  // For each block, we need to get its 6 neighbors.
+  std::vector<Index3D> neighbor_indices;
+  neighbor_table_host_.resize(num_blocks * kNumNeighbors);
+  neighbor_table_host_.setZero();
+  neighbor_pointers_host_.resize(0);
+
+  createNeighborTable(block_indices, esdf_layer, &neighbor_indices,
+                      &neighbor_pointers_host_, &neighbor_table_host_);
+
+  table_timer.Stop();
+
+  // Set up an updated map.
+  updated_blocks_device_.resize(neighbor_indices.size());
+  updated_blocks_device_.setZero();
+
+  neighbor_pointers_device_ = neighbor_pointers_host_;
+  neighbor_table_device_ = neighbor_table_host_;
+
+  timing::Timer kernel_timer("esdf/integrate/compute/neighbors/kernel");
+
+  // Ok now we have to give all this stuff to the kernel.
+  // TODO(helen): you get weird-ass concurrency issues if this is not 1.
+  constexpr int kNumBlocksPerCudaBlock = 8;
+  int dim_block = std::max(
+      static_cast<int>(
+          std::ceil(num_blocks / static_cast<float>(kNumBlocksPerCudaBlock))),
+      1);
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kNumBlocksPerCudaBlock);
+  // We have to do the neighbors one at a time basically for concurrency
+  // issues.
+  for (int i = 0; i < kNumNeighbors; i++) {
+    updateLocalNeighborBandsKernel<<<dim_block, dim_threads, 0, cuda_stream_>>>(
+        num_blocks, i, block_pointers.data(), neighbor_table_device_.data(),
+        neighbor_pointers_device_.data(), max_squared_distance_vox,
+        updated_blocks_device_.data());
+  }
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  kernel_timer.Stop();
+
+  // Unpack the kernel results.
+  // TODO(helen): swap this to a kernel operation.
+  updated_blocks->clear();
+  updated_blocks_host_ = updated_blocks_device_;
+  block_pointers_host_.resize(0);
+
+  for (size_t i = 0; i < neighbor_indices.size(); i++) {
+    if (updated_blocks_host_[i]) {
+      updated_blocks->push_back(neighbor_indices[i]);
+      block_pointers_host_.push_back(neighbor_pointers_host_[i]);
+    }
+  }
+  *updated_block_pointers = block_pointers_host_;
+}
+
+void EsdfIntegrator::createNeighborTable(
+    const std::vector<Index3D>& block_indices, EsdfLayer* esdf_layer,
+    std::vector<Index3D>* neighbor_indices,
+    host_vector<EsdfBlock*>* neighbor_pointers,
+    host_vector<int>* neighbor_table) {
+  // TODO(helen): make this extensible to different number of neighbors.
+  constexpr int kNumNeighbors = 6;
+  int num_blocks = block_indices.size();
+
+  // Hash map mapping the neighbor index to the pointers above.
+  Index3DHashMapType<int>::type neighbor_map;
+
+  // Direction Shorthand: axis = neighbor_index/2
+  // direction = neighbor_index%2 ? -1 : 1
+  Index3D direction = Index3D::Zero();
+  for (int block_number = 0; block_number < num_blocks; block_number++) {
+    const Index3D& block_index = block_indices[block_number];
+    for (int neighbor_number = 0; neighbor_number < kNumNeighbors;
+         neighbor_number++) {
+      direction.setZero();
+      // Change just one axis of the direction.
+      direction(neighbor_number / 2) = neighbor_number % 2 ? -1 : 1;
+      // Check if this is already in our hash.
+      Index3D neighbor_index = block_index + direction;
+      auto res = neighbor_map.find(neighbor_index);
+      if (res != neighbor_map.end()) {
+        (*neighbor_table)[block_number * kNumNeighbors + neighbor_number] =
+            res->second;
+      } else {
+        // Doesn't exist in the neighbor list yet.
+        EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(neighbor_index);
+        if (esdf_block) {
+          int next_index = neighbor_indices->size();
+          neighbor_indices->push_back(neighbor_index);
+          neighbor_pointers->push_back(esdf_block.get());
+          neighbor_map[neighbor_index] = next_index;
+          (*neighbor_table)[block_number * kNumNeighbors + neighbor_number] =
+              next_index;
+        } else {
+          (*neighbor_table)[block_number * kNumNeighbors + neighbor_number] =
+              -1;
+          neighbor_map[neighbor_index] = -1;
+        }
+      }
+    }
+  }
+  CHECK_EQ(neighbor_table->size(), kNumNeighbors * block_indices.size());
+  CHECK_EQ(neighbor_indices->size(), neighbor_pointers->size());
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/integrators/cuda/frustum.cu b/nvblox/src/integrators/cuda/frustum.cu
new file mode 100644
index 00000000..ffc4af0e
--- /dev/null
+++ b/nvblox/src/integrators/cuda/frustum.cu
@@ -0,0 +1,364 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/integrators/frustum.h"
+
+#include "nvblox/core/hash.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_vector.h"
+#include "nvblox/integrators/ray_caster.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+FrustumCalculator::FrustumCalculator() { cudaStreamCreate(&cuda_stream_); }
+FrustumCalculator::~FrustumCalculator() { cudaStreamDestroy(cuda_stream_); }
+
+unsigned int FrustumCalculator::raycast_subsampling_factor() const {
+  return raycast_subsampling_factor_;
+}
+
+void FrustumCalculator::raycast_subsampling_factor(
+    unsigned int raycast_subsampling_factor) {
+  CHECK_GT(raycast_subsampling_factor, 0);
+  raycast_subsampling_factor_ = raycast_subsampling_factor;
+}
+
+// AABB linear indexing
+// - We index in x-major, i.e. x is varied first, then y, then z.
+// - Linear indexing within an AABB is relative and starts at zero. This is
+//   not true for AABB 3D indexing which is w.r.t. the layer origin.
+__host__ __device__ inline size_t layerIndexToAabbLinearIndex(
+    const Index3D& index, const Index3D& aabb_min, const Index3D& aabb_size) {
+  const Index3D index_shifted = index - aabb_min;
+  return index_shifted.x() +                                 // NOLINT
+         index_shifted.y() * aabb_size.x() +                 // NOLINT
+         index_shifted.z() * aabb_size.x() * aabb_size.y();  // NOLINT
+}
+
+__host__ __device__ inline Index3D aabbLinearIndexToLayerIndex(
+    const size_t lin_idx, const Index3D& aabb_min, const Index3D& aabb_size) {
+  const Index3D index(lin_idx % aabb_size.x(),                     // NOLINT
+                      (lin_idx / aabb_size.x()) % aabb_size.y(),   // NOLINT
+                      lin_idx / (aabb_size.x() * aabb_size.y()));  // NOLINT
+  return index + aabb_min;
+}
+
+__device__ void setIndexUpdated(const Index3D& index_to_update,
+                                const Index3D& aabb_min,
+                                const Index3D& aabb_size, bool* aabb_updated) {
+  const size_t linear_size = aabb_size.x() * aabb_size.y() * aabb_size.z();
+  const size_t lin_idx =
+      layerIndexToAabbLinearIndex(index_to_update, aabb_min, aabb_size);
+  if (lin_idx < linear_size) {
+    aabb_updated[lin_idx] = true;
+  }
+}
+
+template <typename T>
+void convertAabbUpdatedToVector(const Index3D& aabb_min,
+                                const Index3D& aabb_size,
+                                size_t aabb_linear_size, bool* aabb_updated,
+                                T* indices) {
+  indices->reserve(aabb_linear_size);
+  for (size_t i = 0; i < aabb_linear_size; i++) {
+    if (aabb_updated[i]) {
+      indices->push_back(aabbLinearIndexToLayerIndex(i, aabb_min, aabb_size));
+    }
+  }
+}
+
+__global__ void getBlockIndicesInImageKernel(
+    const Transform T_L_C, const Camera camera, const float* image, int rows,
+    int cols, const float block_size, const float max_integration_distance_m,
+    const float truncation_distance_m, const Index3D aabb_min,
+    const Index3D aabb_size, bool* aabb_updated) {
+  // First, figure out which pixel we're in.
+  int pixel_row = blockIdx.x * blockDim.x + threadIdx.x;
+  int pixel_col = blockIdx.y * blockDim.y + threadIdx.y;
+
+  // Hooray we do nothing.
+  if (pixel_row >= rows || pixel_col >= cols) {
+    return;
+  }
+
+  // Look up the pixel we care about.
+  float depth = image::access<float>(pixel_row, pixel_col, cols, image);
+  if (depth <= 0.0f) {
+    return;
+  }
+  if (max_integration_distance_m > 0.0f && depth > max_integration_distance_m) {
+    depth = max_integration_distance_m;
+  }
+
+  // Ok now project this thing into space.
+  Vector3f p_C = (depth + truncation_distance_m) *
+                 camera.rayFromPixelIndices(Index2D(pixel_col, pixel_row));
+  Vector3f p_L = T_L_C * p_C;
+
+  // Now we have the position of the thing in space. Now we need the block
+  // index.
+  Index3D block_index = getBlockIndexFromPositionInLayer(block_size, p_L);
+  setIndexUpdated(block_index, aabb_min, aabb_size, aabb_updated);
+}
+
+__global__ void raycastToBlocksKernel(int num_blocks, Index3D* block_indices,
+                                      const Transform T_L_C, float block_size,
+                                      const Index3D aabb_min,
+                                      const Index3D aabb_size,
+                                      bool* aabb_updated) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int corner_index = threadIdx.y;
+
+  if (index >= num_blocks) {
+    return;
+  }
+
+  constexpr float corner_increment_table[9][3] = {
+      {0.0f, 0.0f, 0.0f},  // NOLINT
+      {1.0f, 0.0f, 0.0f},  // NOLINT
+      {0.0f, 1.0f, 0.0f},  // NOLINT
+      {0.0f, 0.0f, 1.0f},  // NOLINT
+      {1.0f, 1.0f, 0.0f},  // NOLINT
+      {1.0f, 0.0f, 1.0f},  // NOLINT
+      {0.0f, 1.0f, 1.0f},  // NOLINT
+      {1.0f, 1.0f, 1.0f},  // NOLINT
+      {0.5f, 0.5f, 0.5f},  // NOLINT
+  };
+
+  const Vector3f increment(corner_increment_table[corner_index][0],
+                           corner_increment_table[corner_index][1],
+                           corner_increment_table[corner_index][2]);
+
+  const Index3D& block_index = block_indices[index];
+
+  RayCaster raycaster(T_L_C.translation() / block_size,
+                      block_index.cast<float>() + increment);
+  Index3D ray_index;
+  while (raycaster.nextRayIndex(&ray_index)) {
+    setIndexUpdated(ray_index, aabb_min, aabb_size, aabb_updated);
+  }
+}
+
+__global__ void combinedBlockIndicesInImageKernel(
+    const Transform T_L_C, const Camera camera, const float* image, int rows,
+    int cols, const float block_size, const float max_integration_distance_m,
+    const float truncation_distance_m, int raycast_subsampling_factor,
+    const Index3D aabb_min, const Index3D aabb_size, bool* aabb_updated) {
+  // First, figure out which pixel we're in.
+  const int ray_idx_row = blockIdx.x * blockDim.x + threadIdx.x;
+  const int ray_idx_col = blockIdx.y * blockDim.y + threadIdx.y;
+  int pixel_row = ray_idx_row * raycast_subsampling_factor;
+  int pixel_col = ray_idx_col * raycast_subsampling_factor;
+
+  // Hooray we do nothing.
+  if (pixel_row >= (rows + raycast_subsampling_factor - 1) ||
+      pixel_col >= (cols + raycast_subsampling_factor - 1)) {
+    return;
+  } else {
+    // Move remaining overhanging pixels back to the borders.
+    if (pixel_row >= rows) {
+      pixel_row = rows - 1;
+    }
+    if (pixel_col >= cols) {
+      pixel_col = cols - 1;
+    }
+  }
+
+  // Look up the pixel we care about.
+  float depth = image::access<float>(pixel_row, pixel_col, cols, image);
+  if (depth <= 0.0f) {
+    return;
+  }
+  if (max_integration_distance_m > 0.0f && depth > max_integration_distance_m) {
+    depth = max_integration_distance_m;
+  }
+
+  // Ok now project this thing into space.
+  Vector3f p_C = (depth + truncation_distance_m) *
+                 camera.rayFromPixelIndices(Index2D(pixel_col, pixel_row));
+  Vector3f p_L = T_L_C * p_C;
+
+  // Now we have the position of the thing in space. Now we need the block
+  // index.
+  Index3D block_index = getBlockIndexFromPositionInLayer(block_size, p_L);
+  setIndexUpdated(block_index, aabb_min, aabb_size, aabb_updated);
+
+  // Ok raycast to the correct point in the block.
+  RayCaster raycaster(T_L_C.translation() / block_size, p_L / block_size);
+  Index3D ray_index = Index3D::Zero();
+  while (raycaster.nextRayIndex(&ray_index)) {
+    setIndexUpdated(ray_index, aabb_min, aabb_size, aabb_updated);
+  }
+}
+
+std::vector<Index3D> FrustumCalculator::getBlocksInImageViewCuda(
+    const DepthImage& depth_frame, const Transform& T_L_C, const Camera& camera,
+    const float block_size, const float truncation_distance_m,
+    const float max_integration_distance_m) {
+  timing::Timer setup_timer("in_view/setup");
+
+  // Aight so first we have to get the AABB of this guy.
+  const Frustum frustum =
+      camera.getViewFrustum(T_L_C, 0.0f, max_integration_distance_m);
+  const AxisAlignedBoundingBox aabb_L = frustum.getAABB();
+
+  // Get the min index and the max index.
+  const Index3D min_index =
+      getBlockIndexFromPositionInLayer(block_size, aabb_L.min());
+  const Index3D max_index =
+      getBlockIndexFromPositionInLayer(block_size, aabb_L.max());
+  const Index3D aabb_size = max_index - min_index + Index3D::Ones();
+  const size_t aabb_linear_size = aabb_size.x() * aabb_size.y() * aabb_size.z();
+
+  // A 3D grid of bools, one for each block in the AABB, which indicates if it
+  // is in the view. The 3D grid is represented as a flat vector.
+  if (aabb_linear_size > aabb_device_buffer_.size()) {
+    constexpr float kBufferExpansionFactor = 1.5f;
+    const int new_size =
+        static_cast<int>(kBufferExpansionFactor * aabb_linear_size);
+    aabb_device_buffer_.reserve(new_size);
+    aabb_host_buffer_.reserve(new_size);
+  }
+  checkCudaErrors(cudaMemsetAsync(aabb_device_buffer_.data(), 0,
+                                  sizeof(bool) * aabb_linear_size));
+  aabb_device_buffer_.resize(aabb_linear_size);
+  aabb_host_buffer_.resize(aabb_linear_size);
+
+  setup_timer.Stop();
+
+  // Raycast
+  if (raycast_to_pixels_) {
+    getBlocksByRaycastingPixels(T_L_C, camera, depth_frame, block_size,
+                                truncation_distance_m,
+                                max_integration_distance_m, min_index,
+                                aabb_size, aabb_device_buffer_.data());
+  } else {
+    getBlocksByRaycastingCorners(T_L_C, camera, depth_frame, block_size,
+                                 truncation_distance_m,
+                                 max_integration_distance_m, min_index,
+                                 aabb_size, aabb_device_buffer_.data());
+  }
+
+  // Output vector.
+  timing::Timer output_timer("in_view/output");
+  cudaMemcpyAsync(aabb_host_buffer_.data(), aabb_device_buffer_.data(),
+                  sizeof(bool) * aabb_linear_size, cudaMemcpyDeviceToHost,
+                  cuda_stream_);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  std::vector<Index3D> output_vector;
+  convertAabbUpdatedToVector<std::vector<Index3D>>(
+      min_index, aabb_size, aabb_linear_size, aabb_host_buffer_.data(),
+      &output_vector);
+  output_timer.Stop();
+
+  // We have to manually destruct this. :(
+  timing::Timer destory_timer("in_view/destroy");
+  destory_timer.Stop();
+  return output_vector;
+}
+
+void FrustumCalculator::getBlocksByRaycastingCorners(
+    const Transform& T_L_C, const Camera& camera, const DepthImage& depth_frame,
+    float block_size, const float truncation_distance_m,
+    const float max_integration_distance_m, const Index3D& min_index,
+    const Index3D& aabb_size, bool* aabb_updated_cuda) {
+  // Get the blocks touched by the ray endpoints
+  // We'll do warps of 32x32 pixels in the image. This is 1024 threads which is
+  // in the recommended 512-1024 range.
+  constexpr int kThreadDim = 16;
+  int rounded_rows = static_cast<int>(
+      std::ceil(depth_frame.rows() / static_cast<float>(kThreadDim)));
+  int rounded_cols = static_cast<int>(
+      std::ceil(depth_frame.cols() / static_cast<float>(kThreadDim)));
+  dim3 block_dim(rounded_rows, rounded_cols);
+  dim3 thread_dim(kThreadDim, kThreadDim);
+
+  timing::Timer image_blocks_timer("in_view/get_image_blocks");
+  getBlockIndicesInImageKernel<<<block_dim, thread_dim, 0, cuda_stream_>>>(
+      T_L_C, camera, depth_frame.dataConstPtr(), depth_frame.rows(),
+      depth_frame.cols(), block_size, max_integration_distance_m,
+      truncation_distance_m, min_index, aabb_size, aabb_updated_cuda);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  image_blocks_timer.Stop();
+
+  timing::Timer image_blocks_copy_timer("in_view/image_blocks_copy");
+
+  unified_vector<Index3D> initial_vector;
+  const size_t aabb_linear_size = aabb_size.x() * aabb_size.y() * aabb_size.z();
+  initial_vector.reserve(aabb_linear_size / 3);
+  convertAabbUpdatedToVector<unified_vector<Index3D>>(
+      min_index, aabb_size, aabb_linear_size, aabb_updated_cuda,
+      &initial_vector);
+  image_blocks_copy_timer.Stop();
+
+  // Call the kernel to do raycasting.
+  timing::Timer raycast_blocks_timer("in_view/raycast_blocks");
+
+  int num_initial_blocks = initial_vector.size();
+  constexpr int kNumCorners = 9;
+  constexpr int kNumBlocksPerThreadBlock = 40;
+  int raycast_block_dim = static_cast<int>(std::ceil(
+      static_cast<float>(num_initial_blocks) / kNumBlocksPerThreadBlock));
+  dim3 raycast_thread_dim(kNumBlocksPerThreadBlock, kNumCorners);
+  raycastToBlocksKernel<<<raycast_block_dim, raycast_thread_dim, 0,
+                          cuda_stream_>>>(
+      num_initial_blocks, initial_vector.data(), T_L_C, block_size, min_index,
+      aabb_size, aabb_updated_cuda);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+  raycast_blocks_timer.Stop();
+}
+
+void FrustumCalculator::getBlocksByRaycastingPixels(
+    const Transform& T_L_C, const Camera& camera, const DepthImage& depth_frame,
+    float block_size, const float truncation_distance_m,
+    const float max_integration_distance_m, const Index3D& min_index,
+    const Index3D& aabb_size, bool* aabb_updated_cuda) {
+  // Number of rays per dimension. Depth frame size / subsampling rate.
+  const int num_subsampled_rows =
+      std::ceil(static_cast<float>(depth_frame.rows() + 1) /
+                static_cast<float>(raycast_subsampling_factor_));
+  const int num_subsampled_cols =
+      std::ceil(static_cast<float>(depth_frame.cols() + 1) /
+                static_cast<float>(raycast_subsampling_factor_));
+
+  // We'll do warps of 32x32 pixels in the image. This is 1024 threads which is
+  // in the recommended 512-1024 range.
+  constexpr int kThreadDim = 16;
+  const int rounded_rows = static_cast<int>(
+      std::ceil(num_subsampled_rows / static_cast<float>(kThreadDim)));
+  const int rounded_cols = static_cast<int>(
+      std::ceil(num_subsampled_cols / static_cast<float>(kThreadDim)));
+  dim3 block_dim(rounded_rows, rounded_cols);
+  dim3 thread_dim(kThreadDim, kThreadDim);
+
+  timing::Timer combined_kernel_timer("in_view/combined_kernel");
+  combinedBlockIndicesInImageKernel<<<block_dim, thread_dim, 0, cuda_stream_>>>(
+      T_L_C, camera, depth_frame.dataConstPtr(), depth_frame.rows(),
+      depth_frame.cols(), block_size, max_integration_distance_m,
+      truncation_distance_m, raycast_subsampling_factor_, min_index, aabb_size,
+      aabb_updated_cuda);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+  combined_kernel_timer.Stop();
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/integrators/cuda/projective_color_integrator.cu b/nvblox/src/integrators/cuda/projective_color_integrator.cu
new file mode 100644
index 00000000..4ba7d815
--- /dev/null
+++ b/nvblox/src/integrators/cuda/projective_color_integrator.cu
@@ -0,0 +1,362 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/integrators/projective_color_integrator.h"
+
+#include "nvblox/integrators/cuda/projective_integrators_common.cuh"
+#include "nvblox/integrators/integrators_common.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+ProjectiveColorIntegrator::ProjectiveColorIntegrator()
+    : ProjectiveIntegratorBase() {
+  sphere_tracer_.params().maximum_ray_length_m = max_integration_distance_m_;
+  checkCudaErrors(cudaStreamCreate(&integration_stream_));
+}
+
+ProjectiveColorIntegrator::~ProjectiveColorIntegrator() {
+  finish();
+  checkCudaErrors(cudaStreamDestroy(integration_stream_));
+}
+
+void ProjectiveColorIntegrator::finish() const {
+  cudaStreamSynchronize(integration_stream_);
+}
+
+void ProjectiveColorIntegrator::integrateFrame(
+    const ColorImage& color_frame, const Transform& T_L_C, const Camera& camera,
+    const TsdfLayer& tsdf_layer, ColorLayer* color_layer,
+    std::vector<Index3D>* updated_blocks) {
+  CHECK_NOTNULL(color_layer);
+  CHECK_EQ(tsdf_layer.block_size(), color_layer->block_size());
+
+  // Metric truncation distance for this layer
+  const float voxel_size =
+      color_layer->block_size() / VoxelBlock<bool>::kVoxelsPerSide;
+  const float truncation_distance_m = truncation_distance_vox_ * voxel_size;
+
+  timing::Timer blocks_in_view_timer("color/integrate/get_blocks_in_view");
+  std::vector<Index3D> block_indices =
+      getBlocksInView(T_L_C, camera, color_layer->block_size());
+  blocks_in_view_timer.Stop();
+
+  // Check which of these blocks are:
+  // - Allocated in the TSDF, and
+  // - have at least a single voxel within the truncation band
+  // This is because:
+  // - We don't allocate new geometry here, we just color existing geometry
+  // - We don't color freespace.
+  timing::Timer blocks_in_band_timer(
+      "color/integrate/reduce_to_blocks_in_band");
+  block_indices = reduceBlocksToThoseInTruncationBand(block_indices, tsdf_layer,
+                                                      truncation_distance_m);
+  blocks_in_band_timer.Stop();
+
+  // Allocate blocks (CPU)
+  // We allocate color blocks where
+  // - there are allocated TSDF blocks, AND
+  // - these blocks are within the truncation band
+  timing::Timer allocate_blocks_timer("color/integrate/allocate_blocks");
+  allocateBlocksWhereRequired(block_indices, color_layer);
+  allocate_blocks_timer.Stop();
+
+  // Create a synthetic depth image
+  timing::Timer sphere_trace_timer("color/integrate/sphere_trace");
+  std::shared_ptr<const DepthImage> synthetic_depth_image_ptr =
+      sphere_tracer_.renderImageOnGPU(
+          camera, T_L_C, tsdf_layer, truncation_distance_m, MemoryType::kDevice,
+          depth_render_ray_subsampling_factor_);
+  sphere_trace_timer.Stop();
+
+  // Update identified blocks
+  // Calls out to the child-class implementing the integation (GPU)
+  timing::Timer update_blocks_timer("color/integrate/update_blocks");
+  updateBlocks(block_indices, color_frame, *synthetic_depth_image_ptr, T_L_C,
+               camera, truncation_distance_m, color_layer);
+  update_blocks_timer.Stop();
+
+  if (updated_blocks != nullptr) {
+    *updated_blocks = block_indices;
+  }
+}
+
+__device__ inline Color blendTwoColors(const Color& first_color,
+                                       float first_weight,
+                                       const Color& second_color,
+                                       float second_weight) {
+  float total_weight = first_weight + second_weight;
+
+  first_weight /= total_weight;
+  second_weight /= total_weight;
+
+  Color new_color;
+  new_color.r = static_cast<uint8_t>(std::round(
+      first_color.r * first_weight + second_color.r * second_weight));
+  new_color.g = static_cast<uint8_t>(std::round(
+      first_color.g * first_weight + second_color.g * second_weight));
+  new_color.b = static_cast<uint8_t>(std::round(
+      first_color.b * first_weight + second_color.b * second_weight));
+
+  return new_color;
+}
+
+__device__ inline bool updateVoxel(const Color color_measured,
+                                   ColorVoxel* voxel_ptr,
+                                   const float voxel_depth_m,
+                                   const float truncation_distance_m,
+                                   const float max_weight) {
+  // NOTE(alexmillane): We integrate all voxels passed to this function, We
+  // should probably not do this. We should no update some based on occlusion
+  // and their distance in the distance field....
+  // TODO(alexmillane): The above.
+
+  // Read CURRENT voxel values (from global GPU memory)
+  const Color voxel_color_current = voxel_ptr->color;
+  const float voxel_weight_current = voxel_ptr->weight;
+  // Fuse
+  constexpr float measurement_weight = 1.0f;
+  const Color fused_color =
+      blendTwoColors(voxel_color_current, voxel_weight_current, color_measured,
+                     measurement_weight);
+  const float weight =
+      fmin(measurement_weight + voxel_weight_current, max_weight);
+  // Write NEW voxel values (to global GPU memory)
+  voxel_ptr->color = fused_color;
+  voxel_ptr->weight = weight;
+  return true;
+}
+
+__global__ void integrateBlocks(
+    const Index3D* block_indices_device_ptr, const Camera camera,
+    const Color* color_image, const int color_rows, const int color_cols,
+    const float* depth_image, const int depth_rows, const int depth_cols,
+    const Transform T_C_L, const float block_size,
+    const float truncation_distance_m, const float max_weight,
+    const float max_integration_distance, const int depth_subsample_factor,
+    ColorBlock** block_device_ptrs) {
+  // Get - the image-space projection of the voxel associated with this thread
+  //     - the depth associated with the projection.
+  Eigen::Vector2f u_px;
+  float voxel_depth_m;
+  if (!projectThreadVoxel(block_indices_device_ptr, camera, T_C_L, block_size,
+                          &u_px, &voxel_depth_m)) {
+    return;
+  }
+
+  // If voxel further away than the limit, skip this voxel
+  if (max_integration_distance > 0.0f) {
+    if (voxel_depth_m > max_integration_distance) {
+      return;
+    }
+  }
+
+  const Eigen::Vector2f u_px_depth =
+      u_px / static_cast<float>(depth_subsample_factor);
+  float surface_depth_m;
+  if (!interpolation::interpolate2DLinear<float>(
+          depth_image, u_px_depth, depth_rows, depth_cols, &surface_depth_m)) {
+    return;
+  }
+
+  // Occlusion testing
+  // Get the distance of the voxel from the rendered surface. If outside
+  // truncation band, skip.
+  const float voxel_distance_from_surface = surface_depth_m - voxel_depth_m;
+  if (fabsf(voxel_distance_from_surface) > truncation_distance_m) {
+    return;
+  }
+
+  Color image_value;
+  if (!interpolation::interpolate2DLinear<Color>(color_image, u_px, color_rows,
+                                                 color_cols, &image_value)) {
+    return;
+  }
+
+  // Get the Voxel we'll update in this thread
+  // NOTE(alexmillane): Note that we've reverse the voxel indexing order such
+  // that adjacent threads (x-major) access adjacent memory locations in the
+  // block (z-major).
+  ColorVoxel* voxel_ptr =
+      &(block_device_ptrs[blockIdx.x]
+            ->voxels[threadIdx.z][threadIdx.y][threadIdx.x]);
+
+  // Update the voxel using the update rule for this layer type
+  updateVoxel(image_value, voxel_ptr, voxel_depth_m, truncation_distance_m,
+              max_weight);
+}
+
+void ProjectiveColorIntegrator::updateBlocks(
+    const std::vector<Index3D>& block_indices, const ColorImage& color_frame,
+    const DepthImage& depth_frame, const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, ColorLayer* layer_ptr) {
+  CHECK_NOTNULL(layer_ptr);
+  CHECK_EQ(color_frame.rows() % depth_frame.rows(), 0);
+  CHECK_EQ(color_frame.cols() % depth_frame.cols(), 0);
+
+  if (block_indices.empty()) {
+    return;
+  }
+  const int num_blocks = block_indices.size();
+  const int depth_subsampling_factor = color_frame.rows() / depth_frame.rows();
+  CHECK_EQ(color_frame.cols() / depth_frame.cols(), depth_subsampling_factor);
+
+  // Expand the buffers when needed
+  if (num_blocks > block_indices_device_.size()) {
+    constexpr float kBufferExpansionFactor = 1.5f;
+    const int new_size = static_cast<int>(kBufferExpansionFactor * num_blocks);
+    block_indices_device_.reserve(new_size);
+    block_ptrs_device_.reserve(new_size);
+    block_indices_host_.reserve(new_size);
+    block_ptrs_host_.reserve(new_size);
+  }
+
+  // Stage on the host pinned memory
+  block_indices_host_ = block_indices;
+  block_ptrs_host_ = getBlockPtrsFromIndices(block_indices, layer_ptr);
+
+  // Transfer to the device
+  block_indices_device_ = block_indices_host_;
+  block_ptrs_device_ = block_ptrs_host_;
+
+  // We need the inverse transform in the kernel
+  const Transform T_C_L = T_L_C.inverse();
+
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_thread_blocks = block_indices.size();
+  // clang-format off
+  integrateBlocks<<<num_thread_blocks, kThreadsPerBlock, 0, integration_stream_>>>(
+      block_indices_device_.data(),
+      camera,
+      color_frame.dataConstPtr(),
+      color_frame.rows(),
+      color_frame.cols(),
+      depth_frame.dataConstPtr(),
+      depth_frame.rows(),
+      depth_frame.cols(),
+      T_C_L,
+      layer_ptr->block_size(),
+      truncation_distance_m,
+      max_weight_,
+      max_integration_distance_m_,
+      depth_subsampling_factor,
+      block_ptrs_device_.data());
+  // clang-format on
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Finish processing of the frame before returning control
+  finish();
+}
+
+__global__ void checkBlocksInTruncationBand(
+    const VoxelBlock<TsdfVoxel>** block_device_ptrs,
+    const float truncation_distance_m,
+    bool* contains_truncation_band_device_ptr) {
+  // A single thread in each block initializes the output to 0
+  if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) {
+    contains_truncation_band_device_ptr[blockIdx.x] = 0;
+  }
+  __syncthreads();
+
+  // Get the Voxel we'll check in this thread
+  const TsdfVoxel voxel = block_device_ptrs[blockIdx.x]
+                              ->voxels[threadIdx.z][threadIdx.y][threadIdx.x];
+
+  // If this voxel in the truncation band, write the flag to say that the block
+  // should be processed.
+  // NOTE(alexmillane): There will be collision on write here. However, from my
+  // reading, all threads' writes will result in a single write to global
+  // memory. Because we only write a single value (1) it doesn't matter which
+  // thread "wins".
+  if (std::abs(voxel.distance) <= truncation_distance_m) {
+    contains_truncation_band_device_ptr[blockIdx.x] = true;
+  }
+}
+
+std::vector<Index3D>
+ProjectiveColorIntegrator::reduceBlocksToThoseInTruncationBand(
+    const std::vector<Index3D>& block_indices, const TsdfLayer& tsdf_layer,
+    const float truncation_distance_m) {
+  // Check 1) Are the blocks allocated
+  // - performed on the CPU because the hash-map is on the CPU
+  std::vector<Index3D> block_indices_check_1;
+  block_indices_check_1.reserve(block_indices.size());
+  for (const Index3D& block_idx : block_indices) {
+    if (tsdf_layer.isBlockAllocated(block_idx)) {
+      block_indices_check_1.push_back(block_idx);
+    }
+  }
+
+  if (block_indices_check_1.empty()) {
+    return block_indices_check_1;
+  }
+
+  // Check 2) Does each of the blocks have a voxel within the truncation band
+  // - performed on the GPU because the blocks are there
+  // Get the blocks we need to check
+  std::vector<const TsdfBlock*> block_ptrs =
+      getBlockPtrsFromIndices(block_indices_check_1, tsdf_layer);
+
+  const int num_blocks = block_ptrs.size();
+
+  // Expand the buffers when needed
+  if (num_blocks > truncation_band_block_ptrs_device_.size()) {
+    constexpr float kBufferExpansionFactor = 1.5f;
+    const int new_size = static_cast<int>(kBufferExpansionFactor * num_blocks);
+    truncation_band_block_ptrs_host_.reserve(new_size);
+    truncation_band_block_ptrs_device_.reserve(new_size);
+    block_in_truncation_band_device_.reserve(new_size);
+    block_in_truncation_band_host_.reserve(new_size);
+  }
+
+  // Host -> Device
+  truncation_band_block_ptrs_host_ = block_ptrs;
+  truncation_band_block_ptrs_device_ = truncation_band_block_ptrs_host_;
+
+  // Prepare output space
+  block_in_truncation_band_device_.resize(num_blocks);
+
+  // Do the check on GPU
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_thread_blocks = num_blocks;
+  // clang-format off
+  checkBlocksInTruncationBand<<<num_thread_blocks, kThreadsPerBlock, 0, integration_stream_>>>(
+      truncation_band_block_ptrs_device_.data(),
+      truncation_distance_m,
+      block_in_truncation_band_device_.data());
+  // clang-format on
+  checkCudaErrors(cudaStreamSynchronize(integration_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Copy results back
+  block_in_truncation_band_host_ = block_in_truncation_band_device_;
+
+  // Filter the indices using the result
+  std::vector<Index3D> block_indices_check_2;
+  block_indices_check_2.reserve(block_indices_check_1.size());
+  for (int i = 0; i < block_indices_check_1.size(); i++) {
+    if (block_in_truncation_band_host_[i] == true) {
+      block_indices_check_2.push_back(block_indices_check_1[i]);
+    }
+  }
+
+  return block_indices_check_2;
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/integrators/cuda/projective_tsdf_integrator.cu b/nvblox/src/integrators/cuda/projective_tsdf_integrator.cu
new file mode 100644
index 00000000..0c28e1db
--- /dev/null
+++ b/nvblox/src/integrators/cuda/projective_tsdf_integrator.cu
@@ -0,0 +1,215 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <nvblox/integrators/projective_tsdf_integrator.h>
+
+#include "nvblox/core/color.h"
+#include "nvblox/core/cuda/error_check.cuh"
+#include "nvblox/core/interpolation_2d.h"
+#include "nvblox/integrators/cuda/projective_integrators_common.cuh"
+#include "nvblox/integrators/integrators_common.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+ProjectiveTsdfIntegrator::ProjectiveTsdfIntegrator()
+    : ProjectiveIntegratorBase() {
+  checkCudaErrors(cudaStreamCreate(&integration_stream_));
+}
+
+ProjectiveTsdfIntegrator::~ProjectiveTsdfIntegrator() {
+  finish();
+  checkCudaErrors(cudaStreamDestroy(integration_stream_));
+}
+
+void ProjectiveTsdfIntegrator::finish() const {
+  cudaStreamSynchronize(integration_stream_);
+}
+
+void ProjectiveTsdfIntegrator::integrateFrame(
+    const DepthImage& depth_frame, const Transform& T_L_C, const Camera& camera,
+    TsdfLayer* layer, std::vector<Index3D>* updated_blocks) {
+  CHECK_NOTNULL(layer);
+  timing::Timer tsdf_timer("tsdf/integrate");
+  // Metric truncation distance for this layer
+  const float voxel_size =
+      layer->block_size() / VoxelBlock<bool>::kVoxelsPerSide;
+  const float truncation_distance_m = truncation_distance_vox_ * voxel_size;
+
+  // Identify blocks we can (potentially) see (CPU)
+  timing::Timer blocks_in_view_timer("tsdf/integrate/get_blocks_in_view");
+  const std::vector<Index3D> block_indices = getBlocksInViewUsingRaycasting(
+      depth_frame, T_L_C, camera, layer->block_size());
+  blocks_in_view_timer.Stop();
+
+  // Allocate blocks (CPU)
+  timing::Timer allocate_blocks_timer("tsdf/integrate/allocate_blocks");
+  allocateBlocksWhereRequired(block_indices, layer);
+  allocate_blocks_timer.Stop();
+
+  // Update identified blocks
+  // Calls out to the child-class implementing the integation (CPU or GPU)
+  timing::Timer update_blocks_timer("tsdf/integrate/update_blocks");
+  updateBlocks(block_indices, depth_frame, T_L_C, camera, truncation_distance_m,
+               layer);
+  update_blocks_timer.Stop();
+
+  if (updated_blocks != nullptr) {
+    *updated_blocks = block_indices;
+  }
+}
+
+__device__ inline bool updateVoxel(const float surface_depth_measured,
+                                   TsdfVoxel* voxel_ptr,
+                                   const float voxel_depth_m,
+                                   const float truncation_distance_m,
+                                   const float max_weight) {
+  // Get the MEASURED depth of the VOXEL
+  const float voxel_distance_measured = surface_depth_measured - voxel_depth_m;
+
+  // If we're behind the negative truncation distance, just continue.
+  if (voxel_distance_measured < -truncation_distance_m) {
+    return false;
+  }
+
+  // Read CURRENT voxel values (from global GPU memory)
+  const float voxel_distance_current = voxel_ptr->distance;
+  const float voxel_weight_current = voxel_ptr->weight;
+
+  // NOTE(alexmillane): We could try to use CUDA math functions to speed up
+  // below
+  // https://docs.nvidia.com/cuda/cuda-math-api/group__CUDA__MATH__SINGLE.html#group__CUDA__MATH__SINGLE
+
+  // Fuse
+  constexpr float measurement_weight = 1.0f;
+  float fused_distance = (voxel_distance_measured * measurement_weight +
+                          voxel_distance_current * voxel_weight_current) /
+                         (measurement_weight + voxel_weight_current);
+
+  // Clip
+  if (fused_distance > 0.0f) {
+    fused_distance = fmin(truncation_distance_m, fused_distance);
+  } else {
+    fused_distance = fmax(-truncation_distance_m, fused_distance);
+  }
+  const float weight =
+      fmin(measurement_weight + voxel_weight_current, max_weight);
+
+  // Write NEW voxel values (to global GPU memory)
+  voxel_ptr->distance = fused_distance;
+  voxel_ptr->weight = weight;
+  return true;
+}
+
+__global__ void integrateBlocks(const Index3D* block_indices_device_ptr,
+                                const Camera camera, const float* image,
+                                int rows, int cols, const Transform T_C_L,
+                                const float block_size,
+                                const float truncation_distance_m,
+                                const float max_weight,
+                                const float max_integration_distance,
+                                TsdfBlock** block_device_ptrs) {
+  // Get - the image-space projection of the voxel associated with this thread
+  //     - the depth associated with the projection.
+  Eigen::Vector2f u_px;
+  float voxel_depth_m;
+  if (!projectThreadVoxel(block_indices_device_ptr, camera, T_C_L, block_size,
+                          &u_px, &voxel_depth_m)) {
+    return;
+  }
+
+  // If voxel further away than the limit, skip this voxel
+  if (max_integration_distance > 0.0f) {
+    if (voxel_depth_m > max_integration_distance) {
+      return;
+    }
+  }
+
+  float image_value;
+  if (!interpolation::interpolate2DLinear<float>(image, u_px, rows, cols,
+                                                 &image_value)) {
+    return;
+  }
+
+  // Get the Voxel we'll update in this thread
+  // NOTE(alexmillane): Note that we've reverse the voxel indexing order such
+  // that adjacent threads (x-major) access adjacent memory locations in the
+  // block (z-major).
+  TsdfVoxel* voxel_ptr = &(block_device_ptrs[blockIdx.x]
+                               ->voxels[threadIdx.z][threadIdx.y][threadIdx.x]);
+
+  // Update the voxel using the update rule for this layer type
+  updateVoxel(image_value, voxel_ptr, voxel_depth_m, truncation_distance_m,
+              max_weight);
+}
+
+void ProjectiveTsdfIntegrator::updateBlocks(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, TsdfLayer* layer_ptr) {
+  CHECK_NOTNULL(layer_ptr);
+
+  if (block_indices.empty()) {
+    return;
+  }
+  const int num_blocks = block_indices.size();
+
+  // Expand the buffers when needed
+  if (num_blocks > block_indices_device_.size()) {
+    constexpr float kBufferExpansionFactor = 1.5f;
+    const int new_size =
+        static_cast<int>(kBufferExpansionFactor * num_blocks);
+    block_indices_device_.reserve(new_size);
+    block_ptrs_device_.reserve(new_size);
+    block_indices_host_.reserve(new_size);
+    block_ptrs_host_.reserve(new_size);
+  }
+
+  // Stage on the host pinned memory
+  block_indices_host_ = block_indices;
+  block_ptrs_host_ = getBlockPtrsFromIndices(block_indices, layer_ptr);
+
+  // Transfer to the device
+  block_indices_device_ = block_indices_host_;
+  block_ptrs_device_ = block_ptrs_host_;
+
+  // We need the inverse transform in the kernel
+  const Transform T_C_L = T_L_C.inverse();
+
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_thread_blocks = num_blocks;
+  // clang-format off
+  integrateBlocks<<<num_thread_blocks, kThreadsPerBlock, 0, integration_stream_>>>(
+      block_indices_device_.data(),
+      camera,
+      depth_frame.dataConstPtr(),
+      depth_frame.rows(),
+      depth_frame.cols(),
+      T_C_L,
+      layer_ptr->block_size(),
+      truncation_distance_m,
+      max_weight_,
+      max_integration_distance_m_,
+      block_ptrs_device_.data());
+  // clang-format on
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Finish processing of the frame before returning control
+  finish();
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/integrators/esdf_integrator.cpp b/nvblox/src/integrators/esdf_integrator.cpp
new file mode 100644
index 00000000..7311e8a8
--- /dev/null
+++ b/nvblox/src/integrators/esdf_integrator.cpp
@@ -0,0 +1,830 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/hash.h"
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/integrators/esdf_integrator.h"
+
+namespace nvblox {
+
+// Integrate the entire layer.
+void EsdfIntegrator::integrateLayer(const TsdfLayer& tsdf_layer,
+                                    EsdfLayer* esdf_layer) {
+  std::vector<Index3D> block_indices = tsdf_layer.getAllBlockIndices();
+  constexpr DeviceType device_type = DeviceType::kGPU;
+  if (device_type == DeviceType::kGPU) {
+    integrateBlocksOnGPU(tsdf_layer, block_indices, esdf_layer);
+  } else {
+    integrateBlocksOnCPU(tsdf_layer, block_indices, esdf_layer);
+  }
+}
+
+// Integrate specific blocks in a layer.
+void EsdfIntegrator::integrateBlocks(const TsdfLayer& tsdf_layer,
+                                     const std::vector<Index3D>& block_indices,
+                                     EsdfLayer* esdf_layer) {
+  integrateBlocksOnGPU(tsdf_layer, block_indices, esdf_layer);
+}
+
+void EsdfIntegrator::integrateBlocksOnCPU(
+    const TsdfLayer& tsdf_layer, const std::vector<Index3D>& block_indices,
+    EsdfLayer* esdf_layer) {
+  timing::Timer esdf_timer("esdf/integrate");
+
+  timing::Timer allocate_timer("esdf/integrate/allocate");
+  // First, allocate all the destination blocks.
+  allocateBlocksOnCPU(block_indices, esdf_layer);
+  allocate_timer.Stop();
+
+  timing::Timer mark_timer("esdf/integrate/mark_sites");
+  // Then, mark all the sites on CPU.
+  // This finds all the blocks that are eligible to be parents.
+  std::vector<Index3D> blocks_with_sites;
+  std::vector<Index3D> blocks_to_clear;
+  markAllSitesOnCPU(tsdf_layer, block_indices, esdf_layer, &blocks_with_sites,
+                    &blocks_to_clear);
+  mark_timer.Stop();
+
+  timing::Timer clear_timer("esdf/integrate/clear_invalid");
+  std::vector<Index3D> cleared_blocks;
+
+  clearInvalidOnCPU(blocks_to_clear, esdf_layer, &cleared_blocks);
+
+  VLOG(3) << "Invalid blocks cleared: " << cleared_blocks.size();
+  clear_timer.Stop();
+
+  timing::Timer compute_timer("esdf/integrate/compute");
+  // Parallel block banding on CPU.
+  // First we call all the blocks with sites to propagate the distances out
+  // from the sites.
+  computeEsdfOnCPU(blocks_with_sites, esdf_layer);
+  // In case some blocks were cleared (otherwise this is a no-op), we also need
+  // to make sure any cleared blocks get their values updated. This simply
+  // ensures that their values are up-to-date.
+  // TODO(helen): check if we can just roll cleared blocks into the list with
+  // blocks with sites and only call it once. Compare how the speed varies.
+  computeEsdfOnCPU(cleared_blocks, esdf_layer);
+  compute_timer.Stop();
+}
+
+void EsdfIntegrator::allocateBlocksOnCPU(
+    const std::vector<Index3D>& block_indices, EsdfLayer* esdf_layer) {
+  // We want to allocate all ESDF layer blocks and copy over the sites.
+  for (const Index3D& block_index : block_indices) {
+    esdf_layer->allocateBlockAtIndex(block_index);
+  }
+}
+
+void EsdfIntegrator::markAllSitesOnCPU(
+    const TsdfLayer& tsdf_layer, const std::vector<Index3D>& block_indices,
+    EsdfLayer* esdf_layer, std::vector<Index3D>* blocks_with_sites,
+    std::vector<Index3D>* blocks_to_clear) {
+  CHECK_NOTNULL(esdf_layer);
+  CHECK_NOTNULL(blocks_with_sites);
+
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = tsdf_layer.block_size() / kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  // Cache the minimum distance in metric size.
+  const float min_site_distance_m = min_site_distance_vox_ * voxel_size;
+  int num_observed = 0;
+  for (const Index3D& block_index : block_indices) {
+    EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+    TsdfBlock::ConstPtr tsdf_block = tsdf_layer.getBlockAtIndex(block_index);
+
+    if (!esdf_block || !tsdf_block) {
+      LOG(ERROR) << "Somehow trying to update non-existent blocks!";
+      continue;
+    }
+
+    bool block_has_sites = false;
+    bool has_observed = false;
+    bool block_to_clear = false;
+    // Iterate over all the voxels:
+    Index3D voxel_index;
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          // Get the voxel and call the callback on it:
+          const TsdfVoxel* tsdf_voxel =
+              &tsdf_block
+                   ->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+          EsdfVoxel* esdf_voxel =
+              &esdf_block
+                   ->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+          if (tsdf_voxel->weight >= min_weight_) {
+            esdf_voxel->observed = true;
+            // Mark as inside if the voxel distance is negative.
+            has_observed = true;
+            bool now_inside = tsdf_voxel->distance <= 0;
+            if (esdf_voxel->is_inside && now_inside == false) {
+              block_to_clear = true;
+              esdf_voxel->squared_distance_vox = max_squared_distance_vox;
+              esdf_voxel->parent_direction.setZero();
+            }
+            esdf_voxel->is_inside = now_inside;
+            if (now_inside &&
+                std::abs(tsdf_voxel->distance) <= min_site_distance_m) {
+              esdf_voxel->is_site = true;
+              esdf_voxel->squared_distance_vox = 0.0f;
+              esdf_voxel->parent_direction.setZero();
+              block_has_sites = true;
+            } else {
+              if (esdf_voxel->is_site) {
+                esdf_voxel->is_site = false;
+                block_to_clear = true;
+                // This block needs to be cleared since something became an
+                // un-site.
+                esdf_voxel->squared_distance_vox = max_squared_distance_vox;
+                esdf_voxel->parent_direction.setZero();
+                // Other case is a brand new voxel.
+              } else if (esdf_voxel->squared_distance_vox <= 0.0f) {
+                esdf_voxel->squared_distance_vox = max_squared_distance_vox;
+                esdf_voxel->parent_direction.setZero();
+              }
+            }
+          }
+        }
+      }
+    }
+    if (block_has_sites) {
+      blocks_with_sites->push_back(block_index);
+    }
+    if (block_to_clear) {
+      blocks_to_clear->push_back(block_index);
+    }
+    if (has_observed) {
+      num_observed++;
+    }
+  }
+}
+
+void EsdfIntegrator::clearInvalidOnCPU(
+    const std::vector<Index3D>& blocks_to_clear, EsdfLayer* esdf_layer,
+    std::vector<Index3D>* updated_blocks) {
+  CHECK_NOTNULL(esdf_layer);
+  CHECK_NOTNULL(updated_blocks);
+
+  updated_blocks->clear();
+  Index3DSet all_cleared_blocks;
+
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = esdf_layer->block_size() / kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  Index3DSet neighbor_clearing_set;
+
+  int cleared_voxels = 0;
+
+  // Ok so the goal is to basically check the neighbors along the edges,
+  // and then continue sweeping if necessary.
+
+  // Assumptions: all blocks with sites are correct now (they got updated
+  // earlier). If a block contains an invalid site, it is also present along the
+  // edge.
+  for (const Index3D& block_index : blocks_to_clear) {
+    VLOG(3) << "Clearing WITHIN block: " << block_index.transpose();
+
+    EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+    if (!esdf_block) {
+      continue;
+    }
+    bool any_cleared = false;
+    // First we just reset all the voxels within the block that point to
+    // non-sites.
+    Index3D voxel_index;
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          EsdfVoxel* esdf_voxel =
+              &esdf_block
+                   ->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+          if (!esdf_voxel->observed || esdf_voxel->is_site ||
+              esdf_voxel->squared_distance_vox >= max_squared_distance_vox) {
+            continue;
+          }
+
+          // Get the parent.
+          Index3D parent_index = voxel_index + esdf_voxel->parent_direction;
+
+          // Check if the voxel is within the same block.
+          if (parent_index.x() < 0 || parent_index.x() >= kVoxelsPerSide ||
+              parent_index.y() < 0 || parent_index.y() >= kVoxelsPerSide ||
+              parent_index.z() < 0 || parent_index.z() >= kVoxelsPerSide) {
+            continue;
+          }
+
+          // Ok check if the parent index is a site.
+          if (!esdf_block
+                   ->voxels[parent_index.x()][parent_index.y()]
+                           [parent_index.z()]
+                   .is_site) {
+            clearVoxel(esdf_voxel, max_squared_distance_vox);
+            cleared_voxels++;
+            any_cleared = true;
+          }
+        }
+      }
+    }
+
+    // Iterate over all the neighbors.
+    // If any of the neighbors reach the edge in clearing, we need to continue
+    // updating neighbors.
+    if (any_cleared) {
+      neighbor_clearing_set.insert(block_index);
+      all_cleared_blocks.insert(block_index);
+    }
+  }
+  // Ok after we're done with everything, also update any neighbors that need
+  // clearing.
+  Index3DSet new_neighbor_clearing_set;
+
+  while (!neighbor_clearing_set.empty()) {
+    for (const Index3D& block_index : neighbor_clearing_set) {
+      VLOG(3) << "Clearing block: " << block_index.transpose();
+      clearNeighborsOnCPU(block_index, esdf_layer, &new_neighbor_clearing_set);
+      all_cleared_blocks.insert(block_index);
+    }
+    VLOG(3) << "Num neighbors to clear: " << new_neighbor_clearing_set.size();
+    std::swap(new_neighbor_clearing_set, neighbor_clearing_set);
+    new_neighbor_clearing_set.clear();
+  }
+
+  VLOG(3) << "Updated voxels by clearing: " << cleared_voxels;
+
+  std::copy(all_cleared_blocks.begin(), all_cleared_blocks.end(),
+            std::back_inserter(*updated_blocks));
+}
+
+// This function was written for testing -- rather than incrementally searching
+// for cleared blocks from an initial seed lsit, this iterates over the entire
+// map and checks the parent of every single voxel. This cannot be parallelized
+// without having the whole hash table on GPU, which is why we went with the
+// other method for parallelization.
+void EsdfIntegrator::clearAllInvalidOnCPU(
+    EsdfLayer* esdf_layer, std::vector<Index3D>* updated_blocks) {
+  // Go through all blocks in the map, clearing any that point to an invalid
+  // parent.
+  std::vector<Index3D> block_indices = esdf_layer->getAllBlockIndices();
+
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = esdf_layer->block_size() / kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  Index3DSet neighbor_clearing_set;
+
+  int cleared_voxels = 0;
+
+  // Ok so the goal is to basically check the neighbors along the edges,
+  // and then continue sweeping if necessary.
+
+  // Assumptions: all blocks with sites are correct now (they got updated
+  // earlier). If a block contains an invalid site, it is also present along the
+  // edge.
+  for (const Index3D& block_index : block_indices) {
+    EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+    if (!esdf_block) {
+      continue;
+    }
+    bool any_cleared = false;
+
+    Index3D voxel_index;
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          EsdfVoxel* esdf_voxel =
+              &esdf_block
+                   ->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+          if (!esdf_voxel->observed || esdf_voxel->is_site) {
+            continue;
+          }
+
+          // Get the parent.
+          Index3D parent_index = voxel_index + esdf_voxel->parent_direction;
+
+          if (esdf_voxel->parent_direction == Index3D::Zero()) {
+            continue;
+          }
+
+          // Check if the voxel is within the same block.
+          if (parent_index.x() < 0 || parent_index.x() >= kVoxelsPerSide ||
+              parent_index.y() < 0 || parent_index.y() >= kVoxelsPerSide ||
+              parent_index.z() < 0 || parent_index.z() >= kVoxelsPerSide) {
+            // Then we need to get the block index.
+            Index3D neighbor_block_index = block_index;
+            Index3D neighbor_voxel_index = parent_index;
+
+            // Find the parent index.
+            while (parent_index.x() >= kVoxelsPerSide) {
+              parent_index.x() -= kVoxelsPerSide;
+              neighbor_block_index.x()++;
+            }
+            while (parent_index.y() >= kVoxelsPerSide) {
+              parent_index.y() -= kVoxelsPerSide;
+              neighbor_block_index.y()++;
+            }
+            while (parent_index.z() >= kVoxelsPerSide) {
+              parent_index.z() -= kVoxelsPerSide;
+              neighbor_block_index.z()++;
+            }
+
+            while (parent_index.x() < 0) {
+              parent_index.x() += kVoxelsPerSide;
+              neighbor_block_index.x()--;
+            }
+            while (parent_index.y() < 0) {
+              parent_index.y() += kVoxelsPerSide;
+              neighbor_block_index.y()--;
+            }
+            while (parent_index.z() < 0) {
+              parent_index.z() += kVoxelsPerSide;
+              neighbor_block_index.z()--;
+            }
+
+            EsdfBlock::ConstPtr neighbor_block =
+                esdf_layer->getBlockAtIndex(neighbor_block_index);
+            const EsdfVoxel* neighbor_voxel =
+                &neighbor_block->voxels[parent_index.x()][parent_index.y()]
+                                       [parent_index.z()];
+            if (!neighbor_voxel->is_site) {
+              clearVoxel(esdf_voxel, max_squared_distance_vox);
+              cleared_voxels++;
+              any_cleared = true;
+            }
+          } else {
+            // Ok check if the parent index is a site.
+            if (!esdf_block
+                     ->voxels[parent_index.x()][parent_index.y()]
+                             [parent_index.z()]
+                     .is_site) {
+              clearVoxel(esdf_voxel, max_squared_distance_vox);
+
+              cleared_voxels++;
+              any_cleared = true;
+            }
+          }
+        }
+      }
+    }
+    if (any_cleared) {
+      updated_blocks->push_back(block_index);
+    }
+  }
+  VLOG(3) << "Cleared voxels in batch: " << cleared_voxels;
+}
+
+void EsdfIntegrator::clearNeighborsOnCPU(const Index3D& block_index,
+                                         EsdfLayer* esdf_layer,
+                                         Index3DSet* neighbor_clearing_set) {
+  CHECK_NOTNULL(esdf_layer);
+  CHECK_NOTNULL(neighbor_clearing_set);
+
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = esdf_layer->block_size() / kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+  constexpr int kNumSweepDir = 3;
+
+  int cleared_voxels = 0;
+
+  EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+  if (!esdf_block) {
+    LOG(ERROR) << "For some reason trying to clear an unallocated block.";
+    return;
+  }
+
+  Index3D neighbor_index = block_index;
+  for (int axis_to_sweep = 0; axis_to_sweep < kNumSweepDir; axis_to_sweep++) {
+    for (int direction = -1; direction <= 1; direction += 2) {
+      // Get this neighbor.
+      bool block_updated = false;
+      neighbor_index = block_index;
+      neighbor_index(axis_to_sweep) += direction;
+
+      EsdfBlock::Ptr neighbor_block =
+          esdf_layer->getBlockAtIndex(neighbor_index);
+      if (!neighbor_block) {
+        continue;
+      }
+
+      VLOG(3) << "Block index: " << block_index.transpose()
+              << " Neighbor index: " << neighbor_index.transpose()
+              << " axis: " << axis_to_sweep << " dir: " << direction;
+
+      // Iterate over the axes per neighbor.
+      // First we look negative.
+      int voxel_index = 0;
+      int opposite_voxel_index = kVoxelsPerSide;
+      // Then we look positive.
+      if (direction > 0) {
+        voxel_index = kVoxelsPerSide - 1;
+        opposite_voxel_index = -1;
+      }
+
+      Index3D index = Index3D::Zero();
+      index(axis_to_sweep) = voxel_index;
+      int first_axis = 0, second_axis = 1;
+      getSweepAxes(axis_to_sweep, &first_axis, &second_axis);
+      for (index(first_axis) = 0; index(first_axis) < kVoxelsPerSide;
+           index(first_axis)++) {
+        for (index(second_axis) = 0; index(second_axis) < kVoxelsPerSide;
+             index(second_axis)++) {
+          // This is the closest ESDF voxel in our block to this guy.
+          const EsdfVoxel* esdf_voxel =
+              &esdf_block->voxels[index.x()][index.y()][index.z()];
+
+          // If this isn't a cleared voxel, don't bother.
+          if (/*!esdf_voxel->observed || */
+              esdf_voxel->parent_direction != Index3D::Zero()) {
+            continue;
+          }
+
+          for (int increment = 1; increment <= kVoxelsPerSide; increment++) {
+            // Basically we just want to check if the edges' neighbors point
+            // to something valid.
+            Index3D neighbor_voxel_index = index;
+            // Select the opposite voxel index.
+            neighbor_voxel_index(axis_to_sweep) =
+                opposite_voxel_index + increment * direction;
+            EsdfVoxel* neighbor_voxel =
+                &neighbor_block->voxels[neighbor_voxel_index.x()]
+                                       [neighbor_voxel_index.y()]
+                                       [neighbor_voxel_index.z()];
+
+            // If either this was never set or is a site itself, we don't
+            // care.
+            if (!neighbor_voxel->observed || neighbor_voxel->is_site ||
+                neighbor_voxel->squared_distance_vox >=
+                    max_squared_distance_vox) {
+              VLOG(3) << "Skipping because the voxel isn't updateable.";
+              continue;
+            }
+            VLOG(3) << "Block index: " << block_index.transpose()
+                    << " voxel index: " << index.transpose()
+                    << " Neighbor voxel index: "
+                    << neighbor_voxel_index.transpose()
+                    << " Distance: " << neighbor_voxel->squared_distance_vox
+                    << " Parent direction: "
+                    << neighbor_voxel->parent_direction.transpose()
+                    << " Sweep axis: " << axis_to_sweep
+                    << " direction: " << direction
+                    << " increment: " << increment;
+
+            // If the direction of this neighbor isn't pointing in our
+            // direction, we also don't care.
+            Index3D parent_voxel_dir = neighbor_voxel->parent_direction;
+            if ((direction > 0 && parent_voxel_dir(axis_to_sweep) >= 0) ||
+                (direction < 0 && parent_voxel_dir(axis_to_sweep) <= 0)) {
+              VLOG(3) << "Skipping because direction doesn't match. Parent "
+                         "voxel dir: "
+                      << parent_voxel_dir(axis_to_sweep)
+                      << " Axis: " << axis_to_sweep
+                      << " direction: " << direction;
+              break;
+            }
+
+            // If it is, and the matching ESDF voxel parent doesn't match,
+            // then it needs to be cleared.
+            // Actually we can just clear it if the neighbor is cleared.
+            clearVoxel(neighbor_voxel, max_squared_distance_vox);
+            block_updated = true;
+            cleared_voxels++;
+            VLOG(3) << "Clearing voxel.";
+
+            // We need to continue sweeping in this direction to clear the
+            // rest of the band. Luckily this is already in the structure of
+            // the for loop.
+          }
+        }
+      }
+      if (block_updated) {
+        neighbor_clearing_set->insert(neighbor_index);
+      }
+    }
+  }
+  VLOG(3) << "Updated neighbor voxels by clearing: " << cleared_voxels
+          << " updated blocks: " << neighbor_clearing_set->size();
+}
+
+void EsdfIntegrator::computeEsdfOnCPU(
+    const std::vector<Index3D>& blocks_with_sites, EsdfLayer* esdf_layer) {
+  CHECK_NOTNULL(esdf_layer);
+
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float voxel_size = esdf_layer->block_size() / kVoxelsPerSide;
+
+  // First we go over all of the blocks with sites.
+  // We compute all the proximal sites inside the block first.
+  for (const Index3D& block_index : blocks_with_sites) {
+    EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+    for (int i = 0; i < 3; i++) {
+      sweepBlockBandOnCPU(i, voxel_size, esdf_block.get());
+    }
+  }
+
+  std::vector<Index3D> updated_blocks;
+  std::vector<Index3D> blocks_to_run = blocks_with_sites;
+  int i = 0;
+  do {
+    updated_blocks.clear();
+    // We then propagate the edges of these updated blocks to their immediate
+    // neighbors.
+    for (const Index3D& block_index : blocks_to_run) {
+      updateLocalNeighborBandsOnCPU(block_index, esdf_layer, &updated_blocks);
+    }
+
+    // Then we update the inside values of these neighbors.
+    for (const Index3D& block_index : updated_blocks) {
+      EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+      for (int i = 0; i < 3; i++) {
+        sweepBlockBandOnCPU(i, voxel_size, esdf_block.get());
+      }
+    }
+    Index3DSet updated_set(updated_blocks.begin(), updated_blocks.end());
+    blocks_to_run.assign(updated_set.begin(), updated_set.end());
+    VLOG(3) << "Update " << i << " updated blocks: " << blocks_to_run.size();
+    i++;
+    // We continue updating in a brushfire pattern until no more blocks are
+    // getting updated.
+  } while (!updated_blocks.empty());
+}
+
+void EsdfIntegrator::updateLocalNeighborBandsOnCPU(
+    const Index3D& block_index, EsdfLayer* esdf_layer,
+    std::vector<Index3D>* updated_blocks) {
+  constexpr int kNumSweepDir = 3;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  constexpr float kAlmostZero = 1e-4;
+  const float voxel_size = esdf_layer->block_size() / kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  // Get the center block.
+  EsdfBlock::Ptr esdf_block = esdf_layer->getBlockAtIndex(block_index);
+  if (!esdf_block) {
+    return;
+  }
+
+  // Iterate over all the neighbors.
+  Index3D neighbor_index = block_index;
+  for (int axis_to_sweep = 0; axis_to_sweep < kNumSweepDir; axis_to_sweep++) {
+    for (int direction = -1; direction <= 1; direction += 2) {
+      // Get this neighbor.
+      bool block_updated = false;
+      neighbor_index = block_index;
+      neighbor_index(axis_to_sweep) += direction;
+
+      EsdfBlock::Ptr neighbor_block =
+          esdf_layer->getBlockAtIndex(neighbor_index);
+      if (!neighbor_block) {
+        continue;
+      }
+
+      // Iterate over the axes per neighbor.
+      // First we look negative.
+      int voxel_index = 0;
+      int opposite_voxel_index = kVoxelsPerSide - 1;
+      // Then we look positive.
+      if (direction > 0) {
+        voxel_index = opposite_voxel_index;
+        opposite_voxel_index = 0;
+      }
+
+      Index3D index = Index3D::Zero();
+      index(axis_to_sweep) = voxel_index;
+      int first_axis = 0, second_axis = 1;
+      getSweepAxes(axis_to_sweep, &first_axis, &second_axis);
+      for (index(first_axis) = 0; index(first_axis) < kVoxelsPerSide;
+           index(first_axis)++) {
+        for (index(second_axis) = 0; index(second_axis) < kVoxelsPerSide;
+             index(second_axis)++) {
+          // This is the bottom interface. So pixel 0 of our block should be
+          // pixel kVoxelsPerSide-1 of the search block.
+          const EsdfVoxel* esdf_voxel =
+              &esdf_block->voxels[index.x()][index.y()][index.z()];
+          Index3D neighbor_voxel_index = index;
+          // Select the opposite voxel index.
+          neighbor_voxel_index(axis_to_sweep) = opposite_voxel_index;
+          EsdfVoxel* neighbor_voxel =
+              &neighbor_block
+                   ->voxels[neighbor_voxel_index.x()][neighbor_voxel_index.y()]
+                           [neighbor_voxel_index.z()];
+
+          if (!esdf_voxel->observed || !neighbor_voxel->observed ||
+              neighbor_voxel->is_site ||
+              esdf_voxel->squared_distance_vox >= max_squared_distance_vox) {
+            continue;
+          }
+          // Determine if we can update this.
+          Eigen::Vector3i potential_direction = esdf_voxel->parent_direction;
+          potential_direction(axis_to_sweep) += -direction;
+          float potential_distance = potential_direction.squaredNorm();
+          if (neighbor_voxel->squared_distance_vox > potential_distance) {
+            neighbor_voxel->parent_direction = potential_direction;
+            neighbor_voxel->squared_distance_vox = potential_distance;
+            block_updated = true;
+          }
+        }
+      }
+      if (block_updated) {
+        updated_blocks->push_back(neighbor_index);
+      }
+    }
+  }
+}
+
+void EsdfIntegrator::propagateEdgesOnCPU(int axis_to_sweep, float voxel_size,
+                                         EsdfBlock* esdf_block) {
+  CHECK_NOTNULL(esdf_block);
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  constexpr float kAlmostZero = 1e-4;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  // Select the axes to increment.
+  int first_axis = 0, second_axis = 1;
+  getSweepAxes(axis_to_sweep, &first_axis, &second_axis);
+
+  // Just need to sweep once through.
+  Index3D index = Index3D::Zero();
+  for (index(first_axis) = 0; index(first_axis) < kVoxelsPerSide;
+       index(first_axis)++) {
+    for (index(second_axis) = 0; index(second_axis) < kVoxelsPerSide;
+         index(second_axis)++) {
+      // Get the min and max of the band.
+      Index3D bottom_index = index;
+      bottom_index(axis_to_sweep) = 0;
+      Index3D top_index = index;
+      top_index(axis_to_sweep) = kVoxelsPerSide - 1;
+      EsdfVoxel* bottom_voxel =
+          &esdf_block
+               ->voxels[bottom_index.x()][bottom_index.y()][bottom_index.z()];
+      EsdfVoxel* top_voxel =
+          &esdf_block->voxels[top_index.x()][top_index.y()][top_index.z()];
+      Eigen::Vector3i bottom_parent_relative = bottom_voxel->parent_direction;
+      bottom_parent_relative += bottom_index;
+      Eigen::Vector3i top_parent_relative = top_voxel->parent_direction;
+      top_parent_relative += top_index;
+
+      // Check if they're actually valid, lol.
+      bool top_valid = top_voxel->observed && top_voxel->squared_distance_vox <
+                                                  max_squared_distance_vox;
+      bool bottom_valid =
+          bottom_voxel->observed &&
+          bottom_voxel->squared_distance_vox < max_squared_distance_vox;
+
+      for (index(axis_to_sweep) = 0; index(axis_to_sweep) < kVoxelsPerSide;
+           index(axis_to_sweep)++) {
+        EsdfVoxel* esdf_voxel =
+            &esdf_block->voxels[index.x()][index.y()][index.z()];
+        if (!esdf_voxel->observed || esdf_voxel->is_site) {
+          continue;
+        }
+        // Check if we should update this with either the top or the bottom
+        // voxel.
+        if (bottom_valid) {
+          Eigen::Vector3i potential_direction = bottom_parent_relative - index;
+          float potential_distance = potential_direction.squaredNorm();
+          if (esdf_voxel->squared_distance_vox > potential_distance) {
+            esdf_voxel->parent_direction = potential_direction;
+            esdf_voxel->squared_distance_vox = potential_distance;
+          }
+        }
+        if (top_valid) {
+          Eigen::Vector3i potential_direction = top_parent_relative - index;
+          float potential_distance = potential_direction.squaredNorm();
+          if (esdf_voxel->squared_distance_vox > potential_distance) {
+            esdf_voxel->parent_direction = potential_direction;
+            esdf_voxel->squared_distance_vox = potential_distance;
+          }
+        }
+      }
+    }
+  }
+}
+
+void EsdfIntegrator::sweepBlockBandOnCPU(int axis_to_sweep, float voxel_size,
+                                         EsdfBlock* esdf_block) {
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const float max_distance_vox = max_distance_m_ / voxel_size;
+  const float max_squared_distance_vox = max_distance_vox * max_distance_vox;
+
+  // Select the axes to increment.
+  int first_axis = 0, second_axis = 1;
+  getSweepAxes(axis_to_sweep, &first_axis, &second_axis);
+
+  Index3D index = Index3D::Zero();
+
+  // Keep track of if we've seen any sites so far.
+  // The secondary pass has "fake" sites; i.e., sites outside of the block.
+  Index3D last_site = Index3D::Zero();
+  bool site_found = false;
+  for (index(first_axis) = 0; index(first_axis) < kVoxelsPerSide;
+       index(first_axis)++) {
+    for (index(second_axis) = 0; index(second_axis) < kVoxelsPerSide;
+         index(second_axis)++) {
+      // First we sweep forward, then backwards.
+      for (int i = 0; i < 2; i++) {
+        last_site = Index3D::Zero();
+        site_found = false;
+        int direction = 1;
+        int start_voxel = 0;
+        int end_voxel = kVoxelsPerSide;
+        if (i == 1) {
+          direction = -1;
+          start_voxel = kVoxelsPerSide - 1;
+          end_voxel = -1;
+        }
+        for (index(axis_to_sweep) = start_voxel;
+             index(axis_to_sweep) != end_voxel;
+             index(axis_to_sweep) += direction) {
+          EsdfVoxel* esdf_voxel =
+              &esdf_block->voxels[index.x()][index.y()][index.z()];
+          if (!esdf_voxel->observed) {
+            continue;
+          }
+          // If this voxel is itself a site, then mark this for future voxels.
+          if (esdf_voxel->is_site) {
+            last_site = index;
+            site_found = true;
+          } else if (!site_found) {
+            // If this voxel isn't a site but we haven't found a site yet,
+            // then if this voxel is valid we set it as the site.
+            if (esdf_voxel->squared_distance_vox < max_squared_distance_vox) {
+              site_found = true;
+              last_site = esdf_voxel->parent_direction + index;
+            }
+          } else {
+            // If we've found the site, then should just decide what to do
+            // here.
+            Index3D potential_direction = last_site - index;
+            float potential_distance = potential_direction.squaredNorm();
+            // Either it hasn't been set at all or it's closer to the site
+            // than to its current value.
+            if (esdf_voxel->squared_distance_vox > potential_distance) {
+              esdf_voxel->parent_direction = potential_direction;
+              esdf_voxel->squared_distance_vox = potential_distance;
+            } else if (esdf_voxel->squared_distance_vox <
+                       max_squared_distance_vox) {
+              // If the current value is a better site, then set it as a site.
+              last_site = esdf_voxel->parent_direction + index;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void EsdfIntegrator::getSweepAxes(int axis_to_sweep, int* first_axis,
+                                  int* second_axis) const {
+  // Pick an order of the axes to sweep.
+  switch (axis_to_sweep) {
+    case 0:
+      *first_axis = 1;
+      *second_axis = 2;
+      break;
+    case 1:
+      *first_axis = 0;
+      *second_axis = 2;
+      break;
+    case 2:
+      *first_axis = 0;
+      *second_axis = 1;
+      break;
+  }
+}
+
+void EsdfIntegrator::clearVoxel(EsdfVoxel* voxel,
+                                float max_squared_distance_vox) {
+  voxel->parent_direction.setZero();
+  voxel->squared_distance_vox = max_squared_distance_vox;
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/integrators/frustum.cpp b/nvblox/src/integrators/frustum.cpp
new file mode 100644
index 00000000..70f86dd0
--- /dev/null
+++ b/nvblox/src/integrators/frustum.cpp
@@ -0,0 +1,61 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/integrators/frustum.h"
+
+#include "nvblox/core/bounding_boxes.h"
+
+namespace nvblox {
+
+std::vector<Index3D> FrustumCalculator::getBlocksInView(
+    const Transform& T_L_C, const Camera& camera, const float block_size,
+    const float max_distance) {
+  CHECK_GT(max_distance, 0.0f);
+
+  // View frustum
+  const Frustum frustum = camera.getViewFrustum(T_L_C, 0.0f, max_distance);
+
+  // Coarse bound: AABB
+  const AxisAlignedBoundingBox aabb_L = frustum.getAABB();
+  std::vector<Index3D> block_indices_in_aabb =
+      getBlockIndicesTouchedByBoundingBox(block_size, aabb_L);
+
+  // Tight bound: View frustum
+  std::vector<Index3D> block_indices_in_frustum;
+  for (const Index3D& block_index : block_indices_in_aabb) {
+    const AxisAlignedBoundingBox& aabb_block =
+        getAABBOfBlock(block_size, block_index);
+    if (frustum.isAABBInView(aabb_block)) {
+      block_indices_in_frustum.push_back(block_index);
+    }
+  }
+  return block_indices_in_frustum;
+}
+
+std::vector<Index3D> FrustumCalculator::getBlocksInImageView(
+    const DepthImage& depth_frame, const Transform& T_L_C, const Camera& camera,
+    const float block_size, const float truncation_distance_m,
+    const float max_integration_distance_m) {
+  float min_depth, max_depth;
+  std::tie(min_depth, max_depth) = image::minmaxGPU(depth_frame);
+  float max_depth_plus_trunc = max_depth + truncation_distance_m;
+  if (max_integration_distance_m > 0.0f) {
+    max_depth_plus_trunc =
+        std::min<float>(max_depth_plus_trunc, max_integration_distance_m);
+  }
+  return getBlocksInView(T_L_C, camera, block_size, max_depth_plus_trunc);
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/integrators/projective_integrator_base.cpp b/nvblox/src/integrators/projective_integrator_base.cpp
new file mode 100644
index 00000000..5e5dd7ac
--- /dev/null
+++ b/nvblox/src/integrators/projective_integrator_base.cpp
@@ -0,0 +1,78 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/integrators/projective_integrator_base.h"
+
+#include "nvblox/core/bounding_boxes.h"
+
+namespace nvblox {
+
+float ProjectiveIntegratorBase::truncation_distance_vox() const {
+  return truncation_distance_vox_;
+}
+
+float ProjectiveIntegratorBase::max_weight() const { return max_weight_; }
+
+float ProjectiveIntegratorBase::max_integration_distance_m() const {
+  return max_integration_distance_m_;
+}
+
+void ProjectiveIntegratorBase::truncation_distance_vox(
+    float truncation_distance_vox) {
+  CHECK_GT(truncation_distance_vox, 0.0f);
+  truncation_distance_vox_ = truncation_distance_vox;
+}
+
+void ProjectiveIntegratorBase::max_weight(float max_weight) {
+  CHECK_GT(max_weight, 0.0f);
+  max_weight_ = max_weight;
+}
+
+void ProjectiveIntegratorBase::max_integration_distance_m(
+    float max_integration_distance_m) {
+  CHECK_GT(max_integration_distance_m, 0.0f);
+  max_integration_distance_m_ = max_integration_distance_m;
+}
+
+float ProjectiveIntegratorBase::truncation_distance_m(float block_size) const {
+  return truncation_distance_vox_ * block_size /
+         VoxelBlock<bool>::kVoxelsPerSide;
+}
+
+std::vector<Index3D> ProjectiveIntegratorBase::getBlocksInView(
+    const DepthImage& depth_frame, const Transform& T_L_C, const Camera& camera,
+    const float block_size) const {
+  return FrustumCalculator::getBlocksInImageView(
+      depth_frame, T_L_C, camera, block_size, truncation_distance_m(block_size),
+      max_integration_distance_m_);
+}
+
+std::vector<Index3D> ProjectiveIntegratorBase::getBlocksInView(
+    const Transform& T_L_C, const Camera& camera,
+    const float block_size) const {
+  return frustum_calculator_.getBlocksInView(
+      T_L_C, camera, block_size,
+      max_integration_distance_m_ + truncation_distance_m(block_size));
+}
+
+std::vector<Index3D> ProjectiveIntegratorBase::getBlocksInViewUsingRaycasting(
+    const DepthImage& depth_frame, const Transform& T_L_C, const Camera& camera,
+    const float block_size) const {
+  return frustum_calculator_.getBlocksInImageViewCuda(
+      depth_frame, T_L_C, camera, block_size, truncation_distance_m(block_size),
+      max_integration_distance_m_);
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/io/csv.cpp b/nvblox/src/io/csv.cpp
new file mode 100644
index 00000000..d3a45fcc
--- /dev/null
+++ b/nvblox/src/io/csv.cpp
@@ -0,0 +1,79 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/io/csv.h"
+
+#include <fstream>
+#include <memory>
+
+namespace nvblox {
+namespace io {
+
+void writeToCsv(const std::string& filepath, const DepthImage& frame) {
+  LOG(INFO) << "Writing DepthImage to: " << filepath;
+
+  // Create a temporary image in unified memory if the image resides in device
+  // memory.
+  const DepthImage* write_frame_ptr;
+  std::unique_ptr<DepthImage> tmp;
+  if (frame.memory_type() == MemoryType::kDevice) {
+    tmp = std::make_unique<DepthImage>(frame, MemoryType::kUnified);
+    write_frame_ptr = tmp.get();
+  } else {
+    write_frame_ptr = &frame;
+  }
+
+  std::ofstream file_stream(filepath, std::ofstream::out);
+  for (int row_idx = 0; row_idx < write_frame_ptr->rows(); row_idx++) {
+    for (int col_idx = 0; col_idx < write_frame_ptr->cols(); col_idx++) {
+      file_stream << (*write_frame_ptr)(row_idx, col_idx) << " ";
+    }
+    file_stream << "\n";
+  }
+  file_stream.close();
+}
+
+void writeToCsv(const std::string& filepath, const ColorImage& frame) {
+  LOG(INFO) << "Writing ColorImage to: " << filepath;
+
+  // Create a temporary image in unified memory if the image resides in device
+  // memory.
+  const ColorImage* write_frame_ptr;
+  std::unique_ptr<ColorImage> tmp;
+  if (frame.memory_type() == MemoryType::kDevice) {
+    tmp = std::make_unique<ColorImage>(frame, MemoryType::kUnified);
+    write_frame_ptr = tmp.get();
+  } else {
+    write_frame_ptr = &frame;
+  }
+
+  std::ofstream file_stream(filepath, std::ofstream::out);
+  for (int row_idx = 0; row_idx < write_frame_ptr->rows(); row_idx++) {
+    for (int col_idx = 0; col_idx < write_frame_ptr->cols(); col_idx++) {
+      file_stream << std::to_string((*write_frame_ptr)(row_idx, col_idx).r)
+                  << " "
+                  << std::to_string((*write_frame_ptr)(row_idx, col_idx).g)
+                  << " "
+                  << std::to_string((*write_frame_ptr)(row_idx, col_idx).b)
+                  << " ";
+    }
+    file_stream
+        << "\n";  // just write a big long line and sort it out in the loader
+  }
+  file_stream.close();
+}
+
+}  // namespace io
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/io/mesh_io.cpp b/nvblox/src/io/mesh_io.cpp
new file mode 100644
index 00000000..57132199
--- /dev/null
+++ b/nvblox/src/io/mesh_io.cpp
@@ -0,0 +1,45 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/io/mesh_io.h"
+
+#include <vector>
+
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/mesh/mesh.h"
+
+namespace nvblox {
+namespace io {
+
+bool outputMeshLayerToPly(const BlockLayer<MeshBlock>& layer,
+                          const std::string& filename) {
+  // TODO: doesn't support intensity yet!!!!
+  const Mesh mesh = Mesh::fromLayer(layer);
+
+  // Create the ply writer object
+  io::PlyWriter writer(filename);
+  writer.setPoints(&mesh.vertices);
+  writer.setTriangles(&mesh.triangles);
+  if (mesh.normals.size() > 0) {
+    writer.setNormals(&mesh.normals);
+  }
+  if (mesh.colors.size() > 0) {
+    writer.setColors(&mesh.colors);
+  }
+  return writer.write();
+}
+
+}  // namespace io
+}  // namespace nvblox
diff --git a/nvblox/src/io/ply_writer.cpp b/nvblox/src/io/ply_writer.cpp
new file mode 100644
index 00000000..43529b2d
--- /dev/null
+++ b/nvblox/src/io/ply_writer.cpp
@@ -0,0 +1,137 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <glog/logging.h>
+
+#include "nvblox/io/ply_writer.h"
+
+namespace nvblox {
+namespace io {
+
+bool PlyWriter::write() {
+  if (!file_) {
+    // Output a warning -- couldn't open file?
+    LOG(WARNING) << "Could not open file for PLY output.";
+    return false;
+  }
+  if (points_ == nullptr || points_->size() == 0) {
+    LOG(ERROR) << "No points added, nothing to output.";
+    return false;
+  }
+
+  // Double-check that the rest of the sizes match.
+  if (intensities_ != nullptr && points_->size() != intensities_->size()) {
+    LOG(ERROR) << "Intensity size does not match point size.";
+    return false;
+  }
+
+  if (colors_ != nullptr && points_->size() != colors_->size()) {
+    LOG(ERROR) << "Color size does not match point size.";
+    return false;
+  }
+  if (normals_ != nullptr && points_->size() != normals_->size()) {
+    LOG(ERROR) << "Normal size does not match point size.";
+    return false;
+  }
+  // Triangles can be whatever size they like.
+
+  // Output the header first.
+  writeHeader();
+
+  // Then output all the points.
+  writePoints();
+
+  // Then triangles.
+  writeTriangles();
+
+  return true;
+}
+
+void PlyWriter::writeHeader() {
+  file_ << "ply" << std::endl;
+  file_ << "format ascii 1.0" << std::endl;
+  file_ << "element vertex " << points_->size() << std::endl;
+  file_ << "property float x" << std::endl;
+  file_ << "property float y" << std::endl;
+  file_ << "property float z" << std::endl;
+
+  if (normals_) {
+    // TODO: should this be normal_x or nx?
+    file_ << "property float nx" << std::endl;
+    file_ << "property float ny" << std::endl;
+    file_ << "property float nz" << std::endl;
+  }
+
+  if (intensities_) {
+    file_ << "property float intensity" << std::endl;
+  }
+
+  if (colors_) {
+    file_ << "property uchar red" << std::endl;
+    file_ << "property uchar green" << std::endl;
+    file_ << "property uchar blue" << std::endl;
+  }
+
+  if (triangles_) {
+    // TODO: check if "vertex_index" or "vertex_indices"
+    file_ << "element face " << triangles_->size() / 3 << std::endl;
+    file_ << "property list uchar int vertex_indices"
+          << std::endl;  // pcl-1.7(ros::kinetic) breaks ply convention by not
+                         // using "vertex_index"
+  }
+  file_ << "end_header" << std::endl;
+}
+
+void PlyWriter::writePoints() {
+  // We've already checked that all the ones that exist have a matching size.
+  for (size_t i = 0; i < points_->size(); i++) {
+    const Vector3f vertex = (*points_)[i];
+    file_ << vertex.x() << " " << vertex.y() << " " << vertex.z();
+
+    if (normals_) {
+      const Vector3f& normal = (*normals_)[i];
+      file_ << " " << normal.x() << " " << normal.y() << " " << normal.z();
+    }
+
+    if (intensities_) {
+      file_ << " " << (*intensities_)[i];
+    }
+
+    if (colors_) {
+      const Color& color = (*colors_)[i];
+      file_ << " " << std::to_string(color.r) << " " << std::to_string(color.g)
+            << " " << std::to_string(color.b);
+    }
+    file_ << std::endl;
+  }
+}
+
+void PlyWriter::writeTriangles() {
+  constexpr int kTriangleSize = 3;
+  if (triangles_) {
+    for (size_t i = 0; i < triangles_->size(); i += kTriangleSize) {
+      file_ << kTriangleSize << " ";
+
+      for (int j = 0; j < kTriangleSize; j++) {
+        file_ << (*triangles_)[i + j] << " ";
+      }
+
+      file_ << std::endl;
+    }
+  }
+}
+
+}  // namespace io
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/mesh/marching_cubes.cu b/nvblox/src/mesh/marching_cubes.cu
new file mode 100644
index 00000000..7b73c49d
--- /dev/null
+++ b/nvblox/src/mesh/marching_cubes.cu
@@ -0,0 +1,189 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/types.h"
+#include "nvblox/mesh/impl/marching_cubes_table.h"
+
+#include "nvblox/mesh/marching_cubes.h"
+
+namespace nvblox {
+namespace marching_cubes {
+
+int calculateVertexConfiguration(const float vertex_sdf[8]) {
+  return (vertex_sdf[0] < 0 ? (1 << 0) : 0) |
+         (vertex_sdf[1] < 0 ? (1 << 1) : 0) |
+         (vertex_sdf[2] < 0 ? (1 << 2) : 0) |
+         (vertex_sdf[3] < 0 ? (1 << 3) : 0) |
+         (vertex_sdf[4] < 0 ? (1 << 4) : 0) |
+         (vertex_sdf[5] < 0 ? (1 << 5) : 0) |
+         (vertex_sdf[6] < 0 ? (1 << 6) : 0) |
+         (vertex_sdf[7] < 0 ? (1 << 7) : 0);
+}
+
+int neighborIndexFromDirection(const Index3D& direction) {
+  return (direction.x() << 2 | direction.y() << 1 | direction.z() << 0);
+}
+
+Index3D directionFromNeighborIndex(const int index) {
+  return Index3D((index & (1 << 2)) >> 2, (index & (1 << 1)) >> 1,
+                 (index & (1 << 0)) >> 0);
+}
+
+/// Performs linear interpolation on two cube corners to find the approximate
+/// zero crossing (surface) value.
+Vector3f interpolateVertex(const Vector3f& vertex1, const Vector3f& vertex2,
+                           float sdf1, float sdf2) {
+  constexpr float kMinSdfDifference = 1e-4;
+  const float sdf_diff = sdf1 - sdf2;
+  // Only compute the actual interpolation value if the sdf_difference is not
+  // too small, this is to counteract issues with floating point precision.
+  if (abs(sdf_diff) >= kMinSdfDifference) {
+    const float t = sdf1 / sdf_diff;
+    return Vector3f(vertex1 + t * (vertex2 - vertex1));
+  } else {
+    return Vector3f(0.5 * (vertex1 + vertex2));
+  }
+}
+
+void interpolateEdgeVertices(
+    const PerVoxelMarchingCubesResults& marching_cubes_results,
+    Eigen::Matrix<float, 3, 12>* edge_coords) {
+  CHECK(edge_coords != NULL);
+  for (size_t i = 0; i < 12; ++i) {
+    const uint8_t* pairs = kEdgeIndexPairs[i];
+    const int edge0 = pairs[0];
+    const int edge1 = pairs[1];
+    // Only interpolate along edges where there is a zero crossing.
+    if ((marching_cubes_results.vertex_sdf[edge0] < 0 &&
+         marching_cubes_results.vertex_sdf[edge1] >= 0) ||
+        (marching_cubes_results.vertex_sdf[edge0] >= 0 &&
+         marching_cubes_results.vertex_sdf[edge1] < 0))
+      edge_coords->col(i) =
+          interpolateVertex(marching_cubes_results.vertex_coords[edge0],
+                            marching_cubes_results.vertex_coords[edge1],
+                            marching_cubes_results.vertex_sdf[edge0],
+                            marching_cubes_results.vertex_sdf[edge1]);
+  }
+}
+
+void meshCube(const PerVoxelMarchingCubesResults& marching_cubes_results,
+              MeshBlock* mesh) {
+  CHECK_NOTNULL(mesh);
+  const int table_index = marching_cubes_results.marching_cubes_table_index;
+
+  // No edges in this cube.
+  if (table_index == 0 || table_index == 255) {
+    return;
+  }
+
+  Eigen::Matrix<float, 3, 12> edge_vertex_coordinates;
+  interpolateEdgeVertices(marching_cubes_results, &edge_vertex_coordinates);
+
+  const int8_t* table_row = kTriangleTable[table_index];
+
+  int table_col = 0;
+  int next_index = mesh->vertices.size();
+
+  // Resize the mesh block to contain all the vertices.
+  int num_triangles = kNumTrianglesTable[table_index];
+
+  table_col = 0;
+  for (int i = 0; i < num_triangles; i++) {
+    mesh->vertices.push_back(
+        edge_vertex_coordinates.col(table_row[table_col + 2]));
+    mesh->vertices.push_back(
+        edge_vertex_coordinates.col(table_row[table_col + 1]));
+    mesh->vertices.push_back(edge_vertex_coordinates.col(table_row[table_col]));
+    mesh->triangles.push_back(next_index);
+    mesh->triangles.push_back(next_index + 1);
+    mesh->triangles.push_back(next_index + 2);
+    const Vector3f& p0 = mesh->vertices[next_index];
+    const Vector3f& p1 = mesh->vertices[next_index + 1];
+    const Vector3f& p2 = mesh->vertices[next_index + 2];
+    Vector3f px = (p1 - p0);
+    Vector3f py = (p2 - p0);
+    Vector3f n = px.cross(py).normalized();
+    mesh->normals.push_back(n);
+    mesh->normals.push_back(n);
+    mesh->normals.push_back(n);
+    next_index += 3;
+    table_col += 3;
+  }
+}
+
+__device__ void calculateOutputIndex(
+    PerVoxelMarchingCubesResults* marching_cubes_results, int* size) {
+  // How many vertices in this voxel
+  const uint8_t table_index =
+      marching_cubes_results->marching_cubes_table_index;
+  const int num_vertices_in_voxel = kNumVertsTable[table_index];
+
+  // No edges in this cube.
+  if (num_vertices_in_voxel == 0) {
+    return;
+  }
+
+  // Calculate:
+  // - the start index where this voxel starts outputing, and
+  // - the total number of vertices in this mesh block (once all threads
+  //   finish).
+  marching_cubes_results->vertex_vector_start_index =
+      atomicAdd(size, num_vertices_in_voxel);
+}
+
+__device__ void calculateVertices(
+    const PerVoxelMarchingCubesResults& marching_cubes_results,
+    CudaMeshBlock* mesh) {
+  const uint8_t table_index = marching_cubes_results.marching_cubes_table_index;
+  const int num_triangles_in_voxel = kNumTrianglesTable[table_index];
+
+  if (num_triangles_in_voxel == 0) {
+    return;
+  }
+
+  // The position in the block that we start output for this voxel.
+  int next_index = marching_cubes_results.vertex_vector_start_index;
+
+  Eigen::Matrix<float, 3, 12> edge_vertex_coordinates;
+  interpolateEdgeVertices(marching_cubes_results, &edge_vertex_coordinates);
+
+  const int8_t* table_row = kTriangleTable[table_index];
+  int table_col = 0;
+  for (int i = 0; i < num_triangles_in_voxel; i++) {
+    mesh->vertices[next_index] =
+        edge_vertex_coordinates.col(table_row[table_col + 2]);
+    mesh->vertices[next_index + 1] =
+        edge_vertex_coordinates.col(table_row[table_col + 1]);
+    mesh->vertices[next_index + 2] =
+        edge_vertex_coordinates.col(table_row[table_col]);
+    mesh->triangles[next_index] = next_index;
+    mesh->triangles[next_index + 1] = next_index + 1;
+    mesh->triangles[next_index + 2] = next_index + 2;
+    const Vector3f& p0 = mesh->vertices[next_index];
+    const Vector3f& p1 = mesh->vertices[next_index + 1];
+    const Vector3f& p2 = mesh->vertices[next_index + 2];
+    Vector3f px = (p1 - p0);
+    Vector3f py = (p2 - p0);
+    Vector3f n = px.cross(py).normalized();
+    mesh->normals[next_index] = n;
+    mesh->normals[next_index + 1] = n;
+    mesh->normals[next_index + 2] = n;
+    next_index += 3;
+    table_col += 3;
+  }
+}
+
+}  // namespace marching_cubes
+}  // namespace nvblox
diff --git a/nvblox/src/mesh/mesh.cpp b/nvblox/src/mesh/mesh.cpp
new file mode 100644
index 00000000..dab4a45b
--- /dev/null
+++ b/nvblox/src/mesh/mesh.cpp
@@ -0,0 +1,69 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <nvblox/mesh/mesh.h>
+
+namespace nvblox {
+
+Mesh Mesh::fromLayer(const BlockLayer<MeshBlock>& layer) {
+  Mesh mesh;
+
+  // Keep track of the vertex index.
+  int next_index = 0;
+
+  // Iterate over every block in the layer.
+  const std::vector<Index3D> indices = layer.getAllBlockIndices();
+
+  for (const Index3D& index : indices) {
+    MeshBlock::ConstPtr block = layer.getBlockAtIndex(index);
+
+    // Copy over.
+    const std::vector<Vector3f> vertices = block->getVertexVectorOnCPU();
+    mesh.vertices.resize(mesh.vertices.size() + vertices.size());
+    std::copy(vertices.begin(), vertices.end(),
+              mesh.vertices.begin() + next_index);
+
+    const std::vector<Vector3f> normals = block->getNormalVectorOnCPU();
+    mesh.normals.resize(mesh.normals.size() + normals.size());
+    std::copy(normals.begin(), normals.end(),
+              mesh.normals.begin() + next_index);
+
+    const std::vector<Color> colors = block->getColorVectorOnCPU();
+    mesh.colors.resize(mesh.colors.size() + colors.size());
+    std::copy(colors.begin(), colors.end(), mesh.colors.begin() + next_index);
+
+    // Our simple mesh implementation has:
+    // - per vertex colors
+    // - per vertex normals
+    CHECK((vertices.size() == normals.size()) || (normals.size() == 0));
+    CHECK((vertices.size() == vertices.size()) || (colors.size() == 0));
+
+    // Copy over the triangles.
+    const std::vector<int> triangles = block->getTriangleVectorOnCPU();
+    std::vector<int> triangle_indices(triangles.size());
+    // Increment all triangle indices.
+    std::transform(triangles.begin(), triangles.end(), triangle_indices.begin(),
+                   std::bind2nd(std::plus<int>(), next_index));
+
+    mesh.triangles.insert(mesh.triangles.end(), triangle_indices.begin(),
+                          triangle_indices.end());
+
+    next_index += vertices.size();
+  }
+
+  return mesh;
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/mesh/mesh_block.cu b/nvblox/src/mesh/mesh_block.cu
new file mode 100644
index 00000000..bb1f1234
--- /dev/null
+++ b/nvblox/src/mesh/mesh_block.cu
@@ -0,0 +1,90 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/mesh/mesh_block.h"
+
+namespace nvblox {
+
+MeshBlock::MeshBlock(MemoryType memory_type)
+    : vertices(memory_type),
+      normals(memory_type),
+      triangles(memory_type),
+      colors(memory_type) {}
+
+void MeshBlock::clear() {
+  vertices.resize(0);
+  normals.resize(0);
+  triangles.resize(0);
+  colors.resize(0);
+}
+
+void MeshBlock::resizeToNumberOfVertices(size_t new_size) {
+  vertices.resize(new_size);
+  normals.resize(new_size);
+  triangles.resize(new_size);
+}
+
+void MeshBlock::reserveNumberOfVertices(size_t new_capacity) {
+  vertices.reserve(new_capacity);
+  normals.reserve(new_capacity);
+  triangles.reserve(new_capacity);
+}
+
+MeshBlock::Ptr MeshBlock::allocate(MemoryType memory_type) {
+  return std::make_shared<MeshBlock>(memory_type);
+}
+
+std::vector<Vector3f> MeshBlock::getVertexVectorOnCPU() const {
+  return vertices.toVector();
+}
+
+std::vector<Vector3f> MeshBlock::getNormalVectorOnCPU() const {
+  return normals.toVector();
+}
+
+std::vector<int> MeshBlock::getTriangleVectorOnCPU() const {
+  return triangles.toVector();
+}
+
+std::vector<Color> MeshBlock::getColorVectorOnCPU() const {
+  return colors.toVector();
+}
+
+size_t MeshBlock::size() const { return vertices.size(); }
+
+size_t MeshBlock::capacity() const { return vertices.capacity(); }
+
+void MeshBlock::expandColorsToMatchVertices() {
+  colors.reserve(vertices.capacity());
+  colors.resize(vertices.size());
+}
+
+void MeshBlock::expandIntensitiesToMatchVertices() {
+  intensities.reserve(vertices.capacity());
+  intensities.resize(vertices.size());
+}
+
+// Set the pointers to point to the mesh block.
+CudaMeshBlock::CudaMeshBlock(MeshBlock* block) {
+  CHECK_NOTNULL(block);
+  vertices = block->vertices.data();
+  normals = block->normals.data();
+  triangles = block->triangles.data();
+  colors = block->colors.data();
+
+  size = block->vertices.size();
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/mesh/mesh_integrator.cpp b/nvblox/src/mesh/mesh_integrator.cpp
new file mode 100644
index 00000000..e97f3cc6
--- /dev/null
+++ b/nvblox/src/mesh/mesh_integrator.cpp
@@ -0,0 +1,221 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/mesh/mesh_integrator.h"
+
+namespace nvblox {
+
+MeshIntegrator::MeshIntegrator() {
+  // clang-format off
+    cube_index_offsets_ << 0, 1, 1, 0, 0, 1, 1, 0,
+                           0, 0, 1, 1, 0, 0, 1, 1,
+                           0, 0, 0, 0, 1, 1, 1, 1;
+  // clang-format on
+}
+
+bool MeshIntegrator::integrateMeshFromDistanceField(
+    const TsdfLayer& distance_layer, BlockLayer<MeshBlock>* mesh_layer,
+    const DeviceType device_type) {
+  // First, get all the blocks.
+  std::vector<Index3D> block_indices = distance_layer.getAllBlockIndices();
+  if (device_type == DeviceType::kCPU) {
+    return integrateBlocksCPU(distance_layer, block_indices, mesh_layer);
+  } else {
+    return integrateBlocksGPU(distance_layer, block_indices, mesh_layer);
+  }
+}
+
+bool MeshIntegrator::integrateBlocksCPU(
+    const TsdfLayer& distance_layer, const std::vector<Index3D>& block_indices,
+    BlockLayer<MeshBlock>* mesh_layer) {
+  timing::Timer mesh_timer("mesh/integrate");
+  CHECK_NOTNULL(mesh_layer);
+  CHECK_NEAR(distance_layer.block_size(), mesh_layer->block_size(), 1e-4);
+
+  // Figure out which of these actually contain something worth meshing.
+  const float block_size = distance_layer.block_size();
+  const float voxel_size = distance_layer.voxel_size();
+
+  // For each block...
+  for (const Index3D& block_index : block_indices) {
+    // Get the block.
+    VoxelBlock<TsdfVoxel>::ConstPtr block =
+        distance_layer.getBlockAtIndex(block_index);
+
+    // Check meshability - basically if this contains anything near the
+    // border.
+    if (!isBlockMeshable(block, voxel_size * 2)) {
+      continue;
+    }
+
+    // Get all the neighbor blocks.
+    std::vector<VoxelBlock<TsdfVoxel>::ConstPtr> neighbor_blocks(8);
+    for (int i = 0; i < 8; i++) {
+      Index3D neighbor_index =
+          block_index + marching_cubes::directionFromNeighborIndex(i);
+      neighbor_blocks[i] = distance_layer.getBlockAtIndex(neighbor_index);
+    }
+
+    // Get all the potential triangles:
+    std::vector<marching_cubes::PerVoxelMarchingCubesResults>
+        triangle_candidates;
+    getTriangleCandidatesInBlock(block, neighbor_blocks, block_index,
+                                 block_size, &triangle_candidates);
+
+    if (triangle_candidates.size() <= 0) {
+      continue;
+    }
+
+    // Allocate the mesh block.
+    MeshBlock::Ptr mesh_block = mesh_layer->allocateBlockAtIndex(block_index);
+
+    // Then actually calculate the triangles.
+    for (const marching_cubes::PerVoxelMarchingCubesResults& candidate :
+         triangle_candidates) {
+      marching_cubes::meshCube(candidate, mesh_block.get());
+    }
+  }
+
+  return true;
+}
+
+bool MeshIntegrator::isBlockMeshable(
+    const VoxelBlock<TsdfVoxel>::ConstPtr block, float cutoff) const {
+  constexpr int kVoxelsPerSide = VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+
+  Index3D voxel_index;
+  // Iterate over all the voxels:
+  for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+       voxel_index.x()++) {
+    for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+         voxel_index.y()++) {
+      for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+           voxel_index.z()++) {
+        const TsdfVoxel* voxel =
+            &block->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+
+        // Check if voxel distance is within the cutoff to determine if
+        // there's going to be a surface boundary in this block.
+        if (voxel->weight >= min_weight_ &&
+            std::abs(voxel->distance) <= cutoff) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void MeshIntegrator::getTriangleCandidatesInBlock(
+    const TsdfBlock::ConstPtr block,
+    const std::vector<TsdfBlock::ConstPtr>& neighbor_blocks,
+    const Index3D& block_index, const float block_size,
+    std::vector<marching_cubes::PerVoxelMarchingCubesResults>*
+        triangle_candidates) {
+  CHECK_NOTNULL(block);
+  CHECK_NOTNULL(triangle_candidates);
+
+  constexpr int kVoxelsPerSide = TsdfBlock::kVoxelsPerSide;
+  const float voxel_size = block_size / kVoxelsPerSide;
+
+  Index3D voxel_index;
+  // Iterate over all inside voxels.
+  for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+       voxel_index.x()++) {
+    for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+         voxel_index.y()++) {
+      for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+           voxel_index.z()++) {
+        // Get the position of this voxel.
+        Vector3f voxel_position = getCenterPostionFromBlockIndexAndVoxelIndex(
+            block_size, block_index, voxel_index);
+
+        marching_cubes::PerVoxelMarchingCubesResults neighbors;
+        // Figure out if this voxel is actually a triangle candidate.
+        if (getTriangleCandidatesAroundVoxel(block, neighbor_blocks,
+                                             voxel_index, voxel_position,
+                                             voxel_size, &neighbors)) {
+          triangle_candidates->push_back(neighbors);
+        }
+      }
+    }
+  }
+}
+
+bool MeshIntegrator::getTriangleCandidatesAroundVoxel(
+    const TsdfBlock::ConstPtr block,
+    const std::vector<VoxelBlock<TsdfVoxel>::ConstPtr>& neighbor_blocks,
+    const Index3D& voxel_index, const Vector3f& voxel_position,
+    const float voxel_size,
+    marching_cubes::PerVoxelMarchingCubesResults* neighbors) {
+  DCHECK_EQ(neighbor_blocks.size(), 8);
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  for (unsigned int i = 0; i < 8; ++i) {
+    Index3D corner_index = voxel_index + cube_index_offsets_.col(i);
+
+    // Are we in bounds? If not, have to get a neighbor.
+    // The neighbor should correspond to the index in neighbor blocks.
+
+    bool search_neighbor = false;
+    Index3D block_offset(0, 0, 0);
+    for (int j = 0; j < 3; j++) {
+      if (corner_index[j] >= kVoxelsPerSide) {
+        // Here the index is too much.
+        corner_index(j) -= kVoxelsPerSide;
+        block_offset(j) = 1;
+        search_neighbor = true;
+      }
+    }
+
+    const TsdfVoxel* voxel = nullptr;
+    // Get the voxel either from the current block or from the corresponding
+    // neighbor.
+    if (search_neighbor) {
+      // Find the correct neighbor block.
+      int neighbor_index =
+          marching_cubes::neighborIndexFromDirection(block_offset);
+      if (neighbor_blocks[neighbor_index] == nullptr) {
+        return false;
+      }
+      voxel =
+          &neighbor_blocks[neighbor_index]
+               ->voxels[corner_index.x()][corner_index.y()][corner_index.z()];
+    } else {
+      voxel =
+          &block->voxels[corner_index.x()][corner_index.y()][corner_index.z()];
+    }
+
+    // If any of the neighbors are not observed, this can't be a mesh
+    // triangle.
+    if (voxel->weight < min_weight_) {
+      return false;
+    }
+    neighbors->vertex_sdf[i] = voxel->distance;
+    neighbors->vertex_coords[i] =
+        voxel_position + voxel_size * cube_index_offsets_.col(i).cast<float>();
+  }
+
+  // Figure out the index if we've made it this far.
+  neighbors->marching_cubes_table_index =
+      marching_cubes::calculateVertexConfiguration(neighbors->vertex_sdf);
+
+  // Index 0 & 255 contain no triangles so not worth outputting it.
+  return neighbors->marching_cubes_table_index != 0 &&
+         neighbors->marching_cubes_table_index != 255;
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/mesh/mesh_integrator.cu b/nvblox/src/mesh/mesh_integrator.cu
new file mode 100644
index 00000000..0135a80b
--- /dev/null
+++ b/nvblox/src/mesh/mesh_integrator.cu
@@ -0,0 +1,480 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <cuda_runtime.h>
+
+#include <thrust/binary_search.h>
+#include <thrust/device_vector.h>
+#include <thrust/gather.h>
+#include <thrust/remove.h>
+#include <thrust/sort.h>
+#include <thrust/unique.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/integrators/integrators_common.h"
+#include "nvblox/mesh/impl/marching_cubes_table.h"
+#include "nvblox/mesh/marching_cubes.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+MeshIntegrator::~MeshIntegrator() {
+  if (cuda_stream_ != nullptr) {
+    cudaStreamDestroy(cuda_stream_);
+  }
+}
+
+bool MeshIntegrator::integrateBlocksGPU(
+    const TsdfLayer& distance_layer, const std::vector<Index3D>& block_indices,
+    BlockLayer<MeshBlock>* mesh_layer) {
+  timing::Timer mesh_timer("mesh/gpu/integrate");
+  CHECK_NOTNULL(mesh_layer);
+  CHECK_NEAR(distance_layer.block_size(), mesh_layer->block_size(), 1e-4);
+  if (block_indices.empty()) {
+    return true;
+  }
+
+  // Initialize the stream if not done yet.
+  if (cuda_stream_ == nullptr) {
+    checkCudaErrors(cudaStreamCreate(&cuda_stream_));
+  }
+
+  // Figure out which of these actually contain something worth meshing.
+  float block_size = distance_layer.block_size();
+  float voxel_size = distance_layer.voxel_size();
+
+  // Clear all blocks if they exist.
+  for (const Index3D& block_index : block_indices) {
+    MeshBlock::Ptr mesh_block = mesh_layer->getBlockAtIndex(block_index);
+    if (mesh_block) {
+      mesh_block->clear();
+    }
+  }
+
+  // First create a list of meshable blocks.
+  std::vector<Index3D> meshable_blocks;
+  timing::Timer meshable_timer("mesh/gpu/get_meshable");
+  getMeshableBlocksGPU(distance_layer, block_indices, 5 * voxel_size,
+                       &meshable_blocks);
+  meshable_timer.Stop();
+
+  // Then get all the candidates and mesh each block.
+  timing::Timer mesh_blocks_timer("mesh/gpu/mesh_blocks");
+
+  meshBlocksGPU(distance_layer, meshable_blocks, mesh_layer);
+
+  // TODO: optionally weld here as well.
+  mesh_blocks_timer.Stop();
+
+  return true;
+}
+
+// Kernels
+
+// Takes in a vector of blocks, and outputs an integer true if that block is
+// meshable.
+// Block size MUST be voxels_per_side x voxels_per_side x voxel_per_size.
+// Grid size can be anything.
+__global__ void isBlockMeshableKernel(int num_blocks,
+                                      const VoxelBlock<TsdfVoxel>** blocks,
+                                      float cutoff_distance, float min_weight,
+                                      bool* meshable) {
+  dim3 voxel_index = threadIdx;
+  // This for loop allows us to have fewer threadblocks than there are
+  // blocks in this computation. We assume the threadblock size is constant
+  // though to make our lives easier.
+  for (int block_index = blockIdx.x; block_index < num_blocks;
+       block_index += gridDim.x) {
+    // Get the correct voxel for this index.
+    const TsdfVoxel& voxel =
+        blocks[block_index]
+            ->voxels[voxel_index.z][voxel_index.y][voxel_index.x];
+    if (fabs(voxel.distance) <= cutoff_distance && voxel.weight >= min_weight) {
+      meshable[block_index] = true;
+    }
+  }
+}
+
+// Takes in a set of blocks arranged in neighbor sets and their relative
+// positions, then finds vertex candidates, and finally creates the output
+// meshes for them.
+// Block size MUST be voxels_per_side x voxels_per_side x voxel_per_size.
+// Grid size can be anything.
+__global__ void meshBlocksCalculateTableIndicesKernel(
+    int num_blocks, const VoxelBlock<TsdfVoxel>** blocks,
+    const Vector3f* block_positions, float voxel_size, float min_weight,
+    marching_cubes::PerVoxelMarchingCubesResults* marching_cubes_results,
+    int* mesh_block_sizes) {
+  constexpr int kVoxelsPerSide = VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+  constexpr int kVoxelsPerBlock =
+      kVoxelsPerSide * kVoxelsPerSide * kVoxelsPerSide;
+  constexpr int kCubeNeighbors = 8;
+
+  const dim3 voxel_index = dim3(threadIdx.z, threadIdx.y, threadIdx.x);
+
+  const int linear_thread_idx =
+      threadIdx.x +
+      kVoxelsPerSide * (threadIdx.y + kVoxelsPerSide * threadIdx.z);
+
+  // Preallocate a half voxel size.
+  const Vector3f half_voxel(0.5f, 0.5f, 0.5f);
+
+  marching_cubes::PerVoxelMarchingCubesResults marching_cubes_results_local;
+
+  // This for loop allows us to have fewer threadblocks than there are
+  // blocks in this computation. We assume the threadblock size is constant
+  // though to make our lives easier.
+  for (int block_index = blockIdx.x; block_index < num_blocks;
+       block_index += gridDim.x) {
+    // Initialize the calculated output size for this block.
+    __shared__ int mesh_block_size;
+    if (linear_thread_idx == 0) {
+      mesh_block_size = 0;
+    }
+    __syncthreads();
+
+    // Getting the block pointer is complicated now so let's just get it.
+    const VoxelBlock<TsdfVoxel>* block = blocks[block_index * kCubeNeighbors];
+
+    // Get the linear index of the this voxel in this block
+    const int vertex_neighbor_idx =
+        block_index * kVoxelsPerBlock + linear_thread_idx;
+
+    // Check all 8 neighbors.
+    bool skip_voxel = false;
+    for (unsigned int i = 0; i < 8; ++i) {
+      Index3D corner_index(
+          voxel_index.x + marching_cubes::kCornerIndexOffsets[i][0],
+          voxel_index.y + marching_cubes::kCornerIndexOffsets[i][1],
+          voxel_index.z + marching_cubes::kCornerIndexOffsets[i][2]);
+      Index3D block_offset(0, 0, 0);
+      bool search_neighbor = false;
+      // Are we in bounds? If not, have to get a neighbor.
+      // The neighbor should correspond to the index in neighbor blocks.
+      for (int j = 0; j < 3; j++) {
+        if (corner_index[j] >= kVoxelsPerSide) {
+          // Here the index is too much.
+          corner_index(j) -= kVoxelsPerSide;
+          block_offset(j) = 1;
+          search_neighbor = true;
+        }
+      }
+
+      const TsdfVoxel* voxel = nullptr;
+      // Don't look for neighbors for now.
+      if (search_neighbor) {
+        int neighbor_index =
+            marching_cubes::neighborIndexFromDirection(block_offset);
+        const VoxelBlock<TsdfVoxel>* neighbor_block =
+            blocks[block_index * kCubeNeighbors + neighbor_index];
+        if (neighbor_block == nullptr) {
+          skip_voxel = true;
+          break;
+        }
+        voxel =
+            &neighbor_block
+                 ->voxels[corner_index.x()][corner_index.y()][corner_index.z()];
+      } else {
+        voxel =
+            &block
+                 ->voxels[corner_index.x()][corner_index.y()][corner_index.z()];
+      }
+      // If any of the neighbors are not observed, this can't be a mesh
+      // triangle.
+      if (voxel->weight < min_weight) {
+        skip_voxel = true;
+        break;
+      }
+
+      // Calculate the position of this voxel.
+      marching_cubes_results_local.vertex_sdf[i] = voxel->distance;
+      marching_cubes_results_local.vertex_coords[i] =
+          block_positions[block_index] +
+          voxel_size * (corner_index.cast<float>() + half_voxel +
+                        (kVoxelsPerSide * block_offset).cast<float>());
+    }
+
+    if (!skip_voxel) {
+      // If we've made it this far, this needs to be meshed.
+      marching_cubes_results_local.contains_mesh = true;
+
+      // Calculate the index into the magic marching cubes table
+      marching_cubes_results_local.marching_cubes_table_index =
+          marching_cubes::calculateVertexConfiguration(
+              marching_cubes_results_local.vertex_sdf);
+
+      // Mesh this cube. This will keep track of what index we're at within
+      // the cube.
+      marching_cubes::calculateOutputIndex(&marching_cubes_results_local,
+                                           &mesh_block_size);
+
+      // Write out to global memory
+      marching_cubes_results[vertex_neighbor_idx] =
+          marching_cubes_results_local;
+    }
+
+    // Writing the shared variable block size to global memory (per block)
+    __syncthreads();
+    if (linear_thread_idx == 0) {
+      mesh_block_sizes[block_index] = mesh_block_size;
+    }
+  }
+}
+
+__global__ void meshBlocksCalculateVerticesKernel(
+    int num_blocks,
+    const marching_cubes::PerVoxelMarchingCubesResults* marching_cubes_results,
+    const int* mesh_block_sizes, CudaMeshBlock* mesh_blocks) {
+  constexpr int kVoxelsPerSide = VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+
+  const int linear_thread_idx =
+      threadIdx.x +
+      kVoxelsPerSide * (threadIdx.y + kVoxelsPerSide * threadIdx.z);
+
+  // This for loop allows us to have fewer threadblocks than there are
+  // blocks in this computation. We assume the threadblock size is constant
+  // though to make our lives easier.
+  for (int block_index = blockIdx.x; block_index < num_blocks;
+       block_index += gridDim.x) {
+    // If this block contains a mesh
+    if (mesh_block_sizes[block_index] > 0) {
+      // Get the linear index of the this voxel in this block
+      constexpr int kVoxelsPerBlock =
+          kVoxelsPerSide * kVoxelsPerSide * kVoxelsPerSide;
+      const int vertex_neighbor_idx =
+          block_index * kVoxelsPerBlock + linear_thread_idx;
+
+      // If this voxel contains a mesh
+      if (marching_cubes_results[vertex_neighbor_idx].contains_mesh) {
+        // Convert the marching cube table index into vertex coordinates
+        marching_cubes::calculateVertices(
+            marching_cubes_results[vertex_neighbor_idx],
+            &mesh_blocks[block_index]);
+      }
+    }
+  }
+}
+
+// Wrappers
+
+void MeshIntegrator::getMeshableBlocksGPU(
+    const TsdfLayer& distance_layer, const std::vector<Index3D>& block_indices,
+    float cutoff_distance, std::vector<Index3D>* meshable_blocks) {
+  CHECK_NOTNULL(meshable_blocks);
+  if (block_indices.size() == 0) {
+    return;
+  }
+
+  constexpr int kVoxelsPerSide = VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+  // One block per block, 1 thread per pixel. :)
+  // Dim block can be smaller, but dim_threads must be the same.
+  int dim_block = block_indices.size();
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+
+  // Collect all the meshable blocks as raw pointers.
+  // Get all the block pointers and positions.
+  block_ptrs_host_.resize(block_indices.size());
+
+  for (size_t i = 0; i < block_indices.size(); i++) {
+    block_ptrs_host_[i] =
+        distance_layer.getBlockAtIndex(block_indices[i]).get();
+  }
+
+  block_ptrs_device_ = block_ptrs_host_;
+
+  // Allocate a device vector that holds the meshable result.
+  meshable_device_.resize(block_indices.size());
+  meshable_device_.setZero();
+
+  checkCudaErrors(cudaPeekAtLastError());
+  isBlockMeshableKernel<<<dim_block, dim_threads, 0, cuda_stream_>>>(
+      block_indices.size(), block_ptrs_device_.data(), cutoff_distance,
+      min_weight_, meshable_device_.data());
+  checkCudaErrors(cudaPeekAtLastError());
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+
+  meshable_host_ = meshable_device_;
+
+  for (size_t i = 0; i < block_indices.size(); i++) {
+    if (meshable_host_[i]) {
+      meshable_blocks->push_back(block_indices[i]);
+    }
+  }
+}
+
+void MeshIntegrator::meshBlocksGPU(const TsdfLayer& distance_layer,
+                                   const std::vector<Index3D>& block_indices,
+                                   BlockLayer<MeshBlock>* mesh_layer) {
+  if (block_indices.empty()) {
+    return;
+  }
+  timing::Timer mesh_prep_timer("mesh/gpu/mesh_blocks/prep");
+  constexpr int kVoxelsPerSide = VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+  constexpr int kCubeNeighbors = 8;
+
+  // One block per block, 1 thread per voxel. :)
+  // Dim block can be smaller, but dim_threads must be the same.
+  int dim_block = block_indices.size();
+  dim3 dim_threads(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+
+  // Get the block and voxel size.
+  const float block_size = distance_layer.block_size();
+  const float voxel_size = distance_layer.voxel_size();
+
+  // Get all the block pointers and positions.
+  // Block pointers are actually a 2D array of also the neighbor block pointers
+  // The neighbors CAN be null so they need to be checked.
+  block_ptrs_host_.resize(block_indices.size() * kCubeNeighbors);
+  block_positions_host_.resize(block_indices.size());
+  for (size_t i = 0; i < block_indices.size(); i++) {
+    block_ptrs_host_[i * kCubeNeighbors] =
+        distance_layer.getBlockAtIndex(block_indices[i]).get();
+    for (size_t j = 1; j < kCubeNeighbors; j++) {
+      // Get the pointers to all the neighbors as well.
+      block_ptrs_host_[i * kCubeNeighbors + j] =
+          distance_layer
+              .getBlockAtIndex(block_indices[i] +
+                               marching_cubes::directionFromNeighborIndex(j))
+              .get();
+    }
+    block_positions_host_[i] =
+        getPositionFromBlockIndex(block_size, block_indices[i]);
+  }
+
+  // Create an output mesh blocks vector..
+  mesh_blocks_host_.resize(block_indices.size());
+
+  block_ptrs_device_ = block_ptrs_host_;
+  block_positions_device_ = block_positions_host_;
+
+  // Allocate working space
+  constexpr int kNumVoxelsPerBlock =
+      kVoxelsPerSide * kVoxelsPerSide * kVoxelsPerSide;
+  marching_cubes_results_device_.resize(block_indices.size() *
+                                        kNumVoxelsPerBlock);
+  marching_cubes_results_device_.setZero();
+  mesh_block_sizes_device_.resize(block_indices.size());
+  mesh_block_sizes_device_.setZero();
+  mesh_prep_timer.Stop();
+
+  // Run the first half of marching cubes and calculate:
+  // - the per-vertex indexes into the magic triangle table
+  // - the number of vertices in each mesh block.
+  timing::Timer mesh_kernel_1_timer("mesh/gpu/mesh_blocks/kernel_table");
+  meshBlocksCalculateTableIndicesKernel<<<dim_block, dim_threads, 0,
+                                          cuda_stream_>>>(
+      block_indices.size(), block_ptrs_device_.data(),
+      block_positions_device_.data(), voxel_size, min_weight_,
+      marching_cubes_results_device_.data(), mesh_block_sizes_device_.data());
+  checkCudaErrors(cudaPeekAtLastError());
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+
+  mesh_kernel_1_timer.Stop();
+
+  // Copy back the new mesh block sizes (so we can allocate space)
+  timing::Timer mesh_copy_timer("mesh/gpu/mesh_blocks/copy_out");
+  mesh_block_sizes_host_ = mesh_block_sizes_device_;
+  mesh_copy_timer.Stop();
+
+  // Allocate mesh blocks
+  timing::Timer mesh_allocation_timer("mesh/gpu/mesh_blocks/block_allocation");
+  for (size_t i = 0; i < block_indices.size(); i++) {
+    const int num_vertices = mesh_block_sizes_host_[i];
+
+    if (num_vertices > 0) {
+      MeshBlock::Ptr output_block =
+          mesh_layer->allocateBlockAtIndex(block_indices[i]);
+
+      // Grow the vector with a growth factor and a minimum allocation to avoid
+      // repeated reallocation
+      if (num_vertices > output_block->capacity()) {
+        constexpr int kMinimumMeshBlockTrianglesPerVoxel = 1;
+        constexpr int kMinimumMeshBlockVertices =
+            kNumVoxelsPerBlock * kMinimumMeshBlockTrianglesPerVoxel * 3;
+        constexpr int kMeshBlockOverallocationFactor = 2;
+        const int num_vertices_to_allocate =
+            std::max(kMinimumMeshBlockVertices,
+                     num_vertices * kMeshBlockOverallocationFactor);
+        output_block->reserveNumberOfVertices(num_vertices_to_allocate);
+      }
+      output_block->resizeToNumberOfVertices(num_vertices);
+      mesh_blocks_host_[i] = CudaMeshBlock(output_block.get());
+    }
+  }
+  mesh_blocks_device_ = mesh_blocks_host_;
+  mesh_allocation_timer.Stop();
+
+  // Run the second half of marching cubes
+  // - Translating the magic table indices into triangle vertices and writing
+  //   them into the mesh layer.
+  timing::Timer mesh_kernel_2_timer("mesh/gpu/mesh_blocks/kernel_vertices");
+  meshBlocksCalculateVerticesKernel<<<dim_block, dim_threads, 0,
+                                      cuda_stream_>>>(
+      block_indices.size(), marching_cubes_results_device_.data(),
+      mesh_block_sizes_device_.data(), mesh_blocks_device_.data());
+  checkCudaErrors(cudaPeekAtLastError());
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream_));
+  mesh_kernel_2_timer.Stop();
+
+  // Optional third stage: welding.
+  if (weld_vertices_) {
+    timing::Timer welding_timer("mesh/gpu/mesh_blocks/welding");
+    weldVertices(block_indices, mesh_layer);
+  }
+}
+
+void MeshIntegrator::weldVertices(const std::vector<Index3D>& block_indices,
+                                  BlockLayer<MeshBlock>* mesh_layer) {
+  for (const Index3D& index : block_indices) {
+    MeshBlock::Ptr mesh_block = mesh_layer->getBlockAtIndex(index);
+
+    if (!mesh_block || mesh_block->size() <= 3) {
+      continue;
+    }
+
+    // Store a copy of the input vertices.
+    input_vertices_ = mesh_block->vertices;
+    input_normals_ = mesh_block->normals;
+
+    // sort vertices to bring duplicates together
+    thrust::sort(thrust::device, mesh_block->vertices.begin(),
+                 mesh_block->vertices.end(), VectorCompare<Vector3f>());
+
+    // Find unique vertices and erase redundancies. The iterator will point to
+    // the new last index.
+    auto iterator = thrust::unique(thrust::device, mesh_block->vertices.begin(),
+                                   mesh_block->vertices.end());
+
+    // Figure out the new size.
+    size_t new_size = iterator - mesh_block->vertices.begin();
+    mesh_block->vertices.resize(new_size);
+    mesh_block->normals.resize(new_size);
+
+    // Find the indices of the original triangles.
+    thrust::lower_bound(thrust::device, mesh_block->vertices.begin(),
+                        mesh_block->vertices.end(), input_vertices_.begin(),
+                        input_vertices_.end(), mesh_block->triangles.begin(),
+                        VectorCompare<Vector3f>());
+
+    // Reshuffle the normals to match.
+    thrust::scatter(thrust::device, input_normals_.begin(),
+                    input_normals_.end(), mesh_block->triangles.begin(),
+                    mesh_block->normals.begin());
+  }
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/mesh/mesh_integrator_color.cu b/nvblox/src/mesh/mesh_integrator_color.cu
new file mode 100644
index 00000000..162995d9
--- /dev/null
+++ b/nvblox/src/mesh/mesh_integrator_color.cu
@@ -0,0 +1,295 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <cuda_runtime.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/integrators/integrators_common.h"
+#include "nvblox/mesh/impl/marching_cubes_table.h"
+#include "nvblox/mesh/marching_cubes.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+void MeshIntegrator::colorMesh(const ColorLayer& color_layer,
+                               BlockLayer<MeshBlock>* mesh_layer) {
+  colorMesh(color_layer, mesh_layer->getAllBlockIndices(), mesh_layer);
+}
+void MeshIntegrator::colorMesh(const ColorLayer& color_layer,
+                               const std::vector<Index3D>& block_indices,
+                               BlockLayer<MeshBlock>* mesh_layer) {
+  // Default choice is GPU
+  colorMeshGPU(color_layer, block_indices, mesh_layer);
+}
+
+//
+/* Color Mesh blocks on the GPU
+ *
+ * Call with
+ * - one ThreadBlock per VoxelBlock, GridDim 1D
+ * - BlockDim 1D, any size: we implement a stridded access pattern over
+ *   MeshBlock verticies
+ *
+ * @param: color_blocks:     a list of color blocks which correspond in position
+ *                           to mesh_blocks
+ * @param: block_indices:    a list of blocks indices.
+ * @param: cuda_mesh_blocks: a list of mesh_blocks to be colored.
+ */
+__global__ void colorMeshBlockByClosestColorVoxel(
+    const ColorBlock** color_blocks, const Index3D* block_indices,
+    const float block_size, const float voxel_size,
+    CudaMeshBlock* cuda_mesh_blocks) {
+  // Block
+  const ColorBlock* color_block_ptr = color_blocks[blockIdx.x];
+  const Index3D block_index = block_indices[blockIdx.x];
+  CudaMeshBlock cuda_mesh_block = cuda_mesh_blocks[blockIdx.x];
+
+  // The position of this block in the layer
+  const Vector3f p_L_B_m = getPositionFromBlockIndex(block_size, block_index);
+
+  // Interate through MeshBlock vertices - Stidded access pattern
+  for (int i = threadIdx.x; i < cuda_mesh_block.size; i += blockDim.x) {
+    // The position of this vertex in the layer
+    const Vector3f p_L_V_m = cuda_mesh_block.vertices[i];
+
+    // The position of this vertex in the block
+    const Vector3f p_B_V_m = p_L_V_m - p_L_B_m;
+
+    // Convert this to a voxel index
+    Index3D voxel_idx_in_block = (p_B_V_m.array() / voxel_size).cast<int>();
+
+    // NOTE(alexmillane): Here we make some assumptions.
+    // - We assume that the closest voxel to p_L_V is in the ColorBlock
+    //   co-located with the MeshBlock from which p_L_V was drawn.
+    // - This is will (very?) occationally be incorrect when mesh vertices
+    //   escape block boundaries. However, making this assumption saves us any
+    //   neighbour calculations.
+    constexpr size_t KVoxelsPerSizeMinusOne =
+        VoxelBlock<ColorVoxel>::kVoxelsPerSide - 1;
+    voxel_idx_in_block =
+        voxel_idx_in_block.array().min(KVoxelsPerSizeMinusOne).max(0);
+
+    // Get the color voxel
+    const ColorVoxel color_voxel =
+        color_block_ptr->voxels[voxel_idx_in_block.x()]  // NOLINT
+                               [voxel_idx_in_block.y()]  // NOLINT
+                               [voxel_idx_in_block.z()];
+
+    // Write the color out to global memory
+    cuda_mesh_block.colors[i] = color_voxel.color;
+  }
+}
+
+__global__ void colorMeshBlocksConstant(Color color,
+                                        CudaMeshBlock* cuda_mesh_blocks) {
+  // Each threadBlock operates on a single MeshBlock
+  CudaMeshBlock cuda_mesh_block = cuda_mesh_blocks[blockIdx.x];
+  // Interate through MeshBlock vertices - Stidded access pattern
+  for (int i = threadIdx.x; i < cuda_mesh_block.size; i += blockDim.x) {
+    cuda_mesh_block.colors[i] = color;
+  }
+}
+
+void colorMeshBlocksConstantGPU(const std::vector<Index3D>& block_indices,
+                                const Color& color, MeshLayer* mesh_layer,
+                                cudaStream_t cuda_stream) {
+  CHECK_NOTNULL(mesh_layer);
+  if (block_indices.size() == 0) {
+    return;
+  }
+
+  // Prepare CudaMeshBlocks, which are effectively containers of device pointers
+  std::vector<CudaMeshBlock> cuda_mesh_blocks;
+  cuda_mesh_blocks.resize(block_indices.size());
+  for (int i = 0; i < block_indices.size(); i++) {
+    cuda_mesh_blocks[i] =
+        CudaMeshBlock(mesh_layer->getBlockAtIndex(block_indices[i]).get());
+  }
+
+  // Allocate
+  CudaMeshBlock* cuda_mesh_block_device_ptrs;
+  checkCudaErrors(cudaMalloc(&cuda_mesh_block_device_ptrs,
+                             cuda_mesh_blocks.size() * sizeof(CudaMeshBlock)));
+
+  // Host -> GPU
+  checkCudaErrors(
+      cudaMemcpyAsync(cuda_mesh_block_device_ptrs, cuda_mesh_blocks.data(),
+                      cuda_mesh_blocks.size() * sizeof(CudaMeshBlock),
+                      cudaMemcpyHostToDevice, cuda_stream));
+
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kThreadsPerBlock = 8 * 32;  // Chosen at random
+  const int num_blocks = block_indices.size();
+  colorMeshBlocksConstant<<<num_blocks, kThreadsPerBlock, 0, cuda_stream>>>(
+      Color::Gray(),  // NOLINT
+      cuda_mesh_block_device_ptrs);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Deallocate
+  checkCudaErrors(cudaFree(cuda_mesh_block_device_ptrs));
+}
+
+void colorMeshBlockByClosestColorVoxelGPU(
+    const ColorLayer& color_layer, const std::vector<Index3D>& block_indices,
+    MeshLayer* mesh_layer, cudaStream_t cuda_stream) {
+  CHECK_NOTNULL(mesh_layer);
+  if (block_indices.size() == 0) {
+    return;
+  }
+
+  // Get the locations (on device) of the color blocks
+  // NOTE(alexmillane): This function assumes that all block_indices have been
+  // checked to exist in color_layer.
+  std::vector<const ColorBlock*> color_blocks =
+      getBlockPtrsFromIndices(block_indices, color_layer);
+
+  // Prepare CudaMeshBlocks, which are effectively containers of device pointers
+  std::vector<CudaMeshBlock> cuda_mesh_blocks;
+  cuda_mesh_blocks.resize(block_indices.size());
+  for (int i = 0; i < block_indices.size(); i++) {
+    cuda_mesh_blocks[i] =
+        CudaMeshBlock(mesh_layer->getBlockAtIndex(block_indices[i]).get());
+  }
+
+  // Allocate
+  const ColorBlock** color_block_device_ptrs;
+  checkCudaErrors(cudaMalloc(&color_block_device_ptrs,
+                             color_blocks.size() * sizeof(ColorBlock*)));
+  Index3D* block_indices_device_ptr;
+  checkCudaErrors(cudaMalloc(&block_indices_device_ptr,
+                             block_indices.size() * sizeof(Index3D)));
+  CudaMeshBlock* cuda_mesh_block_device_ptrs;
+  checkCudaErrors(cudaMalloc(&cuda_mesh_block_device_ptrs,
+                             cuda_mesh_blocks.size() * sizeof(CudaMeshBlock)));
+
+  // Host -> GPU transfers
+  checkCudaErrors(cudaMemcpyAsync(color_block_device_ptrs, color_blocks.data(),
+                                  color_blocks.size() * sizeof(ColorBlock*),
+                                  cudaMemcpyHostToDevice, cuda_stream));
+  checkCudaErrors(cudaMemcpyAsync(block_indices_device_ptr,
+                                  block_indices.data(),
+                                  block_indices.size() * sizeof(Index3D),
+                                  cudaMemcpyHostToDevice, cuda_stream));
+  checkCudaErrors(
+      cudaMemcpyAsync(cuda_mesh_block_device_ptrs, cuda_mesh_blocks.data(),
+                      cuda_mesh_blocks.size() * sizeof(CudaMeshBlock),
+                      cudaMemcpyHostToDevice, cuda_stream));
+
+  // Kernel call - One ThreadBlock launched per VoxelBlock
+  constexpr int kThreadsPerBlock = 8 * 32;  // Chosen at random
+  const int num_blocks = block_indices.size();
+  const float voxel_size =
+      mesh_layer->block_size() / VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+  colorMeshBlockByClosestColorVoxel<<<num_blocks, kThreadsPerBlock, 0,
+                                      cuda_stream>>>(
+      color_block_device_ptrs,   // NOLINT
+      block_indices_device_ptr,  // NOLINT
+      mesh_layer->block_size(),  // NOLINT
+      voxel_size,                // NOLINT
+      cuda_mesh_block_device_ptrs);
+  checkCudaErrors(cudaStreamSynchronize(cuda_stream));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Deallocate
+  checkCudaErrors(cudaFree(color_block_device_ptrs));
+  checkCudaErrors(cudaFree(block_indices_device_ptr));
+  checkCudaErrors(cudaFree(cuda_mesh_block_device_ptrs));
+}
+
+void MeshIntegrator::colorMeshGPU(const ColorLayer& color_layer,
+                                  MeshLayer* mesh_layer) {
+  colorMeshGPU(color_layer, mesh_layer->getAllBlockIndices(), mesh_layer);
+}
+
+void MeshIntegrator::colorMeshGPU(
+    const ColorLayer& color_layer,
+    const std::vector<Index3D>& requested_block_indices,
+    MeshLayer* mesh_layer) {
+  CHECK_NOTNULL(mesh_layer);
+  CHECK_EQ(color_layer.block_size(), mesh_layer->block_size());
+
+  // NOTE(alexmillane): Generally, some of the MeshBlocks which we are
+  // "coloring" will not have data in the color layer. HOWEVER, for colored
+  // MeshBlocks (ie with non-empty color members), the size of the colors must
+  // match vertices. Therefore we "color" all requested block_indices in two
+  // parts:
+  // - The first part using the color layer, and
+  // - the second part a constant color.
+
+  // Check for each index, that the MeshBlock exists, and if it does
+  // allocate space for color.
+  std::vector<Index3D> block_indices;
+  block_indices.reserve(requested_block_indices.size());
+  std::for_each(
+      requested_block_indices.begin(), requested_block_indices.end(),
+      [&mesh_layer, &block_indices](const Index3D& block_idx) {
+        if (mesh_layer->isBlockAllocated(block_idx)) {
+          mesh_layer->getBlockAtIndex(block_idx)->expandColorsToMatchVertices();
+          block_indices.push_back(block_idx);
+        }
+      });
+
+  // Split block indices into two groups, one group containing indices with
+  // corresponding ColorBlocks, and one without.
+  std::vector<Index3D> block_indices_in_color_layer;
+  std::vector<Index3D> block_indices_not_in_color_layer;
+  block_indices_in_color_layer.reserve(block_indices.size());
+  block_indices_not_in_color_layer.reserve(block_indices.size());
+  for (const Index3D& block_idx : block_indices) {
+    if (color_layer.isBlockAllocated(block_idx)) {
+      block_indices_in_color_layer.push_back(block_idx);
+    } else {
+      block_indices_not_in_color_layer.push_back(block_idx);
+    }
+  }
+
+  // Color
+  colorMeshBlockByClosestColorVoxelGPU(
+      color_layer, block_indices_in_color_layer, mesh_layer, cuda_stream_);
+  colorMeshBlocksConstantGPU(block_indices_not_in_color_layer,
+                             default_mesh_color_, mesh_layer, cuda_stream_);
+}
+
+void MeshIntegrator::colorMeshCPU(const ColorLayer& color_layer,
+                                  BlockLayer<MeshBlock>* mesh_layer) {
+  colorMeshCPU(color_layer, mesh_layer->getAllBlockIndices(), mesh_layer);
+}
+
+void MeshIntegrator::colorMeshCPU(const ColorLayer& color_layer,
+                                  const std::vector<Index3D>& block_indices,
+                                  BlockLayer<MeshBlock>* mesh_layer) {
+  // For each vertex just grab the closest color
+  for (const Index3D& block_idx : block_indices) {
+    MeshBlock::Ptr block = mesh_layer->getBlockAtIndex(block_idx);
+    if (block == nullptr) {
+      continue;
+    }
+    block->colors.resize(block->vertices.size());
+    for (int i = 0; i < block->vertices.size(); i++) {
+      const Vector3f& vertex = block->vertices[i];
+      const ColorVoxel* color_voxel;
+      if (getVoxelAtPosition<ColorVoxel>(color_layer, vertex, &color_voxel)) {
+        block->colors[i] = color_voxel->color;
+      } else {
+        block->colors[i] = Color::Gray();
+      }
+    }
+  }
+}
+
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/primitives/primitives.cpp b/nvblox/src/primitives/primitives.cpp
new file mode 100644
index 00000000..efb097e4
--- /dev/null
+++ b/nvblox/src/primitives/primitives.cpp
@@ -0,0 +1,332 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original voxblox license, which this file is heavily based on: ----
+Copyright (c) 2016, ETHZ ASL
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of voxblox nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "nvblox/primitives/primitives.h"
+
+namespace nvblox {
+namespace primitives {
+
+float Sphere::getDistanceToPoint(const Vector3f& point) const {
+  float distance = (center_ - point).norm() - radius_;
+  return distance;
+}
+
+bool Sphere::getRayIntersection(const Vector3f& ray_origin,
+                                const Vector3f& ray_direction, float max_dist,
+                                Vector3f* intersect_point,
+                                float* intersect_dist) const {
+  // From https://en.wikipedia.org/wiki/Line%E2%80%93sphere_intersection
+  // x = o + dl is the ray equation
+  // r = sphere radius, c = sphere center
+  float under_square_root = pow(ray_direction.dot(ray_origin - center_), 2) -
+                            (ray_origin - center_).squaredNorm() +
+                            pow(radius_, 2);
+
+  // No real roots = no intersection.
+  if (under_square_root < 0.0) {
+    return false;
+  }
+
+  float d =
+      -(ray_direction.dot(ray_origin - center_)) - sqrt(under_square_root);
+
+  // Intersection behind the origin.
+  if (d < 0.0) {
+    return false;
+  }
+  // Intersection greater than max dist, so no intersection in the sensor
+  // range.
+  if (d > max_dist) {
+    return false;
+  }
+
+  *intersect_point = ray_origin + d * ray_direction;
+  *intersect_dist = d;
+  return true;
+}
+
+float Cube::getDistanceToPoint(const Vector3f& point) const {
+  // Solution from http://stackoverflow.com/questions/5254838/
+  // calculating-distance-between-a-point-and-a-rectangular-box-nearest-point
+
+  Vector3f distance_vector = Vector3f::Zero();
+  distance_vector.x() =
+      std::max(std::max(center_.x() - size_.x() / 2.0 - point.x(), 0.0),
+               point.x() - center_.x() - size_.x() / 2.0);
+  distance_vector.y() =
+      std::max(std::max(center_.y() - size_.y() / 2.0 - point.y(), 0.0),
+               point.y() - center_.y() - size_.y() / 2.0);
+  distance_vector.z() =
+      std::max(std::max(center_.z() - size_.z() / 2.0 - point.z(), 0.0),
+               point.z() - center_.z() - size_.z() / 2.0);
+
+  float distance = distance_vector.norm();
+
+  // Basically 0... Means it's inside!
+  if (distance < kEpsilon) {
+    distance_vector.x() = std::max(center_.x() - size_.x() / 2.0 - point.x(),
+                                   point.x() - center_.x() - size_.x() / 2.0);
+    distance_vector.y() = std::max(center_.y() - size_.y() / 2.0 - point.y(),
+                                   point.y() - center_.y() - size_.y() / 2.0);
+    distance_vector.z() = std::max(center_.z() - size_.z() / 2.0 - point.z(),
+                                   point.z() - center_.z() - size_.z() / 2.0);
+    distance = distance_vector.maxCoeff();
+  }
+
+  return distance;
+}
+
+bool Cube::getRayIntersection(const Vector3f& ray_origin,
+                              const Vector3f& ray_direction, float max_dist,
+                              Vector3f* intersect_point,
+                              float* intersect_dist) const {
+  // Adapted from https://www.scratchapixel.com/lessons/3d-basic-rendering/
+  // minimal-ray-tracer-rendering-simple-shapes/ray-box-intersection
+  // Compute min and max limits in 3D.
+
+  // Precalculate signs and inverse directions.
+  Vector3f inv_dir(1.0 / ray_direction.x(), 1.0 / ray_direction.y(),
+                   1.0 / ray_direction.z());
+  Eigen::Vector3i ray_sign(inv_dir.x() < 0.0, inv_dir.y() < 0.0,
+                           inv_dir.z() < 0.0);
+
+  Vector3f bounds[2];
+  bounds[0] = center_ - size_ / 2.0;
+  bounds[1] = center_ + size_ / 2.0;
+
+  float tmin = (bounds[ray_sign.x()].x() - ray_origin.x()) * inv_dir.x();
+  float tmax = (bounds[1 - ray_sign.x()].x() - ray_origin.x()) * inv_dir.x();
+  float tymin = (bounds[ray_sign.y()].y() - ray_origin.y()) * inv_dir.y();
+  float tymax = (bounds[1 - ray_sign.y()].y() - ray_origin.y()) * inv_dir.y();
+
+  if ((tmin > tymax) || (tymin > tmax)) return false;
+  if (tymin > tmin) tmin = tymin;
+  if (tymax < tmax) tmax = tymax;
+
+  float tzmin = (bounds[ray_sign.z()].z() - ray_origin.z()) * inv_dir.z();
+  float tzmax = (bounds[1 - ray_sign.z()].z() - ray_origin.z()) * inv_dir.z();
+
+  if ((tmin > tzmax) || (tzmin > tmax)) return false;
+  if (tzmin > tmin) tmin = tzmin;
+  if (tzmax < tmax) tmax = tzmax;
+
+  float t = tmin;
+  if (t < 0.0) {
+    t = tmax;
+    if (t < 0.0) {
+      return false;
+    }
+  }
+
+  if (t > max_dist) {
+    return false;
+  }
+
+  *intersect_dist = t;
+  *intersect_point = ray_origin + ray_direction * t;
+
+  return true;
+}
+
+float Plane::getDistanceToPoint(const Vector3f& point) const {
+  // Compute the 'd' in ax + by + cz + d = 0:
+  // This is actually the scalar product I guess.
+  float d = -normal_.dot(center_);
+  float p = d / normal_.norm();
+
+  float distance = normal_.dot(point) + p;
+  return distance;
+}
+
+bool Plane::getRayIntersection(const Vector3f& ray_origin,
+                               const Vector3f& ray_direction, float max_dist,
+                               Vector3f* intersect_point,
+                               float* intersect_dist) const {
+  // From https://en.wikipedia.org/wiki/Line%E2%80%93plane_intersection
+  // Following notation of sphere more...
+  // x = o + dl is the ray equation
+  // n = normal, c = plane 'origin'
+  float denominator = ray_direction.dot(normal_);
+  if (std::abs(denominator) < kEpsilon) {
+    // Lines are parallel, no intersection.
+    return false;
+  }
+  float d = (center_ - ray_origin).dot(normal_) / denominator;
+  if (d < 0.0) {
+    return false;
+  }
+  if (d > max_dist) {
+    return false;
+  }
+  *intersect_point = ray_origin + d * ray_direction;
+  *intersect_dist = d;
+  return true;
+}
+
+float Cylinder::getDistanceToPoint(const Vector3f& point) const {
+  // From: https://math.stackexchange.com/questions/2064745/
+  // 3 cases, depending on z of point.
+  // First case: in plane with the cylinder. This also takes care of inside
+  // case.
+  float distance = 0.0;
+
+  float min_z_limit = center_.z() - height_ / 2.0;
+  float max_z_limit = center_.z() + height_ / 2.0;
+  if (point.z() >= min_z_limit && point.z() <= max_z_limit) {
+    distance = (point.head<2>() - center_.head<2>()).norm() - radius_;
+  } else if (point.z() > max_z_limit) {
+    // Case 2: above the cylinder.
+    distance =
+        std::sqrt(std::max((point.head<2>() - center_.head<2>()).squaredNorm() -
+                               radius_ * radius_,
+                           static_cast<float>(0.0)) +
+                  (point.z() - max_z_limit) * (point.z() - max_z_limit));
+  } else {
+    // Case 3: below cylinder.
+    distance =
+        std::sqrt(std::max((point.head<2>() - center_.head<2>()).squaredNorm() -
+                               radius_ * radius_,
+                           static_cast<float>(0.0)) +
+                  (point.z() - min_z_limit) * (point.z() - min_z_limit));
+  }
+  return distance;
+}
+
+bool Cylinder::getRayIntersection(const Vector3f& ray_origin,
+                                  const Vector3f& ray_direction, float max_dist,
+                                  Vector3f* intersect_point,
+                                  float* intersect_dist) const {
+  // From http://woo4.me/wootracer/cylinder-intersection/
+  // and http://www.cl.cam.ac.uk/teaching/1999/AGraphHCI/SMAG/node2.html
+  // Define ray as P = E + tD, where E is ray_origin and D is ray_direction.
+  // We define our cylinder as centered in the xy coordinate system, so
+  // E in this case is actually ray_origin - center_.
+  Vector3f vector_E = ray_origin - center_;
+  Vector3f vector_D = ray_direction;  // Axis aligned.
+
+  float a = vector_D.x() * vector_D.x() + vector_D.y() * vector_D.y();
+  float b = 2 * vector_E.x() * vector_D.x() + 2 * vector_E.y() * vector_D.y();
+  float c = vector_E.x() * vector_E.x() + vector_E.y() * vector_E.y() -
+            radius_ * radius_;
+
+  // t = (-b +- sqrt(b^2 - 4ac))/2a
+  // t only has solutions if b^2 - 4ac >= 0
+  float t1 = -1.0;
+  float t2 = -1.0;
+
+  // Make sure we don't divide by 0.
+  if (std::abs(a) < kEpsilon) {
+    return false;
+  }
+
+  float under_square_root = b * b - 4 * a * c;
+  if (under_square_root < 0) {
+    return false;
+  }
+  if (under_square_root <= kEpsilon) {
+    t1 = -b / (2 * a);
+    // Just keep t2 at invalid default value.
+  } else {
+    // 2 ts.
+    t1 = (-b + std::sqrt(under_square_root)) / (2 * a);
+    t2 = (-b - std::sqrt(under_square_root)) / (2 * a);
+  }
+
+  // Great, now we got some ts, time to figure out whether we hit the cylinder
+  // or the endcaps.
+  float t = max_dist;
+
+  float z1 = vector_E.z() + t1 * vector_D.z();
+  float z2 = vector_E.z() + t2 * vector_D.z();
+  bool t1_valid = t1 >= 0.0 && z1 >= -height_ / 2.0 && z1 <= height_ / 2.0;
+  bool t2_valid = t2 >= 0.0 && z2 >= -height_ / 2.0 && z2 <= height_ / 2.0;
+
+  // Get the endcaps and their validity.
+  // Check end-cap intersections now... :(
+  float t3, t4;
+  bool t3_valid = false, t4_valid = false;
+
+  // Make sure we don't divide by 0.
+  if (std::abs(vector_D.z()) > kEpsilon) {
+    // t3 is the bottom end-cap, t4 is the top.
+    t3 = (-height_ / 2.0 - vector_E.z()) / vector_D.z();
+    t4 = (height_ / 2.0 - vector_E.z()) / vector_D.z();
+
+    Vector3f q3 = vector_E + t3 * vector_D;
+    Vector3f q4 = vector_E + t4 * vector_D;
+
+    t3_valid = t3 >= 0.0 && q3.head<2>().norm() < radius_;
+    t4_valid = t4 >= 0.0 && q4.head<2>().norm() < radius_;
+  }
+
+  if (!(t1_valid || t2_valid || t3_valid || t4_valid)) {
+    return false;
+  }
+  if (t1_valid) {
+    t = std::min(t, t1);
+  }
+  if (t2_valid) {
+    t = std::min(t, t2);
+  }
+  if (t3_valid) {
+    t = std::min(t, t3);
+  }
+  if (t4_valid) {
+    t = std::min(t, t4);
+  }
+
+  // Intersection greater than max dist, so no intersection in the sensor
+  // range.
+  if (t >= max_dist) {
+    return false;
+  }
+
+  // Back to normal coordinates now.
+  *intersect_point = ray_origin + t * ray_direction;
+  *intersect_dist = t;
+  return true;
+}
+
+}  // namespace primitives
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/primitives/scene.cpp b/nvblox/src/primitives/scene.cpp
new file mode 100644
index 00000000..51298290
--- /dev/null
+++ b/nvblox/src/primitives/scene.cpp
@@ -0,0 +1,161 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original voxblox license, which this file is heavily based on: ----
+Copyright (c) 2016, ETHZ ASL
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of voxblox nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+#include "nvblox/primitives/scene.h"
+
+namespace nvblox {
+namespace primitives {
+
+Scene::Scene() : aabb_(Vector3f(-5.0, -5.0, -1.0), Vector3f(5.0, 5.0, 9.0)) {}
+
+void Scene::addPrimitive(std::unique_ptr<Primitive> primitive) {
+  primitives_.emplace_back(std::move(primitive));
+}
+
+void Scene::addGroundLevel(float height) {
+  primitives_.emplace_back(
+      new Plane(Vector3f(0.0, 0.0, height), Vector3f(0.0, 0.0, 1.0)));
+}
+
+void Scene::addCeiling(float height) {
+  primitives_.emplace_back(
+      new Plane(Vector3f(0.0, 0.0, height), Vector3f(0.0, 0.0, -1.0)));
+}
+
+void Scene::addPlaneBoundaries(float x_min, float x_max, float y_min,
+                               float y_max) {
+  // X planes:
+  primitives_.emplace_back(
+      new Plane(Vector3f(x_min, 0.0, 0.0), Vector3f(1.0, 0.0, 0.0)));
+  primitives_.emplace_back(
+      new Plane(Vector3f(x_max, 0.0, 0.0), Vector3f(-1.0, 0.0, 0.0)));
+
+  // Y planes:
+  primitives_.emplace_back(
+      new Plane(Vector3f(0.0, y_min, 0.0), Vector3f(0.0, 1.0, 0.0)));
+  primitives_.emplace_back(
+      new Plane(Vector3f(0.0, y_max, 0.0), Vector3f(0.0, -1.0, 0.0)));
+}
+
+void Scene::clear() { primitives_.clear(); }
+
+float Scene::getSignedDistanceToPoint(const Vector3f& coords,
+                                      float max_dist) const {
+  float min_dist = max_dist;
+  for (const std::unique_ptr<Primitive>& primitive : primitives_) {
+    float primitive_dist = primitive->getDistanceToPoint(coords);
+    if (primitive_dist < min_dist) {
+      min_dist = primitive_dist;
+    }
+  }
+
+  return min_dist;
+}
+
+bool Scene::getRayIntersection(const Vector3f& ray_origin,
+                               const Vector3f& ray_direction, float max_dist,
+                               Vector3f* ray_intersection,
+                               float* ray_dist) const {
+  CHECK_NOTNULL(ray_intersection);
+  CHECK_NOTNULL(ray_dist);
+  *ray_intersection = Vector3f::Zero();
+  *ray_dist = max_dist;
+  bool ray_valid = false;
+  for (const std::unique_ptr<Primitive>& primitive : primitives_) {
+    Vector3f primitive_intersection;
+    float primitive_dist;
+    bool intersects =
+        primitive->getRayIntersection(ray_origin, ray_direction, max_dist,
+                                      &primitive_intersection, &primitive_dist);
+    if (intersects) {
+      if (!ray_valid || primitive_dist < *ray_dist) {
+        ray_valid = true;
+        *ray_dist = primitive_dist;
+        *ray_intersection = primitive_intersection;
+      }
+    }
+  }
+
+  return ray_valid;
+}
+
+void Scene::generateDepthImageFromScene(const Camera& camera,
+                                        const Transform& T_S_C, float max_dist,
+                                        DepthImage* depth_frame) const {
+  CHECK_NOTNULL(depth_frame);
+  CHECK(depth_frame->memory_type() == MemoryType::kUnified)
+      << "For scene generation DepthImage with memory_type == kUnified is "
+         "required.";
+  CHECK_EQ(depth_frame->rows(), camera.height());
+  CHECK_EQ(depth_frame->cols(), camera.width());
+
+  const Transform T_C_S = T_S_C.inverse();
+
+  // Iterate over the entire image.
+  Index2D u_C;
+  for (u_C.x() = 0; u_C.x() < camera.width(); u_C.x()++) {
+    for (u_C.y() = 0; u_C.y() < camera.height(); u_C.y()++) {
+      // Get the ray going through this pixel.
+      const Vector3f ray_direction =
+          T_S_C.linear() * camera.rayFromPixelIndices(u_C).normalized();
+      // Get the intersection point for this ray.
+      Vector3f ray_intersection;
+      float ray_dist;
+      if (getRayIntersection(T_S_C.translation(), ray_direction, max_dist,
+                             &ray_intersection, &ray_dist)) {
+        // The ray intersection is expressed in the world coordinate frame.
+        // We must transform it back to the camera coordinate frame.
+        const Vector3f p_C = T_C_S * ray_intersection;
+        // Then we use the z coordinate in the camera frame to set the depth.
+        (*depth_frame)(u_C.y(), u_C.x()) = p_C.z();
+      } else {
+        // Otherwise set the depth to 0.0 to mark it as invalid.
+        (*depth_frame)(u_C.y(), u_C.x()) = 0.0f;
+      }
+    }
+  }
+}
+
+}  // namespace primitives
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/src/ray_tracing/cuda/sphere_tracer.cu b/nvblox/src/ray_tracing/cuda/sphere_tracer.cu
new file mode 100644
index 00000000..de08621d
--- /dev/null
+++ b/nvblox/src/ray_tracing/cuda/sphere_tracer.cu
@@ -0,0 +1,278 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/ray_tracing/sphere_tracer.h"
+
+#include <thrust/device_malloc.h>
+#include <thrust/device_new.h>
+#include <thrust/device_ptr.h>
+
+#include "nvblox/gpu_hash/cuda/gpu_hash_interface.cuh"
+#include "nvblox/gpu_hash/cuda/gpu_indexing.cuh"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+
+__device__ inline bool isTsdfVoxelValid(const TsdfVoxel& voxel) {
+  constexpr float kMinWeight = 1e-4;
+  return voxel.weight > kMinWeight;
+}
+
+__device__ thrust::pair<float, bool> cast(
+    const Ray& ray,                                  // NOLINT
+    Index3DDeviceHashMapType<TsdfBlock> block_hash,  // NOLINT
+    float truncation_distance_m,                     // NOLINT
+    float block_size_m,                              // NOLINT
+    int maximum_steps,                               // NOLINT
+    float maximum_ray_length_m,                      // NOLINT
+    float surface_distance_epsilon_m) {
+  // Approach: Step along the ray until we find the surface, or fail to
+  bool last_distance_positive = false;
+  // t captures the parameter scaling along ray.direction. We assume
+  // that the ray is normalized which such that t has units meters.
+  float t = 0.0f;
+  for (int i = 0; (i < maximum_steps) && (t < maximum_ray_length_m); i++) {
+    // Current point to sample
+    const Vector3f p_L = ray.origin + t * ray.direction;
+
+    // Evaluate the distance at this point
+    float step;
+    TsdfVoxel* voxel_ptr;
+
+    // Can't get distance, let's see what to do...
+    if (!getVoxelAtPosition(block_hash, p_L, block_size_m, &voxel_ptr) ||
+        !isTsdfVoxelValid(*voxel_ptr)) {
+      // 1) We weren't in observed space before this, let's step through this
+      // (unobserved) shit and hope to hit something allocated.
+      if (!last_distance_positive) {
+        // step forward by the truncation distance
+        step = truncation_distance_m;
+        last_distance_positive = false;
+      }
+      // 2) We were in observed space, now we've left it... let's kill this
+      // ray, it's risky to continue.
+      // Note(alexmillane): The "risk" here is that we've somehow passed
+      // through the truncation band. This occurs occasionally. The risk
+      // of continuing is that we can then see through an object. It's safer
+      // to stop here and hope for better luck in the next frame.
+      else {
+        return {t, false};
+      }
+    }
+    // We got a valid distance
+    else {
+      // Distance negative (or close to it)!
+      // We're gonna terminate, let's determine how.
+      if (voxel_ptr->distance < surface_distance_epsilon_m) {
+        // 1) We found a zero crossing. Terminate successfully.
+        if (last_distance_positive) {
+          // We "refine" the distance by back stepping the (now negative)
+          // distance value
+          t += voxel_ptr->distance;
+          // Output - Success!
+          return {t, true};
+        }
+        // 2) We just went from unobserved to negative. We're observing
+        // something from behind, terminate.
+        else {
+          return {t, false};
+        }
+      }
+      // Distance positive!
+      else {
+        // Step by this amount
+        step = voxel_ptr->distance;
+        last_distance_positive = true;
+      }
+    }
+
+    // Step further along the ray
+    t += step;
+  }
+  // Ran out of number of steps or distance... Fail
+  return {t, false};
+}
+
+__global__ void sphereTracingKernel(
+    const Ray ray,                                   // NOLINT
+    Index3DDeviceHashMapType<TsdfBlock> block_hash,  // NOLINT
+    float* t,                                        // NOLINT
+    bool* success_flag,                              // NOLINT
+    float truncation_distance_m,                     // NOLINT
+    float block_size_m,                              // NOLINT
+    int maximum_steps,                               // NOLINT
+    float maximum_ray_length_m,                      // NOLINT
+    float surface_distance_epsilon_m) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx != 0) return;
+
+  thrust::pair<float, bool> res =
+      cast(ray, block_hash, truncation_distance_m, block_size_m, maximum_steps,
+           maximum_ray_length_m, surface_distance_epsilon_m);
+
+  *t = res.first;
+  *success_flag = res.second;
+}
+
+__global__ void sphereTraceImageKernel(
+    const Camera camera,                             // NOLINT
+    const Transform T_S_C,                           // NOLINT
+    Index3DDeviceHashMapType<TsdfBlock> block_hash,  // NOLINT
+    float* image,                                    // NOLINT
+    float truncation_distance_m,                     // NOLINT
+    float block_size_m,                              // NOLINT
+    int maximum_steps,                               // NOLINT
+    float maximum_ray_length_m,                      // NOLINT
+    float surface_distance_epsilon_m,                // NOLINT
+    int ray_subsampling_factor) {
+  const int ray_col_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  const int ray_row_idx = threadIdx.y + blockIdx.y * blockDim.y;
+  // Note: we ensure that this division works cleanly before getting here.
+  const int ray_rows = camera.rows() / ray_subsampling_factor;
+  const int ray_cols = camera.cols() / ray_subsampling_factor;
+  if ((ray_row_idx >= ray_rows) || (ray_col_idx >= ray_cols)) {
+    return;
+  }
+
+  // Get the image-plane coordinates of where this ray should pass such that it
+  // is in the center of the patch it will represent.
+  constexpr float kHalf = 1.0f / 2.0f;
+  const Index2D ray_indices(ray_col_idx, ray_row_idx);
+  const Vector2f pixel_coords =
+      (ray_indices * ray_subsampling_factor).cast<float>() +
+      kHalf * static_cast<float>(ray_subsampling_factor) * Vector2f::Ones();
+
+  // Get the ray going through this pixel (in layer coordinate)
+  const Vector3f ray_direction_C =
+      camera.rayFromImagePlaneCoordinates(pixel_coords).normalized();
+  const Ray ray_L{T_S_C.linear() * ray_direction_C, T_S_C.translation()};
+
+  // Cast the ray into the layer
+  thrust::pair<float, bool> t_optional =
+      cast(ray_L, block_hash, truncation_distance_m, block_size_m,
+           maximum_steps, maximum_ray_length_m, surface_distance_epsilon_m);
+
+  // If success, write depth to image, otherwise write -1.
+  if (t_optional.second == true) {
+    const float depth = t_optional.first * ray_direction_C.z();
+    image::access(ray_row_idx, ray_col_idx, ray_cols, image) = depth;
+  } else {
+    image::access(ray_row_idx, ray_col_idx, ray_cols, image) = -1.0f;
+  }
+}
+
+SphereTracer::SphereTracer(Params params) : params_(std::move(params)) {
+  checkCudaErrors(cudaStreamCreate(&tracing_stream_));
+  cudaMalloc(&t_device_, sizeof(float));
+  cudaMalloc(&success_flag_device_, sizeof(bool));
+}
+
+SphereTracer::~SphereTracer() {
+  cudaStreamSynchronize(tracing_stream_);
+  cudaFree(t_device_);
+  cudaFree(success_flag_device_);
+  checkCudaErrors(cudaStreamDestroy(tracing_stream_));
+}
+
+bool SphereTracer::castOnGPU(const Ray& ray, const TsdfLayer& tsdf_layer,
+                             const float truncation_distance_m,
+                             float* t) const {
+  constexpr float eps = 1e-5;
+  CHECK_NEAR(ray.direction.norm(), 1.0, eps);
+
+  // Get the GPU hash
+  GPULayerView<TsdfBlock> gpu_layer_view = tsdf_layer.getGpuLayerView();
+
+  // Kernel
+  const float surface_distance_epsilon_m =
+      params_.surface_distance_epsilon_vox * tsdf_layer.voxel_size();
+  sphereTracingKernel<<<1, 1, 0, tracing_stream_>>>(
+      ray,                             // NOLINT
+      gpu_layer_view.getHash().impl_,  // NOLINT
+      t_device_,                       // NOLINT
+      success_flag_device_,            // NOLINT
+      truncation_distance_m,           // NOLINT
+      gpu_layer_view.block_size(),     // NOLINT
+      params_.maximum_steps,           // NOLINT
+      params_.maximum_ray_length_m,    // NOLINT
+      surface_distance_epsilon_m);
+
+  // GPU -> CPU
+  cudaMemcpyAsync(t, t_device_, sizeof(float), cudaMemcpyDeviceToHost,
+                  tracing_stream_);
+
+  checkCudaErrors(cudaStreamSynchronize(tracing_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  return true;
+}
+
+std::shared_ptr<const DepthImage> SphereTracer::renderImageOnGPU(
+    const Camera& camera, const Transform& T_S_C, const TsdfLayer& tsdf_layer,
+    const float truncation_distance_m,
+    const MemoryType output_image_memory_type,
+    const int ray_subsampling_factor) {
+  CHECK_EQ(camera.width() % ray_subsampling_factor, 0);
+  CHECK_EQ(camera.height() % ray_subsampling_factor, 0);
+  // Output space
+  const int image_height = camera.height() / ray_subsampling_factor;
+  const int image_width = camera.width() / ray_subsampling_factor;
+  // If we get a request for a different size image, reallocate.
+  if (!depth_image_ || depth_image_->width() != image_width ||
+      depth_image_->height() != image_height ||
+      depth_image_->memory_type() != output_image_memory_type) {
+    depth_image_ = std::make_shared<DepthImage>(image_height, image_width,
+                                                output_image_memory_type);
+  }
+
+  // Get the GPU hash
+  timing::Timer hash_transfer_timer(
+      "color/integrate/sphere_trace/hash_transfer");
+  GPULayerView<TsdfBlock> gpu_layer_view = tsdf_layer.getGpuLayerView();
+  hash_transfer_timer.Stop();
+
+  // Get metric surface distance epsilon
+  const float surface_distance_epsilon_m =
+      params_.surface_distance_epsilon_vox * tsdf_layer.voxel_size();
+
+  // Kernel
+  // Call params
+  // - 1 thread per pixel
+  // - 8 x 8 threads per thread block
+  // - N x M thread blocks get 1 thread per pixel
+  constexpr dim3 kThreadsPerThreadBlock(8, 8, 1);
+  const dim3 num_blocks(
+      depth_image_->cols() / kThreadsPerThreadBlock.y + 1,  // NOLINT
+      depth_image_->rows() / kThreadsPerThreadBlock.x + 1,  // NOLINT
+      1);
+  sphereTraceImageKernel<<<num_blocks, kThreadsPerThreadBlock, 0,
+                           tracing_stream_>>>(
+      camera,                          // NOLINT
+      T_S_C,                           // NOLINT
+      gpu_layer_view.getHash().impl_,  // NOLINT
+      depth_image_->dataPtr(),         // NOLINT
+      truncation_distance_m,           // NOLINT
+      gpu_layer_view.block_size(),     // NOLINT
+      params_.maximum_steps,           // NOLINT
+      params_.maximum_ray_length_m,    // NOLINT
+      surface_distance_epsilon_m,      // NOLINT
+      ray_subsampling_factor);
+  checkCudaErrors(cudaStreamSynchronize(tracing_stream_));
+  checkCudaErrors(cudaPeekAtLastError());
+
+  return depth_image_;
+}
+
+}  // namespace nvblox
diff --git a/nvblox/src/utils/nvtx_ranges.cpp b/nvblox/src/utils/nvtx_ranges.cpp
new file mode 100644
index 00000000..86b6acf3
--- /dev/null
+++ b/nvblox/src/utils/nvtx_ranges.cpp
@@ -0,0 +1,106 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/utils/nvtx_ranges.h"
+
+#include <functional>
+
+#include <iostream>
+
+namespace nvblox {
+namespace timing {
+
+uint32_t colorToUint32(const Color& color) {
+  return 0xFF << 24 |                            // NOLINT
+         static_cast<uint32_t>(color.r) << 16 |  // NOLINT
+         static_cast<uint32_t>(color.g) << 8 |   // NOLINT
+         static_cast<uint32_t>(color.b);         // NOLINT
+}
+
+const uint32_t nxtx_ranges_colors[] = {
+    colorToUint32(Color::Black()),  colorToUint32(Color::Gray()),
+    colorToUint32(Color::Red()),    colorToUint32(Color::Green()),
+    colorToUint32(Color::Blue()),   colorToUint32(Color::Yellow()),
+    colorToUint32(Color::Orange()), colorToUint32(Color::Purple()),
+    colorToUint32(Color::Teal()),   colorToUint32(Color::Pink())};
+const int num_colors = sizeof(nxtx_ranges_colors) / sizeof(uint32_t);
+
+uint32_t colorFromString(const std::string& str) {
+  return nxtx_ranges_colors[std::hash<std::string>{}(str) % num_colors];
+}
+
+NvtxRange::NvtxRange(const std::string& message, const Color& color,
+                     bool construct_stopped)
+    : started_(false), event_attributes_{0} {
+  Init(message, color);
+  if (!construct_stopped) Start();
+}
+
+NvtxRange::NvtxRange(const std::string& message, bool construct_stopped)
+    : started_(false), event_attributes_{0} {
+  Init(message, colorFromString(message));
+  if (!construct_stopped) Start();
+}
+
+NvtxRange::~NvtxRange() { Stop(); }
+
+void NvtxRange::Start() {
+  started_ = true;
+  id_ = nvtxRangeStartEx(&event_attributes_);
+}
+
+void NvtxRange::Stop() {
+  if (started_) nvtxRangeEnd(id_);
+  started_ = false;
+}
+
+void NvtxRange::Init(const std::string& message, const Color& color) {
+  Init(message, colorToUint32(color));
+}
+
+void NvtxRange::Init(const std::string& message, const uint32_t color) {
+  // Initialize
+  event_attributes_ = nvtxEventAttributes_t{0};
+  event_attributes_.version = NVTX_VERSION;
+  event_attributes_.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  // Configure the Attributes
+  event_attributes_.colorType = NVTX_COLOR_ARGB;
+  event_attributes_.color = color;
+  event_attributes_.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  event_attributes_.message.ascii = message.c_str();
+}
+
+void mark(const std::string& message, const uint32_t color) {
+  // Initialize
+  nvtxEventAttributes_t event_attributes = {0};
+  event_attributes.version = NVTX_VERSION;
+  event_attributes.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  event_attributes.colorType = NVTX_COLOR_ARGB;
+  event_attributes.color = color;
+  event_attributes.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  event_attributes.message.ascii = message.c_str();
+  nvtxMarkEx(&event_attributes);
+}
+
+void mark(const std::string& message) {
+  mark(message, colorFromString(message));
+}
+
+void mark(const std::string& message, const Color& color) {
+  mark(message, colorToUint32(color));
+}
+
+}  // namespace timing
+}  // namespace nvblox
diff --git a/nvblox/src/utils/timing.cpp b/nvblox/src/utils/timing.cpp
new file mode 100644
index 00000000..fafa535c
--- /dev/null
+++ b/nvblox/src/utils/timing.cpp
@@ -0,0 +1,258 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+---- Original license for this file: ----
+
+ * Copyright (C) 2012-2013 Simon Lynen, ASL, ETH Zurich, Switzerland
+ * You can contact the author at <slynen at ethz dot ch>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* Adapted from Paul Furgale Schweizer Messer sm_timing*/
+
+#include "nvblox/utils/timing.h"
+
+#include <glog/logging.h>
+#include <math.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <ostream>
+#include <sstream>
+#include <string>
+
+namespace nvblox {
+
+namespace timing {
+
+const double kNumSecondsPerNanosecond = 1.e-9;
+
+Timing& Timing::Instance() {
+  static Timing t;
+  return t;
+}
+
+Timing::Timing() : maxTagLength_(0) {}
+
+Timing::~Timing() {}
+
+// Static functions to query the timers:
+size_t Timing::GetHandle(std::string const& tag) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  // Search for an existing tag.
+  map_t::iterator i = Instance().tagMap_.find(tag);
+  if (i == Instance().tagMap_.end()) {
+    // If it is not there, create a tag.
+    size_t handle = Instance().timers_.size();
+    Instance().tagMap_[tag] = handle;
+    Instance().timers_.push_back(TimerMapValue());
+    // Track the maximum tag length to help printing a table of timing values
+    // later.
+    Instance().maxTagLength_ = std::max(Instance().maxTagLength_, tag.size());
+    return handle;
+  } else {
+    return i->second;
+  }
+}
+
+std::string Timing::GetTag(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  std::string tag;
+
+  // Perform a linear search for the tag.
+  for (typename map_t::value_type current_tag : Instance().tagMap_) {
+    if (current_tag.second == handle) {
+      return current_tag.first;
+    }
+  }
+  return tag;
+}
+
+// Class functions used for timing.
+TimerChrono::TimerChrono(size_t handle, bool constructStopped)
+    : timing_(false), handle_(handle) {
+  if (!constructStopped) Start();
+}
+
+TimerChrono::TimerChrono(std::string const& tag, bool constructStopped)
+    : timing_(false), handle_(Timing::GetHandle(tag)) {
+  if (!constructStopped) Start();
+}
+
+TimerChrono::~TimerChrono() {
+  if (IsTiming()) Stop();
+}
+
+void TimerChrono::Start() {
+  timing_ = true;
+  time_ = std::chrono::system_clock::now();
+}
+
+void TimerChrono::Stop() {
+  std::chrono::time_point<std::chrono::system_clock> now =
+      std::chrono::system_clock::now();
+  double dt =
+      static_cast<double>(
+          std::chrono::duration_cast<std::chrono::nanoseconds>(now - time_)
+              .count()) *
+      kNumSecondsPerNanosecond;
+
+  Timing::Instance().AddTime(handle_, dt);
+  timing_ = false;
+}
+
+bool TimerChrono::IsTiming() const { return timing_; }
+
+TimerNvtx::TimerNvtx(std::string const& tag, bool constructStopped)
+    : timer_(tag, constructStopped), nvtx_range_(tag, constructStopped) {}
+
+TimerNvtx::TimerNvtx(std::string const& tag, const Color& color,
+                     bool constructStopped)
+    : timer_(tag, constructStopped),
+      nvtx_range_(tag, color, constructStopped) {}
+
+void TimerNvtx::Start() {
+  timer_.Start();
+  nvtx_range_.Start();
+}
+
+void TimerNvtx::Stop() {
+  timer_.Stop();
+  nvtx_range_.Stop();
+}
+
+bool TimerNvtx::IsTiming() const { return timer_.IsTiming(); }
+
+void Timing::AddTime(size_t handle, double seconds) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  timers_[handle].acc_.Add(seconds);
+}
+
+double Timing::GetTotalSeconds(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  return Instance().timers_[handle].acc_.Sum();
+}
+double Timing::GetTotalSeconds(std::string const& tag) {
+  return GetTotalSeconds(GetHandle(tag));
+}
+double Timing::GetMeanSeconds(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  return Instance().timers_[handle].acc_.Mean();
+}
+double Timing::GetMeanSeconds(std::string const& tag) {
+  return GetMeanSeconds(GetHandle(tag));
+}
+size_t Timing::GetNumSamples(size_t handle) {
+  return Instance().timers_[handle].acc_.TotalSamples();
+}
+size_t Timing::GetNumSamples(std::string const& tag) {
+  return GetNumSamples(GetHandle(tag));
+}
+double Timing::GetVarianceSeconds(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  return Instance().timers_[handle].acc_.LazyVariance();
+}
+double Timing::GetVarianceSeconds(std::string const& tag) {
+  return GetVarianceSeconds(GetHandle(tag));
+}
+double Timing::GetMinSeconds(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  return Instance().timers_[handle].acc_.Min();
+}
+double Timing::GetMinSeconds(std::string const& tag) {
+  return GetMinSeconds(GetHandle(tag));
+}
+double Timing::GetMaxSeconds(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  return Instance().timers_[handle].acc_.Max();
+}
+double Timing::GetMaxSeconds(std::string const& tag) {
+  return GetMaxSeconds(GetHandle(tag));
+}
+
+double Timing::GetHz(size_t handle) {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  const double rolling_mean = Instance().timers_[handle].acc_.RollingMean();
+  CHECK_GT(rolling_mean, 0.0);
+  return 1.0 / rolling_mean;
+}
+
+double Timing::GetHz(std::string const& tag) { return GetHz(GetHandle(tag)); }
+
+std::string Timing::SecondsToTimeString(double seconds) {
+  char buffer[256];
+  snprintf(buffer, sizeof(buffer), "%09.6f", seconds);
+  return buffer;
+}
+
+void Timing::Print(std::ostream& out) {
+  map_t& tagMap = Instance().tagMap_;
+
+  if (tagMap.empty()) {
+    return;
+  }
+
+  out << "NVBlox Timing\n";
+  out << "-----------\n";
+  for (typename map_t::value_type t : tagMap) {
+    size_t i = t.second;
+    out.width((std::streamsize)Instance().maxTagLength_);
+    out.setf(std::ios::left, std::ios::adjustfield);
+    out << t.first << "\t";
+    out.width(7);
+
+    out.setf(std::ios::right, std::ios::adjustfield);
+    out << GetNumSamples(i) << "\t";
+    if (GetNumSamples(i) > 0) {
+      out << SecondsToTimeString(GetTotalSeconds(i)) << "\t";
+      double meansec = GetMeanSeconds(i);
+      double stddev = sqrt(GetVarianceSeconds(i));
+      out << "(" << SecondsToTimeString(meansec) << " +- ";
+      out << SecondsToTimeString(stddev) << ")\t";
+
+      double minsec = GetMinSeconds(i);
+      double maxsec = GetMaxSeconds(i);
+
+      // The min or max are out of bounds.
+      out << "[" << SecondsToTimeString(minsec) << ","
+          << SecondsToTimeString(maxsec) << "]";
+    }
+    out << std::endl;
+  }
+}
+std::string Timing::Print() {
+  std::stringstream ss;
+  Print(ss);
+  return ss.str();
+}
+
+void Timing::Reset() {
+  std::lock_guard<std::mutex> lock(Instance().mutex_);
+  Instance().tagMap_.clear();
+}
+
+}  // namespace timing
+}  // namespace nvblox
diff --git a/nvblox/tests/CMakeLists.txt b/nvblox/tests/CMakeLists.txt
new file mode 100644
index 00000000..2d4556b4
--- /dev/null
+++ b/nvblox/tests/CMakeLists.txt
@@ -0,0 +1,120 @@
+find_package(GTest REQUIRED)
+enable_testing()
+
+# Copy test data into the testing space (build/tests)
+file(INSTALL "${CMAKE_CURRENT_SOURCE_DIR}/data" DESTINATION "${CMAKE_CURRENT_BINARY_DIR}")
+
+# A library containing functions used by many tests.
+add_library(nvblox_test_utils SHARED
+    lib/utils.cpp
+    lib/projective_tsdf_integrator_cpu.cpp
+    lib/cuda/increment_kernel.cu
+    lib/cuda/gpu_image_routines.cu
+    lib/cuda/interpolation_2d_gpu.cu
+    lib/cuda/projective_tsdf_integrator_cuda_components.cu
+    lib/cuda/test_utils_cuda.cu
+    lib/cuda/blox.cu
+    lib/cuda/blox_utils.cu
+    lib/cuda/gpu_layer_utils.cu
+    lib/cuda/gpu_indexing.cu
+)
+target_include_directories(nvblox_test_utils PUBLIC
+  include
+  ${GTEST_INCLUDE_DIRS}
+)
+target_link_libraries(nvblox_test_utils 
+    nvblox_lib ${GTEST_LIBRARIES} ${CUDA_LIBRARIES} pthread
+)
+set_target_properties(nvblox_test_utils PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+set(TEST_OPTIONS DISCOVERY_TIMEOUT 30)
+
+# The tests
+add_executable(test_camera test_camera.cpp)
+target_link_libraries(test_camera nvblox_test_utils)
+gtest_discover_tests(test_camera ${TEST_OPTIONS})
+
+add_executable(test_indexing test_indexing.cpp)
+target_link_libraries(test_indexing nvblox_test_utils)
+gtest_discover_tests(test_indexing ${TEST_OPTIONS})
+
+add_executable(test_layer test_layer.cpp)
+target_link_libraries(test_layer nvblox_test_utils)
+gtest_discover_tests(test_layer ${TEST_OPTIONS})
+
+add_executable(test_3d_interpolation test_3d_interpolation.cpp)
+target_link_libraries(test_3d_interpolation nvblox_test_utils)
+gtest_discover_tests(test_3d_interpolation ${TEST_OPTIONS})
+
+add_executable(test_tsdf_integrator test_tsdf_integrator.cpp)
+target_link_libraries(test_tsdf_integrator nvblox_test_utils)
+gtest_discover_tests(test_tsdf_integrator ${TEST_OPTIONS})
+
+add_executable(test_3dmatch test_3dmatch.cpp)
+target_link_libraries(test_3dmatch nvblox_test_utils nvblox_lib)
+gtest_discover_tests(test_3dmatch ${TEST_OPTIONS})
+set_target_properties(test_3dmatch PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
+add_executable(test_unified_ptr test_unified_ptr.cpp)
+target_link_libraries(test_unified_ptr nvblox_test_utils)
+gtest_discover_tests(test_unified_ptr ${TEST_OPTIONS})
+
+add_executable(test_mesh test_mesh.cpp)
+target_link_libraries(test_mesh nvblox_test_utils)
+gtest_discover_tests(test_mesh ${TEST_OPTIONS})
+
+add_executable(test_scene test_scene.cpp)
+target_link_libraries(test_scene nvblox_test_utils)
+gtest_discover_tests(test_scene ${TEST_OPTIONS})
+
+add_executable(test_depth_image test_depth_image.cpp)
+target_link_libraries(test_depth_image nvblox_test_utils)
+gtest_discover_tests(test_depth_image ${TEST_OPTIONS})
+
+add_executable(test_tsdf_integrator_cuda_components test_tsdf_integrator_cuda_components.cpp)
+target_link_libraries(test_tsdf_integrator_cuda_components nvblox_test_utils)
+gtest_discover_tests(test_tsdf_integrator_cuda_components ${TEST_OPTIONS})
+
+add_executable(test_unified_vector test_unified_vector.cpp)
+target_link_libraries(test_unified_vector nvblox_test_utils)
+gtest_discover_tests(test_unified_vector ${TEST_OPTIONS})
+
+add_executable(test_esdf_integrator test_esdf_integrator.cpp)
+target_link_libraries(test_esdf_integrator nvblox_test_utils)
+gtest_discover_tests(test_esdf_integrator ${TEST_OPTIONS})
+
+add_executable(test_color_image test_color_image.cpp)
+target_link_libraries(test_color_image nvblox_test_utils)
+gtest_discover_tests(test_color_image ${TEST_OPTIONS})
+
+add_executable(test_color_integrator test_color_integrator.cpp)
+target_link_libraries(test_color_integrator nvblox_test_utils)
+gtest_discover_tests(test_color_integrator ${TEST_OPTIONS})
+
+add_executable(test_mesh_coloring test_mesh_coloring.cpp)
+target_link_libraries(test_mesh_coloring nvblox_test_utils)
+gtest_discover_tests(test_mesh_coloring ${TEST_OPTIONS})
+
+add_executable(test_for_memory_leaks test_for_memory_leaks.cpp)
+target_link_libraries(test_for_memory_leaks nvblox_test_utils)
+gtest_discover_tests(test_for_memory_leaks ${TEST_OPTIONS})
+
+add_executable(test_frustum test_frustum.cpp)
+target_link_libraries(test_frustum nvblox_test_utils)
+gtest_discover_tests(test_frustum ${TEST_OPTIONS})
+
+add_executable(test_gpu_layer_view test_gpu_layer_view.cpp)
+target_link_libraries(test_gpu_layer_view nvblox_test_utils)
+gtest_discover_tests(test_gpu_layer_view ${TEST_OPTIONS})
+
+add_executable(test_sphere_tracing test_sphere_tracing.cpp)
+target_link_libraries(test_sphere_tracing nvblox_test_utils)
+gtest_discover_tests(test_sphere_tracing ${TEST_OPTIONS})
+
+add_executable(test_cake test_cake.cpp)
+target_link_libraries(test_cake nvblox_test_utils)
+gtest_discover_tests(test_cake ${TEST_OPTIONS})
+
+add_executable(test_traits test_traits.cpp)
+target_link_libraries(test_traits nvblox_test_utils)
+gtest_discover_tests(test_traits ${TEST_OPTIONS})
diff --git a/nvblox/tests/data/3dmatch/camera-intrinsics.txt b/nvblox/tests/data/3dmatch/camera-intrinsics.txt
new file mode 100644
index 00000000..de8eb850
--- /dev/null
+++ b/nvblox/tests/data/3dmatch/camera-intrinsics.txt
@@ -0,0 +1,3 @@
+ 5.70342205e+02	  0.00000000e+00	  3.20000000e+02	
+ 0.00000000e+00	  5.70342205e+02	  2.40000000e+02	
+ 0.00000000e+00	  0.00000000e+00	  1.00000000e+00	
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000000.color.png b/nvblox/tests/data/3dmatch/seq-01/frame-000000.color.png
new file mode 100644
index 00000000..c28ce57b
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000000.color.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000000.depth.png b/nvblox/tests/data/3dmatch/seq-01/frame-000000.depth.png
new file mode 100644
index 00000000..da1962a9
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000000.depth.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000000.pose.txt b/nvblox/tests/data/3dmatch/seq-01/frame-000000.pose.txt
new file mode 100644
index 00000000..3f98b68e
--- /dev/null
+++ b/nvblox/tests/data/3dmatch/seq-01/frame-000000.pose.txt
@@ -0,0 +1,4 @@
+ 3.13181000e-01	  3.09473000e-01	 -8.97856000e-01	  1.97304600e+00	
+-8.73910000e-02	 -9.32015000e-01	 -3.51729000e-01	  1.12573400e+00	
+-9.45665000e-01	  1.88620000e-01	 -2.64844000e-01	  3.09820000e-01	
+ 0.00000000e+00	  0.00000000e+00	  0.00000000e+00	  1.00000000e+00	
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000001.color.png b/nvblox/tests/data/3dmatch/seq-01/frame-000001.color.png
new file mode 100644
index 00000000..ab3031e6
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000001.color.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000001.depth.png b/nvblox/tests/data/3dmatch/seq-01/frame-000001.depth.png
new file mode 100644
index 00000000..578292d7
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000001.depth.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000001.pose.txt b/nvblox/tests/data/3dmatch/seq-01/frame-000001.pose.txt
new file mode 100644
index 00000000..25a0f7c6
--- /dev/null
+++ b/nvblox/tests/data/3dmatch/seq-01/frame-000001.pose.txt
@@ -0,0 +1,4 @@
+ 3.18994000e-01	  3.09870000e-01	 -8.95670000e-01	  1.96907500e+00	
+-8.40980000e-02	 -9.32059000e-01	 -3.52412000e-01	  1.12539500e+00	
+-9.44019000e-01	  1.87741000e-01	 -2.71262000e-01	  3.24652000e-01	
+ 0.00000000e+00	  0.00000000e+00	  0.00000000e+00	  1.00000000e+00	
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000002.color.png b/nvblox/tests/data/3dmatch/seq-01/frame-000002.color.png
new file mode 100644
index 00000000..4a812643
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000002.color.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000002.depth.png b/nvblox/tests/data/3dmatch/seq-01/frame-000002.depth.png
new file mode 100644
index 00000000..621162ae
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000002.depth.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000002.pose.txt b/nvblox/tests/data/3dmatch/seq-01/frame-000002.pose.txt
new file mode 100644
index 00000000..890e7d6e
--- /dev/null
+++ b/nvblox/tests/data/3dmatch/seq-01/frame-000002.pose.txt
@@ -0,0 +1,4 @@
+ 3.24792000e-01	  3.10273000e-01	 -8.93444000e-01	  1.96510400e+00	
+-8.08000000e-02	 -9.32101000e-01	 -3.53072000e-01	  1.12505500e+00	
+-9.42328000e-01	  1.86865000e-01	 -2.77669000e-01	  3.39484000e-01	
+ 0.00000000e+00	  0.00000000e+00	  0.00000000e+00	  1.00000000e+00	
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000116.color.png b/nvblox/tests/data/3dmatch/seq-01/frame-000116.color.png
new file mode 100644
index 00000000..d61f31a2
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000116.color.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000116.depth.png b/nvblox/tests/data/3dmatch/seq-01/frame-000116.depth.png
new file mode 100644
index 00000000..9a152c85
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000116.depth.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000116.pose.txt b/nvblox/tests/data/3dmatch/seq-01/frame-000116.pose.txt
new file mode 100644
index 00000000..e8cdf0fe
--- /dev/null
+++ b/nvblox/tests/data/3dmatch/seq-01/frame-000116.pose.txt
@@ -0,0 +1,4 @@
+ 7.66485000e-01	  1.65186000e-01	 -6.20656000e-01	  1.77984700e+00	
+-4.74910000e-02	 -9.49137000e-01	 -3.11260000e-01	  1.10068500e+00	
+-6.40504000e-01	  2.68052000e-01	 -7.19655000e-01	  7.44543000e-01	
+ 0.00000000e+00	  0.00000000e+00	  0.00000000e+00	  1.00000000e+00	
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000422.color.png b/nvblox/tests/data/3dmatch/seq-01/frame-000422.color.png
new file mode 100644
index 00000000..5a5b1aa5
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000422.color.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000422.depth.png b/nvblox/tests/data/3dmatch/seq-01/frame-000422.depth.png
new file mode 100644
index 00000000..e313084b
Binary files /dev/null and b/nvblox/tests/data/3dmatch/seq-01/frame-000422.depth.png differ
diff --git a/nvblox/tests/data/3dmatch/seq-01/frame-000422.pose.txt b/nvblox/tests/data/3dmatch/seq-01/frame-000422.pose.txt
new file mode 100644
index 00000000..f86f5112
--- /dev/null
+++ b/nvblox/tests/data/3dmatch/seq-01/frame-000422.pose.txt
@@ -0,0 +1,4 @@
+ 8.59463000e-01	  8.99460000e-02	 -5.03222000e-01	  6.44752000e-01	
+ 2.90020000e-02	 -9.91393000e-01	 -1.27668000e-01	  1.10310800e+00	
+-5.10374000e-01	  9.51310000e-02	 -8.54675000e-01	  7.29334000e-01	
+ 0.00000000e+00	  0.00000000e+00	  0.00000000e+00	  1.00000000e+00	
diff --git a/nvblox/tests/include/nvblox/tests/blox.h b/nvblox/tests/include/nvblox/tests/blox.h
new file mode 100644
index 00000000..58d90428
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/blox.h
@@ -0,0 +1,47 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_ptr.h"
+
+namespace nvblox {
+
+// Dummy block type that just stores its own index.
+struct IndexBlock {
+  typedef unified_ptr<IndexBlock> Ptr;
+  typedef unified_ptr<const IndexBlock> ConstPtr;
+
+  Index3D data;
+
+  static Ptr allocate(MemoryType memory_type) {
+    return make_unified<IndexBlock>(memory_type);
+  }
+};
+
+// Dummy block that stores a single bool
+struct DummyBlock {
+  typedef unified_ptr<DummyBlock> Ptr;
+  typedef unified_ptr<const DummyBlock> ConstPtr;
+
+  bool data = false;
+
+  static Ptr allocate(MemoryType memory_type) {
+    return make_unified<DummyBlock>(memory_type);
+  }
+};
+
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/blox_utils.h b/nvblox/tests/include/nvblox/tests/blox_utils.h
new file mode 100644
index 00000000..23be14d1
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/blox_utils.h
@@ -0,0 +1,66 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+
+namespace nvblox {
+
+template <typename BlockType>
+void setBlockBytesConstantOnGPU(int value, BlockType* block_device_ptr) {
+  cudaMemset(block_device_ptr, value, sizeof(BlockType));
+}
+
+struct TestVoxel {
+  static constexpr uint8_t kCPUInitializationValue = 0;
+  static constexpr uint8_t kGPUInitializationValue = 1;
+
+  uint8_t data = kCPUInitializationValue;
+};
+
+template <>
+inline void VoxelBlock<TestVoxel>::initOnGPU(
+    VoxelBlock<TestVoxel>* voxel_block_ptr) {
+  setBlockBytesConstantOnGPU(TestVoxel::kGPUInitializationValue,
+                             voxel_block_ptr);
+}
+
+// A custom (non-voxel) block which forgets to define allocate(), and therefore
+// will fail the type_trait has nvblox::traits::has_allocate<T>()
+struct TestBlockNoAllocation {
+  typedef unified_ptr<TestBlockNoAllocation> Ptr;
+  typedef unified_ptr<const TestBlockNoAllocation> ConstPtr;
+
+  static constexpr uint8_t kCPUInitializationValue = 0;
+
+  uint8_t data = kCPUInitializationValue;
+};
+
+using TestBlock = VoxelBlock<TestVoxel>;
+
+namespace test_utils {
+
+// Fills a TsdfBlock such that the voxels distance and weight values are their
+// linear index (as a float)
+void setTsdfBlockVoxelsInSequence(TsdfBlock::Ptr block);
+
+bool checkBlockAllConstant(const TsdfBlock::Ptr block, TsdfVoxel voxel_cpu);
+bool checkBlockAllConstant(const TestBlock::Ptr block, TestVoxel voxel_cpu);
+bool checkBlockAllConstant(const ColorBlock::Ptr block, ColorVoxel voxel_cpu);
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/gpu_image_routines.h b/nvblox/tests/include/nvblox/tests/gpu_image_routines.h
new file mode 100644
index 00000000..c2b9a237
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/gpu_image_routines.h
@@ -0,0 +1,34 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/color.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace test_utils {
+
+void setImageConstantOnGpu(const float value, DepthImage* image_ptr);
+void setImageConstantOnGpu(const Color value, ColorImage* image_ptr);
+
+void getDifferenceImageOnGPU(const ColorImage& image_1,
+                             const ColorImage& image_2,
+                             ColorImage* diff_image_ptr);
+void getDifferenceImageOnGPU(const DepthImage& image_1,
+                             const DepthImage& image_2,
+                             DepthImage* diff_image_ptr);
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/gpu_indexing.h b/nvblox/tests/include/nvblox/tests/gpu_indexing.h
new file mode 100644
index 00000000..0a4a8732
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/gpu_indexing.h
@@ -0,0 +1,31 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace test_utils {
+
+void getBlockAndVoxelIndexFromPositionInLayerOnGPU(const float block_size,
+                                                   const Vector3f& position,
+                                                   Index3D* block_idx,
+                                                   Index3D* voxel_idx);
+
+void getBlockAndVoxelIndexFromPositionInLayerOnGPU(
+    const float block_size, const std::vector<Vector3f>& positions,
+    std::vector<Index3D>* block_indices, std::vector<Index3D>* voxel_indices);
+
+}  // namespace test_utils
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/tests/include/nvblox/tests/gpu_layer_utils.h b/nvblox/tests/include/nvblox/tests/gpu_layer_utils.h
new file mode 100644
index 00000000..03b175c8
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/gpu_layer_utils.h
@@ -0,0 +1,36 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <vector>
+
+#include <nvblox/core/common_names.h>
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+
+namespace nvblox {
+namespace test_utils {
+
+// Returns a vector of flags indicating if the query indices exist.
+std::vector<bool> getContainsFlags(const GPULayerView<TsdfBlock>& gpu_layer,
+                                   const std::vector<Index3D>& indices);
+
+// Retrieves Voxels on the GPU and brings them back to CPU.
+std::pair<std::vector<TsdfVoxel>, std::vector<bool>> getVoxelsAtPositionsOnGPU(
+    const GPULayerView<TsdfBlock>& gpu_layer,
+    const std::vector<Vector3f>& p_L_vec);
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/increment_on_gpu.h b/nvblox/tests/include/nvblox/tests/increment_on_gpu.h
new file mode 100644
index 00000000..254477df
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/increment_on_gpu.h
@@ -0,0 +1,25 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+namespace nvblox {
+namespace test_utils {
+
+void incrementOnGPU(int* number);
+
+void incrementOnGPU(const int num_elelments, int* number);
+
+}  // namespace test_utils
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/tests/include/nvblox/tests/interpolation_2d_gpu.h b/nvblox/tests/include/nvblox/tests/interpolation_2d_gpu.h
new file mode 100644
index 00000000..7a032c2b
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/interpolation_2d_gpu.h
@@ -0,0 +1,35 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/image.h"
+#include "nvblox/core/types.h"
+
+namespace nvblox {
+namespace test_utils {
+
+void linearInterpolateImageGpu(const DepthImage& image,
+                               const std::vector<Vector2f>& u_px_vec,
+                               std::vector<float>* values_ptr,
+                               std::vector<int>* success_flags_ptr);
+
+void linearInterpolateImageGpu(const ColorImage& image,
+                               const std::vector<Vector2f>& u_px_vec,
+                               std::vector<Color>* values_ptr,
+                               std::vector<int>* success_flags_ptr);
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/projective_tsdf_integrator_cpu.h b/nvblox/tests/include/nvblox/tests/projective_tsdf_integrator_cpu.h
new file mode 100644
index 00000000..0cf54e89
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/projective_tsdf_integrator_cpu.h
@@ -0,0 +1,37 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+
+namespace nvblox {
+
+class ProjectiveTsdfIntegratorCPU : public ProjectiveTsdfIntegrator {
+ public:
+  ProjectiveTsdfIntegratorCPU() : ProjectiveTsdfIntegrator() {}
+  virtual ~ProjectiveTsdfIntegratorCPU() {}
+
+ protected:
+  // We override the GPU version of this function and run it on the CPU.
+  void updateBlocks(const std::vector<Index3D>& block_indices,
+                    const DepthImage& depth_frame, const Transform& T_L_C,
+                    const Camera& camera, const float truncation_distance_m,
+                    TsdfLayer* layer) override;
+};
+
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/projective_tsdf_integrator_cuda_components.h b/nvblox/tests/include/nvblox/tests/projective_tsdf_integrator_cuda_components.h
new file mode 100644
index 00000000..5c04ae01
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/projective_tsdf_integrator_cuda_components.h
@@ -0,0 +1,50 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include <nvblox/core/camera.h>
+#include <nvblox/core/common_names.h>
+#include <nvblox/core/image.h>
+#include <nvblox/core/layer.h>
+#include <nvblox/core/types.h>
+#include <nvblox/core/voxels.h>
+
+namespace nvblox {
+namespace test_utils {
+
+// A Nx2 matrix representing the results of projecting all voxels in a block
+// into the image plane
+using BlockProjectionResult =
+    Eigen::Matrix<float,
+                  VoxelBlock<bool>::kVoxelsPerSide *
+                      VoxelBlock<bool>::kVoxelsPerSide *
+                      VoxelBlock<bool>::kVoxelsPerSide,
+                  2>;
+
+Eigen::Matrix3Xf transformPointsOnGPU(const Transform& T_A_B,
+                                      const Eigen::Matrix3Xf& vecs_A);
+
+std::vector<BlockProjectionResult> projectBlocksOnGPU(
+    const std::vector<Index3D>& block_indices, const Camera& camera,
+    const Transform& T_C_L, TsdfLayer* distance_layer_ptr);
+
+Eigen::VectorXf interpolatePointsOnGPU(const DepthImage& depth_frame,
+                                       const Eigen::MatrixX2f& u_px_vec);
+
+void setVoxelBlockOnGPU(TsdfLayer* layer);
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/test_utils_cuda.h b/nvblox/tests/include/nvblox/tests/test_utils_cuda.h
new file mode 100644
index 00000000..f362bd8c
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/test_utils_cuda.h
@@ -0,0 +1,39 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_vector.h"
+
+namespace nvblox {
+namespace test_utils {
+
+void fillVectorWithConstant(float value, unified_vector<float>* vec_ptr);
+void fillVectorWithConstant(int value, unified_vector<int>* vec_ptr);
+
+void fillWithConstant(float value, size_t num_elems, float* vec_ptr);
+void fillWithConstant(int value, size_t num_elems, int* vec_ptr);
+
+bool checkVectorAllConstant(const unified_vector<float>& vec_ptr, float value);
+bool checkVectorAllConstant(const unified_vector<int>& vec_ptr, int value);
+
+bool checkAllConstant(const float* vec_ptr, float value, size_t num_elems);
+bool checkAllConstant(const int* vec_ptr, int value, size_t num_elems);
+
+void addOneToAllGPU(unified_vector<int>* vec_ptr);
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/include/nvblox/tests/utils.h b/nvblox/tests/include/nvblox/tests/utils.h
new file mode 100644
index 00000000..2f58ba14
--- /dev/null
+++ b/nvblox/tests/include/nvblox/tests/utils.h
@@ -0,0 +1,44 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#pragma once
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/color.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+namespace nvblox {
+namespace test_utils {
+
+float randomFloatInRange(float f_min, float f_max);
+
+float randomIntInRange(int i_min, int i_max);
+
+float randomSign();
+
+Index3D getRandomIndex3dInRange(const int min, const int max);
+
+Vector3f getRandomVector3fInRange(const float min, const float max);
+
+Vector3f getRandomVector3fInRange(const Vector3f& min, const Vector3f& max);
+
+Vector3f getRandomUnitVector3f();
+
+Color randomColor();
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/cuda/blox.cu b/nvblox/tests/lib/cuda/blox.cu
new file mode 100644
index 00000000..2bf085bf
--- /dev/null
+++ b/nvblox/tests/lib/cuda/blox.cu
@@ -0,0 +1,29 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/blox.h"
+
+#include "nvblox/core/layer.h"
+#include "nvblox/gpu_hash/cuda/impl/gpu_layer_view_impl.cuh"
+
+namespace nvblox {
+
+using IndexBlockLayer = BlockLayer<IndexBlock>;
+template class GPULayerView<IndexBlock>;
+
+using DummyBlockLayer = BlockLayer<DummyBlock>;
+template class GPULayerView<DummyBlock>;
+
+} // nvblox
diff --git a/nvblox/tests/lib/cuda/blox_utils.cu b/nvblox/tests/lib/cuda/blox_utils.cu
new file mode 100644
index 00000000..ea537074
--- /dev/null
+++ b/nvblox/tests/lib/cuda/blox_utils.cu
@@ -0,0 +1,140 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/blox_utils.h"
+
+namespace nvblox {
+
+// The requirement for this second definition of this variable here is a
+// language issue with c++14. Can be removed in c++17.
+constexpr uint8_t TestBlockNoAllocation::kCPUInitializationValue;
+
+namespace test_utils {
+
+__global__ void setTsdfBlockVoxelsInSequenceKernel(TsdfBlock* block) {
+  const int lin_idx =
+      threadIdx.x + TsdfBlock::kVoxelsPerSide *
+                        (threadIdx.y + TsdfBlock::kVoxelsPerSide * threadIdx.z);
+  TsdfVoxel* voxel_ptr = &block->voxels[threadIdx.z][threadIdx.y][threadIdx.x];
+  voxel_ptr->distance = static_cast<float>(lin_idx);
+  voxel_ptr->weight = static_cast<float>(lin_idx);
+}
+
+void setTsdfBlockVoxelsInSequence(TsdfBlock::Ptr block) {
+  CHECK(block.memory_type() != MemoryType::kHost);
+  constexpr int kNumBlocks = 1;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  setTsdfBlockVoxelsInSequenceKernel<<<kNumBlocks, kThreadsPerBlock>>>(
+      block.get());
+}
+
+__global__ void checkBlockAllConstant(const TsdfBlock* block,
+                                      const TsdfVoxel* voxel_constant,
+                                      bool* flag) {
+  if ((blockIdx.x == 0) && (threadIdx.x == 0)) {
+    *flag = true;
+  }
+  __syncthreads();
+  const TsdfVoxel voxel = block->voxels[threadIdx.z][threadIdx.y][threadIdx.x];
+  const float distance_diff =
+      std::abs(voxel.distance - voxel_constant->distance);
+  const float weight_diff = std::abs(voxel.weight - voxel_constant->weight);
+  constexpr float eps = 1e-4;
+  if ((distance_diff > eps) || (weight_diff > eps)) {
+    *flag = false;
+  }
+}
+
+__global__ void checkBlockAllConstant(const TestBlock* block,
+                                      const TestVoxel* voxel_constant,
+                                      bool* flag) {
+  if ((blockIdx.x == 0) && (threadIdx.x == 0)) {
+    *flag = true;
+  }
+  __syncthreads();
+  const TestVoxel voxel = block->voxels[threadIdx.z][threadIdx.y][threadIdx.x];
+  if (voxel.data != voxel_constant->data) {
+    *flag = false;
+  }
+}
+
+__global__ void checkBlockAllConstant(const ColorBlock* block,
+                                      const ColorVoxel* voxel_constant,
+                                      bool* flag) {
+  if ((blockIdx.x == 0) && (threadIdx.x == 0)) {
+    *flag = true;
+  }
+  __syncthreads();
+  const ColorVoxel voxel = block->voxels[threadIdx.z][threadIdx.y][threadIdx.x];
+  if ((voxel.color.r != voxel_constant->color.r) ||
+      (voxel.color.g != voxel_constant->color.g) ||
+      (voxel.color.b != voxel_constant->color.b)) {
+    *flag = false;
+  }
+  const float weight_diff = std::abs(voxel.weight - voxel_constant->weight);
+  constexpr float eps = 1e-4;
+  if (weight_diff > eps) {
+    *flag = false;
+  }
+}
+
+template <typename VoxelType>
+bool checkBlockAllConstantTemplate(
+    const typename VoxelBlock<VoxelType>::Ptr block, VoxelType voxel_cpu) {
+  // Allocate memory for the flag
+  bool* flag_device_ptr;
+  checkCudaErrors(cudaMalloc(&flag_device_ptr, sizeof(bool)));
+
+  // Transfer the CPU voxel to GPU
+  VoxelType* voxel_device_ptr;
+  checkCudaErrors(cudaMalloc(&voxel_device_ptr, sizeof(VoxelType)));
+  checkCudaErrors(cudaMemcpy(voxel_device_ptr, &voxel_cpu, sizeof(VoxelType),
+                             cudaMemcpyHostToDevice));
+
+  constexpr int kNumBlocks = 1;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  checkBlockAllConstant<<<kNumBlocks, kThreadsPerBlock>>>(
+      block.get(), voxel_device_ptr, flag_device_ptr);
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Copy the flag back
+  bool flag;
+  checkCudaErrors(
+      cudaMemcpy(&flag, flag_device_ptr, sizeof(bool), cudaMemcpyDeviceToHost));
+
+  // Free the flag
+  checkCudaErrors(cudaFree(flag_device_ptr));
+  checkCudaErrors(cudaFree(voxel_device_ptr));
+
+  return flag;
+}
+
+bool checkBlockAllConstant(const TsdfBlock::Ptr block, TsdfVoxel voxel_cpu) {
+  return checkBlockAllConstantTemplate<TsdfVoxel>(block, voxel_cpu);
+}
+
+bool checkBlockAllConstant(const TestBlock::Ptr block, TestVoxel voxel_cpu) {
+  return checkBlockAllConstantTemplate<TestVoxel>(block, voxel_cpu);
+}
+
+bool checkBlockAllConstant(const ColorBlock::Ptr block, ColorVoxel voxel_cpu) {
+  return checkBlockAllConstantTemplate<ColorVoxel>(block, voxel_cpu);
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/cuda/gpu_image_routines.cu b/nvblox/tests/lib/cuda/gpu_image_routines.cu
new file mode 100644
index 00000000..8325b83e
--- /dev/null
+++ b/nvblox/tests/lib/cuda/gpu_image_routines.cu
@@ -0,0 +1,126 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/gpu_image_routines.h"
+
+#include "nvblox/core/cuda/error_check.cuh"
+#include "nvblox/core/image.h"
+
+namespace nvblox {
+namespace test_utils {
+
+template <typename ElementType>
+__global__ void setImageConstantKernel(ElementType* image, const int rows,
+                                       const int cols, ElementType value) {
+  // NOTE(alexmillane): Memory access is fully coallesed because neighbouring
+  // threads in the grid x dimension, access neighbouring memory elements
+  // (row-major images).
+  const int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int col_idx = blockIdx.y * blockDim.y + threadIdx.y;
+  if (col_idx < cols && row_idx < rows) {
+    image::access(row_idx, col_idx, cols, image) = value;
+  }
+}
+
+template <typename ElementType>
+void setImageConstantOnGpuTemplate(const ElementType value,
+                                   Image<ElementType>* image_ptr) {
+  // Set the pixels to a constant value. One thread per pixel (lol)
+  constexpr int kThreadsPerBlockInEachDimension = 8;
+  dim3 blockShape(kThreadsPerBlockInEachDimension,
+                  kThreadsPerBlockInEachDimension);
+  dim3 gridShape((image_ptr->rows() / kThreadsPerBlockInEachDimension) + 1,
+                 (image_ptr->cols() / kThreadsPerBlockInEachDimension) + 1);
+  setImageConstantKernel<<<gridShape, blockShape>>>(
+      image_ptr->dataPtr(), image_ptr->rows(), image_ptr->cols(), value);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+}
+
+void setImageConstantOnGpu(const float value, DepthImage* image_ptr) {
+  setImageConstantOnGpuTemplate<float>(value, image_ptr);
+}
+
+void setImageConstantOnGpu(const Color value, ColorImage* image_ptr) {
+  setImageConstantOnGpuTemplate<Color>(value, image_ptr);
+}
+
+__device__ Color diff(const Color& color_1, const Color& color_2) {
+  return Color(static_cast<uint8_t>(std::abs(static_cast<int16_t>(color_1.r) -
+                                             static_cast<int16_t>(color_2.r))),
+               static_cast<uint8_t>(std::abs(static_cast<int16_t>(color_1.g) -
+                                             static_cast<int16_t>(color_2.g))),
+               static_cast<uint8_t>(std::abs(static_cast<int16_t>(color_1.b) -
+                                             static_cast<int16_t>(color_2.b))));
+}
+
+__device__ float diff(const float& depth_1, const float& depth_2) {
+  return fabsf(depth_1 - depth_2);
+}
+
+template <typename ElementType>
+__global__ void differenceImageKernel(ElementType* diff_image_ptr,
+                                      const int rows, const int cols,
+                                      const ElementType* image_1,
+                                      const ElementType* image_2) {
+  // NOTE(alexmillane): Memory access is fully coallesed because neighbouring
+  // threads in the grid x dimension, access neighbouring memory elements
+  // (row-major images).
+  const int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int col_idx = blockIdx.y * blockDim.y + threadIdx.y;
+  if (col_idx < cols && row_idx < rows) {
+    const ElementType color_1 = image::access(row_idx, col_idx, cols, image_1);
+    const ElementType color_2 = image::access(row_idx, col_idx, cols, image_2);
+    const ElementType abs_color_diff = diff(color_1, color_2);
+    image::access(row_idx, col_idx, cols, diff_image_ptr) = abs_color_diff;
+  }
+}
+
+template <typename ImageType>
+void getDifferenceImageTemplate(const ImageType& image_1,
+                                const ImageType& image_2,
+                                ImageType* diff_image_ptr) {
+  CHECK_EQ(image_1.rows(), image_2.rows());
+  CHECK_EQ(image_1.cols(), image_2.cols());
+  CHECK(image_1.memory_type() == image_2.memory_type());
+  CHECK_NOTNULL(diff_image_ptr);
+  // Create the image for output
+  *diff_image_ptr =
+      ImageType(image_1.rows(), image_1.cols(), image_1.memory_type());
+  // Set the pixels to a constant value. One thread per pixel (lol)
+  constexpr int kThreadsPerBlockInEachDimension = 8;
+  dim3 blockShape(kThreadsPerBlockInEachDimension,
+                  kThreadsPerBlockInEachDimension);
+  dim3 gridShape((image_1.rows() / kThreadsPerBlockInEachDimension) + 1,
+                 (image_1.cols() / kThreadsPerBlockInEachDimension) + 1);
+  differenceImageKernel<<<gridShape, blockShape>>>(
+      diff_image_ptr->dataPtr(), image_1.rows(), image_1.cols(),
+      image_1.dataConstPtr(), image_2.dataConstPtr());
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+}
+
+void getDifferenceImageOnGPU(const ColorImage& image_1, const ColorImage& image_2,
+                        ColorImage* diff_image_ptr) {
+  getDifferenceImageTemplate<ColorImage>(image_1, image_2, diff_image_ptr);
+}
+
+void getDifferenceImageOnGPU(const DepthImage& image_1, const DepthImage& image_2,
+                        DepthImage* diff_image_ptr) {
+  getDifferenceImageTemplate<DepthImage>(image_1, image_2, diff_image_ptr);
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/cuda/gpu_indexing.cu b/nvblox/tests/lib/cuda/gpu_indexing.cu
new file mode 100644
index 00000000..d3225c17
--- /dev/null
+++ b/nvblox/tests/lib/cuda/gpu_indexing.cu
@@ -0,0 +1,84 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/gpu_indexing.h"
+
+#include <cuda_runtime.h>
+
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/unified_vector.h"
+
+namespace nvblox {
+namespace test_utils {
+
+__global__ void getBlockAndVoxelIndexFromPositionInLayerKernel(
+    const Vector3f position, const float block_size, Index3D* block_idx,
+    Index3D* voxel_idx) {
+  getBlockAndVoxelIndexFromPositionInLayer(block_size, position, block_idx,
+                                           voxel_idx);
+}
+
+__global__ void getBlockAndVoxelIndexFromPositionInLayerKernel(
+    const Vector3f* positions, const float block_size, const int num_positions,
+    Index3D* block_indices, Index3D* voxel_indices) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_positions) {
+    getBlockAndVoxelIndexFromPositionInLayer(
+        block_size, positions[idx], &block_indices[idx], &voxel_indices[idx]);
+  }
+}
+
+void getBlockAndVoxelIndexFromPositionInLayerOnGPU(const float block_size,
+                                                   const Vector3f& position,
+                                                   Index3D* block_idx,
+                                                   Index3D* voxel_idx) {
+  Index3D* block_idx_device;
+  Index3D* voxel_idx_device;
+  checkCudaErrors(cudaMalloc(&block_idx_device, sizeof(Index3D)));
+  checkCudaErrors(cudaMalloc(&voxel_idx_device, sizeof(Index3D)));
+
+  getBlockAndVoxelIndexFromPositionInLayerKernel<<<1, 1>>>(
+      position, block_size, block_idx_device, voxel_idx_device);
+
+  checkCudaErrors(cudaMemcpy(block_idx, block_idx_device, sizeof(Index3D),
+                             cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(voxel_idx, voxel_idx_device, sizeof(Index3D),
+                             cudaMemcpyDeviceToHost));
+
+  checkCudaErrors(cudaFree(block_idx_device));
+  checkCudaErrors(cudaFree(voxel_idx_device));
+}
+
+void getBlockAndVoxelIndexFromPositionInLayerOnGPU(
+    const float block_size, const std::vector<Vector3f>& positions,
+    std::vector<Index3D>* block_indices, std::vector<Index3D>* voxel_indices) {
+  device_vector<Vector3f> positions_device(positions);
+
+  device_vector<Index3D> block_indices_device(positions.size());
+  device_vector<Index3D> voxel_indices_device(positions.size());
+
+  constexpr int kNumThreads = 1024;
+  const int kNumBlocks = positions.size() / kNumThreads + 1;
+
+  getBlockAndVoxelIndexFromPositionInLayerKernel<<<kNumBlocks, kNumThreads>>>(
+      positions_device.data(), block_size, positions_device.size(),
+      block_indices_device.data(), voxel_indices_device.data());
+
+  *block_indices = block_indices_device.toVector();
+  *voxel_indices = voxel_indices_device.toVector();
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/tests/lib/cuda/gpu_layer_utils.cu b/nvblox/tests/lib/cuda/gpu_layer_utils.cu
new file mode 100644
index 00000000..37648b80
--- /dev/null
+++ b/nvblox/tests/lib/cuda/gpu_layer_utils.cu
@@ -0,0 +1,108 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/gpu_layer_utils.h"
+
+#include <thrust/device_vector.h>
+
+#include "nvblox/gpu_hash/cuda/gpu_hash_interface.cuh"
+#include "nvblox/gpu_hash/cuda/gpu_indexing.cuh"
+
+namespace nvblox {
+namespace test_utils {
+
+__global__ void getContainsFlagsKernel(
+    Index3DDeviceHashMapType<TsdfBlock> block_hash, Index3D* indices,
+    int num_indices, bool* flags) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < num_indices) {
+    flags[idx] = block_hash.contains(indices[idx]);
+  }
+}
+
+std::vector<bool> getContainsFlags(
+    const GPULayerView<TsdfBlock>& gpu_layer,
+    const std::vector<Index3D>& indices) {
+  // CPU -> GPU
+  thrust::device_vector<Index3D> device_indices(indices);
+
+  // Output space
+  thrust::device_vector<bool> device_flags(device_indices.size());
+
+  // Kernel
+  constexpr int kNumThreadsPerBlock = 32;
+  const int num_blocks = device_indices.size() / kNumThreadsPerBlock + 1;
+  getContainsFlagsKernel<<<num_blocks, kNumThreadsPerBlock>>>(
+      gpu_layer.getHash().impl_, device_indices.data().get(),
+      device_indices.size(), device_flags.data().get());
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // GPU -> CPU
+  std::vector<bool> host_flags(device_flags.size());
+  thrust::copy(device_flags.begin(), device_flags.end(), host_flags.begin());
+
+  return host_flags;
+}
+
+__global__ void getVoxelsAtPositionsKernel(
+    Index3DDeviceHashMapType<TsdfBlock> block_hash, const Vector3f* p_L_vec,
+    float block_size, int num_points, TsdfVoxel* voxels, bool* flags) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < num_points) {
+    const Vector3f p_L = p_L_vec[idx];
+    TsdfVoxel* voxel_ptr;
+    const bool flag =
+        getVoxelAtPosition(block_hash, p_L, block_size, &voxel_ptr);
+    flags[idx] = flag;
+    if (flag) {
+      voxels[idx] = *voxel_ptr;
+    }
+  }
+}
+
+std::pair<std::vector<TsdfVoxel>, std::vector<bool>> getVoxelsAtPositionsOnGPU(
+    const GPULayerView<TsdfBlock>& gpu_layer,
+    const std::vector<Vector3f>& p_L_vec) {
+  // CPU -> GPU
+  thrust::device_vector<Vector3f> device_positions(p_L_vec);
+
+  // Output space
+  thrust::device_vector<TsdfVoxel> device_voxels(device_positions.size());
+  thrust::device_vector<bool> device_flags(device_positions.size());
+
+  // Kernel
+  constexpr int kNumThreadsPerBlock = 32;
+  const int num_blocks = device_positions.size() / kNumThreadsPerBlock + 1;
+  getVoxelsAtPositionsKernel<<<num_blocks, kNumThreadsPerBlock>>>(
+      gpu_layer.getHash().impl_, device_positions.data().get(),
+      gpu_layer.block_size(), device_positions.size(),
+      device_voxels.data().get(), device_flags.data().get());
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // GPU -> CPU
+  std::vector<TsdfVoxel> host_voxels(device_voxels.size());
+  thrust::copy(device_voxels.begin(), device_voxels.end(), host_voxels.begin());
+  std::vector<bool> host_flags(device_flags.size());
+  thrust::copy(device_flags.begin(), device_flags.end(), host_flags.begin());
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+
+  return {std::move(host_voxels), std::move(host_flags)};
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/cuda/increment_kernel.cu b/nvblox/tests/lib/cuda/increment_kernel.cu
new file mode 100644
index 00000000..c8b86bb2
--- /dev/null
+++ b/nvblox/tests/lib/cuda/increment_kernel.cu
@@ -0,0 +1,48 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/increment_on_gpu.h"
+
+__global__ void incrementKernel(int* number) {
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    (*number)++;
+  }
+}
+
+__global__ void incrementKernel(int* number, const int num_elelments) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elelments) {
+    number[idx]++;
+  }
+}
+
+
+namespace nvblox {
+namespace test_utils {
+
+void incrementOnGPU(int* number) {
+  incrementKernel<<<1, 1>>>(number);
+  cudaDeviceSynchronize();
+}
+
+void incrementOnGPU(const int num_elelments, int* number) {
+  constexpr int kThreadsPerBlock = 32;
+  const int num_blocks = (num_elelments / kThreadsPerBlock) + 1;
+  incrementKernel<<<num_blocks, kThreadsPerBlock>>>(number, num_elelments);
+  cudaDeviceSynchronize();
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
\ No newline at end of file
diff --git a/nvblox/tests/lib/cuda/interpolation_2d_gpu.cu b/nvblox/tests/lib/cuda/interpolation_2d_gpu.cu
new file mode 100644
index 00000000..6ee6d960
--- /dev/null
+++ b/nvblox/tests/lib/cuda/interpolation_2d_gpu.cu
@@ -0,0 +1,100 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/interpolation_2d_gpu.h"
+
+#include "nvblox/core/interpolation_2d.h"
+
+namespace nvblox {
+namespace test_utils {
+
+template <typename ElementType>
+__global__ void interpolate(const ElementType* image_unified_ptr, int rows,
+                            int cols, const Vector2f* u_px_vec,
+                            ElementType* values_ptr, int* success_flags,
+                            int num_points) {
+  const int lin_idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (lin_idx < num_points) {
+    ElementType value;
+    if (interpolation::interpolate2DLinear(image_unified_ptr, u_px_vec[lin_idx],
+                                           rows, cols, &value)) {
+      success_flags[lin_idx] = 1;
+      values_ptr[lin_idx] = value;
+    } else {
+      success_flags[lin_idx] = 0;
+    }
+  }
+}
+
+template <typename ElementType>
+void linearInterpolateImageGpuTemplate(const Image<ElementType>& image,
+                                       const std::vector<Vector2f>& u_px_vec,
+                                       std::vector<ElementType>* values_ptr,
+                                       std::vector<int>* success_flags_ptr) {
+  CHECK_NOTNULL(values_ptr);
+  CHECK_NOTNULL(success_flags_ptr);
+  CHECK_EQ(values_ptr->size(), u_px_vec.size());
+  CHECK_EQ(success_flags_ptr->size(), u_px_vec.size());
+  // Move the data to the GPU
+  const int num_points = u_px_vec.size();
+  Vector2f* u_px_vec_device_ptr;
+  ElementType* values_device_ptr;
+  int* success_flags_device_ptr;
+  checkCudaErrors(
+      cudaMalloc(&u_px_vec_device_ptr, num_points * sizeof(Eigen::Vector2f)));
+  checkCudaErrors(
+      cudaMalloc(&values_device_ptr, num_points * sizeof(ElementType)));
+  checkCudaErrors(
+      cudaMalloc(&success_flags_device_ptr, num_points * sizeof(int)));
+  checkCudaErrors(cudaMemcpy(u_px_vec_device_ptr, u_px_vec.data(),
+                             num_points * sizeof(Eigen::Vector2f),
+                             cudaMemcpyHostToDevice));
+
+  // Interpolate on the GPUUUU
+  constexpr int kThreadsPerBlock = 32;
+  const int num_blocks = static_cast<int>(num_points / kThreadsPerBlock) + 1;
+  interpolate<<<num_blocks, kThreadsPerBlock>>>(
+      image.dataConstPtr(), image.rows(), image.cols(), u_px_vec_device_ptr,
+      values_device_ptr, success_flags_device_ptr, num_points);
+  checkCudaErrors(cudaPeekAtLastError());
+  // Get results back
+  checkCudaErrors(cudaMemcpy(values_ptr->data(), values_device_ptr,
+                             num_points * sizeof(ElementType),
+                             cudaMemcpyDeviceToHost));
+  checkCudaErrors(cudaMemcpy(success_flags_ptr->data(),
+                             success_flags_device_ptr, num_points * sizeof(int),
+                             cudaMemcpyDeviceToHost));
+  cudaFree(values_device_ptr);
+  cudaFree(success_flags_device_ptr);
+}
+
+void linearInterpolateImageGpu(const DepthImage& image,
+                               const std::vector<Vector2f>& u_px_vec,
+                               std::vector<float>* values_ptr,
+                               std::vector<int>* success_flags_ptr) {
+  linearInterpolateImageGpuTemplate<float>(image, u_px_vec, values_ptr,
+                                           success_flags_ptr);
+}
+
+void linearInterpolateImageGpu(const ColorImage& image,
+                               const std::vector<Vector2f>& u_px_vec,
+                               std::vector<Color>* values_ptr,
+                               std::vector<int>* success_flags_ptr) {
+  linearInterpolateImageGpuTemplate<Color>(image, u_px_vec, values_ptr,
+                                           success_flags_ptr);
+}
+
+}  //  namespace test_utils
+}  //  namespace nvblox
diff --git a/nvblox/tests/lib/cuda/projective_tsdf_integrator_cuda_components.cu b/nvblox/tests/lib/cuda/projective_tsdf_integrator_cuda_components.cu
new file mode 100644
index 00000000..a293cc53
--- /dev/null
+++ b/nvblox/tests/lib/cuda/projective_tsdf_integrator_cuda_components.cu
@@ -0,0 +1,319 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/projective_tsdf_integrator_cuda_components.h"
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/cuda/error_check.cuh"
+#include "nvblox/core/interpolation_2d.h"
+
+namespace nvblox {
+namespace test_utils {
+
+__global__ void transformPointsOnGPU(const Eigen::Matrix3f* R_B_A_matrix_ptr,
+                                     const Eigen::Vector3f* t_B_A_matrix_ptr,
+                                     const float* vecs_A_ptr,
+                                     const int num_vecs, float* vecs_B_ptr) {
+  // We first load the transform into shared memory for use by all threads in
+  // the block. The transformation matrix has 4x4=16 elements, so the first 16
+  // threads of each block perform the load.
+  __shared__ Eigen::Matrix3f R_B_A;
+  if (threadIdx.x < 9) {
+    R_B_A.data()[threadIdx.x] = R_B_A_matrix_ptr->data()[threadIdx.x];
+  }
+  __shared__ Eigen::Vector3f t_B_A;
+  if (threadIdx.x >= 9 && threadIdx.x < 12) {
+    t_B_A.data()[threadIdx.x - 9] = t_B_A_matrix_ptr->data()[threadIdx.x - 9];
+  }
+  __syncthreads();
+
+  // Now perform transformation of the vectors.
+  const int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (vec_idx < num_vecs) {
+    // Mapping the vecs
+    const Eigen::Map<const Eigen::Matrix3Xf> vecs_A(vecs_A_ptr, 3, num_vecs);
+    Eigen::Map<Eigen::Matrix3Xf> vecs_B(vecs_B_ptr, 3, num_vecs);
+    // Transformation
+    vecs_B.col(vec_idx) = R_B_A * vecs_A.col(vec_idx) + t_B_A;
+  }
+}
+
+__global__ void projectBlocksToCamera(
+    const Index3D* block_indices_device_ptr, const Camera* camera_device_ptr,
+    const Eigen::Matrix3f* R_C_L_device_ptr,
+    const Eigen::Vector3f* t_C_L_device_ptr, const float block_size,
+    BlockProjectionResult* block_projection_results_device_ptr) {
+  // Linear index of thread within block
+  const int thread_index_linear =
+      threadIdx.z + blockDim.z * (threadIdx.y + (blockDim.y * threadIdx.x));
+
+  // Get data needed by all threads into shared memory
+  __shared__ Eigen::Matrix3f R_C_L;
+  if (thread_index_linear < 9) {
+    R_C_L.data()[thread_index_linear] =
+        R_C_L_device_ptr->data()[thread_index_linear];
+  }
+  __shared__ Eigen::Vector3f t_C_L;
+  if (thread_index_linear >= 9 && thread_index_linear < 12) {
+    t_C_L.data()[thread_index_linear - 9] =
+        t_C_L_device_ptr->data()[thread_index_linear - 9];
+  }
+  __syncthreads();
+
+  // The indices of the voxel this thread will work on
+  // blockIdx.x      - The index of the block we're working on (blockIdx.y/z
+  //                   should be zero)
+  // threadIdx.x/y/z - The indices of the voxel within the block (we
+  //                   expect the threadBlockDims == voxelBlockDims)
+  const Index3D block_idx = block_indices_device_ptr[blockIdx.x];
+  const Index3D voxel_idx(threadIdx.x, threadIdx.y, threadIdx.z);
+
+  // Voxel center point
+  const Vector3f p_voxel_center_L = getCenterPostionFromBlockIndexAndVoxelIndex(
+      block_size, block_idx, voxel_idx);
+  // To camera frame
+  const Vector3f p_voxel_center_C = R_C_L * p_voxel_center_L + t_C_L;
+  // Project to image plane
+  Eigen::Vector2f u_px;
+  if (!camera_device_ptr->project(p_voxel_center_C, &u_px)) {
+    return;
+  }
+
+  // Map outputs
+  BlockProjectionResult* result_ptr =
+      &(block_projection_results_device_ptr[blockIdx.x]);
+  result_ptr->row(thread_index_linear) = u_px;
+}
+
+__global__ void interpolate(const float* depth_frame,
+                            const float* u_px_vec_device_ptr, const int rows,
+                            const int cols, const int num_points,
+                            float* interpolated_value_device_ptr) {
+  // Map the interpolation locations
+  const Eigen::Map<const Eigen::MatrixX2f> u_px_vec(u_px_vec_device_ptr,
+                                                    num_points, 2);
+  // Interpolate one of the points
+  const int u_px_vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (u_px_vec_idx < num_points) {
+    interpolation::interpolate2DLinear(
+        depth_frame, u_px_vec.row(u_px_vec_idx), rows, cols,
+        &interpolated_value_device_ptr[u_px_vec_idx]);
+  }
+}
+
+__global__ void setVoxelBlock(VoxelBlock<TsdfVoxel>** block_device_ptrs) {
+  // The VoxelBlock that this ThreadBlock is working on
+  VoxelBlock<TsdfVoxel>* block_ptr = block_device_ptrs[blockIdx.x];
+  block_ptr->voxels[threadIdx.z][threadIdx.y][threadIdx.x].distance =
+      threadIdx.x + blockDim.x * (threadIdx.y + blockDim.y * threadIdx.z);
+}
+
+void setVoxelBlockOnGPU(TsdfLayer* layer) {
+  // Get a list of blocks to be modified on the CPU
+  const std::vector<Index3D> block_indices = layer->getAllBlockIndices();
+  std::vector<VoxelBlock<TsdfVoxel>*> block_ptrs;
+  block_ptrs.reserve(block_indices.size());
+  for (const Index3D& block_index : block_indices) {
+    block_ptrs.push_back(layer->getBlockAtIndex(block_index).get());
+  }
+
+  // Move the list to the GPU
+  VoxelBlock<TsdfVoxel>** block_device_ptrs;
+  checkCudaErrors(
+      cudaMalloc(&block_device_ptrs,
+                 block_ptrs.size() * sizeof(VoxelBlock<TsdfVoxel>*)));
+  checkCudaErrors(
+      cudaMemcpy(block_device_ptrs, block_ptrs.data(),
+                 block_ptrs.size() * sizeof(VoxelBlock<TsdfVoxel>*),
+                 cudaMemcpyHostToDevice));
+
+  // Kernal - One thread block per block
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_blocks = block_indices.size();
+  setVoxelBlock<<<num_blocks, kThreadsPerBlock>>>(block_device_ptrs);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  cudaFree(block_device_ptrs);
+}
+
+Eigen::VectorXf interpolatePointsOnGPU(const DepthImage& depth_frame,
+                                       const Eigen::MatrixX2f& u_px_vec) {
+  // Transfer data to the GPU
+  float* depth_frame_device_ptr;
+  checkCudaErrors(
+      cudaMalloc(&depth_frame_device_ptr, depth_frame.numel() * sizeof(float)));
+  checkCudaErrors(cudaMemcpy(depth_frame_device_ptr, depth_frame.dataConstPtr(),
+                             depth_frame.numel() * sizeof(float),
+                             cudaMemcpyHostToDevice));
+  float* u_px_vec_device_ptr;
+  checkCudaErrors(
+      cudaMalloc(&u_px_vec_device_ptr, u_px_vec.rows() * 2 * sizeof(float)));
+  checkCudaErrors(cudaMemcpy(u_px_vec_device_ptr, u_px_vec.data(),
+                             u_px_vec.rows() * 2 * sizeof(float),
+                             cudaMemcpyHostToDevice));
+
+  // Output location
+  float* interpolated_values_device_ptr;
+  checkCudaErrors(cudaMalloc(&interpolated_values_device_ptr,
+                             u_px_vec.rows() * sizeof(float)));
+
+  // Kernel - interpolation
+  const int num_points = u_px_vec.rows();
+  constexpr int threadsPerBlock = 512;
+  const int blocksInGrid = (num_points / threadsPerBlock) + 1;
+  interpolate<<<blocksInGrid, threadsPerBlock>>>(
+      depth_frame_device_ptr, u_px_vec_device_ptr, depth_frame.rows(), depth_frame.cols(),
+      u_px_vec.rows(), interpolated_values_device_ptr);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  // Return the result
+  Eigen::VectorXf results(u_px_vec.rows());
+  checkCudaErrors(cudaMemcpy(results.data(), interpolated_values_device_ptr,
+                             u_px_vec.rows() * sizeof(float),
+                             cudaMemcpyDeviceToHost));
+
+  cudaFree(depth_frame_device_ptr);
+  cudaFree(u_px_vec_device_ptr);
+  cudaFree(interpolated_values_device_ptr);
+
+  return results;
+}
+
+std::vector<BlockProjectionResult> projectBlocksOnGPU(
+    const std::vector<Index3D>& block_indices, const Camera& camera,
+    const Transform& T_C_L, TsdfLayer* distance_layer_ptr) {
+  // Camera
+  Camera* camera_device_ptr;
+  checkCudaErrors(cudaMalloc(&camera_device_ptr, sizeof(Camera)));
+  checkCudaErrors(cudaMemcpy(camera_device_ptr, &camera, sizeof(Camera),
+                             cudaMemcpyHostToDevice));
+
+  // Transformation
+  // NOTE(alexmillane): For some reason I only got things to work by separating
+  // the Eigen::Affine3f into the rotation matrix and translation vector... I
+  // cannot explain why it didn't work, but I spent hours trying to get it and I
+  // couldn't.
+  const Eigen::Matrix3f R_C_L = T_C_L.rotation();
+  const Eigen::Vector3f t_C_L = T_C_L.translation();
+  Eigen::Matrix3f* R_C_L_device_ptr;
+  Eigen::Vector3f* t_C_L_device_ptr;
+  checkCudaErrors(cudaMalloc(&R_C_L_device_ptr, sizeof(Eigen::Matrix3f)));
+  checkCudaErrors(cudaMemcpy(R_C_L_device_ptr, R_C_L.data(),
+                             sizeof(Eigen::Matrix3f), cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMalloc(&t_C_L_device_ptr, sizeof(Eigen::Vector3f)));
+  checkCudaErrors(cudaMemcpy(t_C_L_device_ptr, t_C_L.data(),
+                             sizeof(Eigen::Vector3f), cudaMemcpyHostToDevice));
+
+  // Copy the block indices to the GPU for projection
+  Index3D* block_indices_device_ptr;
+  checkCudaErrors(cudaMalloc(&block_indices_device_ptr,
+                             block_indices.size() * sizeof(Index3D)));
+  checkCudaErrors(cudaMemcpy(block_indices_device_ptr, block_indices.data(),
+                             block_indices.size() * sizeof(Index3D),
+                             cudaMemcpyHostToDevice));
+
+  // Output space
+  BlockProjectionResult* block_projection_results_device_ptr;
+  checkCudaErrors(
+      cudaMalloc(&block_projection_results_device_ptr,
+                 block_indices.size() * sizeof(BlockProjectionResult)));
+
+  // TODO: CURRENTLY ASSUMES WE CAN LAUNCH AN INFINITE NUMBER OF THREAD BLOX
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const dim3 kThreadsPerBlock(kVoxelsPerSide, kVoxelsPerSide, kVoxelsPerSide);
+  const int num_blocks = block_indices.size();
+
+  projectBlocksToCamera<<<num_blocks, kThreadsPerBlock>>>(
+      block_indices_device_ptr, camera_device_ptr, R_C_L_device_ptr,
+      t_C_L_device_ptr, distance_layer_ptr->block_size(),
+      block_projection_results_device_ptr);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  // Copy over results
+  std::vector<BlockProjectionResult> projection_results;
+  projection_results.resize(block_indices.size());
+  checkCudaErrors(
+      cudaMemcpy(projection_results.data(), block_projection_results_device_ptr,
+                 block_indices.size() * sizeof(BlockProjectionResult),
+                 cudaMemcpyDeviceToHost));
+
+  // Free
+  cudaFree(R_C_L_device_ptr);
+  cudaFree(t_C_L_device_ptr);
+  cudaFree(block_indices_device_ptr);
+  cudaFree(block_projection_results_device_ptr);
+
+  return projection_results;
+}
+
+Eigen::Matrix3Xf transformPointsOnGPU(const Transform& T_B_A,
+                                      const Eigen::Matrix3Xf& vecs_A) {
+  // Move inputs
+  float* vecs_A_device_ptr;
+  const int num_elements = vecs_A.rows() * vecs_A.cols();
+  checkCudaErrors(cudaMalloc(&vecs_A_device_ptr, num_elements * sizeof(float)));
+  checkCudaErrors(cudaMemcpy(vecs_A_device_ptr, vecs_A.data(),
+                             num_elements * sizeof(float),
+                             cudaMemcpyHostToDevice));
+  // Transformation
+  // NOTE(alexmillane): For some reason I only got things to work by separating
+  // the Eigen::Affine3f into the rotation matrix and translation vector... I
+  // cannot explain why it didn't work, but I spent hours trying to get it and I
+  // couldn't.
+  const Eigen::Matrix3f R_B_A = T_B_A.rotation();
+  const Eigen::Vector3f t_B_A = T_B_A.translation();
+  Eigen::Matrix3f* R_A_B_device_ptr;
+  Eigen::Vector3f* t_A_B_device_ptr;
+  checkCudaErrors(cudaMalloc(&R_A_B_device_ptr, sizeof(Eigen::Matrix3f)));
+  checkCudaErrors(cudaMemcpy(R_A_B_device_ptr, R_B_A.data(),
+                             sizeof(Eigen::Matrix3f), cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMalloc(&t_A_B_device_ptr, sizeof(Eigen::Vector3f)));
+  checkCudaErrors(cudaMemcpy(t_A_B_device_ptr, t_B_A.data(),
+                             sizeof(Eigen::Vector3f), cudaMemcpyHostToDevice));
+
+  // Output space
+  float* vecs_B_device;
+  checkCudaErrors(cudaMalloc(&vecs_B_device, num_elements * sizeof(float)));
+
+  // Kernel
+  const int num_vecs = vecs_A.cols();
+  constexpr int threadsPerBlock = 512;
+  const int blocksInGrid = (num_vecs / threadsPerBlock) + 1;
+  transformPointsOnGPU<<<blocksInGrid, threadsPerBlock>>>(
+      R_A_B_device_ptr, t_A_B_device_ptr, vecs_A_device_ptr, num_vecs,
+      vecs_B_device);
+  checkCudaErrors(cudaGetLastError());
+  checkCudaErrors(cudaDeviceSynchronize());
+
+  // Retrieve output
+  Eigen::Matrix3Xf vecs_B(3, num_vecs);
+  checkCudaErrors(cudaMemcpy(vecs_B.data(), vecs_B_device,
+                             num_elements * sizeof(float),
+                             cudaMemcpyDeviceToHost));
+
+  cudaFree(vecs_A_device_ptr);
+  cudaFree(R_A_B_device_ptr);
+  cudaFree(t_A_B_device_ptr);
+  cudaFree(vecs_B_device);
+
+  return vecs_B;
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/cuda/test_utils_cuda.cu b/nvblox/tests/lib/cuda/test_utils_cuda.cu
new file mode 100644
index 00000000..a9527c3d
--- /dev/null
+++ b/nvblox/tests/lib/cuda/test_utils_cuda.cu
@@ -0,0 +1,164 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/test_utils_cuda.h"
+
+namespace nvblox {
+namespace test_utils {
+
+/* Fills a device vector with elements
+ * Call with
+ * - GridDim: 1 block, 1D
+ * - BlockDim: 1D size > num_elements (1 thread per element)
+ */
+template <typename T>
+__global__ void fillVectorWithConstant(int num_elements, T value, T* vec) {
+  if ((blockIdx.x == 0) && (threadIdx.x < num_elements)) {
+    vec[threadIdx.x] = value;
+  }
+}
+
+/* Fills a device vector with elements
+ * Call with
+ * - GridDim: 1D
+ * - BlockDim: 1D (1 thread per element)
+ */
+template <typename T>
+__global__ void checkVectorAllConstant(int num_elements, T value, const T* vec,
+                                       bool* flag) {
+  int thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_idx == 0) {
+    *flag = true;
+  }
+  __syncthreads();
+  if (thread_idx < num_elements) {
+    if (vec[thread_idx] != value) {
+      *flag = false;
+    }
+  }
+}
+
+/* Fills a device vector with elements
+ * Call with
+ * - GridDim: N blocks, 1D
+ * - BlockDim: 1D (1 thread per element)
+ */
+__global__ void addOneToAll(int num_elements, int* vec) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < num_elements) {
+    vec[idx] = vec[idx] + 1;
+  }
+}
+
+void fillVectorWithConstant(float value, unified_vector<float>* vec_ptr) {
+  constexpr int kMaxThreadBlockSize = 512;
+  CHECK_LT(vec_ptr->size(), kMaxThreadBlockSize);
+  // kernel
+  int num_thread_blocks = 1;
+  int num_threads = vec_ptr->size();
+  fillVectorWithConstant<<<num_thread_blocks, num_threads>>>(
+      vec_ptr->size(), value, vec_ptr->data());
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+void fillVectorWithConstant(int value, unified_vector<int>* vec_ptr) {
+  constexpr int kMaxThreadBlockSize = 512;
+  CHECK_LT(vec_ptr->size(), kMaxThreadBlockSize);
+  // kernel
+  int num_thread_blocks = 1;
+  int num_threads = vec_ptr->size();
+  fillVectorWithConstant<<<num_thread_blocks, num_threads>>>(
+      vec_ptr->size(), value, vec_ptr->data());
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+void fillWithConstant(float value, size_t num_elems, float* vec_ptr) {
+  constexpr int kMaxThreadBlockSize = 512;
+  CHECK_LT(num_elems, kMaxThreadBlockSize);
+  // kernel
+  int num_thread_blocks = 1;
+  int num_threads = num_elems;
+  fillVectorWithConstant<<<num_thread_blocks, num_threads>>>(
+      num_elems, value, vec_ptr);
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+void fillWithConstant(int value, size_t num_elems, int* vec_ptr) {
+  constexpr int kMaxThreadBlockSize = 512;
+  CHECK_LT(num_elems, kMaxThreadBlockSize);
+  // kernel
+  int num_thread_blocks = 1;
+  int num_threads = num_elems;
+  fillVectorWithConstant<<<num_thread_blocks, num_threads>>>(
+      num_elems, value, vec_ptr);
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+template <typename T>
+bool checkVectorAllConstantTemplate(const T* vec, T value, size_t size) {
+  // Allocate memory
+  bool* flag_device_ptr;
+  checkCudaErrors(cudaMalloc(&flag_device_ptr, sizeof(bool)));
+
+  // Kernel
+  constexpr int kNumThreads = 512;
+  const int num_thread_blocks = size / kNumThreads + 1;
+  checkVectorAllConstant<T><<<num_thread_blocks, kNumThreads>>>(
+      size, value, vec, flag_device_ptr);
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+
+  // Copy the flag back
+  bool flag;
+  cudaMemcpy(&flag, flag_device_ptr, sizeof(bool), cudaMemcpyDeviceToHost);
+
+  // Free the flag
+  checkCudaErrors(cudaFree(flag_device_ptr));
+
+  return flag;
+}
+
+bool checkVectorAllConstant(const unified_vector<float>& vec, float value) {
+  return checkVectorAllConstantTemplate<float>(vec.data(), value, vec.size());
+}
+
+bool checkVectorAllConstant(const unified_vector<int>& vec, int value) {
+  return checkVectorAllConstantTemplate<int>(vec.data(), value, vec.size());
+}
+
+bool checkAllConstant(const float* vec_ptr, float value, size_t num_elems) {
+  return checkVectorAllConstantTemplate<float>(vec_ptr, value, num_elems);
+}
+
+bool checkAllConstant(const int* vec_ptr, int value, size_t num_elems) {
+  return checkVectorAllConstantTemplate<int>(vec_ptr, value, num_elems);
+}
+
+
+void addOneToAllGPU(unified_vector<int>* vec_ptr) {
+  constexpr int kNumThreadsPerBlock = 512;
+  const int kNumBlocks = vec_ptr->size() / kNumThreadsPerBlock + 1;
+  addOneToAll<<<kNumBlocks, kNumThreadsPerBlock>>>(vec_ptr->size(),
+                                                   vec_ptr->data());
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaPeekAtLastError());
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/projective_tsdf_integrator_cpu.cpp b/nvblox/tests/lib/projective_tsdf_integrator_cpu.cpp
new file mode 100644
index 00000000..3528e306
--- /dev/null
+++ b/nvblox/tests/lib/projective_tsdf_integrator_cpu.cpp
@@ -0,0 +1,87 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/projective_tsdf_integrator_cpu.h"
+
+namespace nvblox {
+
+void ProjectiveTsdfIntegratorCPU::updateBlocks(
+    const std::vector<Index3D>& block_indices, const DepthImage& depth_frame,
+    const Transform& T_L_C, const Camera& camera,
+    const float truncation_distance_m, TsdfLayer* layer) {
+  CHECK(layer->memory_type() == MemoryType::kUnified)
+      << "For CPU-based interpolation, the layer must be CPU accessible (ie "
+         "MemoryType::kUnified).";
+
+  // Update each block requested using the depth map.
+  Transform T_C_L = T_L_C.inverse();
+  for (const Index3D& block_index : block_indices) {
+    VoxelBlock<TsdfVoxel>::Ptr block_ptr = layer->getBlockAtIndex(block_index);
+
+    Index3D voxel_index;
+    for (voxel_index.x() = 0; voxel_index.x() < TsdfBlock::kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < TsdfBlock::kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < TsdfBlock::kVoxelsPerSide;
+             voxel_index.z()++) {
+          Vector3f p_L = getCenterPostionFromBlockIndexAndVoxelIndex(
+              layer->block_size(), block_index, voxel_index);
+
+          // Convert the p_L to a p_C
+          Vector3f p_C = T_C_L * p_L;
+          Vector2f u_C;
+          if (!camera.project(p_C, &u_C)) {
+            // If the voxel isn't in the frame, next voxel plz
+            continue;
+          }
+
+          float depth = -1.0;
+          if (!interpolation::interpolate2DLinear(depth_frame, u_C, &depth)) {
+            continue;
+          }
+
+          DCHECK_GE(depth, 0.0f);
+
+          TsdfVoxel& voxel =
+              block_ptr
+                  ->voxels[voxel_index.x()][voxel_index.y()][voxel_index.z()];
+
+          // TODO: double-check this distance.
+          const float voxel_distance = depth - p_C.z();
+
+          // If we're behind the negative truncation distance, just continue.
+          if (voxel_distance < -truncation_distance_m) {
+            continue;
+          }
+
+          const float measurement_weight = 1.0f;
+          const float fused_distance = (voxel_distance * measurement_weight +
+                                        voxel.distance * voxel.weight) /
+                                       (measurement_weight + voxel.weight);
+
+          voxel.distance =
+              fused_distance > 0.0f
+                  ? std::min(truncation_distance_m, fused_distance)
+                  : std::max(-truncation_distance_m, fused_distance);
+          voxel.weight =
+              std::min(measurement_weight + voxel.weight, max_weight_);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace nvblox
diff --git a/nvblox/tests/lib/utils.cpp b/nvblox/tests/lib/utils.cpp
new file mode 100644
index 00000000..9e0b312e
--- /dev/null
+++ b/nvblox/tests/lib/utils.cpp
@@ -0,0 +1,70 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include "nvblox/tests/utils.h"
+
+#include <cstdlib>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/utils/timing.h"
+
+namespace nvblox {
+namespace test_utils {
+
+float randomFloatInRange(float f_min, float f_max) {
+  float f = static_cast<float>(std::rand()) / RAND_MAX;
+  return f_min + f * (f_max - f_min);
+}
+
+float randomIntInRange(int i_min, int i_max) {
+  return static_cast<int>(randomFloatInRange(i_min, i_max + 1));
+}
+
+float randomSign() {
+  constexpr int kHalfMaxRandMax = RAND_MAX / 2;
+  return (std::rand() < kHalfMaxRandMax) ? -1.0f : 1.0f;
+}
+
+Index3D getRandomIndex3dInRange(const int min, const int max) {
+  return Index3D(test_utils::randomIntInRange(min, max),
+                 test_utils::randomIntInRange(min, max),
+                 test_utils::randomIntInRange(min, max));
+}
+
+Vector3f getRandomVector3fInRange(const float min, const float max) {
+  return Vector3f(test_utils::randomFloatInRange(min, max),
+                  test_utils::randomFloatInRange(min, max),
+                  test_utils::randomFloatInRange(min, max));
+}
+
+Vector3f getRandomVector3fInRange(const Vector3f& min, const Vector3f& max) {
+  return Vector3f(test_utils::randomFloatInRange(min.x(), max.x()),
+                  test_utils::randomFloatInRange(min.y(), max.y()),
+                  test_utils::randomFloatInRange(min.z(), max.z()));
+}
+
+Vector3f getRandomUnitVector3f() {
+  return getRandomVector3fInRange(-1.0, 1.0).normalized();
+}
+
+Color randomColor() {
+  return Color(static_cast<uint8_t>(randomIntInRange(0, 255)),
+               static_cast<uint8_t>(randomIntInRange(0, 255)),
+               static_cast<uint8_t>(randomIntInRange(0, 255)));
+}
+
+}  // namespace test_utils
+}  // namespace nvblox
diff --git a/nvblox/tests/test_3d_interpolation.cpp b/nvblox/tests/test_3d_interpolation.cpp
new file mode 100644
index 00000000..f9f30592
--- /dev/null
+++ b/nvblox/tests/test_3d_interpolation.cpp
@@ -0,0 +1,292 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/interpolation_3d.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/primitives/primitives.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-6;
+
+constexpr int kXAxisIndex = 0;
+constexpr int kYAxisIndex = 1;
+constexpr int kZAxisIndex = 2;
+
+template <typename VoxelType>
+using SetVoxelFunctionType = std::function<void(VoxelType& voxel, float value)>;
+
+template <typename VoxelType>
+void fillVoxelsWithIndices(VoxelBlock<VoxelType>* block_ptr,
+                           const int xyz_index,
+                           SetVoxelFunctionType<VoxelType> set_voxel,
+                           const int offset = 0) {
+  CHECK_GE(xyz_index, 0);
+  CHECK_LT(xyz_index, 3);
+  Index3D index;
+  for (index.x() = 0; index.x() < TsdfBlock::kVoxelsPerSide; index.x()++) {
+    for (index.y() = 0; index.y() < TsdfBlock::kVoxelsPerSide; index.y()++) {
+      for (index.z() = 0; index.z() < TsdfBlock::kVoxelsPerSide; index.z()++) {
+        set_voxel(block_ptr->voxels[index.x()][index.y()][index.z()],
+                  static_cast<float>(index[xyz_index] + offset));
+      }
+    }
+  }
+}
+
+void fillVoxelsWithIndices(TsdfBlock* block_ptr, const int xyz_index,
+                           const int offset = 0) {
+  auto setVoxelLambda = [](TsdfVoxel& voxel, float value) {
+    voxel.distance = value;
+    voxel.weight = 1.0f;
+  };
+  fillVoxelsWithIndices<TsdfVoxel>(block_ptr, xyz_index, setVoxelLambda,
+                                   offset);
+}
+
+void fillVoxelsWithIndices(EsdfBlock* block_ptr, const int xyz_index,
+                           const int offset = 0) {
+  auto setVoxelLambda = [](EsdfVoxel& voxel, float value) {
+    voxel.squared_distance_vox = std::pow(value, 2);
+    voxel.observed = true;
+  };
+  fillVoxelsWithIndices<EsdfVoxel>(block_ptr, xyz_index, setVoxelLambda,
+                                   offset);
+}
+
+TEST(InterpolatorTest, NeighboursTest) {
+  // Empty layer
+  constexpr float kVoxelSize = 1.0f;
+  TsdfLayer layer(kVoxelSize, MemoryType::kUnified);
+
+  // Allocate a block with its origin at the origin
+  TsdfBlock::Ptr block_ptr = layer.allocateBlockAtIndex(Index3D(0, 0, 0));
+
+  // Fill with such that the distances are equal to the x-index;
+  fillVoxelsWithIndices(block_ptr.get(), kXAxisIndex);
+
+  // Dummy function which always says a voxel is valid
+  auto valid_lambda = [](const TsdfVoxel&) -> bool { return true; };
+
+  // Check the surrounding voxels
+  std::array<TsdfVoxel, 8> voxels;
+  Vector3f p_L = 0.0 * Vector3f::Ones();
+  EXPECT_FALSE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels));
+  p_L = 0.1 * Vector3f::Ones();
+  EXPECT_FALSE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels));
+  p_L = 0.6 * Vector3f::Ones();
+  EXPECT_TRUE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels));
+  // Check that the surrounding voxels have x values {0,0,0,0,1,1,1,1}
+  for (int i = 0; i < 8; i++) {
+    if (i < 4) {
+      EXPECT_EQ(voxels[i].distance, 0);
+    } else {
+      EXPECT_EQ(voxels[i].distance, 1);
+    }
+  }
+  // Ask for neighbours outside the block bounds on the x-dimension
+  p_L = Vector3f(static_cast<float>(TsdfBlock::kVoxelsPerSide), 0.6f, 0.6f);
+  EXPECT_FALSE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels));
+  // Add a new block
+  block_ptr = layer.allocateBlockAtIndex(Index3D(1, 0, 0));
+  fillVoxelsWithIndices(block_ptr.get(), kXAxisIndex);
+  // Now the test just outside the first block on the x-dimension should pass.
+  EXPECT_TRUE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels));
+  // Check that the surrounding voxels have x values {7,7,7,7,0,0,0,0}
+  for (int i = 0; i < 8; i++) {
+    if (i < 4) {
+      EXPECT_EQ(voxels[i].distance, 7);
+    } else {
+      EXPECT_EQ(voxels[i].distance, 0);
+    }
+  }
+}
+
+TEST(InterpolatorTest, OffsetTest) {
+  // Empty layer
+  constexpr float kVoxelSize = 1.0f;
+  TsdfLayer layer(kVoxelSize, MemoryType::kUnified);
+
+  // Allocate a block with its origin at the origin
+  TsdfBlock::Ptr block_ptr = layer.allocateBlockAtIndex(Index3D(0, 0, 0));
+
+  // Dummy function which always says a voxel is valid
+  auto valid_lambda = [](const TsdfVoxel&) -> bool { return true; };
+
+  // Get the surrounding voxels (in this test we just look at the offset vector)
+  std::array<TsdfVoxel, 8> voxels;
+  Vector3f p_offset_L;
+  Vector3f p_L = 0.5 * Vector3f::Ones();
+  EXPECT_TRUE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels, &p_offset_L));
+
+  // TODO(alexmillane): Let's make some eigen checks....
+  EXPECT_NEAR(p_offset_L.x(), 0.0f, kFloatEpsilon);
+  EXPECT_NEAR(p_offset_L.y(), 0.0f, kFloatEpsilon);
+  EXPECT_NEAR(p_offset_L.z(), 0.0f, kFloatEpsilon);
+
+  p_L = 1.0 * Vector3f::Ones();
+  EXPECT_TRUE(interpolation::internal::getSurroundingVoxels3D<TsdfVoxel>(
+      p_L, layer, valid_lambda, &voxels, &p_offset_L));
+  EXPECT_NEAR(p_offset_L.x(), 0.5f, kFloatEpsilon);
+  EXPECT_NEAR(p_offset_L.y(), 0.5f, kFloatEpsilon);
+  EXPECT_NEAR(p_offset_L.z(), 0.5f, kFloatEpsilon);
+}
+
+TEST(InterpolatorTest, InterpolationTest) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  // Create 3 layers where distances equal to x,y,z indices respectively. The
+  // idea is that interpolating in these layers at point p_L gives you p_L back
+  // for all points within the block (minus half a voxel width
+  // because indexing starts at the low-corner but interpolation is wrt to voxel
+  // center :)).
+  constexpr float kVoxelSize = 1.0f;
+  constexpr float KTestBlockSize = TsdfBlock::kVoxelsPerSide * kVoxelSize;
+  // TSDF
+  TsdfLayer layer_x_tsdf(kVoxelSize, MemoryType::kUnified);
+  TsdfLayer layer_y_tsdf(kVoxelSize, MemoryType::kUnified);
+  TsdfLayer layer_z_tsdf(kVoxelSize, MemoryType::kUnified);
+  TsdfBlock::Ptr tsdf_block_x_ptr =
+      layer_x_tsdf.allocateBlockAtIndex(Index3D(0, 0, 0));
+  TsdfBlock::Ptr tsdf_block_y_ptr =
+      layer_y_tsdf.allocateBlockAtIndex(Index3D(0, 0, 0));
+  TsdfBlock::Ptr tsdf_block_z_ptr =
+      layer_z_tsdf.allocateBlockAtIndex(Index3D(0, 0, 0));
+  fillVoxelsWithIndices(tsdf_block_x_ptr.get(), kXAxisIndex);
+  fillVoxelsWithIndices(tsdf_block_y_ptr.get(), kYAxisIndex);
+  fillVoxelsWithIndices(tsdf_block_z_ptr.get(), kZAxisIndex);
+  // ESDF
+  EsdfLayer layer_x_esdf(kVoxelSize, MemoryType::kUnified);
+  EsdfLayer layer_y_esdf(kVoxelSize, MemoryType::kUnified);
+  EsdfLayer layer_z_esdf(kVoxelSize, MemoryType::kUnified);
+  EsdfBlock::Ptr esdf_block_x_ptr =
+      layer_x_esdf.allocateBlockAtIndex(Index3D(0, 0, 0));
+  EsdfBlock::Ptr esdf_block_y_ptr =
+      layer_y_esdf.allocateBlockAtIndex(Index3D(0, 0, 0));
+  EsdfBlock::Ptr esdf_block_z_ptr =
+      layer_z_esdf.allocateBlockAtIndex(Index3D(0, 0, 0));
+  fillVoxelsWithIndices(esdf_block_x_ptr.get(), kXAxisIndex);
+  fillVoxelsWithIndices(esdf_block_y_ptr.get(), kYAxisIndex);
+  fillVoxelsWithIndices(esdf_block_z_ptr.get(), kZAxisIndex);
+
+  // Testing random points inside the block
+  constexpr int kNumTests = 1000;
+  for (int i = 0; i < kNumTests; i++) {
+    // Random point
+    const float half_voxel_size = kVoxelSize / 2.0f;
+    const Vector3f p_L = test_utils::getRandomVector3fInRange(
+        half_voxel_size, KTestBlockSize - half_voxel_size);
+
+    // Interpolate
+    float interpolated_distance_x;
+    float interpolated_distance_y;
+    float interpolated_distance_z;
+    // TSDF
+    EXPECT_TRUE(interpolation::interpolateOnCPU(p_L, layer_x_tsdf,
+                                                &interpolated_distance_x));
+    EXPECT_TRUE(interpolation::interpolateOnCPU(p_L, layer_y_tsdf,
+                                                &interpolated_distance_y));
+    EXPECT_TRUE(interpolation::interpolateOnCPU(p_L, layer_z_tsdf,
+                                                &interpolated_distance_z));
+    // Check
+    EXPECT_NEAR(interpolated_distance_x, p_L.x() - 0.5f, kFloatEpsilon);
+    EXPECT_NEAR(interpolated_distance_y, p_L.y() - 0.5f, kFloatEpsilon);
+    EXPECT_NEAR(interpolated_distance_z, p_L.z() - 0.5f, kFloatEpsilon);
+    // ESDF
+    EXPECT_TRUE(interpolation::interpolateOnCPU(p_L, layer_x_esdf,
+                                                &interpolated_distance_x));
+    EXPECT_TRUE(interpolation::interpolateOnCPU(p_L, layer_y_esdf,
+                                                &interpolated_distance_y));
+    EXPECT_TRUE(interpolation::interpolateOnCPU(p_L, layer_z_esdf,
+                                                &interpolated_distance_z));
+    // Check
+    EXPECT_NEAR(interpolated_distance_x, p_L.x() - 0.5f, kFloatEpsilon);
+    EXPECT_NEAR(interpolated_distance_y, p_L.y() - 0.5f, kFloatEpsilon);
+    EXPECT_NEAR(interpolated_distance_z, p_L.z() - 0.5f, kFloatEpsilon);
+  }
+}
+
+TEST(InterpolatorTest, PrimitivesInterpolationTest) {
+  // Maximum distance to consider for scene generation.
+  constexpr float kMaxDist = 10.0;
+  constexpr float kMinWeight = 1.0;
+
+  // Scene is bounded to -5, -5, 0 to 5, 5, 5.
+  primitives::Scene scene;
+  scene.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, 0.0f),
+                                        Vector3f(5.0f, 5.0f, 5.0f));
+  // Create a scene with a ground plane and a sphere.
+  scene.addGroundLevel(0.0f);
+  scene.addCeiling(5.0f);
+  scene.addPrimitive(
+      std::make_unique<primitives::Sphere>(Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+  // Add bounding planes at 5 meters. Basically makes it sphere in a box.
+  scene.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+
+  // Get the ground truth SDF for it.
+  constexpr float kVoxelSize_m = 0.2;
+  TsdfLayer gt_layer(kVoxelSize_m, MemoryType::kUnified);
+  constexpr float kTruncationDistanceMeters = 10;
+  scene.generateSdfFromScene(kTruncationDistanceMeters, &gt_layer);
+
+  // Generate some random points in the arena
+  constexpr int kNumPointsToTest = 1000;
+  std::vector<Vector3f> p_L_vec(kNumPointsToTest);
+  std::generate(p_L_vec.begin(), p_L_vec.end(), []() {
+    return Vector3f(test_utils::randomFloatInRange(-5.0f + kVoxelSize_m,
+                                                   5.0f - kVoxelSize_m),
+                    test_utils::randomFloatInRange(-5.0f + kVoxelSize_m,
+                                                   5.0f - kVoxelSize_m),
+                    test_utils::randomFloatInRange(0.0f + kVoxelSize_m,
+                                                   5.0f - kVoxelSize_m));
+  });
+
+  // Interpolate the distance field at these locations
+  std::vector<float> interpolated_distances;
+  std::vector<bool> success_flags;
+  interpolation::interpolateOnCPU(p_L_vec, gt_layer, &interpolated_distances,
+                                  &success_flags);
+
+  // Get the true distances
+  std::vector<float> gt_distances;
+  for (const Vector3f& p_L : p_L_vec) {
+    gt_distances.push_back(scene.getSignedDistanceToPoint(p_L, kMaxDist));
+  }
+
+  // Check
+  for (int i = 0; i < p_L_vec.size(); i++) {
+    EXPECT_NEAR(interpolated_distances[i], gt_distances[i], 0.5 * kVoxelSize_m);
+  }
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_3dmatch.cpp b/nvblox/tests/test_3dmatch.cpp
new file mode 100644
index 00000000..dd7780d6
--- /dev/null
+++ b/nvblox/tests/test_3dmatch.cpp
@@ -0,0 +1,133 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include "nvblox/core/image.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+
+// DEBUG
+#include <chrono>
+#include <thread>
+
+using namespace nvblox;
+
+constexpr float kTolerance = 1e-4;
+
+class Dataset3DMatchTest : public ::testing::Test {
+ protected:
+  void SetUp() override { base_path_ = "./data/3dmatch/"; }
+
+  std::string base_path_;
+};
+
+TEST_F(Dataset3DMatchTest, ParseTransform) {
+  const std::string transform_filename =
+      datasets::threedmatch::getPathForFramePose(base_path_, 1, 0);
+
+  Transform T_L_C_test;
+
+  ASSERT_TRUE(datasets::threedmatch::parsePoseFromFile(transform_filename,
+                                                       &T_L_C_test));
+
+  Eigen::Matrix4f T_L_C_mat;
+  T_L_C_mat << 3.13181000e-01, 3.09473000e-01, -8.97856000e-01, 1.97304600e+00,
+      -8.73910000e-02, -9.32015000e-01, -3.51729000e-01, 1.12573400e+00,
+      -9.45665000e-01, 1.88620000e-01, -2.64844000e-01, 3.09820000e-01,
+      0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 1.00000000e+00;
+
+  Transform T_L_C_true(T_L_C_mat);
+
+  EXPECT_TRUE(T_L_C_test.isApprox(T_L_C_true));
+}
+
+TEST_F(Dataset3DMatchTest, ParseCameraIntrinsics) {
+  const std::string intrinsics_filename =
+      datasets::threedmatch::getPathForCameraIntrinsics(base_path_);
+
+  Eigen::Matrix3f intrinsics_test;
+
+  ASSERT_TRUE(datasets::threedmatch::parseCameraFromFile(intrinsics_filename,
+                                                         &intrinsics_test));
+
+  Eigen::Matrix3f intrinsics_true;
+  intrinsics_true << 5.70342205e+02, 0.00000000e+00, 3.20000000e+02,
+      0.00000000e+00, 5.70342205e+02, 2.40000000e+02, 0.00000000e+00,
+      0.00000000e+00, 1.00000000e+00;
+
+  EXPECT_TRUE(intrinsics_test.isApprox(intrinsics_true));
+}
+
+TEST_F(Dataset3DMatchTest, LoadImage) {
+  const std::string image_filename =
+      datasets::threedmatch::getPathForDepthImage(base_path_, 1, 0);
+
+  DepthImage image;
+  ASSERT_TRUE(datasets::load16BitDepthImage(image_filename, &image));
+
+  EXPECT_EQ(image.rows(), 480);
+  EXPECT_EQ(image.cols(), 640);
+
+  EXPECT_NEAR(image::minGPU(image), 0.0, kTolerance);
+  EXPECT_NEAR(image::maxGPU(image), 7.835, kTolerance);
+}
+
+enum class LoaderType { kSingleThreaded, kMultiThreaded };
+
+class LoaderParameterizedTest
+    : public Dataset3DMatchTest,
+      public ::testing::WithParamInterface<LoaderType> {
+  //
+};
+
+TEST_P(LoaderParameterizedTest, ImageLoaderObject) {
+  const LoaderType loader_type = GetParam();
+  std::unique_ptr<datasets::ImageLoader<DepthImage>> depth_loader_ptr;
+  if (loader_type == LoaderType::kSingleThreaded) {
+    depth_loader_ptr =
+        datasets::threedmatch::createDepthImageLoader(base_path_, 1);
+  } else {
+    constexpr int kNumThreads = 6;
+    depth_loader_ptr =
+        datasets::threedmatch::createMultithreadedDepthImageLoader(
+            base_path_, 1, kNumThreads);
+  }
+
+  DepthImage depth_image_1;
+  DepthImage depth_image_2;
+  DepthImage depth_image_3;
+  DepthImage depth_image_4;
+  EXPECT_TRUE(depth_loader_ptr->getNextImage(&depth_image_1));
+  EXPECT_TRUE(depth_loader_ptr->getNextImage(&depth_image_2));
+  EXPECT_TRUE(depth_loader_ptr->getNextImage(&depth_image_3));
+  EXPECT_FALSE(depth_loader_ptr->getNextImage(&depth_image_4));
+
+  EXPECT_EQ(depth_image_1.rows(), 480);
+  EXPECT_EQ(depth_image_1.cols(), 640);
+  EXPECT_NEAR(image::minGPU(depth_image_1), 0.0, kTolerance);
+  EXPECT_NEAR(image::maxGPU(depth_image_1), 7.835, kTolerance);
+}
+
+INSTANTIATE_TEST_CASE_P(LoaderTests, LoaderParameterizedTest,
+                        ::testing::Values(LoaderType::kSingleThreaded,
+                                          LoaderType::kMultiThreaded));
+
+int main(int argc, char** argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_blox.cpp b/nvblox/tests/test_blox.cpp
new file mode 100644
index 00000000..c3cf8410
--- /dev/null
+++ b/nvblox/tests/test_blox.cpp
@@ -0,0 +1,96 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/unified_ptr.h"
+
+#include "nvblox/tests/blox.h"
+#include "nvblox/tests/test_utils_cuda.h"
+
+using namespace nvblox;
+
+TEST(BloxTest, InitializeDistanceBloxOnGPU) {
+  TsdfBlock::Ptr block_ptr = TsdfBlock::allocate(MemoryType::kDevice);
+  TsdfVoxel zero_voxel;
+  zero_voxel.distance = 0.0f;
+  zero_voxel.weight = 0.0f;
+  EXPECT_TRUE(test_utils::checkBlockAllConstant(block_ptr, zero_voxel));
+  TsdfVoxel one_voxel;
+  one_voxel.distance = 1.0f;
+  one_voxel.weight = 1.0f;
+  EXPECT_FALSE(test_utils::checkBlockAllConstant(block_ptr, one_voxel));
+}
+
+TEST(BloxTestDeathTest, NoAllocationDefined) {
+  // This block type has not defined an allocation function
+  EXPECT_FALSE(traits::has_allocate<TestBlockNoAllocation>::value);
+}
+
+TEST(BloxTest, CustomGPUInitialization) {
+  constexpr float block_size_m = 0.1;
+  BlockLayer<VoxelBlock<TestVoxel>> layer(block_size_m, MemoryType::kDevice);
+  auto block_ptr = layer.allocateBlockAtIndex(Index3D(0, 0, 0));
+  TestVoxel one_voxel;
+  one_voxel.data = 1;
+  EXPECT_TRUE(test_utils::checkBlockAllConstant(block_ptr, one_voxel));
+  TestVoxel zero_voxel;
+  zero_voxel.data = 0;
+  EXPECT_FALSE(test_utils::checkBlockAllConstant(block_ptr, zero_voxel));
+}
+
+TEST(BloxTest, ColorInitialization) {
+  // The color block has non-trivial initialization. Therefore we test this
+  // block specifically as a representative of the class of non-trivially
+  // constructed blocks.
+
+  // kUnified
+  constexpr float block_size_m = 0.1;
+  BlockLayer<ColorBlock> layer_unified(block_size_m, MemoryType::kUnified);
+  auto block_ptr = layer_unified.allocateBlockAtIndex(Index3D(0, 0, 0));
+  auto check_color_voxels = [](const Index3D& index,
+                               const ColorVoxel* voxel_ptr) {
+    EXPECT_EQ(voxel_ptr->color, Color::Gray());
+    EXPECT_EQ(voxel_ptr->weight, 0.0f);
+  };
+  callFunctionOnAllVoxels<ColorVoxel>(*block_ptr, check_color_voxels);
+
+  // kDevice
+  BlockLayer<ColorBlock> layer_device(block_size_m, MemoryType::kDevice);
+  auto block_device_ptr = layer_device.allocateBlockAtIndex(Index3D(0, 0, 0));
+  ColorVoxel gray_voxel;
+  gray_voxel.color.r = 127;
+  gray_voxel.color.g = 127;
+  gray_voxel.color.b = 127;
+  gray_voxel.weight = 0.0f;
+  EXPECT_TRUE(test_utils::checkBlockAllConstant(block_device_ptr, gray_voxel));
+  ColorVoxel zero_voxel;
+  zero_voxel.color.r = 0;
+  zero_voxel.color.g = 0;
+  zero_voxel.color.b = 0;
+  zero_voxel.weight = 0.0f;
+  EXPECT_FALSE(test_utils::checkBlockAllConstant(block_device_ptr, zero_voxel));
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_cake.cpp b/nvblox/tests/test_cake.cpp
new file mode 100644
index 00000000..f4373734
--- /dev/null
+++ b/nvblox/tests/test_cake.cpp
@@ -0,0 +1,120 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <type_traits>
+#include <typeindex>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/layer_cake.h"
+#include "nvblox/core/voxels.h"
+
+#include "nvblox/tests/blox.h"
+
+using namespace nvblox;
+
+TEST(LayerCakeTest, addAndRetrieve) {
+  // Create
+  const float voxel_size = 0.1f;
+  LayerCake cake(voxel_size);
+
+  // Add a TsdfLayer with a block
+  TsdfLayer* tsdf_layer_ptr_1 = cake.add<TsdfLayer>(MemoryType::kUnified);
+  auto block_ptr_1 = tsdf_layer_ptr_1->allocateBlockAtIndex(Index3D(0, 0, 0));
+  block_ptr_1->voxels[0][0][0].distance = 1.0f;
+
+  // Retrieve
+  ASSERT_TRUE(cake.exists<TsdfLayer>());
+  const TsdfLayer* tsdf_layer_ptr_2 = cake.getConstPtr<TsdfLayer>();
+  EXPECT_TRUE(tsdf_layer_ptr_2->isBlockAllocated(Index3D(0, 0, 0)));
+  auto block_ptr_2 = tsdf_layer_ptr_2->getBlockAtIndex(Index3D(0, 0, 0));
+  EXPECT_EQ(block_ptr_2->voxels[0][0][0].distance, 1.0f);
+}
+
+TEST(LayerCakeTest, create) {
+  // Bring in a custom test layer
+  using DummyLayer = BlockLayer<DummyBlock>;
+
+  // Create
+  const float voxel_size = 0.1f;
+  auto cake = LayerCake::create<TsdfLayer, ColorLayer, EsdfLayer, DummyLayer>(
+      voxel_size, MemoryType::kUnified);
+
+  // Checks
+  EXPECT_TRUE(cake.exists<TsdfLayer>());
+  EXPECT_TRUE(cake.exists<ColorLayer>());
+  EXPECT_TRUE(cake.exists<EsdfLayer>());
+  EXPECT_TRUE(cake.exists<DummyLayer>());
+
+  EXPECT_FALSE(cake.exists<MeshLayer>());
+}
+
+TEST(LayerCakeTest, moveOperations) {
+  const float voxel_size = 0.1f;
+  LayerCake cake_1 =
+      LayerCake::create<TsdfLayer, ColorLayer>(voxel_size, MemoryType::kDevice);
+
+  LayerCake cake_2 =
+      LayerCake::create<MeshLayer, EsdfLayer>(voxel_size, MemoryType::kDevice);
+
+  cake_1 = std::move(cake_2);
+
+  EXPECT_FALSE(cake_1.exists<TsdfLayer>());
+  EXPECT_FALSE(cake_1.exists<ColorLayer>());
+  EXPECT_TRUE(cake_1.exists<MeshLayer>());
+  EXPECT_TRUE(cake_1.exists<EsdfLayer>());
+
+  EXPECT_FALSE(cake_2.exists<TsdfLayer>());
+  EXPECT_FALSE(cake_2.exists<ColorLayer>());
+  EXPECT_FALSE(cake_2.exists<MeshLayer>());
+  EXPECT_FALSE(cake_2.exists<EsdfLayer>());
+}
+
+TEST(LayerCakeTest, voxelAndBlockSize) {
+  const float voxel_size = 0.1f;
+  LayerCake cake =
+      LayerCake::create<TsdfLayer, MeshLayer>(voxel_size, MemoryType::kDevice);
+
+  const float expected_block_size = voxel_size * TsdfBlock::kVoxelsPerSide;
+
+  EXPECT_EQ(cake.getPtr<TsdfLayer>()->voxel_size(), 0.1f);
+  EXPECT_EQ(cake.getPtr<MeshLayer>()->block_size(), expected_block_size);
+}
+
+TEST(LayerCakeTest, differentMemoryTypes) {
+  const float voxel_size = 0.1f;
+
+  LayerCake cake =
+      LayerCake::create<TsdfLayer, ColorLayer, EsdfLayer, MeshLayer>(
+          voxel_size, MemoryType::kDevice, MemoryType::kUnified,
+          MemoryType::kHost, MemoryType::kDevice);
+
+  EXPECT_EQ(cake.get<TsdfLayer>().memory_type(), MemoryType::kDevice);
+  EXPECT_EQ(cake.get<ColorLayer>().memory_type(), MemoryType::kUnified);
+  EXPECT_EQ(cake.get<EsdfLayer>().memory_type(), MemoryType::kHost);
+  EXPECT_EQ(cake.get<MeshLayer>().memory_type(), MemoryType::kDevice);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_camera.cpp b/nvblox/tests/test_camera.cpp
new file mode 100644
index 00000000..68204968
--- /dev/null
+++ b/nvblox/tests/test_camera.cpp
@@ -0,0 +1,353 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/bounding_boxes.h"
+#include "nvblox/core/camera.h"
+#include "nvblox/core/types.h"
+
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+// TODO: Decide where to put test epsilons
+// NOTE(alexmillane): I had to crank this up slightly to get things to pass... I
+// guess this is just floating point errors accumulating?
+constexpr float kFloatEpsilon = 1e-4;
+
+std::pair<Vector3f, Vector2f> getRandomVisibleRayAndImagePoint(
+    const Camera& camera) {
+  //  Random point on image plane
+  const Vector2f u_C(test_utils::randomFloatInRange(
+                         0.0f, static_cast<float>(camera.width() - 1)),
+                     test_utils::randomFloatInRange(
+                         0.0f, static_cast<float>(camera.height() - 1)));
+  // Normalized ray
+  return {camera.rayFromImagePlaneCoordinates(u_C).normalized(), u_C};
+}
+
+Camera getTestCamera() {
+  // Arbitrary camera
+  constexpr float fu = 300;
+  constexpr float fv = 300;
+  constexpr int width = 640;
+  constexpr int height = 480;
+  constexpr float cu = static_cast<float>(width) / 2.0f;
+  constexpr float cv = static_cast<float>(height) / 2.0f;
+  return Camera(fu, fv, cu, cv, width, height);
+}
+
+TEST(CameraTest, PointsInView) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  const Camera camera = getTestCamera();
+
+  // Generate some random points (in view) and project them back
+  constexpr int kNumPoints = 1000;
+  for (int i = 0; i < kNumPoints; i++) {
+    Vector3f ray_C;
+    Vector2f u_C;
+    std::tie(ray_C, u_C) = getRandomVisibleRayAndImagePoint(camera);
+    const Vector3f p_C = test_utils::randomFloatInRange(1.0, 1000.0) * ray_C;
+    Vector2f u_reprojection_C;
+    EXPECT_TRUE(camera.project(p_C, &u_reprojection_C));
+    EXPECT_TRUE(((u_reprojection_C - u_C).array().abs() < kFloatEpsilon).all());
+  }
+}
+
+TEST(CameraTest, CenterPixel) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  const Camera camera = getTestCamera();
+
+  // Center
+  const Vector3f center_ray = Vector3f(0.0f, 0.0f, 1.0f);
+  const Vector3f p_C = test_utils::randomFloatInRange(1.0, 1000.0) * center_ray;
+  Eigen::Vector2f u;
+  EXPECT_TRUE(camera.project(p_C, &u));
+  EXPECT_TRUE(
+      ((u - Vector2f(camera.cu(), camera.cv())).array().abs() < kFloatEpsilon)
+          .all());
+}
+
+TEST(CameraTest, BehindCamera) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  const Camera camera = getTestCamera();
+
+  constexpr int kNumPoints = 1000;
+  for (int i = 0; i < kNumPoints; i++) {
+    Vector3f ray_C;
+    Vector2f u_C;
+    std::tie(ray_C, u_C) = getRandomVisibleRayAndImagePoint(camera);
+    Vector3f p_C = test_utils::randomFloatInRange(1.0, 1000.0) * ray_C;
+    // The negative here puts the point behind the camera
+    p_C.z() = -1.0f * p_C.z();
+    Vector2f u_reprojection_C;
+    EXPECT_FALSE(camera.project(p_C, &u_reprojection_C));
+  }
+}
+
+TEST(CameraTest, OutsideImagePlane) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  const Camera camera = getTestCamera();
+
+  constexpr int kNumPoints = 1000;
+  for (int i = 0; i < kNumPoints; i++) {
+    //  Random point off image plane
+    // Add a random offset to the center pixel with sufficient magnitude to take
+    // it off the plane.
+    constexpr float kOffImagePlaneFactor = 5.0;
+    const Vector2f u_perturbation_C(
+        test_utils::randomSign() *
+            test_utils::randomFloatInRange(
+                camera.width() / 2.0, kOffImagePlaneFactor * camera.width()),
+        test_utils::randomSign() *
+            test_utils::randomFloatInRange(
+                camera.height() / 2.0, kOffImagePlaneFactor * camera.height()));
+    const Vector2f u_C = Vector2f(camera.cu(), camera.cv()) + u_perturbation_C;
+
+    // NOTE(alexmillane): My own ray-from-pixel function to not trigger checks
+    // because the pixel is off the image plane.
+    const auto rayFromPixelNoChecks = [camera](const auto& u_C) {
+      return Vector3f((u_C[0] - camera.cu()) / camera.fu(),
+                      (u_C[1] - camera.cv()) / camera.fv(), 1.0f);
+    };
+    const Vector3f ray_C = rayFromPixelNoChecks(u_C);
+    const Vector3f p_C = test_utils::randomFloatInRange(1.0, 1000.0) * ray_C;
+    Vector2f u_reprojection_C;
+    EXPECT_FALSE(camera.project(p_C, &u_reprojection_C));
+  }
+}
+
+TEST(CameraTest, AxisAlignedBoundingBox) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  const Camera camera = getTestCamera();
+
+  // Rays through the corners of the image plane
+  const Vector3f ray_0_C =
+      camera.rayFromImagePlaneCoordinates(Vector2f(0.0f, 0.0f));
+  const Vector3f ray_2_C =
+      camera.rayFromImagePlaneCoordinates(Vector2f(0.0f, camera.height()));
+  const Vector3f ray_1_C =
+      camera.rayFromImagePlaneCoordinates(Vector2f(camera.width(), 0.0f));
+  const Vector3f ray_3_C = camera.rayFromImagePlaneCoordinates(
+      Vector2f(camera.width(), camera.height()));
+
+  // Generate a random depths
+  constexpr float kMinimumDepthPx = 1.0;
+  constexpr float kMaximumDepthPx = 1000.0;
+  const float min_depth =
+      test_utils::randomFloatInRange(kMinimumDepthPx, kMaximumDepthPx);
+  const float max_depth =
+      test_utils::randomFloatInRange(kMinimumDepthPx, kMaximumDepthPx);
+
+  // True bounding box from the 3D points
+  AlignedVector<Vector3f> view_corners_C = {
+      min_depth * ray_0_C, max_depth * ray_0_C,  // NOLINT
+      min_depth * ray_1_C, max_depth * ray_1_C,  // NOLINT
+      min_depth * ray_2_C, max_depth * ray_2_C,  // NOLINT
+      min_depth * ray_3_C, max_depth * ray_3_C   // NOLINT
+  };
+  AxisAlignedBoundingBox aabb_true;
+  std::for_each(view_corners_C.begin(), view_corners_C.end(),
+                [&aabb_true](const Vector3f& p) { aabb_true.extend(p); });
+
+  // Bounding box approximated by the camera model.
+  // TODO(alexmillane): Only tested with identity transform at the moment.
+  const Transform T_L_C = Transform::Identity();
+  const AxisAlignedBoundingBox aabb_test =
+      camera.getViewAABB(T_L_C, min_depth, max_depth);
+
+  EXPECT_TRUE(aabb_true.isApprox(aabb_test))
+      << "AABB true: " << aabb_true.min().transpose() << " "
+      << aabb_true.max().transpose()
+      << " AABB test: " << aabb_test.min().transpose() << " "
+      << aabb_test.max().transpose();
+}
+
+TEST(CameraTest, FrustumTest) {
+  constexpr float kMinDist = 1.0f;
+  constexpr float kMaxDist = 10.0f;
+
+  const Camera camera = getTestCamera();
+
+  Frustum frustum(camera, Transform::Identity(), kMinDist, kMaxDist);
+
+  // Project a point into the camera.
+  Vector3f point_C(0.5, 0.5, 5.0);
+  Vector2f u_C;
+  ASSERT_TRUE(camera.project(point_C, &u_C));
+
+  // Check that the point is within the frustum.
+  EXPECT_TRUE(frustum.isPointInView(point_C));
+
+  // Check a point further than the max dist.
+  point_C << 0.5, 0.5, kMaxDist + 10.0f;
+  ASSERT_TRUE(camera.project(point_C, &u_C));
+  EXPECT_FALSE(frustum.isPointInView(point_C));
+
+  // Check a point closer than the max dist.
+  point_C << 0.0, 0.0, 0.3f;
+  ASSERT_TRUE(camera.project(point_C, &u_C));
+  EXPECT_FALSE(frustum.isPointInView(point_C));
+}
+
+TEST(CameraTest, FrustumAABBTest) {
+  constexpr float kMinDist = 1.0f;
+  constexpr float kMaxDist = 10.0f;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const Camera camera = getTestCamera();
+
+  Frustum frustum(camera, Transform::Identity(), kMinDist, kMaxDist);
+  AxisAlignedBoundingBox view_aabb =
+      camera.getViewAABB(Transform::Identity(), kMinDist, kMaxDist);
+
+  // Double-check that the camera and the frustum AABB match.
+  EXPECT_TRUE(frustum.isAABBInView(view_aabb));
+  EXPECT_TRUE(view_aabb.isApprox(frustum.getAABB()));
+
+  // Get all blocks in the view AABB and make sure that some of them are
+  // actually in the view.
+  const float block_size = 1.0f;
+  std::vector<Index3D> block_indices_in_aabb =
+      getBlockIndicesTouchedByBoundingBox(block_size, view_aabb);
+  std::vector<Index3D> block_indices_in_frustum;
+  for (const Index3D& block_index : block_indices_in_aabb) {
+    const AxisAlignedBoundingBox& aabb_block =
+        getAABBOfBlock(block_size, block_index);
+    if (frustum.isAABBInView(aabb_block)) {
+      block_indices_in_frustum.push_back(block_index);
+    }
+  }
+
+  EXPECT_GT(block_indices_in_aabb.size(), block_indices_in_frustum.size());
+  EXPECT_GT(block_indices_in_frustum.size(), 0);
+
+  // Check all voxels within the view and make sure that they're correctly
+  // marked.
+  for (const Index3D& block_index : block_indices_in_aabb) {
+    Index3D voxel_index;
+
+    // Iterate over all the voxels:
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          Vector3f position = getCenterPostionFromBlockIndexAndVoxelIndex(
+              block_size, block_index, voxel_index);
+          Eigen::Vector2f u_C;
+          bool in_frustum = frustum.isPointInView(position);
+          bool in_camera = camera.project(position, &u_C);
+          if (position.z() <= kMaxDist && position.z() >= kMinDist) {
+            EXPECT_EQ(in_frustum, in_camera);
+          } else {
+            // Doesn't matter if we're within the camera view if it's false.
+            EXPECT_FALSE(in_frustum);
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(CameraTest, FrustumAtLeastOneValidVoxelTest) {
+  constexpr float kMinDist = 0.0f;
+  constexpr float kMaxDist = 10.0f;
+  constexpr int kVoxelsPerSide = VoxelBlock<bool>::kVoxelsPerSide;
+  const Camera camera = getTestCamera();
+
+  Frustum frustum(camera, Transform::Identity(), kMinDist, kMaxDist);
+  AxisAlignedBoundingBox view_aabb =
+      camera.getViewAABB(Transform::Identity(), kMinDist, kMaxDist);
+
+  // Double-check that the camera and the frustum AABB match.
+  EXPECT_TRUE(frustum.isAABBInView(view_aabb));
+  EXPECT_TRUE(view_aabb.isApprox(frustum.getAABB()));
+
+  // Get all blocks in the view AABB and make sure that some of them are
+  // actually in the view.
+  const float block_size = 0.5f;
+  std::vector<Index3D> block_indices_in_aabb =
+      getBlockIndicesTouchedByBoundingBox(block_size, view_aabb);
+  std::vector<Index3D> block_indices_in_frustum;
+  for (const Index3D& block_index : block_indices_in_aabb) {
+    const AxisAlignedBoundingBox& aabb_block =
+        getAABBOfBlock(block_size, block_index);
+    if (frustum.isAABBInView(aabb_block)) {
+      block_indices_in_frustum.push_back(block_index);
+    }
+  }
+
+  EXPECT_GT(block_indices_in_aabb.size(), block_indices_in_frustum.size());
+  EXPECT_GT(block_indices_in_frustum.size(), 0);
+
+  // Check that for any given block in the frustum, there's AT LEAST one valid
+  // voxel.
+  int empty = 0;
+  for (const Index3D& block_index : block_indices_in_frustum) {
+    Index3D voxel_index;
+    bool any_valid = false;
+    // Iterate over all the voxels:
+    for (voxel_index.x() = 0; voxel_index.x() < kVoxelsPerSide;
+         voxel_index.x()++) {
+      for (voxel_index.y() = 0; voxel_index.y() < kVoxelsPerSide;
+           voxel_index.y()++) {
+        for (voxel_index.z() = 0; voxel_index.z() < kVoxelsPerSide;
+             voxel_index.z()++) {
+          Vector3f position = getCenterPostionFromBlockIndexAndVoxelIndex(
+              block_size, block_index, voxel_index);
+          Eigen::Vector2f u_C;
+          bool in_frustum = frustum.isPointInView(position);
+          bool in_camera = camera.project(position, &u_C);
+          any_valid = in_camera || any_valid;
+          if (position.z() >= 0.0f && position.z() < 1e-4f) {
+            // Nothing.
+          } else if (position.z() <= kMaxDist && position.z() > kMinDist) {
+            EXPECT_EQ(in_frustum, in_camera);
+          } else {
+            // Doesn't matter if we're within the camera view if it's false.
+            EXPECT_FALSE(in_frustum);
+          }
+        }
+      }
+    }
+    const AxisAlignedBoundingBox& aabb_block =
+        getAABBOfBlock(block_size, block_index);
+    if (!any_valid) {
+      empty++;
+    }
+  }
+  // At MOST 3% empty on the corners.
+  EXPECT_LE(static_cast<float>(empty) / block_indices_in_frustum.size(), 0.03);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_color_image.cpp b/nvblox/tests/test_color_image.cpp
new file mode 100644
index 00000000..7204fa0c
--- /dev/null
+++ b/nvblox/tests/test_color_image.cpp
@@ -0,0 +1,228 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/image.h"
+#include "nvblox/core/interpolation_2d.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/io/csv.h"
+
+#include "nvblox/tests/gpu_image_routines.h"
+#include "nvblox/tests/interpolation_2d_gpu.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-4;
+
+TEST(ColorImageTest, SetConstant) {
+  // Set constant on the GPU
+  ColorImage image(480, 640, MemoryType::kUnified);
+  const Color color(255, 0, 0);
+  test_utils::setImageConstantOnGpu(color, &image);
+  // Check on the CPU
+  for (int row_idx = 0; row_idx < image.rows(); row_idx++) {
+    for (int col_idx = 0; col_idx < image.cols(); col_idx++) {
+      EXPECT_EQ(image(row_idx, col_idx), color);
+    }
+  }
+}
+
+TEST(ColorImageTest, NearbyImagesSimilar) {
+  // This tests that two consequtive images in the 3DMatch dataset have small
+  // pixelwise difference.
+
+  // Load 3dmatch image
+  const std::string base_path = "./data/3dmatch";
+  constexpr int seq_id = 1;
+  ColorImage image_1;
+  EXPECT_TRUE(datasets::load8BitColorImage(
+      datasets::threedmatch::getPathForColorImage(base_path, seq_id, 0),
+      &image_1, MemoryType::kUnified));
+  ColorImage image_2;
+  EXPECT_TRUE(datasets::load8BitColorImage(
+      datasets::threedmatch::getPathForColorImage(base_path, seq_id, 1),
+      &image_2, MemoryType::kUnified));
+
+  // Compute the diff image on the GPU
+  ColorImage diff_image;
+  test_utils::getDifferenceImageOnGPU(image_1, image_2, &diff_image);
+
+  // Write diff image
+  io::writeToCsv("./color_image_difference.csv", diff_image);
+
+  // Check that there's not much difference between the images using the CPU
+  // - 50 pixel values means constitutes a "big" difference for this test
+  // - We allow up to 10% of pixels to have a "big" difference.
+  constexpr float kBigDifferenceThreshold = 50.0;
+  constexpr float kAllowableBigDifferencePixelsRatio = 0.10;
+  CHECK_EQ(image_1.rows(), image_2.rows());
+  CHECK_EQ(image_1.cols(), image_2.cols());
+  int num_big_diff_pixels = 0;
+  for (int i = 0; i < image_2.numel(); i++) {
+    const float r_diff =
+        static_cast<float>(image_1(i).r) - static_cast<float>(image_2(i).r);
+    const float g_diff =
+        static_cast<float>(image_1(i).g) - static_cast<float>(image_2(i).g);
+    const float b_diff =
+        static_cast<float>(image_1(i).b) - static_cast<float>(image_2(i).b);
+    const float average_diff =
+        (std::fabs(r_diff) + std::fabs(g_diff) + std::fabs(b_diff)) / 3.0f;
+    if (average_diff > kBigDifferenceThreshold) {
+      ++num_big_diff_pixels;
+    }
+  }
+  const float big_diff_pixel_ratio = static_cast<float>(num_big_diff_pixels) /
+                                     static_cast<float>(image_1.numel());
+  std::cout << "num_big_diff_pixels: " << num_big_diff_pixels << std::endl;
+  std::cout << "big_diff_pixel_ratio: " << big_diff_pixel_ratio << std::endl;
+  EXPECT_LT(big_diff_pixel_ratio, kAllowableBigDifferencePixelsRatio);
+}
+
+TEST(ColorImageTest, InterpolationNearestNeighbour) {
+  // Tiny image with col index in g channel, row index in 1.
+  ColorImage image(2, 2, MemoryType::kUnified);
+  image(0, 0) = Color(0, 0, 0);
+  image(0, 1) = Color(0, 1, 0);
+  image(1, 0) = Color(1, 0, 0);
+  image(1, 1) = Color(1, 1, 0);
+
+  // Note the switching in index order below. This is because images are set:
+  //   image(row,col) = val
+  // but interpolated:
+  //   val = image(u_px(x,y)) = image(y,x)
+  Color color;
+  EXPECT_TRUE(interpolation::interpolate2D(
+      image, Vector2f(0.4, 0.4), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_EQ(color.r, 0);
+  EXPECT_EQ(color.g, 0);
+  EXPECT_TRUE(interpolation::interpolate2D(
+      image, Vector2f(0.4, 0.6), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_EQ(color.r, 1);
+  EXPECT_EQ(color.g, 0);
+  EXPECT_TRUE(interpolation::interpolate2D(
+      image, Vector2f(0.6, 0.4), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_EQ(color.r, 0);
+  EXPECT_EQ(color.g, 1);
+  EXPECT_TRUE(interpolation::interpolate2D(
+      image, Vector2f(0.6, 0.6), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_EQ(color.r, 1);
+  EXPECT_EQ(color.g, 1);
+
+  // Out of bounds
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(-0.6, 0.0), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(0.0, -0.6), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(
+      interpolation::interpolate2D(image, Vector2f(-0.6, -0.6), &color,
+                                   InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(1.6, 0.0), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(0.0, 1.6), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(1.6, 1.6), &color, InterpolationType::kNearestNeighbor));
+}
+
+TEST(ColorImageTest, InterpolationLinear) {
+  // Tiny image with col index in g channel, row index in 1. x10
+  ColorImage image(2, 2, MemoryType::kUnified);
+  image(0, 0) = Color(5, 5, 0);
+  image(0, 1) = Color(5, 15, 0);
+  image(1, 0) = Color(15, 5, 0);
+  image(1, 1) = Color(15, 15, 0);
+
+  Color color;
+  EXPECT_TRUE(interpolation::interpolate2D(image, Vector2f(0.5, 0.5), &color,
+                                           InterpolationType::kLinear));
+  EXPECT_EQ(color.r, 5);
+  EXPECT_EQ(color.g, 5);
+
+  EXPECT_TRUE(interpolation::interpolate2D(image, Vector2f(0.4, 0.4), &color,
+                                           InterpolationType::kLinear));
+  EXPECT_EQ(color.r, 4);
+  EXPECT_EQ(color.g, 4);
+  EXPECT_TRUE(interpolation::interpolate2D(image, Vector2f(0.4, 0.6), &color,
+                                           InterpolationType::kLinear));
+  EXPECT_EQ(color.r, 6);
+  EXPECT_EQ(color.g, 4);
+  EXPECT_TRUE(interpolation::interpolate2D(image, Vector2f(0.6, 0.4), &color,
+                                           InterpolationType::kLinear));
+  EXPECT_EQ(color.r, 4);
+  EXPECT_EQ(color.g, 6);
+  EXPECT_TRUE(interpolation::interpolate2D(image, Vector2f(0.6, 0.6), &color,
+                                           InterpolationType::kLinear));
+  EXPECT_EQ(color.r, 6);
+  EXPECT_EQ(color.g, 6);
+
+  // Out of bounds
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(-0.6, 0.0), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(0.0, -0.6), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(
+      interpolation::interpolate2D(image, Vector2f(-0.6, -0.6), &color,
+                                   InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(1.6, 0.0), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(0.0, 1.6), &color, InterpolationType::kNearestNeighbor));
+  EXPECT_FALSE(interpolation::interpolate2D(
+      image, Vector2f(1.6, 1.6), &color, InterpolationType::kNearestNeighbor));
+}
+
+TEST(ColorImageTest, InterpolationGPU) {
+  // Tiny image with col coord in g channel, row coord in r. x10
+  ColorImage image(2, 2, MemoryType::kUnified);
+  image(0, 0) = Color(5, 5, 0);
+  image(0, 1) = Color(5, 15, 0);
+  image(1, 0) = Color(15, 5, 0);
+  image(1, 1) = Color(15, 15, 0);
+
+  constexpr int kNumTests = 1000;
+  std::vector<Eigen::Vector2f> u_px_vec;
+  u_px_vec.reserve(kNumTests);
+  for (int i = 0; i < kNumTests; i++) {
+    u_px_vec.push_back(
+        Eigen::Vector2f(test_utils::randomFloatInRange(0.5f, 1.5f),
+                        test_utils::randomFloatInRange(0.5f, 1.5f)));
+  }
+
+  std::vector<Color> values(kNumTests, Color(0, 0, 0));
+  std::vector<int> success_flags(kNumTests, 0);
+  test_utils::linearInterpolateImageGpu(image, u_px_vec, &values,
+                                        &success_flags);
+
+  for (int i = 0; i < kNumTests; i++) {
+    EXPECT_TRUE(success_flags[i] == 1);
+    const uint8_t x =
+        static_cast<uint8_t>(std::round((u_px_vec[i].x() * 10.0f)));
+    const uint8_t y =
+        static_cast<uint8_t>(std::round((u_px_vec[i].y() * 10.0f)));
+    EXPECT_NEAR(values[i].g, x, kFloatEpsilon);
+    EXPECT_NEAR(values[i].r, y, kFloatEpsilon);
+  }
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_color_integrator.cpp b/nvblox/tests/test_color_integrator.cpp
new file mode 100644
index 00000000..8336554c
--- /dev/null
+++ b/nvblox/tests/test_color_integrator.cpp
@@ -0,0 +1,479 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/blox.h"
+#include "nvblox/core/bounding_boxes.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/interpolation_3d.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/integrators/projective_color_integrator.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/mesh_io.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/primitives/primitives.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/tests/gpu_image_routines.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+class ColorIntegrationTest : public ::testing::Test {
+ protected:
+  ColorIntegrationTest()
+      : kSphereCenter(Vector3f(0.0f, 0.0f, 2.0f)),
+        gt_layer_(voxel_size_m_, MemoryType::kUnified),
+        camera_(Camera(fu_, fv_, cu_, cv_, width_, height_)) {
+    // Maximum distance to consider for scene generation.
+    constexpr float kMaxDist = 10.0;
+    constexpr float kMinWeight = 1.0;
+
+    // Tolerance for error.
+    constexpr float kDistanceErrorTolerance = truncation_distance_m_;
+
+    // Scene is bounded to -5, -5, 0 to 5, 5, 5.
+    scene.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, 0.0f),
+                                          Vector3f(5.0f, 5.0f, 5.0f));
+    // Create a scene with a ground plane and a sphere.
+    scene.addGroundLevel(0.0f);
+    scene.addCeiling(5.0f);
+    scene.addPrimitive(
+        std::make_unique<primitives::Sphere>(Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+    // Add bounding planes at 5 meters. Basically makes it sphere in a box.
+    scene.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+
+    // Get the ground truth SDF for it.
+    scene.generateSdfFromScene(truncation_distance_m_, &gt_layer_);
+  }
+
+  // Scenes
+  constexpr static float kSphereRadius = 2.0f;
+  const Vector3f kSphereCenter;
+
+  // Test layer
+  constexpr static float voxel_size_m_ = 0.1;
+  constexpr static float block_size_m_ =
+      VoxelBlock<TsdfVoxel>::kVoxelsPerSide * voxel_size_m_;
+  TsdfLayer gt_layer_;
+
+  // Truncation distance
+  constexpr static float truncation_distance_vox_ = 4;
+  constexpr static float truncation_distance_m_ =
+      truncation_distance_vox_ * voxel_size_m_;
+
+  // Test camera
+  constexpr static float fu_ = 300;
+  constexpr static float fv_ = 300;
+  constexpr static int width_ = 640;
+  constexpr static int height_ = 480;
+  constexpr static float cu_ = static_cast<float>(width_) / 2.0f;
+  constexpr static float cv_ = static_cast<float>(height_) / 2.0f;
+  Camera camera_;
+
+  // Test Scene
+  primitives::Scene scene;
+};
+
+// ProjectiveTsdfIntegrator child that gives the tests access to the internal
+// functions.
+class TestProjectiveColorIntegratorGPU : public ProjectiveColorIntegrator {
+ public:
+  TestProjectiveColorIntegratorGPU() : ProjectiveColorIntegrator() {}
+  FRIEND_TEST(ColorIntegrationTest, TruncationBandTest);
+};
+
+ColorImage generateSolidColorImage(const Color& color, const int height,
+                                   const int width) {
+  // Generate a random color for this scene
+  ColorImage image(height, width);
+  nvblox::test_utils::setImageConstantOnGpu(color, &image);
+  return image;
+}
+
+std::vector<Eigen::Vector3f> getPointsOnASphere(const float radius,
+                                                const Eigen::Vector3f& center,
+                                                const int points_per_rad = 10) {
+  std::vector<Eigen::Vector3f> sphere_points;
+  for (int azimuth_idx = 0; azimuth_idx < 2 * points_per_rad; azimuth_idx++) {
+    for (int elevation_idx = 0; elevation_idx < points_per_rad;
+         elevation_idx++) {
+      const float azimuth = azimuth_idx * M_PI / points_per_rad - M_PI;
+      const float elevation =
+          elevation_idx * M_PI / points_per_rad - M_PI / 2.0f;
+      Eigen::Vector3f p =
+          radius * Eigen::Vector3f(cos(azimuth) * sin(elevation),
+                                   sin(azimuth) * sin(elevation),
+                                   cos(elevation));
+      p += center;
+      sphere_points.push_back(p);
+    }
+  }
+  return sphere_points;
+}
+
+float checkSphereColor(const ColorLayer& color_layer, const Vector3f& center,
+                       const float radius, const Color& color) {
+  // Check that each sphere is colored appropriately (if observed)
+  int num_observed = 0;
+  int num_tested = 0;
+  auto check_color = [&num_tested, &num_observed](
+                         const ColorVoxel& voxel,
+                         const Color& color_2) -> void {
+    ++num_tested;
+    if (voxel.weight >= 1.0f) {
+      EXPECT_EQ(voxel.color, color_2);
+      ++num_observed;
+    }
+  };
+
+  const std::vector<Eigen::Vector3f> sphere_points =
+      getPointsOnASphere(radius, center);
+  for (const Vector3f p : sphere_points) {
+    const ColorVoxel* color_voxel;
+    EXPECT_TRUE(getVoxelAtPosition<ColorVoxel>(color_layer, p, &color_voxel));
+    check_color(*color_voxel, color);
+  }
+
+  const float ratio_observed_points =
+      static_cast<float>(num_observed) / static_cast<float>(num_tested);
+  return ratio_observed_points;
+}
+
+TEST_F(ColorIntegrationTest, TruncationBandTest) {
+  // Check the GPU version against a hand-rolled CPU implementation.
+  TestProjectiveColorIntegratorGPU integrator;
+
+  // The distance from the surface that we "pass" blocks within.
+  constexpr float kTestDistance = voxel_size_m_;
+
+  std::vector<Index3D> all_indices = gt_layer_.getAllBlockIndices();
+  std::vector<Index3D> valid_indices =
+      integrator.reduceBlocksToThoseInTruncationBand(all_indices, gt_layer_,
+                                                     kTestDistance);
+  integrator.finish();
+
+  // Horrible N^2 complexity set_difference implementation. But easy to write :)
+  std::vector<Index3D> not_valid_indices;
+  for (const Index3D& idx : all_indices) {
+    if (std::find(valid_indices.begin(), valid_indices.end(), idx) ==
+        valid_indices.end()) {
+      not_valid_indices.push_back(idx);
+    }
+  }
+
+  // Check indices touching band
+  for (const Index3D& idx : valid_indices) {
+    const auto block_ptr = gt_layer_.getBlockAtIndex(idx);
+    bool touches_band = false;
+    auto touches_band_lambda = [&touches_band, kTestDistance](
+                                   const Index3D& voxel_index,
+                                   const TsdfVoxel* voxel) -> void {
+      if (std::abs(voxel->distance) <= kTestDistance) {
+        touches_band = true;
+      }
+    };
+    callFunctionOnAllVoxels<TsdfVoxel>(*block_ptr, touches_band_lambda);
+    EXPECT_TRUE(touches_band);
+  }
+
+  // Check indices NOT touching band
+  for (const Index3D& idx : not_valid_indices) {
+    const auto block_ptr = gt_layer_.getBlockAtIndex(idx);
+    bool touches_band = false;
+    auto touches_band_lambda = [&touches_band, kTestDistance](
+                                   const Index3D& voxel_index,
+                                   const TsdfVoxel* voxel) -> void {
+      if (std::abs(voxel->distance) <= kTestDistance) {
+        touches_band = true;
+      }
+    };
+    callFunctionOnAllVoxels<TsdfVoxel>(*block_ptr, touches_band_lambda);
+    EXPECT_FALSE(touches_band);
+  }
+}
+
+TEST_F(ColorIntegrationTest, IntegrateColorToGroundTruthDistanceField) {
+  // Create an integrator.
+  ProjectiveColorIntegrator color_integrator;
+
+  // Simulate a trajectory of the requisite amount of points, on the circle
+  // around the sphere.
+  constexpr float kTrajectoryRadius = 4.0f;
+  constexpr float kTrajectoryHeight = 2.0f;
+  constexpr int kNumTrajectoryPoints = 80;
+  const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+
+  // Color layer
+  ColorLayer color_layer(voxel_size_m_, MemoryType::kUnified);
+
+  // Generate a random color for this scene
+  const Color color = Color::Red();
+  const ColorImage image = generateSolidColorImage(color, height_, width_);
+
+  // Set keeping track of which blocks were touched during the test
+  Index3DSet touched_blocks;
+
+  for (size_t i = 0; i < kNumTrajectoryPoints; i++) {
+    const float theta = radians_increment * i;
+    // Convert polar to cartesian coordinates.
+    Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                   kTrajectoryRadius * std::sin(theta),
+                                   kTrajectoryHeight);
+    // The camera has its z axis pointing towards the origin.
+    Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+    Eigen::Quaternionf rotation_theta(
+        Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+    // Construct a transform from camera to scene with this.
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(rotation_theta * rotation_base);
+    T_S_C.pretranslate(cartesian_coordinates);
+
+    // Generate an image with a single color
+    std::vector<Index3D> updated_blocks;
+    color_integrator.integrateFrame(image, T_S_C, camera_, gt_layer_,
+                                    &color_layer, &updated_blocks);
+    // Accumulate touched block indices
+    std::copy(updated_blocks.begin(), updated_blocks.end(),
+              std::inserter(touched_blocks, touched_blocks.end()));
+  }
+
+  // Lambda that checks if voxels have the passed color (if they have weight >
+  // 0)
+  auto color_check_lambda = [&color](const Index3D& voxel_idx,
+                                     const ColorVoxel* voxel) -> void {
+    if (voxel->weight > 0.0f) {
+      EXPECT_EQ(voxel->color, color);
+    }
+  };
+
+  // Check that all touched blocks are the color we chose
+  for (const Index3D& block_idx : touched_blocks) {
+    callFunctionOnAllVoxels<ColorVoxel>(*color_layer.getBlockAtIndex(block_idx),
+                                        color_check_lambda);
+  }
+
+  // Check that most points on the surface of the sphere have been observed
+  int num_points_on_sphere_surface_observed = 0;
+  const std::vector<Eigen::Vector3f> sphere_points =
+      getPointsOnASphere(kSphereRadius, kSphereCenter);
+  const int num_surface_points_tested = sphere_points.size();
+  for (const Vector3f p : sphere_points) {
+    const ColorVoxel* color_voxel;
+    EXPECT_TRUE(getVoxelAtPosition<ColorVoxel>(color_layer, p, &color_voxel));
+    if (color_voxel->weight >= 1.0f) {
+      ++num_points_on_sphere_surface_observed;
+    }
+  }
+  const float ratio_observed_surface_points =
+      static_cast<float>(num_points_on_sphere_surface_observed) /
+      static_cast<float>(num_surface_points_tested);
+  std::cout << "num_points_on_sphere_surface_observed: "
+            << num_points_on_sphere_surface_observed << std::endl;
+  std::cout << "num_surface_points_tested: " << num_surface_points_tested
+            << std::endl;
+  std::cout << "ratio_observed_surface_points: "
+            << ratio_observed_surface_points << std::endl;
+  EXPECT_GT(ratio_observed_surface_points, 0.5);
+
+  // Check that all color blocks have a corresponding block in the tsdf layer
+  for (const Index3D block_idx : color_layer.getAllBlockIndices()) {
+    EXPECT_NE(gt_layer_.getBlockAtIndex(block_idx), nullptr);
+  }
+
+  // Generate a mesh from the "reconstruction"
+  MeshIntegrator mesh_integrator;
+  BlockLayer<MeshBlock> mesh_layer(block_size_m_, MemoryType::kDevice);
+  EXPECT_TRUE(
+      mesh_integrator.integrateMeshFromDistanceField(gt_layer_, &mesh_layer));
+  mesh_integrator.colorMesh(color_layer, &mesh_layer);
+
+  // Write to file
+  io::outputMeshLayerToPly(mesh_layer, "color_sphere_mesh.ply");
+}
+
+TEST_F(ColorIntegrationTest, ColoredSpheres) {
+  constexpr float kTruncationDistanceVox = 2;
+  constexpr float truncation_distance_m_ =
+      kTruncationDistanceVox * voxel_size_m_;
+
+  primitives::Scene scene_2;
+
+  constexpr float kSphereRadius = 2.0f;
+  const Eigen::Vector3f center_1 = Vector3f(5.0f, 0.0f, 0.0f);
+  const Eigen::Vector3f center_2 = Vector3f(5.0f, 5.0f, 0.0f);
+  const Eigen::Vector3f center_3 = Vector3f(5.0f, 10.0f, 0.0f);
+
+  // Scene is bounded to -5, -5, 0 to 5, 5, 5.
+  scene_2.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, -5.0f),
+                                          Vector3f(10.0f, 15.0f, 5.0f));
+  // Create a scene with a ground plane and a sphere.
+  scene_2.addPrimitive(
+      std::make_unique<primitives::Sphere>(center_1, kSphereRadius));
+  scene_2.addPrimitive(
+      std::make_unique<primitives::Sphere>(center_2, kSphereRadius));
+  scene_2.addPrimitive(
+      std::make_unique<primitives::Sphere>(center_3, kSphereRadius));
+
+  // Camera
+  // Slightly narrow field of view so each image below sees only a single
+  // sphere.
+  constexpr float fu = 450;
+  constexpr float fv = 450;
+  constexpr int width = 640;
+  constexpr int height = 480;
+  constexpr float cu = static_cast<float>(width_) / 2.0f;
+  constexpr float cv = static_cast<float>(height_) / 2.0f;
+  Camera camera(fu, fv, cu, cv, width, height);
+
+  // Simulate camera views
+  // Rotate 90 degress around the y axis to point camera-Z along world-X
+  Transform T_S_C1 = Transform::Identity();
+  T_S_C1.prerotate(
+      Eigen::Quaternionf(Eigen::AngleAxisf(M_PI / 2, Vector3f::UnitY())));
+  T_S_C1.pretranslate(Eigen::Vector3f(0.0f, 0.0f, 0.0f));
+  Transform T_S_C2 = Transform::Identity();
+  T_S_C2.prerotate(
+      Eigen::Quaternionf(Eigen::AngleAxisf(M_PI / 2, Vector3f::UnitY())));
+  T_S_C2.pretranslate(Eigen::Vector3f(0.0f, 5.0f, 0.0f));
+  Transform T_S_C3 = Transform::Identity();
+  T_S_C3.prerotate(
+      Eigen::Quaternionf(Eigen::AngleAxisf(M_PI / 2, Vector3f::UnitY())));
+  T_S_C3.pretranslate(Eigen::Vector3f(0.0f, 10.0f, 0.0f));
+
+  // TSDF
+  scene_2.generateSdfFromScene(truncation_distance_m_, &gt_layer_);
+
+  // Color layer
+  ProjectiveColorIntegrator color_integrator;
+  ColorLayer color_layer(voxel_size_m_, MemoryType::kUnified);
+
+  const auto color_1 = Color::Red();
+  const auto color_2 = Color::Green();
+  const auto color_3 = Color::Blue();
+
+  const auto image_1 = generateSolidColorImage(color_1, height_, width_);
+  const auto image_2 = generateSolidColorImage(color_2, height_, width_);
+  const auto image_3 = generateSolidColorImage(color_3, height_, width_);
+
+  std::vector<Index3D> updated_blocks;
+  color_integrator.integrateFrame(image_1, T_S_C1, camera, gt_layer_,
+                                  &color_layer, &updated_blocks);
+  color_integrator.integrateFrame(image_2, T_S_C2, camera, gt_layer_,
+                                  &color_layer, &updated_blocks);
+  color_integrator.integrateFrame(image_3, T_S_C3, camera, gt_layer_,
+                                  &color_layer, &updated_blocks);
+
+  // Generate a mesh from the "reconstruction"
+  MeshIntegrator mesh_integrator;
+  BlockLayer<MeshBlock> mesh_layer(block_size_m_, MemoryType::kUnified);
+  EXPECT_TRUE(
+      mesh_integrator.integrateMeshFromDistanceField(gt_layer_, &mesh_layer));
+  mesh_integrator.colorMesh(color_layer, &mesh_layer);
+
+  const float sphere_1_observed_ratio =
+      checkSphereColor(color_layer, center_1, kSphereRadius, color_1);
+  const float sphere_2_observed_ratio =
+      checkSphereColor(color_layer, center_2, kSphereRadius, color_2);
+  const float sphere_3_observed_ratio =
+      checkSphereColor(color_layer, center_3, kSphereRadius, color_3);
+
+  EXPECT_GT(sphere_1_observed_ratio, 0.2);
+  EXPECT_GT(sphere_2_observed_ratio, 0.2);
+  EXPECT_GT(sphere_3_observed_ratio, 0.2);
+
+  std::cout << "sphere_1_observed_ratio: " << sphere_1_observed_ratio
+            << std::endl;
+  std::cout << "sphere_2_observed_ratio: " << sphere_2_observed_ratio
+            << std::endl;
+  std::cout << "sphere_3_observed_ratio: " << sphere_3_observed_ratio
+            << std::endl;
+
+  // Write to file
+  io::outputMeshLayerToPly(mesh_layer, "colored_spheres.ply");
+}
+
+TEST_F(ColorIntegrationTest, OcclusionTesting) {
+  primitives::Scene scene_3;
+
+  // Scene: Two spheres, one occluded by the other.
+  scene_3.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, -5.0f),
+                                          Vector3f(15.0f, 15.0f, 5.0f));
+  constexpr float kSphereRadius = 2.0f;
+
+  const Vector3f center_1(5.0f, 0.0f, 0.0f);
+  const Vector3f center_2(10.0f, 0.0f, 0.0f);
+
+  scene_3.addPrimitive(
+      std::make_unique<primitives::Sphere>(center_1, kSphereRadius));
+  scene_3.addPrimitive(
+      std::make_unique<primitives::Sphere>(center_2, kSphereRadius));
+  scene_3.generateSdfFromScene(truncation_distance_m_, &gt_layer_);
+
+  // Viewpoint
+  Transform T_S_C = Transform::Identity();
+  T_S_C.prerotate(
+      Eigen::Quaternionf(Eigen::AngleAxisf(M_PI / 2, Vector3f::UnitY())));
+
+  // Integrate a color image
+  ProjectiveColorIntegrator color_integrator;
+  ColorLayer color_layer(voxel_size_m_, MemoryType::kUnified);
+
+  const auto color_1 = Color::Red();
+  const auto image_1 = generateSolidColorImage(color_1, height_, width_);
+
+  std::vector<Index3D> updated_blocks;
+  color_integrator.integrateFrame(image_1, T_S_C, camera_, gt_layer_,
+                                  &color_layer, &updated_blocks);
+
+  // Check front sphere (observed voxels red)
+  const float sphere_1_observed_ratio =
+      checkSphereColor(color_layer, center_1, kSphereRadius, color_1);
+  EXPECT_GT(sphere_1_observed_ratio, 0.2);
+  std::cout << "sphere_1_observed_ratio: " << sphere_1_observed_ratio
+            << std::endl;
+
+  // Check back sphere (no observed voxels)
+  const std::vector<Eigen::Vector3f> sphere_points =
+      getPointsOnASphere(kSphereRadius, center_2);
+  for (const Vector3f p : sphere_points) {
+    const ColorVoxel* color_voxel;
+    const bool block_allocated =
+        getVoxelAtPosition<ColorVoxel>(color_layer, p, &color_voxel);
+    if (block_allocated) {
+      EXPECT_EQ(color_voxel->weight, 0.0f);
+    }
+  }
+
+  // Generate a mesh from the "reconstruction"
+  MeshIntegrator mesh_integrator;
+  MeshLayer mesh_layer(block_size_m_, MemoryType::kUnified);
+  EXPECT_TRUE(
+      mesh_integrator.integrateMeshFromDistanceField(gt_layer_, &mesh_layer));
+  mesh_integrator.colorMesh(color_layer, &mesh_layer);
+  io::outputMeshLayerToPly(mesh_layer, "colored_spheres_occluded.ply");
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_depth_image.cpp b/nvblox/tests/test_depth_image.cpp
new file mode 100644
index 00000000..c3d1d317
--- /dev/null
+++ b/nvblox/tests/test_depth_image.cpp
@@ -0,0 +1,200 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/image.h"
+#include "nvblox/core/interpolation_2d.h"
+#include "nvblox/core/types.h"
+
+#include "nvblox/tests/gpu_image_routines.h"
+#include "nvblox/tests/interpolation_2d_gpu.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-4;
+
+class DepthImageTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Uninitialized depth frame
+    depth_frame_ = DepthImage(rows_, cols_, MemoryType::kUnified);
+  }
+
+  Index2D getRandomPixel() {
+    return Index2D(test_utils::randomIntInRange(0., cols_ - 1),
+                   test_utils::randomIntInRange(0, rows_ - 1));
+  }
+
+  int rows_ = 480;
+  int cols_ = 640;
+  DepthImage depth_frame_;
+};
+
+void setImageConstantOnCpu(const float value, DepthImage* depth_frame_ptr) {
+  // Set everything to 1.0 through one access method and check through the other
+  for (int row_idx = 0; row_idx < depth_frame_ptr->rows(); row_idx++) {
+    for (int col_idx = 0; col_idx < depth_frame_ptr->cols(); col_idx++) {
+      (*depth_frame_ptr)(row_idx, col_idx) = 1.0f;
+    }
+  }
+}
+
+TEST_F(DepthImageTest, Host) {
+  // Set constant on CPU
+  setImageConstantOnCpu(1.0, &depth_frame_);
+
+  // Check on the CPU
+  for (int lin_idx = 0; lin_idx < depth_frame_.numel(); lin_idx++) {
+    EXPECT_EQ(depth_frame_(lin_idx), 1.0f);
+  }
+}
+
+TEST_F(DepthImageTest, Device) {
+  // Set constant on GPU
+  constexpr float kPixelValue = 1.0f;
+  test_utils::setImageConstantOnGpu(kPixelValue, &depth_frame_);
+
+  // Check on the CPU
+  for (int lin_idx = 0; lin_idx < depth_frame_.numel(); lin_idx++) {
+    EXPECT_EQ(depth_frame_(lin_idx), 1.0f);
+  }
+}
+
+TEST_F(DepthImageTest, DeviceReduction) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  // Set constant on CPU
+  constexpr float kPixelValue = 1.0f;
+  setImageConstantOnCpu(kPixelValue, &depth_frame_);
+
+  // Change a single value on the CPU
+  constexpr float kMaxValue = 100.0f;
+  constexpr float kMinValue = -100.0f;
+  const Index2D u_max = getRandomPixel();
+  Index2D u_min = u_max;
+  while ((u_min.array() == u_max.array()).all()) {
+    u_min = getRandomPixel();
+  }
+  depth_frame_(u_max.y(), u_max.x()) = kMaxValue;
+  depth_frame_(u_min.y(), u_min.x()) = kMinValue;
+
+  // Reduction on the GPU
+  const float max = image::max(depth_frame_);
+  const float min = image::min(depth_frame_);
+  const auto minmax = image::minmax(depth_frame_);
+
+  // Check on the CPU
+  EXPECT_EQ(max, kMaxValue);
+  EXPECT_EQ(min, kMinValue);
+  EXPECT_EQ(minmax.first, kMinValue);
+  EXPECT_EQ(minmax.second, kMaxValue);
+}
+
+TEST_F(DepthImageTest, LinearInterpolation) {
+  // The images {depth_frame_col_coords, depth_frame_row_coords} are set up such
+  // that if you interpolate, you should get the interpolated position back.
+  DepthImage depth_frame_col_coords(rows_, cols_, MemoryType::kUnified);
+  DepthImage depth_frame_row_coords(rows_, cols_, MemoryType::kUnified);
+  for (int col_idx = 0; col_idx < cols_; col_idx++) {
+    for (int row_idx = 0; row_idx < rows_; row_idx++) {
+      depth_frame_col_coords(row_idx, col_idx) =
+          static_cast<float>(col_idx) + 0.5f;
+      depth_frame_row_coords(row_idx, col_idx) =
+          static_cast<float>(row_idx) + 0.5f;
+    }
+  }
+  constexpr int kNumTests = 1000;
+  // int num_failures = 0;
+  for (int i = 0; i < kNumTests; i++) {
+    // Random pixel location on image plane
+    const Vector2f u_px(test_utils::randomFloatInRange(
+                            0.5f, static_cast<float>(cols_ - 1) + 0.5f),
+                        test_utils::randomFloatInRange(
+                            0.5f, static_cast<float>(rows_ - 1) + 0.5f));
+    // Interpolate x and y grids
+    float interpolated_value_col;
+    EXPECT_TRUE(interpolation::interpolate2DLinear(depth_frame_col_coords, u_px,
+                                                   &interpolated_value_col));
+    float interpolated_value_row;
+    EXPECT_TRUE(interpolation::interpolate2DLinear(depth_frame_row_coords, u_px,
+                                                   &interpolated_value_row));
+    // Check result
+    EXPECT_NEAR(interpolated_value_col, u_px.x(), kFloatEpsilon);
+    EXPECT_NEAR(interpolated_value_row, u_px.y(), kFloatEpsilon);
+  }
+}
+
+TEST_F(DepthImageTest, DeepCopy) {
+  // Set constant on CPU
+  constexpr float kPixelValue = 1.0f;
+  setImageConstantOnCpu(kPixelValue, &depth_frame_);
+
+  // Copy
+  DepthImage copy(depth_frame_);
+
+  // Check the copy is actually a copy
+  for (int lin_idx = 0; lin_idx < copy.numel(); lin_idx++) {
+    EXPECT_EQ(copy(lin_idx), kPixelValue);
+  }
+}
+
+TEST_F(DepthImageTest, InterpolationGPU) {
+  // Tiny images
+  DepthImage image_x(2, 2, MemoryType::kUnified);
+  image_x(0, 0) = 0.5f;
+  image_x(0, 1) = 1.5f;
+  image_x(1, 0) = 0.5f;
+  image_x(1, 1) = 1.5f;
+  DepthImage image_y(2, 2, MemoryType::kUnified);
+  image_y(0, 0) = 0.5f;
+  image_y(0, 1) = 0.5f;
+  image_y(1, 0) = 1.5f;
+  image_y(1, 1) = 1.5f;
+
+  constexpr int kNumTests = 1000;
+  std::vector<Eigen::Vector2f> u_px_vec;
+  u_px_vec.reserve(kNumTests);
+  for (int i = 0; i < kNumTests; i++) {
+    u_px_vec.push_back(
+        Eigen::Vector2f(test_utils::randomFloatInRange(0.5f, 1.5f),
+                        test_utils::randomFloatInRange(0.5f, 1.5f)));
+  }
+
+  std::vector<float> values_x(kNumTests, 1.0f);
+  std::vector<int> success_flags_x(kNumTests, 0);
+  test_utils::linearInterpolateImageGpu(image_x, u_px_vec, &values_x,
+                                        &success_flags_x);
+  std::vector<float> values_y(kNumTests, 1.0f);
+  std::vector<int> success_flags_y(kNumTests, 0);
+  test_utils::linearInterpolateImageGpu(image_y, u_px_vec, &values_y,
+                                        &success_flags_y);
+
+  for (int i = 0; i < kNumTests; i++) {
+    EXPECT_TRUE(success_flags_x[i] == 1);
+    EXPECT_NEAR(values_x[i], u_px_vec[i].x(), kFloatEpsilon);
+    EXPECT_NEAR(values_y[i], u_px_vec[i].y(), kFloatEpsilon);
+  }
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_esdf_integrator.cpp b/nvblox/tests/test_esdf_integrator.cpp
new file mode 100644
index 00000000..f8b27d48
--- /dev/null
+++ b/nvblox/tests/test_esdf_integrator.cpp
@@ -0,0 +1,883 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include <string>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/integrators/esdf_integrator.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/io/pointcloud_io.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-4;
+
+// Have various obstacles that the ESDF can try to navigate around.
+enum class Obstacle {
+  kAxisAlignedPlane,  // 0
+  kAngledPlane,       // 1
+  kSphereOrigin,      // 2
+  kBox,               // 3
+  kBoxWithSphere,     // 4
+  kBoxWithCube        // 5
+};
+
+class EsdfIntegratorTest : public ::testing::TestWithParam<Obstacle> {
+ protected:
+  void SetUp() override;
+
+  void addParameterizedObstacleToScene(const Obstacle& obstacle);
+
+  bool outputFlatSliceEsdfAsPly(const EsdfLayer& layer,
+                                const std::string& ply_path, float height);
+  bool outputFlatSliceTsdfAsPly(const TsdfLayer& layer,
+                                const std::string& ply_path, float height);
+
+  // Returns PERCENTAGE ABOVE THRESHOLD
+  float compareEsdfToGt(const EsdfLayer& esdf_layer, const TsdfLayer& gt_layer,
+                        float error_threshold);
+
+  // Returns PERCENTAGE ABOVE THRESHOLD
+  float compareEsdfToEsdf(const EsdfLayer& esdf_layer,
+                          const EsdfLayer& gt_layer, float error_threshold);
+
+  // Returns if all the voxels in the ESDF are valid.
+  bool validateEsdf(const EsdfLayer& esdf_layer,
+                    float max_squared_distance_vox);
+
+  bool output_pointclouds_ = false;
+  float block_size_;
+  float voxel_size_ = 0.10f;
+  float max_distance_ = 4.0f;
+  float very_small_cutoff_ = 2e-3f;
+  float small_cutoff_ = 2e-2f;
+
+  TsdfLayer::Ptr tsdf_layer_;
+  TsdfLayer::Ptr gt_sdf_layer_;
+  EsdfLayer::Ptr esdf_layer_;
+
+  EsdfIntegrator esdf_integrator_;
+
+  // A simulation scene.
+  primitives::Scene scene_;
+
+  // Camera and TSDF parameters for incremental integration.
+  std::shared_ptr<Camera> camera_;
+  ProjectiveTsdfIntegrator tsdf_integrator_;
+};
+
+void EsdfIntegratorTest::SetUp() {
+  timing::Timing::Reset();
+  std::srand(0);
+  block_size_ = VoxelBlock<bool>::kVoxelsPerSide * voxel_size_;
+
+  tsdf_layer_.reset(new TsdfLayer(voxel_size_, MemoryType::kUnified));
+  gt_sdf_layer_.reset(new TsdfLayer(voxel_size_, MemoryType::kUnified));
+  esdf_layer_.reset(new EsdfLayer(voxel_size_, MemoryType::kUnified));
+
+  esdf_integrator_.max_distance_m() = max_distance_;
+  esdf_integrator_.min_weight() = 1.0f;
+
+  camera_.reset(new Camera(300, 300, 320, 240, 640, 480));
+}
+
+void EsdfIntegratorTest::addParameterizedObstacleToScene(
+    const Obstacle& obstacle) {
+  if (obstacle == Obstacle::kBox || obstacle == Obstacle::kBoxWithCube ||
+      obstacle == Obstacle::kBoxWithSphere) {
+    scene_.aabb() = AxisAlignedBoundingBox(Vector3f(-5.5f, -5.5f, -0.5f),
+                                           Vector3f(5.5f, 5.5f, 5.5f));
+    scene_.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+    scene_.addGroundLevel(0.0f);
+    scene_.addCeiling(5.0f);
+  } else {
+    scene_.aabb() = AxisAlignedBoundingBox(Vector3f(-3.0f, -3.0f, 0.0f),
+                                           Vector3f(3.0f, 3.0f, 3.0f));
+  }
+
+  switch (obstacle) {
+    case Obstacle::kAxisAlignedPlane:
+      // Plane at the origin pointing in the -x direction.
+      scene_.addPrimitive(std::make_unique<primitives::Plane>(
+          Vector3f(0.05, 0.05, 0.05), Vector3f(-1, 0, 0).normalized()));
+      break;
+    case Obstacle::kAngledPlane:
+      // Plane at the origin pointing in the -x -y direction.
+      scene_.addPrimitive(std::make_unique<primitives::Plane>(
+          Vector3f(0.0, 0.0, 0.0), Vector3f(1, 1, 0).normalized()));
+      break;
+    case Obstacle::kSphereOrigin:
+      // Create a scene that's just a sphere
+      scene_.addPrimitive(
+          std::make_unique<primitives::Sphere>(Vector3f(0.0, 0.0, 0.0), 2.0));
+      break;
+    case Obstacle::kBoxWithSphere:
+      scene_.addPrimitive(std::make_unique<primitives::Sphere>(
+          Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+      break;
+    case Obstacle::kBoxWithCube:
+      scene_.addPrimitive(std::make_unique<primitives::Cube>(
+          Vector3f(0.0f, 0.0f, 2.0f), Vector3f(2.0f, 2.0f, 2.0f)));
+      break;
+    default:
+      break;
+  };
+}
+
+bool EsdfIntegratorTest::outputFlatSliceEsdfAsPly(const EsdfLayer& layer,
+                                                  const std::string& ply_path,
+                                                  float height) {
+  // Create a ply writer object.
+  io::PlyWriter writer(ply_path);
+
+  // Combine all the voxels in the mesh into a pointcloud.
+  std::vector<Vector3f> points;
+  std::vector<float> distances;
+
+  const float block_size = layer.block_size();
+  const float voxel_size = layer.voxel_size();
+
+  auto lambda = [&points, &distances, &block_size, &voxel_size, &height](
+                    const Index3D& block_index, const Index3D& voxel_index,
+                    const EsdfVoxel* voxel) {
+    if (voxel->observed) {
+      Vector3f position = getCenterPostionFromBlockIndexAndVoxelIndex(
+          block_size, block_index, voxel_index);
+      if (position.z() - height < voxel_size && position.z() >= height) {
+        points.push_back(position);
+        float distance = voxel_size * std::sqrt(voxel->squared_distance_vox);
+        if (voxel->is_inside) {
+          distance = -distance;
+        }
+        distances.push_back(distance);
+      }
+    }
+  };
+
+  // Call above lambda on every voxel in the layer.
+  callFunctionOnAllVoxels<EsdfVoxel>(layer, lambda);
+
+  // Add the pointcloud to the ply writer.
+  writer.setPoints(&points);
+  writer.setIntensities(&distances);
+
+  // Write out the ply.
+  return writer.write();
+}
+
+bool EsdfIntegratorTest::outputFlatSliceTsdfAsPly(const TsdfLayer& layer,
+                                                  const std::string& ply_path,
+                                                  float height) {
+  // Create a ply writer object.
+  io::PlyWriter writer(ply_path);
+
+  // Combine all the voxels in the mesh into a pointcloud.
+  std::vector<Vector3f> points;
+  std::vector<float> distances;
+
+  const float block_size = layer.block_size();
+  const float voxel_size = layer.voxel_size();
+
+  auto lambda = [&points, &distances, &block_size, &voxel_size, &height](
+                    const Index3D& block_index, const Index3D& voxel_index,
+                    const TsdfVoxel* voxel) {
+    if (voxel->weight > 1e-4f) {
+      Vector3f position = getCenterPostionFromBlockIndexAndVoxelIndex(
+          block_size, block_index, voxel_index);
+      if (position.z() - height < voxel_size && position.z() >= height) {
+        points.push_back(position);
+        distances.push_back(voxel->distance);
+      }
+    }
+  };
+
+  // Call above lambda on every voxel in the layer.
+  callFunctionOnAllVoxels<TsdfVoxel>(layer, lambda);
+
+  // Add the pointcloud to the ply writer.
+  writer.setPoints(&points);
+  writer.setIntensities(&distances);
+
+  // Write out the ply.
+  return writer.write();
+}
+
+float EsdfIntegratorTest::compareEsdfToGt(const EsdfLayer& esdf_layer,
+                                          const TsdfLayer& gt_layer,
+                                          float error_threshold) {
+  // Compare the layers
+  int total_num_voxels_observed = 0;
+  int num_voxels_over_threshold = 0;
+  for (const Index3D& block_index : esdf_layer.getAllBlockIndices()) {
+    const auto block_esdf = esdf_layer.getBlockAtIndex(block_index);
+    const auto block_gt = gt_layer.getBlockAtIndex(block_index);
+    if (!block_esdf || !block_gt) {
+      continue;
+    }
+    for (int x = 0; x < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; x++) {
+      for (int y = 0; y < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; y++) {
+        for (int z = 0; z < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; z++) {
+          float distance =
+              esdf_layer.voxel_size() *
+              std::sqrt(block_esdf->voxels[x][y][z].squared_distance_vox);
+          if (block_esdf->voxels[x][y][z].is_inside) {
+            distance = -distance;
+          }
+          float diff = distance - block_gt->voxels[x][y][z].distance;
+
+          if (block_esdf->voxels[x][y][z].observed) {
+            ++total_num_voxels_observed;
+            if (std::abs(diff) > error_threshold) {
+              ++num_voxels_over_threshold;
+            }
+          }
+        }
+      }
+    }
+  }
+  LOG(INFO) << "Num voxels observed: " << total_num_voxels_observed
+            << " over threshold: " << num_voxels_over_threshold;
+
+  return static_cast<float>(num_voxels_over_threshold) /
+         total_num_voxels_observed;
+}
+
+float EsdfIntegratorTest::compareEsdfToEsdf(const EsdfLayer& esdf_layer,
+                                            const EsdfLayer& gt_layer,
+                                            float error_threshold) {
+  // Compare the layers
+  int total_num_voxels_observed = 0;
+  int num_voxels_over_threshold = 0;
+  for (const Index3D& block_index : esdf_layer.getAllBlockIndices()) {
+    const auto block_esdf = esdf_layer.getBlockAtIndex(block_index);
+    CHECK_NOTNULL(block_esdf.get());
+    const auto block_gt = gt_layer.getBlockAtIndex(block_index);
+    if (!block_esdf || !block_gt) {
+      continue;
+    }
+    for (int x = 0; x < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; x++) {
+      for (int y = 0; y < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; y++) {
+        for (int z = 0; z < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; z++) {
+          float distance =
+              esdf_layer.voxel_size() *
+              std::sqrt(block_esdf->voxels[x][y][z].squared_distance_vox);
+          if (block_esdf->voxels[x][y][z].is_inside) {
+            distance = -distance;
+          }
+          float distance_gt =
+              esdf_layer.voxel_size() *
+              std::sqrt(block_gt->voxels[x][y][z].squared_distance_vox);
+          if (block_gt->voxels[x][y][z].is_inside) {
+            distance_gt = -distance_gt;
+          }
+          float diff = distance - distance_gt;
+
+          if (block_esdf->voxels[x][y][z].observed) {
+            ++total_num_voxels_observed;
+            if (std::abs(diff) > error_threshold) {
+              ++num_voxels_over_threshold;
+            }
+          }
+        }
+      }
+    }
+  }
+  LOG(INFO) << "Num ESDF voxels observed: " << total_num_voxels_observed
+            << " over threshold: " << num_voxels_over_threshold;
+
+  return static_cast<float>(num_voxels_over_threshold) /
+         total_num_voxels_observed;
+}
+
+bool EsdfIntegratorTest::validateEsdf(const EsdfLayer& esdf_layer,
+                                      float max_squared_distance_vox) {
+  constexpr float kTolerance = 1e-4;
+  constexpr int kVoxelsPerSide = VoxelBlock<TsdfVoxel>::kVoxelsPerSide;
+  for (const Index3D& block_index : esdf_layer.getAllBlockIndices()) {
+    const auto block = esdf_layer.getBlockAtIndex(block_index);
+    if (!block) {
+      continue;
+    }
+    for (int x = 0; x < kVoxelsPerSide; x++) {
+      for (int y = 0; y < kVoxelsPerSide; y++) {
+        for (int z = 0; z < kVoxelsPerSide; z++) {
+          const EsdfVoxel& voxel = block->voxels[x][y][z];
+
+          if (!voxel.observed) {
+            continue;
+          }
+          // If the voxel is a site, its parent must be 0, 0, 0 and vice versa.
+          if (voxel.parent_direction == Index3D::Zero()) {
+            if (!voxel.is_site &&
+                voxel.squared_distance_vox < max_squared_distance_vox &&
+                voxel.squared_distance_vox >= kTolerance) {
+              LOG(ERROR) << "Wrong distance for zero parent voxel! "
+                         << voxel.parent_direction.transpose() << " "
+                         << voxel.squared_distance_vox << "/"
+                         << max_squared_distance_vox;
+              return false;
+            }
+          } else {
+            // First check tht the distance matches.
+            if (voxel.squared_distance_vox < max_squared_distance_vox &&
+                voxel.squared_distance_vox -
+                        voxel.parent_direction.squaredNorm() >=
+                    kTolerance) {
+              LOG(ERROR) << "Distance doesn't match parent direction.";
+              return false;
+            }
+
+            // If a voxel has a parent direction, it should definitely point to
+            // a site voxel.
+
+            // Get the parent.
+            Index3D parent_index = Index3D(x, y, z) + voxel.parent_direction;
+
+            // Check if the voxel is within the same block.
+            if (parent_index.x() < 0 || parent_index.x() >= kVoxelsPerSide ||
+                parent_index.y() < 0 || parent_index.y() >= kVoxelsPerSide ||
+                parent_index.z() < 0 || parent_index.z() >= kVoxelsPerSide) {
+              // Then we need to get the block index.
+              Index3D neighbor_block_index = block_index;
+              Index3D neighbor_voxel_index = parent_index;
+
+              // Find the parent index.
+              while (parent_index.x() >= kVoxelsPerSide) {
+                parent_index.x() -= kVoxelsPerSide;
+                neighbor_block_index.x()++;
+              }
+              while (parent_index.y() >= kVoxelsPerSide) {
+                parent_index.y() -= kVoxelsPerSide;
+                neighbor_block_index.y()++;
+              }
+              while (parent_index.z() >= kVoxelsPerSide) {
+                parent_index.z() -= kVoxelsPerSide;
+                neighbor_block_index.z()++;
+              }
+
+              while (parent_index.x() < 0) {
+                parent_index.x() += kVoxelsPerSide;
+                neighbor_block_index.x()--;
+              }
+              while (parent_index.y() < 0) {
+                parent_index.y() += kVoxelsPerSide;
+                neighbor_block_index.y()--;
+              }
+              while (parent_index.z() < 0) {
+                parent_index.z() += kVoxelsPerSide;
+                neighbor_block_index.z()--;
+              }
+
+              EsdfBlock::ConstPtr neighbor_block =
+                  esdf_layer.getBlockAtIndex(neighbor_block_index);
+              if (!neighbor_block) {
+                LOG(ERROR) << "Neighbor block does not exist!";
+                return false;
+              }
+              const EsdfVoxel* neighbor_voxel =
+                  &neighbor_block->voxels[parent_index.x()][parent_index.y()]
+                                         [parent_index.z()];
+              if (!neighbor_voxel->is_site) {
+                LOG(ERROR) << "Parent in different block but is not a site!";
+                LOG(ERROR) << "Block index: "
+                           << neighbor_block_index.transpose()
+                           << " voxel index: " << parent_index.transpose();
+                return false;
+              }
+            } else {
+              // Ok check if the parent index is a site.
+              if (!block
+                       ->voxels[parent_index.x()][parent_index.y()]
+                               [parent_index.z()]
+                       .is_site) {
+                LOG(ERROR) << "Parent within the same block but is not a site!";
+                return false;
+              }
+            }
+          }
+          if (voxel.is_site) {
+            if (voxel.parent_direction != Index3D::Zero()) {
+              LOG(ERROR) << "Site voxel has a parent!";
+              return false;
+            }
+            if (voxel.squared_distance_vox >= kTolerance) {
+              LOG(ERROR) << "Site voxel has non-zero distance! "
+                         << voxel.squared_distance_vox;
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+}
+
+TEST_P(EsdfIntegratorTest, SingleEsdfTestCPU) {
+  // Create a scene that's just a plane.
+  addParameterizedObstacleToScene(GetParam());
+
+  // Generate a TSDF
+  scene_.generateSdfFromScene(4 * voxel_size_, tsdf_layer_.get());
+  // Get the full ground truth ESDF
+  scene_.generateSdfFromScene(max_distance_, gt_sdf_layer_.get());
+
+  // Actually run the ESDF generation.
+  std::vector<Index3D> block_indices = tsdf_layer_->getAllBlockIndices();
+  esdf_integrator_.integrateBlocksOnCPU(*tsdf_layer_, block_indices,
+                                        esdf_layer_.get());
+
+  // Compare the results to GT.
+  EXPECT_LE(compareEsdfToGt(*esdf_layer_, *gt_sdf_layer_, voxel_size_),
+            very_small_cutoff_);
+
+  // Check if all parents point the correct directions, etc.
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+
+  float slice_height = 1.0f;
+  std::string obstacle_string = std::to_string(static_cast<int>(GetParam()));
+  outputFlatSliceEsdfAsPly(
+      *esdf_layer_, "test_esdf_cpu_" + obstacle_string + "_esdf_slice.ply",
+      1.0f);
+  outputFlatSliceTsdfAsPly(
+      *tsdf_layer_, "test_esdf_cpu_" + obstacle_string + "_tsdf_slice.ply",
+      1.0f);
+
+  if (output_pointclouds_) {
+    io::outputVoxelLayerToPly(*esdf_layer_,
+                              "test_esdf_cpu_" + obstacle_string + "_esdf.ply");
+    io::outputVoxelLayerToPly(*gt_sdf_layer_,
+                              "test_esdf_cpu_" + obstacle_string + "_gt.ply");
+    io::outputVoxelLayerToPly(*tsdf_layer_,
+                              "test_esdf_cpu_" + obstacle_string + "_tsdf.ply");
+  }
+  std::cout << timing::Timing::Print();
+}
+
+TEST_P(EsdfIntegratorTest, SingleEsdfTestGPU) {
+  // Create a scene that's just an object.
+  addParameterizedObstacleToScene(GetParam());
+
+  // Generate a TSDF
+  scene_.generateSdfFromScene(4 * voxel_size_, tsdf_layer_.get());
+  // Get the full ground truth ESDF
+  scene_.generateSdfFromScene(max_distance_, gt_sdf_layer_.get());
+
+  // Actually run the ESDF generation.
+  std::vector<Index3D> block_indices = tsdf_layer_->getAllBlockIndices();
+  esdf_integrator_.integrateBlocksOnGPU(*tsdf_layer_, block_indices,
+                                        esdf_layer_.get());
+
+  // Compare the results to GT.
+  EXPECT_LE(compareEsdfToGt(*esdf_layer_, *gt_sdf_layer_, voxel_size_),
+            very_small_cutoff_);
+
+  // Check if all parents point the correct directions, etc.
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+
+  if (output_pointclouds_) {
+    std::string obstacle_string = std::to_string(static_cast<int>(GetParam()));
+    io::outputVoxelLayerToPly(*esdf_layer_,
+                              "test_esdf_gpu_" + obstacle_string + "_esdf.ply");
+    io::outputVoxelLayerToPly(*gt_sdf_layer_,
+                              "test_esdf_gpu_" + obstacle_string + "_gt.ply");
+    io::outputVoxelLayerToPly(*tsdf_layer_,
+                              "test_esdf_gpu_" + obstacle_string + "_tsdf.ply");
+  }
+  std::cout << timing::Timing::Print();
+}
+
+TEST_P(EsdfIntegratorTest, ComplexSceneWithTsdf) {
+  constexpr float kTrajectoryRadius = 4.0f;
+  constexpr float kTrajectoryHeight = 2.0f;
+  constexpr int kNumTrajectoryPoints = 80;
+
+  // Maximum distance to consider for scene generation.
+  constexpr float kMaxDist = 15.0;
+
+  Obstacle obstacle = GetParam();
+  // Skip the non-box cases:
+  if (obstacle != Obstacle::kBoxWithSphere &&
+      obstacle != Obstacle::kBoxWithCube && obstacle != Obstacle::kBox) {
+    return;
+  }
+  addParameterizedObstacleToScene(obstacle);
+
+  // Get the ground truth SDF for it.
+  scene_.generateSdfFromScene(max_distance_, gt_sdf_layer_.get());
+
+  // Set up the integrator.
+  tsdf_integrator_.max_integration_distance_m(kMaxDist);
+
+  // Simulate a trajectory of the requisite amount of points, on the circle
+  // around the sphere.
+  const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+
+  // Create a depth frame. We share this memory buffer for the entire
+  // trajectory.
+  DepthImage depth_image(camera_->height(), camera_->width(),
+                         MemoryType::kUnified);
+
+  for (size_t i = 0; i < kNumTrajectoryPoints; i++) {
+    const float theta = radians_increment * i;
+    // Convert polar to cartesian coordinates.
+    Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                   kTrajectoryRadius * std::sin(theta),
+                                   kTrajectoryHeight);
+    // The camera has its z axis pointing towards the origin.
+    Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+    Eigen::Quaternionf rotation_theta(
+        Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+    // Construct a transform from camera to scene with this.
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(rotation_theta * rotation_base);
+    T_S_C.pretranslate(cartesian_coordinates);
+
+    // Generate a depth image of the scene.
+    scene_.generateDepthImageFromScene(*camera_, T_S_C, kMaxDist, &depth_image);
+
+    // Integrate this depth image.
+    tsdf_integrator_.integrateFrame(depth_image, T_S_C, *camera_,
+                                    tsdf_layer_.get());
+  }
+
+  // Actually run the ESDF generation.
+  esdf_integrator_.integrateLayer(*tsdf_layer_, esdf_layer_.get());
+
+  // Compare the results to GT.
+  // Allow within 1 voxel sizes.
+  EXPECT_LE(compareEsdfToGt(*esdf_layer_, *gt_sdf_layer_, 4 * voxel_size_),
+            0.20);
+
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+
+  if (output_pointclouds_) {
+    //  Output the layer for inspection.
+    outputFlatSliceEsdfAsPly(*esdf_layer_, "test_esdf_slice_complex.ply", 1.0f);
+    outputFlatSliceTsdfAsPly(*gt_sdf_layer_, "test_esdf_slice_complex_gt.ply",
+                             1.0f);
+    io::outputVoxelLayerToPly(*gt_sdf_layer_,
+                              "test_esdf_slice_complex_gt_full.ply");
+  }
+
+  std::cout << timing::Timing::Print();
+}
+
+TEST_P(EsdfIntegratorTest, IncrementalTsdfAndEsdfWithObjectRemovalGPU) {
+  constexpr float kTrajectoryRadius = 4.0f;
+  constexpr float kTrajectoryHeight = 2.0f;
+  constexpr int kNumTrajectoryPoints = 1;
+
+  // Maximum distance to consider for scene generation.
+  constexpr float kMaxDist = 15.0;
+
+  // Create a batch layer to batch to.
+  EsdfLayer esdf_layer_batch(voxel_size_, MemoryType::kUnified);
+
+  Obstacle obstacle = GetParam();
+  // Skip the non-box cases:
+  if (obstacle != Obstacle::kBoxWithSphere &&
+      obstacle != Obstacle::kBoxWithCube && obstacle != Obstacle::kBox) {
+    return;
+  }
+  addParameterizedObstacleToScene(obstacle);
+
+  // Set up the integrator.
+  tsdf_integrator_.max_integration_distance_m(kMaxDist);
+
+  // Simulate a trajectory of the requisite amount of points, on the circle
+  // around the sphere.
+  const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+
+  // Create a depth frame. We share this memory buffer for the entire
+  // trajectory.
+  DepthImage depth_image(camera_->height(), camera_->width(),
+                         MemoryType::kUnified);
+
+  for (size_t i = 0; i < kNumTrajectoryPoints * 2; i++) {
+    if (i == kNumTrajectoryPoints) {
+      // Clear the scene.
+      scene_.clear();
+      addParameterizedObstacleToScene(Obstacle::kBox);
+    }
+    const float theta = radians_increment * i;
+    // Convert polar to cartesian coordinates.
+    Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                   kTrajectoryRadius * std::sin(theta),
+                                   kTrajectoryHeight);
+    // The camera has its z axis pointing towards the origin.
+    Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+    Eigen::Quaternionf rotation_theta(
+        Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+    // Construct a transform from camera to scene with this.
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(rotation_theta * rotation_base);
+    T_S_C.pretranslate(cartesian_coordinates);
+
+    // Generate a depth image of the scene.
+    scene_.generateDepthImageFromScene(*camera_, T_S_C, kMaxDist, &depth_image);
+
+    // Integrate this depth image.
+    std::vector<Index3D> updated_blocks;
+    tsdf_integrator_.integrateFrame(depth_image, T_S_C, *camera_,
+                                    tsdf_layer_.get(), &updated_blocks);
+
+    // Run incremental ESDF generation.
+    esdf_integrator_.integrateBlocksOnGPU(*tsdf_layer_, updated_blocks,
+                                          esdf_layer_.get());
+  }
+
+  // Run batch ESDF generation to compare to.
+  esdf_integrator_.integrateLayer(*tsdf_layer_, &esdf_layer_batch);
+
+  // Compare results to each other. Should really be basically identical.
+  EXPECT_LE(compareEsdfToEsdf(*esdf_layer_, esdf_layer_batch, voxel_size_),
+            small_cutoff_);
+
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+
+  if (output_pointclouds_) {
+    float slice_height = 3.75f;
+    std::string obstacle_string = std::to_string(static_cast<int>(obstacle));
+    outputFlatSliceEsdfAsPly(
+        *esdf_layer_, "test_incremental_" + obstacle_string + "_esdf.ply",
+        slice_height);
+    outputFlatSliceEsdfAsPly(
+        esdf_layer_batch,
+        "test_incremental_" + obstacle_string + "_esdf_batch.ply",
+        slice_height);
+    io::outputVoxelLayerToPly(
+        *esdf_layer_, "test_incremental_" + obstacle_string + "_esdf_full.ply");
+  }
+  std::cout << timing::Timing::Print();
+}
+
+TEST_P(EsdfIntegratorTest, IncrementalEsdf2DWithObjectRemoval) {
+  // Create a batch layer to batch to.
+  EsdfLayer esdf_layer_batch(voxel_size_, MemoryType::kUnified);
+
+  Obstacle obstacle = GetParam();
+  // Skip the non-box cases:
+  if (obstacle != Obstacle::kBoxWithSphere &&
+      obstacle != Obstacle::kBoxWithCube && obstacle != Obstacle::kBox) {
+    return;
+  }
+  addParameterizedObstacleToScene(obstacle);
+
+  AxisAlignedBoundingBox aabb(Vector3f(-5.0f, -5.0f, 1.0f - voxel_size_ / 2.0f),
+                              Vector3f(5.0f, 5.0f, 1.0f + voxel_size_ / 2.0f));
+  scene_.aabb() = aabb;
+
+  // Get the ground truth SDF for it.
+  scene_.generateSdfFromScene(max_distance_, tsdf_layer_.get());
+
+  for (size_t i = 0; i < 2; i++) {
+    if (i == 1) {
+      tsdf_layer_->clear();
+      // Clear the scene.
+      scene_.clear();
+      addParameterizedObstacleToScene(Obstacle::kBox);
+      scene_.aabb() = aabb;
+      scene_.generateSdfFromScene(max_distance_, tsdf_layer_.get());
+    }
+
+    // Get all blocks from the tsdf layer.
+    std::vector<Index3D> updated_blocks;
+    updated_blocks = tsdf_layer_->getAllBlockIndices();
+
+    // Run incremental ESDF generation.
+    esdf_integrator_.integrateBlocksOnGPU(*tsdf_layer_, updated_blocks,
+                                          esdf_layer_.get());
+  }
+
+  // Run batch ESDF generation to compare to.
+  esdf_integrator_.integrateLayer(*tsdf_layer_, &esdf_layer_batch);
+
+  // Compare results to each other. Should really be basically identical.
+  EXPECT_LE(compareEsdfToEsdf(*esdf_layer_, esdf_layer_batch, voxel_size_),
+            small_cutoff_);
+
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+  if (output_pointclouds_) {
+    float slice_height = 1.0f;
+    std::string obstacle_string = std::to_string(static_cast<int>(obstacle));
+    outputFlatSliceEsdfAsPly(
+        *esdf_layer_, "test_2d_" + obstacle_string + "_esdf.ply", slice_height);
+    outputFlatSliceEsdfAsPly(esdf_layer_batch,
+                             "test_2d_" + obstacle_string + "_esdf_batch.ply",
+                             slice_height);
+    outputFlatSliceTsdfAsPly(*tsdf_layer_,
+                             "test_2d_" + obstacle_string + "_tsdf_batch.ply",
+                             slice_height);
+  }
+  std::cout << timing::Timing::Print();
+}
+
+TEST_P(EsdfIntegratorTest, IncrementalEsdfSliceWithObjectRemovalGPU) {
+  // Create a batch layer to batch to.
+  EsdfLayer esdf_layer_batch(voxel_size_, MemoryType::kUnified);
+
+  Obstacle obstacle = GetParam();
+  // Skip the non-box cases:
+  if (obstacle != Obstacle::kBoxWithSphere &&
+      obstacle != Obstacle::kBoxWithCube && obstacle != Obstacle::kBox) {
+    return;
+  }
+
+  float min_z = 1.0f;
+  float max_z = 2.0f;
+  float output_z = 1.5f;
+  AxisAlignedBoundingBox aabb(
+      Vector3f(-5.0f, -5.0f, output_z - voxel_size_ / 2.0f),
+      Vector3f(5.0f, 5.0f, output_z + voxel_size_ / 2.0f));
+  float tsdf_truncation_distance = 4 * voxel_size_;
+
+  addParameterizedObstacleToScene(obstacle);
+
+  scene_.aabb() = aabb;
+
+  // Get the ground truth SDF for it.
+  scene_.generateSdfFromScene(tsdf_truncation_distance, tsdf_layer_.get());
+
+  for (size_t i = 0; i < 2; i++) {
+    if (i == 1) {
+      tsdf_layer_->clear();
+      // Clear the scene.
+      scene_.clear();
+      addParameterizedObstacleToScene(Obstacle::kBox);
+      scene_.aabb() = aabb;
+      scene_.generateSdfFromScene(tsdf_truncation_distance, tsdf_layer_.get());
+    }
+
+    // Get all blocks from the tsdf layer.
+    std::vector<Index3D> updated_blocks;
+    updated_blocks = tsdf_layer_->getAllBlockIndices();
+
+    // Run incremental ESDF generation.
+    esdf_integrator_.integrateSliceOnGPU(*tsdf_layer_, updated_blocks, min_z,
+                                         max_z, output_z, esdf_layer_.get());
+  }
+
+  // Run batch ESDF generation to compare to.
+  esdf_integrator_.integrateLayer(*tsdf_layer_, &esdf_layer_batch);
+
+  // Compare results to each other. Should really be basically identical.
+  EXPECT_LE(compareEsdfToEsdf(*esdf_layer_, esdf_layer_batch, voxel_size_),
+            small_cutoff_);
+
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+  if (output_pointclouds_) {
+    float slice_height = output_z;
+    std::string obstacle_string = std::to_string(static_cast<int>(obstacle));
+    io::outputVoxelLayerToPly(
+        *esdf_layer_, "test_slice_" + obstacle_string + "_esdf_full.ply");
+    outputFlatSliceEsdfAsPly(
+        esdf_layer_batch, "test_slice_" + obstacle_string + "_esdf_batch.ply",
+        slice_height);
+    io::outputVoxelLayerToPly(
+        *tsdf_layer_, "test_slice_" + obstacle_string + "_tsdf_batch.ply");
+  }
+  std::cout << timing::Timing::Print();
+}
+
+TEST_P(EsdfIntegratorTest, IncrementalEsdfWithObjectRemoval) {
+  // Create a batch layer to batch to.
+  EsdfLayer esdf_layer_batch(voxel_size_, MemoryType::kUnified);
+
+  Obstacle obstacle = GetParam();
+  // Skip the non-box cases:
+  if (obstacle != Obstacle::kBoxWithSphere &&
+      obstacle != Obstacle::kBoxWithCube && obstacle != Obstacle::kBox) {
+    return;
+  }
+  addParameterizedObstacleToScene(obstacle);
+
+  // Get the ground truth SDF for it.
+  scene_.generateSdfFromScene(max_distance_, tsdf_layer_.get());
+
+  for (size_t i = 0; i < 2; i++) {
+    if (i == 1) {
+      // Clear the scene.
+      scene_.clear();
+      addParameterizedObstacleToScene(Obstacle::kBox);
+
+      scene_.generateSdfFromScene(max_distance_, tsdf_layer_.get());
+    }
+
+    // Get all blocks from the tsdf layer.
+    std::vector<Index3D> updated_blocks;
+    updated_blocks = tsdf_layer_->getAllBlockIndices();
+
+    // Run incremental ESDF generation.
+    esdf_integrator_.integrateBlocksOnGPU(*tsdf_layer_, updated_blocks,
+                                          esdf_layer_.get());
+  }
+
+  // Run batch ESDF generation to compare to.
+  esdf_integrator_.integrateLayer(*tsdf_layer_, &esdf_layer_batch);
+
+  // Compare results to each other. Should really be basically identical.
+  EXPECT_LE(compareEsdfToEsdf(*esdf_layer_, esdf_layer_batch, voxel_size_),
+            small_cutoff_);
+
+  EXPECT_TRUE(validateEsdf(
+      *esdf_layer_, esdf_integrator_.max_squared_distance_vox(voxel_size_)));
+
+  if (output_pointclouds_) {
+    float slice_height = 3.5f;
+    std::string obstacle_string = std::to_string(static_cast<int>(obstacle));
+    outputFlatSliceEsdfAsPly(*esdf_layer_,
+                             "test_object_" + obstacle_string + "_esdf.ply",
+                             slice_height);
+    outputFlatSliceEsdfAsPly(
+        esdf_layer_batch, "test_object_" + obstacle_string + "_esdf_batch.ply",
+        slice_height);
+    io::outputVoxelLayerToPly(
+        *esdf_layer_, "test_object_" + obstacle_string + "_esdf_full.ply");
+  }
+  std::cout << timing::Timing::Print();
+}
+
+INSTANTIATE_TEST_CASE_P(
+    ParameterizedEsdfTests, EsdfIntegratorTest,
+    ::testing::Values(Obstacle::kAxisAlignedPlane, Obstacle::kAngledPlane,
+                      Obstacle::kSphereOrigin, Obstacle::kBox,
+                      Obstacle::kBoxWithSphere, Obstacle::kBoxWithCube));
+
+int main(int argc, char** argv) {
+  warmupCuda();
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_for_memory_leaks.cpp b/nvblox/tests/test_for_memory_leaks.cpp
new file mode 100644
index 00000000..0252e80f
--- /dev/null
+++ b/nvblox/tests/test_for_memory_leaks.cpp
@@ -0,0 +1,159 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/image.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/mesh/mesh_integrator.h"
+
+#include "nvblox/tests/test_utils_cuda.h"
+
+using namespace nvblox;
+
+std::pair<int, float> getFreeGPUMemory() {
+  size_t free_bytes;
+  size_t total_bytes;
+  cudaMemGetInfo(&free_bytes, &total_bytes);
+  const int free_mb = free_bytes / 1e6;
+  const int total_mb = total_bytes / 1e6;
+  const float free_percent =
+      static_cast<float>(free_mb) * 100.0f / static_cast<float>(total_mb);
+  return {free_mb, free_percent};
+}
+
+TEST(MemoryLeakTest, UnifiedVectorInt) {
+  // Allocate a bunch of data
+  unified_vector<int> vec;
+  constexpr int kNumTestingRounds = 10;
+  constexpr int kMegaBytesPerRound = 100;
+  constexpr int kNumVariablesToAddPerRound =
+      kMegaBytesPerRound * 1e6 / sizeof(int);
+  int start_free_gpu_memory_mb;
+  std::tie(start_free_gpu_memory_mb, std::ignore) = getFreeGPUMemory();
+  for (int test_idx = 0; test_idx < kNumTestingRounds; test_idx++) {
+    for (int i = 0; i < kNumVariablesToAddPerRound; i++) {
+      vec.push_back(i);
+    }
+    test_utils::addOneToAllGPU(&vec);
+    // Check (only on the final round)
+    if (test_idx == (kNumTestingRounds - 1)) {
+      for (int i = 0; i < kNumVariablesToAddPerRound; i++) {
+        EXPECT_EQ(vec[i], i + 1);
+      }
+    }
+    vec.clear();
+    // Debug
+    int free_gpu_memory_percent;
+    std::tie(std::ignore, free_gpu_memory_percent) = getFreeGPUMemory();
+    std::cout << "Percentage free: " << free_gpu_memory_percent << "\%"
+              << std::endl;
+  }
+  int end_free_gpu_memory_mb;
+  std::tie(end_free_gpu_memory_mb, std::ignore) = getFreeGPUMemory();
+
+  // Check that there's approximately the same left over at the end
+  int reduction_in_free_gpu_memory_mb =
+      start_free_gpu_memory_mb - end_free_gpu_memory_mb;
+  std::cout << "(Int) Memory difference: " << reduction_in_free_gpu_memory_mb
+            << std::endl;
+
+  // Check that memory isn't depleting.
+  // NOTE(alexmillane): We dont know what else is going on on the GPU but
+  // hopefully nothing that allocates 100Mb during this test run...
+  constexpr int kMaxAllowableMemoryReductionMb = 100;
+  // EXPECT_LT(reduction_in_free_gpu_memory_mb, kMaxAllowableMemoryReductionMb);
+}
+
+TEST(MemoryLeakTest, 3DMatchMeshing) {
+  // Load 3dmatch image
+  const std::string base_path = "data/3dmatch";
+  constexpr int seq_id = 1;
+  DepthImage depth_image_1;
+  ColorImage color_image_1;
+  EXPECT_TRUE(datasets::load16BitDepthImage(
+      datasets::threedmatch::getPathForDepthImage(base_path, seq_id, 0),
+      &depth_image_1));
+  EXPECT_TRUE(datasets::load8BitColorImage(
+      datasets::threedmatch::getPathForColorImage(base_path, seq_id, 0),
+      &color_image_1));
+  EXPECT_EQ(depth_image_1.width(), color_image_1.width());
+  EXPECT_EQ(depth_image_1.height(), color_image_1.height());
+
+  // Parse 3x3 camera intrinsics matrix from 3D Match format: space-separated.
+  Eigen::Matrix3f camera_intrinsic_matrix;
+  EXPECT_TRUE(datasets::threedmatch::parseCameraFromFile(
+      datasets::threedmatch::getPathForCameraIntrinsics(base_path),
+      &camera_intrinsic_matrix));
+  const auto camera = Camera::fromIntrinsicsMatrix(
+      camera_intrinsic_matrix, depth_image_1.width(), depth_image_1.height());
+
+  // Params
+  constexpr float kVoxelSizeM = 0.05f;
+  const float kBlockSizeM = VoxelBlock<TsdfVoxel>::kVoxelsPerSide * kVoxelSizeM;
+  ProjectiveTsdfIntegrator tsdf_integrator;
+
+  // Memory start
+  int start_free_gpu_memory_mb;
+  std::tie(start_free_gpu_memory_mb, std::ignore) = getFreeGPUMemory();
+
+  constexpr int kNumRounds = 100;
+  // Generate a mesh
+  MeshIntegrator mesh_integrator;
+  MeshLayer mesh_layer_gpu(kBlockSizeM, MemoryType::kUnified);
+  for (int i = 0; i < kNumRounds; i++) {
+    std::cout << "i: " << i << std::endl;
+    // Integrate depth
+    TsdfLayer tsdf_layer(kVoxelSizeM, MemoryType::kDevice);
+    tsdf_integrator.integrateFrame(depth_image_1, Transform::Identity(), camera,
+                                   &tsdf_layer);
+
+    EXPECT_TRUE(mesh_integrator.integrateMeshFromDistanceField(
+        tsdf_layer, &mesh_layer_gpu));
+    tsdf_layer.clear();
+    mesh_layer_gpu.clear();
+    // Debug
+    int free_gpu_memory_percent;
+    std::tie(std::ignore, free_gpu_memory_percent) = getFreeGPUMemory();
+    std::cout << "Percentage free: " << free_gpu_memory_percent << "\%"
+              << std::endl;
+  }
+  int end_free_gpu_memory_mb;
+  std::tie(end_free_gpu_memory_mb, std::ignore) = getFreeGPUMemory();
+
+  // Check that there's approximately the same left over at the end
+  int reduction_in_free_gpu_memory_mb =
+      start_free_gpu_memory_mb - end_free_gpu_memory_mb;
+  std::cout << "(3D Matching) Memory difference: "
+            << reduction_in_free_gpu_memory_mb << std::endl;
+
+  // Check that memory isn't depleting.
+  // NOTE(alexmillane): We dont know what else is going on on the GPU but
+  // hopefully nothing that allocates 100Mb during this test run...
+  constexpr int kMaxAllowableMemoryReductionMb = 100;
+  // EXPECT_LT(reduction_in_free_gpu_memory_mb, kMaxAllowableMemoryReductionMb);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_frustum.cpp b/nvblox/tests/test_frustum.cpp
new file mode 100644
index 00000000..ecad5e31
--- /dev/null
+++ b/nvblox/tests/test_frustum.cpp
@@ -0,0 +1,463 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include <string>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/integrators/frustum.h"
+#include "nvblox/integrators/ray_caster.h"
+#include "nvblox/io/pointcloud_io.h"
+#include "nvblox/primitives/scene.h"
+#include "nvblox/utils/timing.h"
+
+using namespace nvblox;
+
+class FrustumTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    timing::Timing::Reset();
+    std::srand(0);
+    block_size_ = VoxelBlock<bool>::kVoxelsPerSide * voxel_size_;
+
+    // Make the scene 6x6x3 meters big.
+    scene_.aabb() = AxisAlignedBoundingBox(Vector3f(-3.0f, -3.0f, 0.0f),
+                                           Vector3f(3.0f, 3.0f, 3.0f));
+
+    // Arbitrary camera
+    constexpr float fu = 300;
+    constexpr float fv = 300;
+    constexpr int width = 640;
+    constexpr int height = 480;
+    constexpr float cu = static_cast<float>(width) / 2.0f;
+    constexpr float cv = static_cast<float>(height) / 2.0f;
+    camera_.reset(new Camera(fu, fv, cu, cv, width, height));
+
+    base_path_ = "./data/3dmatch/";
+  }
+
+  static constexpr float kFloatEpsilon = 1e-4;
+
+  float block_size_;
+  float voxel_size_ = 0.05;
+
+  // A simulation scene.
+  primitives::Scene scene_;
+
+  // A simulation camera.
+  std::unique_ptr<Camera> camera_;
+
+  // Base path for 3D Match dataset.
+  std::string base_path_;
+
+  FrustumCalculator frustum_;
+};
+
+TEST_F(FrustumTest, FarPlaneImageTest) {
+  // We create a scene that is a flat plane 10 meters from the origin.
+  constexpr float kPlaneDistance = 10.0f;
+  float max_distance = kPlaneDistance;
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(kPlaneDistance, 0.0, 0.0), Vector3f(-1, 0, 0)));
+
+  // Create a pose at the origin looking forward.
+  Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+  Transform T_S_C = Transform::Identity();
+  T_S_C.prerotate(rotation_base);
+
+  // Generate a depth frame with max distance == plane distance.
+  DepthImage depth_frame(camera_->height(), camera_->width(),
+                         MemoryType::kUnified);
+  scene_.generateDepthImageFromScene(*camera_, T_S_C, 2 * kPlaneDistance,
+                                     &depth_frame);
+
+  // Figure out what the GT should be.
+  std::vector<Index3D> blocks_in_view = FrustumCalculator::getBlocksInView(
+      T_S_C, *camera_, block_size_, max_distance);
+  std::vector<Index3D> blocks_in_image_view =
+      FrustumCalculator::getBlocksInImageView(depth_frame, T_S_C, *camera_,
+                                              block_size_, 0.0f, max_distance);
+  EXPECT_EQ(blocks_in_view.size(), blocks_in_image_view.size());
+
+  // Now get the actual thing to test.
+  std::vector<Index3D> blocks_in_cuda_view = frustum_.getBlocksInImageViewCuda(
+      depth_frame, T_S_C, *camera_, block_size_, 0.0f,
+      max_distance + kFloatEpsilon);
+
+  // Sort all of the entries.
+  std::sort(blocks_in_view.begin(), blocks_in_view.end(),
+            VectorCompare<Index3D>());
+  std::sort(blocks_in_image_view.begin(), blocks_in_image_view.end(),
+            VectorCompare<Index3D>());
+  std::sort(blocks_in_cuda_view.begin(), blocks_in_cuda_view.end(),
+            VectorCompare<Index3D>());
+
+  size_t min_size =
+      std::min(blocks_in_cuda_view.size(), blocks_in_image_view.size());
+  for (size_t i = 0; i < min_size; i++) {
+    EXPECT_EQ(blocks_in_view[i], blocks_in_image_view[i]);
+  }
+
+  // We will now raycast through every single pixel in the original image.
+  for (int u = 0; u < camera_->rows(); u++) {
+    for (int v = 0; v < camera_->cols(); v++) {
+      // Get the depth at this image point.
+      float depth = depth_frame(u, v);
+      Vector3f p_C = depth * camera_->rayFromPixelIndices(Index2D(v, u));
+      Vector3f p_L = T_S_C * p_C;
+
+      Index3D block_index = getBlockIndexFromPositionInLayer(block_size_, p_L);
+
+      EXPECT_TRUE(std::binary_search(blocks_in_cuda_view.begin(),
+                                     blocks_in_cuda_view.end(), block_index,
+                                     VectorCompare<Index3D>()))
+          << block_index;
+      // Now raycast back to the center.
+      // Ok raycast to the correct point in the block.
+      RayCaster raycaster(T_S_C.translation() / block_size_, p_L / block_size_);
+      Index3D ray_index = Index3D::Zero();
+      while (raycaster.nextRayIndex(&ray_index)) {
+        EXPECT_TRUE(std::binary_search(blocks_in_cuda_view.begin(),
+                                       blocks_in_cuda_view.end(), ray_index,
+                                       VectorCompare<Index3D>()))
+            << ray_index;
+      }
+    }
+  }
+
+  std::cout << timing::Timing::Print();
+}
+
+TEST_F(FrustumTest, PlaneWithGround) {
+  // We create a scene that is a flat plane 10 meters from the origin.
+  constexpr float kPlaneDistance = 10.0f;
+  float max_distance = kPlaneDistance;
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(kPlaneDistance, 0.0, 0.0), Vector3f(-1, 0, 0)));
+  scene_.addGroundLevel(-1.0f);
+
+  // Create a pose at the origin looking forward.
+  Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+  Transform T_S_C = Transform::Identity();
+  T_S_C.prerotate(rotation_base);
+
+  // Generate a depth frame with max distance == plane distance.
+  DepthImage depth_frame(camera_->height(), camera_->width(),
+                         MemoryType::kUnified);
+  scene_.generateDepthImageFromScene(*camera_, T_S_C, 2 * kPlaneDistance,
+                                     &depth_frame);
+
+  // Figure out what the GT should be.
+  timing::Timer blocks_in_view_timer("blocks_in_view");
+  std::vector<Index3D> blocks_in_view = FrustumCalculator::getBlocksInView(
+      T_S_C, *camera_, block_size_, max_distance);
+  blocks_in_view_timer.Stop();
+  timing::Timer blocks_in_image_view_timer("blocks_in_image_view");
+  std::vector<Index3D> blocks_in_image_view =
+      FrustumCalculator::getBlocksInImageView(depth_frame, T_S_C, *camera_,
+                                              block_size_, 0.0f, max_distance);
+  blocks_in_image_view_timer.Stop();
+
+  EXPECT_EQ(blocks_in_view.size(), blocks_in_image_view.size());
+
+  // Now get the actual thing to test.
+  timing::Timer blocks_in_cuda_view_timer("blocks_in_cuda_view");
+  std::vector<Index3D> blocks_in_cuda_view = frustum_.getBlocksInImageViewCuda(
+      depth_frame, T_S_C, *camera_, block_size_, 0.0f,
+      max_distance + kFloatEpsilon);
+  EXPECT_LT(blocks_in_cuda_view.size(), blocks_in_image_view.size());
+  blocks_in_cuda_view_timer.Stop();
+
+  // Sort all of the entries.
+  std::sort(blocks_in_view.begin(), blocks_in_view.end(),
+            VectorCompare<Index3D>());
+  std::sort(blocks_in_image_view.begin(), blocks_in_image_view.end(),
+            VectorCompare<Index3D>());
+  std::sort(blocks_in_cuda_view.begin(), blocks_in_cuda_view.end(),
+            VectorCompare<Index3D>());
+
+  for (size_t i = 0; i < blocks_in_image_view.size(); i++) {
+    EXPECT_EQ(blocks_in_view[i], blocks_in_image_view[i]);
+  }
+
+  // Ok now the hard part. We expect the raycast to EVERY PIXEL to succeed
+  // in only going through allocated blocks.
+  // We make this easy by just making a TSDF layer.
+  TsdfLayer tsdf_layer(voxel_size_, nvblox::MemoryType::kUnified);
+  TsdfLayer tsdf_layer_cuda(voxel_size_, nvblox::MemoryType::kUnified);
+
+  for (const Index3D& block_index : blocks_in_cuda_view) {
+    TsdfBlock::Ptr block = tsdf_layer_cuda.allocateBlockAtIndex(block_index);
+    for (int x = 0; x < TsdfBlock::kVoxelsPerSide; x++) {
+      for (int y = 0; y < TsdfBlock::kVoxelsPerSide; y++) {
+        for (int z = 0; z < TsdfBlock::kVoxelsPerSide; z++) {
+          block->voxels[x][y][z].weight = 1;
+        }
+      }
+    }
+  }
+
+  // We will now raycast through every single pixel in the original image.
+  for (int u = 0; u < camera_->rows(); u++) {
+    for (int v = 0; v < camera_->cols(); v++) {
+      // Get the depth at this image point.
+      float depth = depth_frame(u, v);
+      Vector3f p_C = depth * camera_->rayFromPixelIndices(Index2D(v, u));
+      Vector3f p_L = T_S_C * p_C;
+
+      Index3D block_index = getBlockIndexFromPositionInLayer(block_size_, p_L);
+
+      EXPECT_TRUE(tsdf_layer_cuda.isBlockAllocated(block_index)) << block_index;
+
+      // Now raycast back to the center.
+      // Ok raycast to the correct point in the block.
+      RayCaster raycaster(T_S_C.translation() / block_size_, p_L / block_size_);
+      Index3D ray_index = Index3D::Zero();
+      while (raycaster.nextRayIndex(&ray_index)) {
+        EXPECT_TRUE(tsdf_layer_cuda.isBlockAllocated(ray_index)) << ray_index;
+
+        if (!tsdf_layer_cuda.isBlockAllocated(ray_index)) {
+          TsdfBlock::Ptr block = tsdf_layer.allocateBlockAtIndex(block_index);
+          for (int x = 0; x < TsdfBlock::kVoxelsPerSide; x++) {
+            for (int y = 0; y < TsdfBlock::kVoxelsPerSide; y++) {
+              for (int z = 0; z < TsdfBlock::kVoxelsPerSide; z++) {
+                block->voxels[x][y][z].weight = 1;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  bool output = false;
+  if (output) {
+    for (const Index3D& block_index : blocks_in_image_view) {
+      TsdfBlock::Ptr block = tsdf_layer.allocateBlockAtIndex(block_index);
+      for (int x = 0; x < TsdfBlock::kVoxelsPerSide; x++) {
+        for (int y = 0; y < TsdfBlock::kVoxelsPerSide; y++) {
+          for (int z = 0; z < TsdfBlock::kVoxelsPerSide; z++) {
+            block->voxels[x][y][z].weight = 1;
+          }
+        }
+      }
+    }
+
+    io::writeToCsv("test_frustum_image.csv", depth_frame);
+
+    io::outputVoxelLayerToPly(tsdf_layer, "test_frustum_blocks_image.ply");
+    io::outputVoxelLayerToPly(tsdf_layer_cuda, "test_frustum_blocks_cuda.ply");
+  }
+  std::cout << timing::Timing::Print();
+}
+
+// TODO(helen): fix this test with frustum checking.
+TEST_F(FrustumTest, BlocksInView) {
+  // Arranging a situation where we have a predictable number of blocks in
+  // view We choose a situation with with a single z-colomn of blocks in
+  // view by making a camera with 4 pixels viewing the furthest blocks'
+  // corners.
+
+  constexpr float kBlockSize = 1.0f;
+
+  // Let's touch a collumn of 10 blocks in the z direction.
+  constexpr float kDistanceToBlockCenters = 9.5f;
+
+  // Design a camera that just views the far blocks outer corners
+  constexpr float cu = 1.0;
+  constexpr float cv = 1.0;
+  constexpr int width = 2;
+  constexpr int height = 2;
+  // Pixel locations touched by the blocks upper-right corner.
+  // NOTE(alexmillane): we just slightly lengthen the focal length to not
+  // have boundary effects.
+  constexpr float u = static_cast<float>(width);
+  constexpr float v = static_cast<float>(height);
+  constexpr float fu =
+      static_cast<float>(u - cu) * 2.0 * kDistanceToBlockCenters / kBlockSize +
+      0.01;
+  constexpr float fv =
+      static_cast<float>(v - cv) * 2.0 * kDistanceToBlockCenters / kBlockSize +
+      0.01;
+  Camera camera(fu, fv, cu, cv, width, height);
+
+  // Depth frame with 4 pixels, some pixels at the far block depth, some a
+  // the first.
+  DepthImage depth_frame(2, 2, MemoryType::kUnified);
+  depth_frame(0, 0) = kBlockSize / 2.0f;
+  depth_frame(1, 0) = kBlockSize / 2.0f;
+  depth_frame(0, 1) = kDistanceToBlockCenters;
+  depth_frame(1, 1) = kDistanceToBlockCenters;
+
+  // Camera looking down z axis, sitting centered on a block center in x and
+  // y
+  Transform T_L_C;
+  T_L_C = Eigen::Translation3f(0.5f, 0.5f, 0.0f);
+
+  // Get the block indices
+  constexpr float kMaxDist = 10.0f;
+  constexpr float kTruncationDistance = 0.0f;
+  const std::vector<Index3D> blocks_in_view =
+      FrustumCalculator::getBlocksInImageView(depth_frame, T_L_C, camera,
+                                              kBlockSize, kTruncationDistance,
+                                              kMaxDist);
+
+  EXPECT_EQ(blocks_in_view.size(), 10);
+  for (int i = 0; i < blocks_in_view.size(); i++) {
+    EXPECT_TRUE((blocks_in_view[i].array() == Index3D(0, 0, i).array()).all());
+  }
+}
+
+TEST_F(FrustumTest, ThreeDMatch) {
+  // Get the first frame, a camera, and a pose.
+  constexpr int kSequenceNum = 1;
+  constexpr int kFrameNumber = 0;
+  float max_distance = 10.0f;
+
+  std::unique_ptr<datasets::ImageLoader<DepthImage>> depth_image_loader =
+      datasets::threedmatch::createDepthImageLoader(base_path_, kSequenceNum);
+
+  // Get the first image.
+  DepthImage depth_frame;
+  ASSERT_TRUE(depth_image_loader->getNextImage(&depth_frame));
+
+  // Get the tranform.
+  Transform T_L_C;
+  ASSERT_TRUE(datasets::threedmatch::parsePoseFromFile(
+      datasets::threedmatch::getPathForFramePose(base_path_, kSequenceNum,
+                                                 kFrameNumber),
+      &T_L_C));
+
+  // Create a camera object.
+  int image_width = depth_frame.cols();
+  int image_height = depth_frame.rows();
+  const std::string intrinsics_filename =
+      datasets::threedmatch::getPathForCameraIntrinsics(base_path_);
+  Eigen::Matrix3f camera_intrinsics;
+  ASSERT_TRUE(datasets::threedmatch::parseCameraFromFile(intrinsics_filename,
+                                                         &camera_intrinsics));
+  Camera camera = Camera::fromIntrinsicsMatrix(camera_intrinsics, image_width,
+                                               image_height);
+
+  for (int i = 0; i < 100; i++) {
+    // Now get the actual thing to test.
+    timing::Timer blocks_in_cuda_view_timer("blocks_in_cuda_view");
+    std::vector<Index3D> blocks_in_cuda_view =
+        frustum_.getBlocksInImageViewCuda(depth_frame, T_L_C, camera,
+                                          block_size_, 0.0f, max_distance);
+    blocks_in_cuda_view_timer.Stop();
+
+    // Figure out what the GT should be.
+    timing::Timer blocks_in_view_timer("blocks_in_view");
+    std::vector<Index3D> blocks_in_view = FrustumCalculator::getBlocksInView(
+        T_L_C, camera, block_size_, max_distance);
+    blocks_in_view_timer.Stop();
+    timing::Timer blocks_in_image_view_timer("blocks_in_image_view");
+    std::vector<Index3D> blocks_in_image_view =
+        FrustumCalculator::getBlocksInImageView(
+            depth_frame, T_L_C, camera, block_size_, 0.0f, max_distance);
+    blocks_in_image_view_timer.Stop();
+  }
+
+  std::cout << timing::Timing::Print();
+}
+
+class FrustumRayTracingSubsamplingTest
+    : public FrustumTest,
+      public ::testing::WithParamInterface<int> {
+ protected:
+  // Yo dawg I heard you like params
+};
+
+TEST_P(FrustumRayTracingSubsamplingTest, RayTracePixels) {
+  // Arranging a situation where we have a predictable number of blocks in
+  // view
+  // |--|--|
+  // |--|--|
+  // |--|--|
+  // |--|--|
+  //   \ /
+  //    *   --camera
+  constexpr float kBlockSize = 1.0f;
+
+  // Let's touch a 2x2x3 collumn of blocks in the z direction.
+  constexpr float kDistanceToBlockCenters = 2.5f;
+
+  // Design a camera that just views the far blocks outer corners
+  constexpr int width = 3;
+  constexpr int height = 3;
+  constexpr float cu = static_cast<float>(width - 1) / 2.0f;
+  constexpr float cv = static_cast<float>(height - 1) / 2.0f;
+  // Calculate focal lengths such that the extreme pixels shoot rays though back
+  // blocks' centers
+  constexpr float u = static_cast<float>(width - 1);
+  constexpr float v = static_cast<float>(height - 1);
+  constexpr float fu = static_cast<float>(u - cu) * kDistanceToBlockCenters /
+                       (0.5f * kBlockSize);
+  constexpr float fv = static_cast<float>(v - cv) * kDistanceToBlockCenters /
+                       (0.5f * kBlockSize);
+  Camera camera(fu, fv, cu, cv, width, height);
+
+  // Depth frame with 4 pixels, some pixels at the far block depth, some a
+  // the first.
+  DepthImage depth_frame(height, width, MemoryType::kUnified);
+  for (int lin_idx = 0; lin_idx < depth_frame.numel(); lin_idx++) {
+    depth_frame(lin_idx) = kDistanceToBlockCenters;
+  }
+
+  // Camera looking down z axis, sitting between blocks the in x and
+  // y dimensions
+  Transform T_L_C;
+  T_L_C = Eigen::Translation3f(1.0f, 1.0f, 0.0f);
+
+  FrustumCalculator frustum_calculator;
+
+  unsigned int raycast_subsampling_factor = GetParam();
+  frustum_calculator.raycast_subsampling_factor(raycast_subsampling_factor);
+
+  const std::vector<Index3D> blocks_in_view =
+      frustum_calculator.getBlocksInImageViewCuda(
+          depth_frame, T_L_C, camera, kBlockSize, 0.0,
+          kDistanceToBlockCenters + 1.0f);
+
+  std::for_each(blocks_in_view.begin(), blocks_in_view.end(),
+                [](const auto& block_idx) {
+                  EXPECT_TRUE(block_idx.x() == 0 || block_idx.x() == 1);
+                  EXPECT_TRUE(block_idx.y() == 0 || block_idx.y() == 1);
+                  EXPECT_TRUE(block_idx.z() == 0 || block_idx.z() == 1 ||
+                              block_idx.z() == 2);
+                });
+
+  // 2 x 2 x 3 block volume
+  EXPECT_EQ(blocks_in_view.size(), 12);
+}
+
+INSTANTIATE_TEST_CASE_P(FrustumTest, FrustumRayTracingSubsamplingTest,
+                        ::testing::Values(1, 2));
+
+int main(int argc, char** argv) {
+  warmupCuda();
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_gpu_layer_view.cpp b/nvblox/tests/test_gpu_layer_view.cpp
new file mode 100644
index 00000000..ac2b11f0
--- /dev/null
+++ b/nvblox/tests/test_gpu_layer_view.cpp
@@ -0,0 +1,201 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+
+#include "nvblox/gpu_hash/gpu_layer_view.h"
+
+#include "nvblox/primitives/primitives.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/tests/gpu_layer_utils.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+TEST(GpuHashTest, CopyOverLayer) {
+  // Indices to insert in the layer
+  std::vector<Index3D> block_indices;
+  block_indices.push_back({0, 0, 0});
+  block_indices.push_back({1, 0, 0});
+  block_indices.push_back({2, 0, 0});
+
+  // Create a layer with a single block
+  const float voxel_size = 0.1f;
+  TsdfLayer tsdf_layer(voxel_size, MemoryType::kUnified);
+  std::for_each(block_indices.begin(), block_indices.end(),
+                [&tsdf_layer](const Index3D& idx) {
+                  tsdf_layer.allocateBlockAtIndex(idx);
+                });
+
+  // Copy block hash to GPU
+  GPULayerView<TsdfBlock> gpu_layer = tsdf_layer.getGpuLayerView();
+
+  // Search for indices on the GPU
+  block_indices.push_back({3, 0, 0});
+  const std::vector<bool> flags =
+      test_utils::getContainsFlags(gpu_layer, block_indices);
+
+  EXPECT_EQ(flags.size(), block_indices.size());
+  EXPECT_TRUE(flags[0]);
+  EXPECT_TRUE(flags[1]);
+  EXPECT_TRUE(flags[2]);
+  EXPECT_FALSE(flags[3]);
+}
+
+TEST(GpuHashTest, SphereSceneAccessTest) {
+  // Sphere in a box scene.
+  primitives::Scene scene;
+  const Vector3f bounds_min = Vector3f(-5.0f, -5.0f, 0.0f);
+  const Vector3f bounds_max = Vector3f(5.0f, 5.0f, 5.0f);
+  scene.aabb() = AxisAlignedBoundingBox(bounds_min, bounds_max);
+  scene.addGroundLevel(0.0f);
+  scene.addCeiling(5.0f);
+  scene.addPrimitive(
+      std::make_unique<primitives::Sphere>(Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+  scene.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+
+  // Layer
+  constexpr float kVoxelSize = 0.1f;
+  TsdfLayer tsdf_layer(kVoxelSize, MemoryType::kUnified);
+
+  // Generate SDF
+  constexpr float kMaxDistance = 5.0f;
+  scene.generateSdfFromScene(kMaxDistance, &tsdf_layer);
+
+  // Get some random points in the scene
+  constexpr int kNumPoints = 1000;
+  std::vector<Vector3f> p_L_vec(kNumPoints);
+  std::generate(p_L_vec.begin(), p_L_vec.end(), [&bounds_min, &bounds_max]() {
+    return test_utils::getRandomVector3fInRange(bounds_min, bounds_max);
+  });
+
+  // Lookup voxels (CPU)
+  std::vector<TsdfVoxel> cpu_lookup_voxels;
+  cpu_lookup_voxels.reserve(p_L_vec.size());
+  for (const Vector3f& p_L : p_L_vec) {
+    // TODO(alexmillane): Let's also add this as a member function
+    const TsdfVoxel* voxel_ptr;
+    EXPECT_TRUE(getVoxelAtPosition(tsdf_layer, p_L, &voxel_ptr));
+    cpu_lookup_voxels.push_back(*voxel_ptr);
+  }
+
+  // CPU -> GPU
+  GPULayerView<TsdfBlock> gpu_layer = tsdf_layer.getGpuLayerView();
+
+  // Lookup voxels (GPU)
+  std::vector<TsdfVoxel> gpu_lookup_voxels;
+  std::vector<bool> flags;
+  std::tie(gpu_lookup_voxels, flags) =
+      test_utils::getVoxelsAtPositionsOnGPU(gpu_layer, p_L_vec);
+
+  std::for_each(flags.begin(), flags.end(),
+                [](bool flag) { EXPECT_TRUE(flag); });
+
+  CHECK_EQ(cpu_lookup_voxels.size(), gpu_lookup_voxels.size());
+  for (int i = 0; i < gpu_lookup_voxels.size(); i++) {
+    constexpr float eps = 1e-6;
+    EXPECT_NEAR(cpu_lookup_voxels[i].distance, gpu_lookup_voxels[i].distance,
+                eps);
+  }
+
+  // Get some points OUTSIDE the scene
+  LOG(INFO) << "Testing points outside the scene.";
+  std::generate(p_L_vec.begin(), p_L_vec.end(), [&bounds_max]() {
+    return test_utils::getRandomVector3fInRange(bounds_max.array() + 1.0,
+                                                bounds_max.array() + 5.0);
+  });
+  std::tie(gpu_lookup_voxels, flags) =
+      test_utils::getVoxelsAtPositionsOnGPU(gpu_layer, p_L_vec);
+  std::for_each(flags.begin(), flags.end(),
+                [](bool flag) { EXPECT_FALSE(flag); });
+}
+
+TEST(GpuHashTest, ResizeTest) {
+  // Indices to insert in the layer
+  std::vector<Index3D> block_indices;
+  block_indices.push_back({0, 0, 0});
+  block_indices.push_back({1, 0, 0});
+  block_indices.push_back({2, 0, 0});
+
+  // Create a layer with a two blocks
+  const float voxel_size = 0.1f;
+  TsdfLayer tsdf_layer(voxel_size, MemoryType::kUnified);
+  std::for_each(block_indices.begin(), block_indices.end(),
+                [&tsdf_layer](const Index3D& idx) {
+                  tsdf_layer.allocateBlockAtIndex(idx);
+                });
+
+  // Push the layer into a undersized layer view and check it expands to fit.
+  GPULayerView<TsdfBlock> gpu_layer_view(1);
+  gpu_layer_view.reset(&tsdf_layer);
+  EXPECT_GT(gpu_layer_view.size(), 1);
+}
+
+TEST(GpuHashTest, LoadFactorTest) {
+  // Create a layer with 1000 blocks
+  constexpr size_t num_blocks = 1000;
+  std::vector<Index3D> block_indices;
+  for (int i = 0; i < num_blocks; i++) {
+    block_indices.push_back({i, 0, 0});
+  }
+
+  // Create a layer
+  const float voxel_size = 0.1f;
+  TsdfLayer tsdf_layer(voxel_size, MemoryType::kUnified);
+  std::for_each(block_indices.begin(), block_indices.end(),
+                [&tsdf_layer](const Index3D& idx) {
+                  tsdf_layer.allocateBlockAtIndex(idx);
+                });
+
+  // Allocate a GPULayerView that can *just* hold the layer without breaking the
+  // load rate.
+  GPULayerView<TsdfBlock> gpu_layer(1);
+  const size_t initial_gpu_layer_size =
+      std::ceil(static_cast<float>(num_blocks) / gpu_layer.max_load_factor());
+  gpu_layer = GPULayerView<TsdfBlock>(initial_gpu_layer_size);
+
+  // Send in the blocks
+  gpu_layer.reset(&tsdf_layer);
+
+  // Check it hasn't expanded
+  EXPECT_EQ(gpu_layer.size(), initial_gpu_layer_size);
+
+  // Add another block
+  tsdf_layer.allocateBlockAtIndex({0, 0, 1});
+
+  // Regenerate the layer view
+  gpu_layer.reset(&tsdf_layer);
+
+  // Check it's expanded (by the right amount)
+  EXPECT_GT(gpu_layer.size(), initial_gpu_layer_size);
+  EXPECT_NEAR(static_cast<float>(gpu_layer.size()) /
+                  static_cast<float>(initial_gpu_layer_size),
+              gpu_layer.size_expansion_factor(), 0.01);
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_indexing.cpp b/nvblox/tests/test_indexing.cpp
new file mode 100644
index 00000000..e0be4e96
--- /dev/null
+++ b/nvblox/tests/test_indexing.cpp
@@ -0,0 +1,151 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+#include "nvblox/tests/gpu_indexing.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-4;
+
+class IndexingTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    std::srand(0);
+    block_size_ = VoxelBlock<bool>::kVoxelsPerSide * voxel_size_;
+  }
+
+  float block_size_;
+  float voxel_size_ = 0.05;
+};
+
+TEST_F(IndexingTest, GetVoxelIndex) {
+  ASSERT_NEAR(voxel_size_, 0.05, kFloatEpsilon);
+  ASSERT_NEAR(block_size_, 0.4, kFloatEpsilon);
+
+  Vector3f point(0, 0, 0);
+  Index3D voxel_index = getVoxelIndexFromPositionInLayer(block_size_, point);
+  EXPECT_TRUE(voxel_index == Index3D(0, 0, 0));
+
+  point = Vector3f(block_size_, block_size_, block_size_);
+  voxel_index = getVoxelIndexFromPositionInLayer(block_size_, point);
+  EXPECT_TRUE(voxel_index == Index3D(0, 0, 0));
+
+  point = Vector3f(2.0f, 1.0f, 3.0f);
+  voxel_index = getVoxelIndexFromPositionInLayer(block_size_, point);
+  EXPECT_TRUE(voxel_index == Index3D(0, 4, 4)) << voxel_index;
+
+  point = Vector3f(-2.0f, -1.0f, -3.0f);
+  voxel_index = getVoxelIndexFromPositionInLayer(block_size_, point);
+  EXPECT_TRUE(voxel_index == Index3D(0, 4, 4)) << voxel_index;
+}
+
+TEST_F(IndexingTest, CenterIndexing) {
+  AlignedVector<Vector3f> test_points;
+  test_points.push_back({0, 0, 0});
+  test_points.push_back({0.05, 0.05, 0.05});
+  test_points.push_back({0.025, 0.025, 0.025});
+  test_points.push_back({-1, -3, 2});
+
+  for (const Vector3f& point : test_points) {
+    Index3D voxel_index = getVoxelIndexFromPositionInLayer(block_size_, point);
+    Index3D block_index = getBlockIndexFromPositionInLayer(block_size_, point);
+
+    Vector3f reconstructed_point = getPositionFromBlockIndexAndVoxelIndex(
+        block_size_, block_index, voxel_index);
+    Vector3f reconstructed_center_point =
+        getCenterPostionFromBlockIndexAndVoxelIndex(block_size_, block_index,
+                                                    voxel_index);
+
+    // Check the difference between the corner and the center.
+    Vector3f center_difference =
+        reconstructed_center_point - reconstructed_point;
+
+    // Needs to be within voxel size.
+    EXPECT_LT((reconstructed_point - point).cwiseAbs().maxCoeff(), voxel_size_);
+    EXPECT_LT((reconstructed_center_point - point).cwiseAbs().maxCoeff(),
+              voxel_size_);
+    EXPECT_TRUE(center_difference.isApprox(
+        Vector3f(voxel_size_ / 2.0f, voxel_size_ / 2.0f, voxel_size_ / 2.0f),
+        kFloatEpsilon));
+  }
+}
+
+TEST_F(IndexingTest, getBlockAndVoxelIndexFromPositionInLayer) {
+  constexpr int kNumTests = 1000;
+  for (int i = 0; i < kNumTests; i++) {
+    // Random block and voxel indices
+    constexpr int kRandomBlockIndexRange = 1000;
+    const Index3D block_index = test_utils::getRandomIndex3dInRange(
+        -kRandomBlockIndexRange, kRandomBlockIndexRange);
+    const Index3D voxel_index = test_utils::getRandomIndex3dInRange(
+        0, VoxelBlock<bool>::kVoxelsPerSide - 1);
+
+    // 3D point from indices, including sub-voxel randomness
+    constexpr float voxel_size = 0.1;
+    constexpr float block_size = VoxelBlock<bool>::kVoxelsPerSide * voxel_size;
+    const Vector3f delta =
+        test_utils::getRandomVector3fInRange(0.0f, voxel_size);
+    const Vector3f p_L = (block_index.cast<float>() * block_size) +
+                         (voxel_index.cast<float>() * voxel_size) + delta;
+
+    // Point back to voxel indices
+    Index3D block_index_check;
+    Index3D voxel_index_check;
+    getBlockAndVoxelIndexFromPositionInLayer(
+        block_size, p_L, &block_index_check, &voxel_index_check);
+
+    // Check we get out what we put in
+    EXPECT_EQ(block_index_check.x(), block_index.x());
+    EXPECT_EQ(block_index_check.y(), block_index.y());
+    EXPECT_EQ(block_index_check.z(), block_index.z());
+  }
+}
+
+TEST_F(IndexingTest, getBlockAndVoxelIndexFromPositionInLayerRoundingErrors) {
+  constexpr int kNumTests = 1e7;
+  std::vector<Vector3f> positions;
+  positions.reserve(kNumTests);
+  for (int i = 0; i < kNumTests; i++) {
+    positions.push_back(Vector3f::Random());
+  }
+
+  std::vector<Index3D> block_indices;
+  std::vector<Index3D> voxel_indices;
+  constexpr float kBlockSize = 0.1;
+  test_utils::getBlockAndVoxelIndexFromPositionInLayerOnGPU(
+      kBlockSize, positions, &block_indices, &voxel_indices);
+
+  for (int i = 0; i < kNumTests; i++) {
+    EXPECT_TRUE((voxel_indices[i].array() >= 0).all());
+    EXPECT_TRUE(
+        (voxel_indices[i].array() < VoxelBlock<bool>::kVoxelsPerSide).all());
+  }
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_layer.cpp b/nvblox/tests/test_layer.cpp
new file mode 100644
index 00000000..5eb4719a
--- /dev/null
+++ b/nvblox/tests/test_layer.cpp
@@ -0,0 +1,208 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+
+#include "nvblox/tests/blox.h"
+#include "nvblox/tests/blox_utils.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+Index3D getRandomIndex3DInRange(const int min, const int max) {
+  return Index3D(test_utils::randomIntInRange(min, max),
+                 test_utils::randomIntInRange(min, max),
+                 test_utils::randomIntInRange(min, max));
+}
+
+TEST(LayerTest, InsertionAndRetrieval) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  // Empty layer
+  constexpr float KTestBlockSize = 0.1;
+  BlockLayer<IndexBlock> layer(KTestBlockSize, MemoryType::kUnified);
+
+  // Get some distict block locations
+  constexpr int kNumTestItems = 100;
+  constexpr int kMinimumIndexValue = -100;
+  constexpr int kMaximumIndexValue = 100;
+
+  Index3DSet unique_block_indices;
+  for (int i = 0; i < kNumTestItems; i++) {
+    while (!unique_block_indices
+                .insert(getRandomIndex3DInRange(kMinimumIndexValue,
+                                                kMaximumIndexValue))
+                .second) {
+      continue;
+    }
+  }
+
+  // Allocate blocks at these indices (and store index in block for checking
+  // later)
+  for (auto it = unique_block_indices.begin(); it != unique_block_indices.end();
+       it++) {
+    EXPECT_FALSE(layer.getBlockAtIndex(*it));
+    IndexBlock::Ptr block_ptr = layer.allocateBlockAtIndex(*it);
+    block_ptr->data = *it;
+  }
+
+  // Allocate blocks at these positions
+  for (auto it = unique_block_indices.begin(); it != unique_block_indices.end();
+       it++) {
+    IndexBlock::ConstPtr block_ptr = layer.getBlockAtIndex(*it);
+    // Check it's there
+    EXPECT_NE(block_ptr, nullptr);
+    // Check it has the data we put in.
+    EXPECT_EQ(block_ptr->data, *it);
+  }
+}
+
+TEST(LayerTest, EmptyLayer) {
+  constexpr float KTestBlockSize = 0.1;
+  BlockLayer<IndexBlock> layer(KTestBlockSize, MemoryType::kUnified);
+
+  IndexBlock::Ptr index_block_ptr = layer.getBlockAtIndex(Index3D(4, 50, -10));
+
+  EXPECT_FALSE(index_block_ptr);
+  EXPECT_EQ(index_block_ptr.get(), nullptr);
+}
+
+TEST(LayerTest, MinCornerBasedIndexing) {
+  // Check that indexing is performed with block origin at minimum corner and
+  // spanning block_size. E.g. from [0,0,0], [1,1,1], exclusive on the top side.
+
+  constexpr float KTestBlockSize = 0.1;
+  BlockLayer<DummyBlock> layer(KTestBlockSize, MemoryType::kUnified);
+
+  const Vector3f kPostion3DEpsilson(0.001f, 0.001f, 0.001f);
+  Vector3f position_low(0, 0, 0);
+  Vector3f position_high =
+      KTestBlockSize * Vector3f::Ones() - kPostion3DEpsilson;
+
+  // Put something in on the low side of the block's range
+  DummyBlock::Ptr block_low_ptr = layer.allocateBlockAtPosition(position_low);
+  block_low_ptr->data = true;
+
+  // Check we get the same block back on the high side.
+  DummyBlock::ConstPtr block_high_ptr =
+      layer.allocateBlockAtPosition(position_high);
+  EXPECT_NE(block_high_ptr, nullptr);
+  EXPECT_EQ(block_low_ptr, block_high_ptr);
+  EXPECT_TRUE(block_high_ptr->data);
+}
+
+TEST(VoxelLayerTest, CopyVoxelsToHost) {
+  constexpr float voxel_size_m = 1.0;
+  TsdfLayer tsdf_layer(voxel_size_m, MemoryType::kDevice);
+  auto block_ptr = tsdf_layer.allocateBlockAtIndex(Index3D(0, 0, 0));
+
+  test_utils::setTsdfBlockVoxelsInSequence(block_ptr);
+
+  // Generating the voxel center positions
+  std::vector<Vector3f> positions_L;
+  for (int x = 0; x < TsdfBlock::kVoxelsPerSide; x++) {
+    for (int y = 0; y < TsdfBlock::kVoxelsPerSide; y++) {
+      for (int z = 0; z < TsdfBlock::kVoxelsPerSide; z++) {
+        positions_L.push_back(Index3D(x, y, z).cast<float>() +
+                              0.5 * Vector3f::Ones());
+      }
+    }
+  }
+
+  std::vector<TsdfVoxel> voxels;
+  std::vector<bool> flags;
+  tsdf_layer.getVoxels(positions_L, &voxels, &flags);
+
+  for (int i = 0; i < voxels.size(); i++) {
+    EXPECT_TRUE(flags[i]);
+
+    EXPECT_EQ(voxels[i].distance, static_cast<float>(i));
+    EXPECT_EQ(voxels[i].weight, static_cast<float>(i));
+  }
+
+  // Now try some edge cases
+
+  // Just inside the block from {0.0f, 0.0f, 0.0f}
+  constexpr float kEps = 1e-5;
+  const Vector3f kVecEps = kEps * Vector3f::Ones();
+  tsdf_layer.getVoxels({kVecEps}, &voxels, &flags);
+  EXPECT_EQ(flags.size(), 1);
+  EXPECT_EQ(voxels.size(), 1);
+  EXPECT_TRUE(flags[0]);
+  EXPECT_EQ(voxels[0].distance, 0.0f);
+  EXPECT_EQ(voxels[0].weight, 0.0f);
+
+  // Just inside the block from it's far boundary {8.0f, 8.0f, 8.0f}
+  tsdf_layer.getVoxels({Vector3f(8.0f, 8.0f, 8.0f) - kVecEps}, &voxels, &flags);
+  EXPECT_TRUE(flags[0]);
+  EXPECT_EQ(voxels[0].distance, 511.0f);
+  EXPECT_EQ(voxels[0].weight, 511.0f);
+
+  // Just outside the block from {0.0f, 0.0f, 0.0f}
+  tsdf_layer.getVoxels({-kVecEps}, &voxels, &flags);
+  EXPECT_FALSE(flags[0]);
+
+  // Just outside the block from it's far boundary {8.0f, 8.0f, 8.0f}
+  tsdf_layer.getVoxels({Vector3f(8.0f, 8.0f, 8.0f) + kVecEps}, &voxels, &flags);
+  EXPECT_FALSE(flags[0]);
+}
+
+TEST(LayerTest, MoveOperations) {
+  constexpr float voxel_size_m = 1.0;
+
+  // BlockLayer (MeshLayer being the representative)
+  MeshLayer mesh_layer_1(voxel_size_m, MemoryType::kDevice);
+  mesh_layer_1.allocateBlockAtIndex(Index3D(0, 0, 0));
+
+  EXPECT_TRUE(mesh_layer_1.isBlockAllocated(Index3D(0, 0, 0)));
+
+  MeshLayer mesh_layer_2 = std::move(mesh_layer_1);
+
+  EXPECT_FALSE(mesh_layer_1.isBlockAllocated(Index3D(0, 0, 0)));
+  EXPECT_TRUE(mesh_layer_2.isBlockAllocated(Index3D(0, 0, 0)));
+
+  MeshLayer mesh_layer_3(std::move(mesh_layer_2));
+
+  EXPECT_FALSE(mesh_layer_2.isBlockAllocated(Index3D(0, 0, 0)));
+  EXPECT_TRUE(mesh_layer_3.isBlockAllocated(Index3D(0, 0, 0)));
+
+  // VoxelBlockLayer (TsdfLayer being the representative)
+  TsdfLayer tsdf_layer_1(voxel_size_m, MemoryType::kDevice);
+  tsdf_layer_1.allocateBlockAtIndex(Index3D(0, 0, 0));
+
+  TsdfLayer tsdf_layer_2 = std::move(tsdf_layer_1);
+
+  EXPECT_FALSE(tsdf_layer_1.isBlockAllocated(Index3D(0, 0, 0)));
+  EXPECT_TRUE(tsdf_layer_2.isBlockAllocated(Index3D(0, 0, 0)));
+
+  TsdfLayer tsdf_layer_3(std::move(tsdf_layer_2));
+
+  EXPECT_FALSE(tsdf_layer_2.isBlockAllocated(Index3D(0, 0, 0)));
+  EXPECT_TRUE(tsdf_layer_3.isBlockAllocated(Index3D(0, 0, 0)));
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_mesh.cpp b/nvblox/tests/test_mesh.cpp
new file mode 100644
index 00000000..49cca1f2
--- /dev/null
+++ b/nvblox/tests/test_mesh.cpp
@@ -0,0 +1,404 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include <string>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/indexing.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/mesh_io.h"
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/mesh/mesh_block.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/utils/timing.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-4;
+
+class MeshTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    timing::Timing::Reset();
+    std::srand(0);
+
+    sdf_layer_.reset(new TsdfLayer(voxel_size_, MemoryType::kUnified));
+    mesh_layer_.reset(
+        new MeshLayer(sdf_layer_->block_size(), MemoryType::kUnified));
+
+    block_size_ = mesh_layer_->block_size();
+
+    // Make the scene 6x6x3 meters big.
+    scene_.aabb() = AxisAlignedBoundingBox(Vector3f(-3.0f, -3.0f, 0.0f),
+                                           Vector3f(3.0f, 3.0f, 3.0f));
+  }
+
+  float block_size_;
+  float voxel_size_ = 0.10;
+
+  TsdfLayer::Ptr sdf_layer_;
+  MeshLayer::Ptr mesh_layer_;
+
+  // The actual integrator.
+  MeshIntegrator mesh_integrator_;
+
+  // A simulation scene.
+  primitives::Scene scene_;
+};
+
+TEST_F(MeshTest, DirectionFromNeighborTest) {
+  for (int i = 0; i < 8; i++) {
+    Index3D dir = marching_cubes::directionFromNeighborIndex(i);
+    EXPECT_GE(dir.minCoeff(), 0);
+    EXPECT_LE(dir.maxCoeff(), 1);
+
+    int index = marching_cubes::neighborIndexFromDirection(dir);
+    EXPECT_EQ(i, index);
+  }
+  // Make sure 0 maps to 0.
+  EXPECT_EQ(marching_cubes::neighborIndexFromDirection(Index3D::Zero()), 0);
+}
+
+TEST_F(MeshTest, BlankMap) {
+  std::vector<Index3D> block_indices_mesh = mesh_layer_->getAllBlockIndices();
+  std::vector<Index3D> block_indices_sdf = sdf_layer_->getAllBlockIndices();
+
+  ASSERT_EQ(block_indices_mesh.size(), 0);
+  ASSERT_EQ(block_indices_sdf.size(), 0);
+
+  // Integrate a blank map.
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(
+      *sdf_layer_, mesh_layer_.get()));
+
+  // Should still be 0.
+  block_indices_mesh = mesh_layer_->getAllBlockIndices();
+  ASSERT_EQ(block_indices_mesh.size(), 0);
+}
+
+TEST_F(MeshTest, PlaneMesh) {
+  // Create a scene that's just a plane.
+  // Plane at the origin pointing in the -x direction.
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(0.0, 0.0, 0.0), Vector3f(-1, 0, 0)));
+
+  scene_.generateSdfFromScene(4 * voxel_size_, sdf_layer_.get());
+
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(
+      *sdf_layer_, mesh_layer_.get(), DeviceType::kCPU));
+
+  std::vector<Index3D> block_indices_mesh = mesh_layer_->getAllBlockIndices();
+  std::vector<Index3D> block_indices_sdf = sdf_layer_->getAllBlockIndices();
+
+  EXPECT_GT(block_indices_sdf.size(), 0);
+  EXPECT_LE(block_indices_mesh.size(), block_indices_sdf.size());
+  EXPECT_GT(block_indices_mesh.size(), 0);
+
+  // Make sure there's no empty mesh blocks.
+  for (const Index3D& block_index : block_indices_mesh) {
+    MeshBlock::ConstPtr mesh_block = mesh_layer_->getBlockAtIndex(block_index);
+
+    EXPECT_GT(mesh_block->vertices.size(), 0);
+    EXPECT_GT(mesh_block->normals.size(), 0);
+    EXPECT_GT(mesh_block->triangles.size(), 0);
+
+    // Make sure everything is the same size.
+    EXPECT_EQ(mesh_block->vertices.size(), mesh_block->normals.size());
+    EXPECT_EQ(mesh_block->vertices.size(), mesh_block->triangles.size());
+
+    std::vector<Vector3f> vertices = mesh_block->getVertexVectorOnCPU();
+    std::vector<Vector3f> normals = mesh_block->getNormalVectorOnCPU();
+
+    // Make sure that the actual points are correct.
+    for (size_t i = 0; i < vertices.size(); i++) {
+      const Vector3f& vertex = vertices[i];
+      const Vector3f& normal = normals[i];
+
+      // Make sure the points on the plane are correct.
+      EXPECT_NEAR(vertex.x(), 0.0, kFloatEpsilon);
+      EXPECT_NEAR(normal.x(), -1.0, kFloatEpsilon);
+      EXPECT_NEAR(normal.y(), 0.0, kFloatEpsilon);
+      EXPECT_NEAR(normal.z(), 0.0, kFloatEpsilon);
+    }
+  }
+
+  io::outputMeshLayerToPly(*mesh_layer_, "test_mesh_cpu.ply");
+
+  std::cout << timing::Timing::Print();
+}
+
+TEST_F(MeshTest, ComplexScene) {
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(0.0, 0.0, 0.0), Vector3f(-1, 0, 0)));
+
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(2.1, 0.1, 0.1), Vector3f(0, -1, 0)));
+
+  scene_.addPrimitive(
+      std::make_unique<primitives::Sphere>(Vector3f(-2, -2, 0), 2.0));
+
+  scene_.generateSdfFromScene(4 * voxel_size_, sdf_layer_.get());
+
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(
+      *sdf_layer_, mesh_layer_.get(), DeviceType::kCPU));
+
+  std::vector<Index3D> block_indices_mesh = mesh_layer_->getAllBlockIndices();
+  EXPECT_GT(block_indices_mesh.size(), 0);
+
+  io::outputMeshLayerToPly(*mesh_layer_, "test_mesh_complex.ply");
+  std::cout << timing::Timing::Print();
+}
+
+TEST_F(MeshTest, GPUPlaneTest) {
+  // Create a scene that's just a plane.
+  // Plane at the origin pointing in the -x direction.
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(0.00, 0.0, 0.0), Vector3f(-1, 0, 0)));
+
+  scene_.generateSdfFromScene(4 * voxel_size_, sdf_layer_.get());
+
+  // Create a second mesh layer for the CPU.
+  BlockLayer<MeshBlock> cpu_mesh(block_size_, MemoryType::kUnified);
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(
+      *sdf_layer_, &cpu_mesh, DeviceType::kCPU));
+
+  // Integrate from GPU.
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(
+      *sdf_layer_, mesh_layer_.get(), DeviceType::kGPU));
+
+  std::vector<Index3D> block_indices_mesh = mesh_layer_->getAllBlockIndices();
+  std::vector<Index3D> block_indices_sdf = sdf_layer_->getAllBlockIndices();
+  std::vector<Index3D> block_indices_mesh_cpu = cpu_mesh.getAllBlockIndices();
+
+  EXPECT_GT(block_indices_sdf.size(), 0);
+  EXPECT_LE(block_indices_mesh.size(), block_indices_sdf.size());
+  EXPECT_GT(block_indices_mesh.size(), 0);
+  EXPECT_GE(block_indices_mesh.size(), block_indices_mesh_cpu.size());
+
+  // With the GPU integration we might actually have empty mesh blocks.
+  for (const Index3D& block_index : block_indices_mesh) {
+    MeshBlock::ConstPtr mesh_block = mesh_layer_->getBlockAtIndex(block_index);
+    MeshBlock::ConstPtr mesh_block_cpu = cpu_mesh.getBlockAtIndex(block_index);
+    if (mesh_block_cpu == nullptr) {
+      EXPECT_EQ(mesh_block->vertices.size(), 0);
+      EXPECT_EQ(mesh_block->normals.size(), 0);
+      EXPECT_EQ(mesh_block->triangles.size(), 0);
+    }
+
+    // Make sure everything is the same size.
+    EXPECT_EQ(mesh_block->vertices.size(), mesh_block->normals.size());
+    EXPECT_EQ(mesh_block->vertices.size(), mesh_block->triangles.size());
+
+    // Make sure we have the same size as on the CPU as well.
+    if (mesh_block_cpu != nullptr) {
+      EXPECT_EQ(mesh_block->vertices.size(), mesh_block_cpu->vertices.size());
+    }
+    std::vector<Vector3f> vertices = mesh_block->getVertexVectorOnCPU();
+    std::vector<Vector3f> normals = mesh_block->getNormalVectorOnCPU();
+
+    // Make sure that the actual points are correct.
+    for (size_t i = 0; i < vertices.size(); i++) {
+      const Vector3f& vertex = vertices[i];
+      const Vector3f& normal = normals[i];
+
+      // Make sure the points on the plane are correct.
+      EXPECT_NEAR(vertex.x(), 0.0, kFloatEpsilon);
+      EXPECT_NEAR(normal.x(), -1.0, kFloatEpsilon);
+      EXPECT_NEAR(normal.y(), 0.0, kFloatEpsilon);
+      EXPECT_NEAR(normal.z(), 0.0, kFloatEpsilon);
+    }
+  }
+  io::outputMeshLayerToPly(*mesh_layer_, "test_mesh_gpu.ply");
+
+  std::cout << timing::Timing::Print();
+}
+
+TEST_F(MeshTest, IncrementalMesh) {
+  constexpr float kSphereRadius = 2.0f;
+  constexpr float kTrajectoryRadius = 4.0f;
+  constexpr float kTrajectoryHeight = 2.0f;
+  constexpr int kNumTrajectoryPoints = 80;
+
+  // Maximum distance to consider for scene generation.
+  constexpr float kMaxDist = 10.0;
+  constexpr float kMinWeight = 2.0;
+
+  mesh_integrator_.min_weight() = kMinWeight;
+
+  // Create a camera.
+  Camera camera(300, 300, 320, 240, 640, 480);
+
+  // Scene is bounded to -5, -5, 0 to 5, 5, 5.
+  scene_.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, 0.0f),
+                                         Vector3f(5.0f, 5.0f, 5.0f));
+  // Create a scene with a ground plane, ceiling, and a sphere.
+  scene_.addGroundLevel(0.0f);
+  scene_.addCeiling(5.0f);
+  scene_.addPrimitive(
+      std::make_unique<primitives::Sphere>(Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+  // Add bounding planes at 5 meters. Basically makes it sphere in a box.
+  scene_.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+
+  // Create an integrator.
+  ProjectiveTsdfIntegrator integrator;
+  integrator.max_integration_distance_m(10.0);
+
+  // Simulate a trajectory of the requisite amount of points, on the circle
+  // around the sphere.
+  const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+
+  // Create a depth frame. We share this memory buffer for the entire
+  // trajectory.
+  DepthImage depth_frame(camera.height(), camera.width(), MemoryType::kUnified);
+
+  for (size_t i = 0; i < kNumTrajectoryPoints; i++) {
+    const float theta = radians_increment * i;
+    // Convert polar to cartesian coordinates.
+    Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                   kTrajectoryRadius * std::sin(theta),
+                                   kTrajectoryHeight);
+    // The camera has its z axis pointing towards the origin.
+    Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+    Eigen::Quaternionf rotation_theta(
+        Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+    // Construct a transform from camera to scene with this.
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(rotation_theta * rotation_base);
+    T_S_C.pretranslate(cartesian_coordinates);
+
+    // Generate a depth image of the scene.
+    scene_.generateDepthImageFromScene(camera, T_S_C, kMaxDist, &depth_frame);
+
+    // Integrate this depth image.
+    std::vector<Index3D> updated_blocks;
+
+    integrator.integrateFrame(depth_frame, T_S_C, camera, sdf_layer_.get(),
+                              &updated_blocks);
+
+    // Integrate the mesh.
+    mesh_integrator_.integrateBlocksGPU(*sdf_layer_, updated_blocks,
+                                        mesh_layer_.get());
+  }
+
+  // Create a comparison mesh.
+  BlockLayer<MeshBlock>::Ptr batch_mesh_layer(
+      new BlockLayer<MeshBlock>(block_size_, MemoryType::kUnified));
+
+  // Compute the batch mesh.
+  mesh_integrator_.integrateMeshFromDistanceField(*sdf_layer_,
+                                                  batch_mesh_layer.get());
+
+  io::outputMeshLayerToPly(*mesh_layer_, "test_mesh_inc.ply");
+  io::outputMeshLayerToPly(*batch_mesh_layer, "test_mesh_batch.ply");
+
+  // For each block in the batch mesh, make sure we have roughly the same
+  // number of points in the incremental mesh.
+  std::vector<Index3D> all_blocks = batch_mesh_layer->getAllBlockIndices();
+
+  for (const Index3D& block : all_blocks) {
+    MeshBlock::ConstPtr batch_block = batch_mesh_layer->getBlockAtIndex(block);
+    MeshBlock::ConstPtr inc_block = mesh_layer_->getBlockAtIndex(block);
+
+    ASSERT_NE(inc_block.get(), nullptr);
+    EXPECT_EQ(batch_block->vertices.size(), inc_block->vertices.size());
+  }
+
+  std::cout << timing::Timing::Print();
+}
+
+TEST_F(MeshTest, RepeatabilityTest) {
+  const std::string base_path = "../tests/data/3dmatch";
+  constexpr int seq_id = 1;
+  DepthImage depth_image;
+  ColorImage color_image;
+  EXPECT_TRUE(datasets::load16BitDepthImage(
+      datasets::threedmatch::getPathForDepthImage(base_path, seq_id, 0),
+      &depth_image));
+  EXPECT_TRUE(datasets::load8BitColorImage(
+      datasets::threedmatch::getPathForColorImage(base_path, seq_id, 0),
+      &color_image));
+  EXPECT_EQ(depth_image.width(), color_image.width());
+  EXPECT_EQ(depth_image.height(), color_image.height());
+
+  // Parse 3x3 camera intrinsics matrix from 3D Match format: space-separated.
+  Eigen::Matrix3f camera_intrinsic_matrix;
+  EXPECT_TRUE(datasets::threedmatch::parseCameraFromFile(
+      datasets::threedmatch::getPathForCameraIntrinsics(base_path),
+      &camera_intrinsic_matrix));
+  const auto camera = Camera::fromIntrinsicsMatrix(
+      camera_intrinsic_matrix, depth_image.width(), depth_image.height());
+
+  // Integrate depth
+  sdf_layer_ = std::make_shared<TsdfLayer>(voxel_size_, MemoryType::kUnified);
+  ProjectiveTsdfIntegrator tsdf_integrator;
+  tsdf_integrator.integrateFrame(depth_image, Transform::Identity(), camera,
+                                 sdf_layer_.get());
+
+  // Generate the mesh (twice)
+  MeshLayer mesh_layer_1(block_size_, MemoryType::kUnified);
+  MeshLayer mesh_layer_2(block_size_, MemoryType::kUnified);
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(*sdf_layer_,
+                                                              &mesh_layer_1));
+  EXPECT_TRUE(mesh_integrator_.integrateMeshFromDistanceField(*sdf_layer_,
+                                                              &mesh_layer_2));
+
+  // Check that the generated vertices are the same.
+  auto block_indices_1 = mesh_layer_1.getAllBlockIndices();
+  auto block_indices_2 = mesh_layer_2.getAllBlockIndices();
+  EXPECT_EQ(block_indices_1.size(), block_indices_2.size());
+  for (int block_idx = 0; block_idx < block_indices_1.size(); block_idx++) {
+    const Index3D& idx_1 = block_indices_1[block_idx];
+    const Index3D& idx_2 = block_indices_2[block_idx];
+    EXPECT_TRUE((idx_1.array() == idx_2.array()).all());
+    MeshBlock::ConstPtr block_1 = mesh_layer_1.getBlockAtIndex(idx_1);
+    MeshBlock::ConstPtr block_2 = mesh_layer_2.getBlockAtIndex(idx_2);
+    EXPECT_EQ(block_1->vertices.size(), block_2->vertices.size());
+
+    auto threed_less = [](const Vector3f& p_1, const Vector3f& p_2) -> bool {
+      if (p_1.x() != p_2.x()) {
+        return p_1.x() < p_2.x();
+      }
+      if (p_1.y() != p_2.y()) {
+        return p_1.y() < p_2.y();
+      }
+      return p_1.z() < p_2.z();
+    };
+    auto vertex_vector_1 = block_1->getVertexVectorOnCPU();
+    auto vertex_vector_2 = block_2->getVertexVectorOnCPU();
+    std::sort(vertex_vector_1.begin(), vertex_vector_1.end(), threed_less);
+    std::sort(vertex_vector_2.begin(), vertex_vector_2.end(), threed_less);
+    for (int i = 0; i < vertex_vector_1.size(); i++) {
+      EXPECT_TRUE(
+          (vertex_vector_1[i].array() == vertex_vector_2[i].array()).all());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  warmupCuda();
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_mesh_coloring.cpp b/nvblox/tests/test_mesh_coloring.cpp
new file mode 100644
index 00000000..4bfbfd44
--- /dev/null
+++ b/nvblox/tests/test_mesh_coloring.cpp
@@ -0,0 +1,239 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/datasets/image_loader.h"
+#include "nvblox/datasets/parse_3dmatch.h"
+#include "nvblox/integrators/projective_color_integrator.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/mesh_io.h"
+#include "nvblox/mesh/mesh_integrator.h"
+#include "nvblox/primitives/scene.h"
+
+using namespace nvblox;
+
+TEST(MeshColoringTests, UniformColorSphere) {
+  // The uniform color
+  const Color kTestColor = Color::Purple();
+
+  // Layer params
+  constexpr float voxel_size_m = 0.1;
+  constexpr float block_size_m =
+      VoxelBlock<TsdfVoxel>::kVoxelsPerSide * voxel_size_m;
+
+  // TSDF layer
+  TsdfLayer tsdf_layer(voxel_size_m, MemoryType::kUnified);
+
+  // Build the test scene
+  constexpr float kSphereRadius = 2.0f;
+  constexpr float kTruncationDistanceVox = 2;
+  constexpr float kTruncationDistanceMeters =
+      kTruncationDistanceVox * voxel_size_m;
+
+  // Maximum distance to consider for scene generation.
+  constexpr float kMaxDist = 10.0;
+  constexpr float kMinWeight = 1.0;
+
+  // Tolerance for error.
+  constexpr float kDistanceErrorTolerance = kTruncationDistanceMeters;
+
+  // Scene is bounded to -5, -5, 0 to 5, 5, 5.
+  primitives::Scene scene;
+  scene.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, 0.0f),
+                                        Vector3f(5.0f, 5.0f, 5.0f));
+  // Create a scene with a ground plane and a sphere.
+  scene.addGroundLevel(0.0f);
+  scene.addCeiling(5.0f);
+  scene.addPrimitive(
+      std::make_unique<primitives::Sphere>(Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+  // Add bounding planes at 5 meters. Basically makes it sphere in a box.
+  scene.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+
+  // Get the ground truth SDF for it.
+  scene.generateSdfFromScene(kTruncationDistanceMeters, &tsdf_layer);
+
+  // Make a ColorLayer with a solid color
+  ColorLayer color_layer(voxel_size_m, MemoryType::kUnified);
+  for (const Index3D& block_idx : tsdf_layer.getAllBlockIndices()) {
+    ColorBlock::Ptr color_block = color_layer.allocateBlockAtIndex(block_idx);
+    callFunctionOnAllVoxels<ColorVoxel>(
+        color_block.get(),
+        [&kTestColor](const Index3D& voxel_index, ColorVoxel* voxel) {
+          voxel->color = kTestColor;
+        });
+  }
+
+  // Generate a mesh from the "reconstruction"
+  MeshIntegrator mesh_integrator;
+  MeshLayer mesh_layer(block_size_m, MemoryType::kUnified);
+  EXPECT_TRUE(
+      mesh_integrator.integrateMeshFromDistanceField(tsdf_layer, &mesh_layer));
+  mesh_integrator.colorMesh(color_layer, &mesh_layer);
+
+  // Check that all the mesh points are correctly colored
+  callFunctionOnAllBlocks<MeshBlock>(
+      mesh_layer,
+      [&kTestColor](const Index3D& block_index, const MeshBlock* mesh_block) {
+        EXPECT_EQ(mesh_block->vertices.size(), mesh_block->colors.size());
+        for (const Color& color : mesh_block->colors) {
+          EXPECT_EQ(color, kTestColor);
+        }
+      });
+}
+
+TEST(MeshColoringTests, CPUvsGPUon3DMatch) {
+  // Load 3dmatch image
+  const std::string base_path = "../tests/data/3dmatch";
+  constexpr int seq_id = 1;
+  DepthImage depth_image_1;
+  ColorImage color_image_1;
+  EXPECT_TRUE(datasets::load16BitDepthImage(
+      datasets::threedmatch::getPathForDepthImage(base_path, seq_id, 0),
+      &depth_image_1));
+  EXPECT_TRUE(datasets::load8BitColorImage(
+      datasets::threedmatch::getPathForColorImage(base_path, seq_id, 0),
+      &color_image_1));
+  EXPECT_EQ(depth_image_1.width(), color_image_1.width());
+  EXPECT_EQ(depth_image_1.height(), color_image_1.height());
+
+  // Parse 3x3 camera intrinsics matrix from 3D Match format: space-separated.
+  Eigen::Matrix3f camera_intrinsic_matrix;
+  EXPECT_TRUE(datasets::threedmatch::parseCameraFromFile(
+      datasets::threedmatch::getPathForCameraIntrinsics(base_path),
+      &camera_intrinsic_matrix));
+  const auto camera = Camera::fromIntrinsicsMatrix(
+      camera_intrinsic_matrix, depth_image_1.width(), depth_image_1.height());
+
+  // Integrate depth
+  constexpr float kVoxelSizeM = 0.05f;
+  const float kBlockSizeM = VoxelBlock<TsdfVoxel>::kVoxelsPerSide * kVoxelSizeM;
+  ProjectiveTsdfIntegrator tsdf_integrator;
+  TsdfLayer tsdf_layer(kVoxelSizeM, MemoryType::kUnified);
+  tsdf_integrator.integrateFrame(depth_image_1, Transform::Identity(), camera,
+                                 &tsdf_layer);
+
+  // Integrate Color (GPU)
+  ProjectiveColorIntegrator color_integrator;
+  ColorLayer color_layer(kVoxelSizeM, MemoryType::kUnified);
+  color_integrator.integrateFrame(color_image_1, Transform::Identity(), camera,
+                                  tsdf_layer, &color_layer);
+
+  // Generate a mesh from the "reconstruction"
+  MeshIntegrator mesh_integrator;
+  MeshLayer mesh_layer_colored_on_gpu(kBlockSizeM, MemoryType::kUnified);
+  EXPECT_TRUE(mesh_integrator.integrateMeshFromDistanceField(
+      tsdf_layer, &mesh_layer_colored_on_gpu));
+
+  // Copy the mesh
+  MeshLayer mesh_layer_colored_on_cpu(kBlockSizeM, MemoryType::kUnified);
+  for (const Index3D& block_idx :
+       mesh_layer_colored_on_gpu.getAllBlockIndices()) {
+    MeshBlock::ConstPtr mesh_block =
+        mesh_layer_colored_on_gpu.getBlockAtIndex(block_idx);
+    MeshBlock::Ptr mesh_block_copy =
+        mesh_layer_colored_on_cpu.allocateBlockAtIndex(block_idx);
+    mesh_block_copy->vertices = unified_vector<Vector3f>(mesh_block->vertices);
+    mesh_block_copy->normals = unified_vector<Vector3f>(mesh_block->normals);
+  }
+
+  // Color on GPU and CPU
+  mesh_integrator.colorMeshGPU(color_layer, &mesh_layer_colored_on_gpu);
+  mesh_integrator.colorMeshCPU(color_layer, &mesh_layer_colored_on_cpu);
+
+  // Compare colors between the two implementations
+  int num_same = 0;
+  int num_diff = 0;
+  int num_diff_outside = 0;
+  int total_vertices = 0;
+
+  auto block_indices_gpu = mesh_layer_colored_on_gpu.getAllBlockIndices();
+  auto block_indices_cpu = mesh_layer_colored_on_cpu.getAllBlockIndices();
+  EXPECT_EQ(block_indices_gpu.size(), block_indices_cpu.size());
+  for (int idx = 0; idx < block_indices_gpu.size(); idx++) {
+    const Index3D& block_idx = block_indices_gpu[idx];
+
+    MeshBlock::ConstPtr block_gpu =
+        mesh_layer_colored_on_gpu.getBlockAtIndex(block_idx);
+    MeshBlock::ConstPtr block_cpu =
+        mesh_layer_colored_on_cpu.getBlockAtIndex(block_idx);
+    CHECK(block_gpu);
+    CHECK(block_cpu);
+
+    EXPECT_EQ(block_gpu->vertices.size(), block_cpu->vertices.size());
+    EXPECT_EQ(block_gpu->colors.size(), block_cpu->colors.size());
+    EXPECT_EQ(block_gpu->vertices.size(), block_gpu->colors.size());
+    for (int i = 0; i < block_gpu->colors.size(); i++) {
+      EXPECT_TRUE(
+          (block_gpu->vertices[i].array() == block_cpu->vertices[i].array())
+              .all());
+      if (block_gpu->colors[i] == block_cpu->colors[i]) {
+        num_same++;
+      } else {
+        num_diff++;
+        // OK so we have different colors at this vertex.
+        // This CAN occur because of vertices that leave block boundaries. (For
+        // speed we only take the closest color voxel in the corresponding
+        // block).
+        // Let's check that this is indeed the case here.
+
+        // Calculate the position of this vertex in the block
+        const Vector3f p_V_B_m =
+            block_cpu->vertices[i] -
+            getPositionFromBlockIndex(kBlockSizeM, block_idx);
+        const Vector3f p_V_B_vox = p_V_B_m / kVoxelSizeM;
+        if ((p_V_B_vox.array() > VoxelBlock<TsdfVoxel>::kVoxelsPerSide).any()) {
+          num_diff_outside++;
+        }
+      }
+    }
+    total_vertices += block_cpu->vertices.size();
+  }
+
+  // OK so there are a few verices which get different colors and are WITHIN
+  // block boundaries. From looking at them this is because they sit exactly on
+  // voxel boundaries and presumably are rounded different ways in the CPU and
+  // GPU implementations. Let's just check that the number of such occurances
+  // are exceedingly small.
+  constexpr float kAllowablePercentageDifferingColorWithinBlock = 0.1f;  // 0.1%
+  const float percentage_different =
+      100.0f * (num_diff - num_diff_outside) / total_vertices;
+  EXPECT_LT(percentage_different,
+            kAllowablePercentageDifferingColorWithinBlock);
+
+  std::cout << "Number of vertices assigned the SAME color between CPU and GPU "
+               "implementations: "
+            << num_same << std::endl;
+  std::cout << "Number of vertices assigned DIFFERING color between CPU and "
+               "GPU implementations: "
+            << num_diff << std::endl;
+  std::cout << "of these, number with vertexes outside block boundaries: "
+            << num_diff_outside << std::endl;
+  std::cout
+      << "The percentage of vertexes with different colors within blocks: "
+      << percentage_different << "%" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_nvtx_ranges.cpp b/nvblox/tests/test_nvtx_ranges.cpp
new file mode 100644
index 00000000..266d7bd4
--- /dev/null
+++ b/nvblox/tests/test_nvtx_ranges.cpp
@@ -0,0 +1,75 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/unified_vector.h"
+#include "nvblox/utils/nvtx_ranges.h"
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/tests/test_utils_cuda.h"
+
+using namespace nvblox;
+
+// NOTE(alexmillane): This file doesn't really test per-se NvtxRange. The intent
+// is that this test generates nsight system trace for inspection.
+
+TEST(NvtxTest, Ranges) {
+  timing::NvtxRange whole_test_nvtx_range("whole_test", Color::Orange());
+
+  constexpr int kSize = 1e6;
+  constexpr int kInitialValue = 0;
+  unified_vector<int> vec(kSize, kInitialValue);
+
+  constexpr int kNumAdds = 10;
+  for (int i = 0; i < kNumAdds; i++) {
+    timing::NvtxRange nvtx_range("add one idx: " + std::to_string(i),
+                                 Color::Red());
+    test_utils::addOneToAllGPU(&vec);
+  }
+
+  timing::NvtxRange nvtx_range("check", Color::Green());
+  EXPECT_TRUE(
+      test_utils::checkVectorAllConstant(vec, kInitialValue + kNumAdds));
+  nvtx_range.Stop();
+}
+
+TEST(NvtxTest, TimerNvtx) {
+  constexpr int kSize = 1e6;
+  constexpr int kInitialValue = 0;
+  unified_vector<int> vec(kSize, kInitialValue);
+
+  constexpr int kNumAdds = 10;
+  for (int i = 0; i < kNumAdds; i++) {
+    timing::mark("add:" + std::to_string(i), Color::Red());
+    timing::TimerNvtx timer("add_one");
+    test_utils::addOneToAllGPU(&vec);
+  }
+
+  timing::TimerNvtx timer("check", Color::Green());
+  EXPECT_TRUE(
+      test_utils::checkVectorAllConstant(vec, kInitialValue + kNumAdds));
+  timer.Stop();
+
+  std::cout << timing::Timing::Print() << std::endl;
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_ray_caster.cpp b/nvblox/tests/test_ray_caster.cpp
new file mode 100644
index 00000000..d18f2135
--- /dev/null
+++ b/nvblox/tests/test_ray_caster.cpp
@@ -0,0 +1,117 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/integrators/ray_caster.h"
+#include "nvblox/utils/timing.h"
+
+using namespace nvblox;
+
+class RayCasterTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    timing::Timing::Reset();
+    std::srand(0);
+  }
+
+  static constexpr float kFloatEpsilon = 1e-4;
+};
+
+TEST_F(RayCasterTest, StraightAheadCast) {
+  Vector3f start(0.0f, 0.0f, 0.0f);
+  Vector3f end(5.0f, 0.0f, 0.0f);
+  RayCaster raycaster(start, end);
+
+  std::vector<Index3D> all_indices;
+  raycaster.getAllIndices(&all_indices);
+
+  EXPECT_EQ(all_indices.size(), 6);
+
+  // Next make a scaled raycaster.
+  float scale = 2.0f;
+  RayCaster scaled_raycaster(scale * start, scale * end, scale);
+  std::vector<Index3D> all_scaled_indices;
+  scaled_raycaster.getAllIndices(&all_scaled_indices);
+
+  ASSERT_EQ(all_indices.size(), all_scaled_indices.size());
+
+  // Finally, negative.
+  RayCaster negative_raycaster(-start, -end);
+  std::vector<Index3D> all_negative_indices;
+  negative_raycaster.getAllIndices(&all_negative_indices);
+
+  ASSERT_EQ(all_indices.size(), all_negative_indices.size());
+
+  for (size_t i = 0; i < all_indices.size(); i++) {
+    EXPECT_EQ(all_indices[i], all_scaled_indices[i]);
+    EXPECT_EQ(all_indices[i], -all_negative_indices[i]);
+    EXPECT_EQ(all_indices[i].y(), 0);
+    EXPECT_EQ(all_indices[i].z(), 0);
+  }
+}
+
+TEST_F(RayCasterTest, ObliqueCast) {
+  Vector3f start(0.5f, -1.1f, 3.1f);
+  Vector3f end(5.1f, 0.2f, 2.1f);
+  RayCaster raycaster(start, end);
+
+  std::vector<Index3D> all_indices;
+  raycaster.getAllIndices(&all_indices);
+
+  // Next make a scaled raycaster.
+  float scale = 2.0f;
+  RayCaster scaled_raycaster(scale * start, scale * end, scale);
+  std::vector<Index3D> all_scaled_indices;
+  scaled_raycaster.getAllIndices(&all_scaled_indices);
+
+  ASSERT_EQ(all_indices.size(), all_scaled_indices.size());
+
+  // Finally, backwards.
+  RayCaster backwards_raycaster(end, start);
+  std::vector<Index3D> all_backwards_indices;
+  backwards_raycaster.getAllIndices(&all_backwards_indices);
+
+  ASSERT_EQ(all_indices.size(), all_backwards_indices.size());
+
+  for (size_t i = 0; i < all_indices.size(); i++) {
+    EXPECT_EQ(all_indices[i], all_scaled_indices[i]);
+    EXPECT_EQ(all_indices[i],
+              all_backwards_indices[all_indices.size() - i - 1]);
+  }
+}
+
+TEST_F(RayCasterTest, Length0Cast) {
+  Vector3f start(0.0f, 0.0f, 0.0f);
+  Vector3f end = start;
+  RayCaster raycaster(start, end);
+
+  std::vector<Index3D> all_indices;
+  raycaster.getAllIndices(&all_indices);
+
+  EXPECT_EQ(all_indices.size(), 1);
+  EXPECT_EQ(all_indices[0], Index3D::Zero());
+}
+
+int main(int argc, char** argv) {
+  warmupCuda();
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_scene.cpp b/nvblox/tests/test_scene.cpp
new file mode 100644
index 00000000..92d54b77
--- /dev/null
+++ b/nvblox/tests/test_scene.cpp
@@ -0,0 +1,150 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include <string>
+
+#include "nvblox/io/csv.h"
+#include "nvblox/primitives/scene.h"
+
+using namespace nvblox;
+
+constexpr float kFloatEpsilon = 1e-4;
+
+class SceneTest : public ::testing::Test {
+ protected:
+  SceneTest() : camera_(Camera(fu_, fv_, cu_, cv_, width_, height_)) {}
+
+  void SetUp() override {
+    // Make the scene 6x6x3 meters big.
+    scene_.aabb() = AxisAlignedBoundingBox(Vector3f(-3.0f, -3.0f, 0.0f),
+                                           Vector3f(3.0f, 3.0f, 3.0f));
+  }
+
+  // A simulation scene.
+  primitives::Scene scene_;
+
+  // Camera parameters.
+  constexpr static float fu_ = 300;
+  constexpr static float fv_ = 300;
+  constexpr static int width_ = 640;
+  constexpr static int height_ = 480;
+  constexpr static float cu_ = static_cast<float>(width_) / 2.0f;
+  constexpr static float cv_ = static_cast<float>(height_) / 2.0f;
+  Camera camera_;
+};
+
+TEST_F(SceneTest, BlankMap) {
+  constexpr float max_dist = 1.0;
+
+  // Ensure that we don't get any distances back.
+  Vector3f point = Vector3f::Zero();
+
+  float dist = scene_.getSignedDistanceToPoint(point, max_dist);
+
+  EXPECT_NEAR(dist, max_dist, kFloatEpsilon);
+
+  // Ensure that we get a blank image.
+  DepthImage depth_frame(camera_.height(), camera_.width(),
+                         MemoryType::kUnified);
+  Transform T_S_C = Transform::Identity();
+  scene_.generateDepthImageFromScene(camera_, T_S_C, max_dist, &depth_frame);
+
+  // Check all the pixels.
+  for (int lin_idx = 0; lin_idx < depth_frame.numel(); lin_idx++) {
+    EXPECT_NEAR(depth_frame(lin_idx), 0.0f, kFloatEpsilon);
+  }
+}
+
+TEST_F(SceneTest, PlaneScene) {
+  constexpr float max_dist = 2.0;
+
+  // Create a scene that's just a plane.
+  // Plane at 1.0 in the positive z direction, pointing in -z.
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(0.0f, 0.0, 1.0), Vector3f(0, 0, -1)));
+
+  // Get a camera pointing to this plane from the origin.
+  Transform T_S_C = Transform::Identity();
+
+  DepthImage depth_frame(camera_.height(), camera_.width(),
+                         MemoryType::kUnified);
+  scene_.generateDepthImageFromScene(camera_, T_S_C, max_dist, &depth_frame);
+
+  // Check all the pixels.
+  for (int lin_idx = 0; lin_idx < depth_frame.numel(); lin_idx++) {
+    EXPECT_NEAR(depth_frame(lin_idx), 1.0f, kFloatEpsilon);
+  }
+}
+
+TEST_F(SceneTest, PlaneSceneVertical) {
+  constexpr float max_dist = 2.0;
+
+  // Create a scene that's just a plane.
+  // Plane at 1.0 in the positive x direction, pointing in -x.
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(1.0f, 0.0, 0.0), Vector3f(-1, 0, 0)));
+
+  // Get a camera pointing to this plane from the origin.
+  Eigen::Quaternionf rotation =
+      Eigen::Quaternionf::FromTwoVectors(Vector3f(0, 0, 1), Vector3f(1, 0, 0));
+
+  Transform T_S_C = Transform::Identity();
+  T_S_C.prerotate(rotation);
+
+  DepthImage depth_frame(camera_.height(), camera_.width(),
+                         MemoryType::kUnified);
+  scene_.generateDepthImageFromScene(camera_, T_S_C, max_dist, &depth_frame);
+  io::writeToCsv("test_plane_scene_vertical.csv", depth_frame);
+
+  // Check all the pixels.
+  for (int lin_idx = 0; lin_idx < depth_frame.numel(); lin_idx++) {
+    EXPECT_NEAR(depth_frame(lin_idx), 1.0f, kFloatEpsilon);
+  }
+}
+
+TEST_F(SceneTest, PlaneSceneVerticalOffset) {
+  constexpr float max_dist = 4.0;
+
+  // Create a scene that's just a plane.
+  // Plane at 1.0 in the positive x direction, pointing in -x.
+  scene_.addPrimitive(std::make_unique<primitives::Plane>(
+      Vector3f(1.0f, 0.0, 0.0), Vector3f(-1, 0, 0)));
+
+  // Get a camera pointing to this plane from (-1, 0, 0).
+  Eigen::Quaternionf rotation =
+      Eigen::Quaternionf::FromTwoVectors(Vector3f(0, 0, 1), Vector3f(1, 0, 0));
+  Vector3f translation(-1, 0, 0);
+  Transform T_S_C = Transform::Identity();
+  T_S_C.prerotate(rotation.normalized());
+  T_S_C.pretranslate(translation);
+
+  DepthImage depth_frame(camera_.height(), camera_.width(),
+                         MemoryType::kUnified);
+  scene_.generateDepthImageFromScene(camera_, T_S_C, max_dist, &depth_frame);
+
+  // Check all the pixels.
+  for (int lin_idx = 0; lin_idx < depth_frame.numel(); lin_idx++) {
+    EXPECT_NEAR(depth_frame(lin_idx), 2.0f, kFloatEpsilon);
+  }
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_sphere_tracing.cpp b/nvblox/tests/test_sphere_tracing.cpp
new file mode 100644
index 00000000..13f14921
--- /dev/null
+++ b/nvblox/tests/test_sphere_tracing.cpp
@@ -0,0 +1,483 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/camera.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/primitives/primitives.h"
+#include "nvblox/primitives/scene.h"
+#include "nvblox/ray_tracing/sphere_tracer.h"
+#include "nvblox/utils/timing.h"
+
+#include "nvblox/tests/gpu_image_routines.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+class SphereTracingTest : public ::testing::Test {
+ protected:
+  SphereTracingTest()
+      : layer_(
+            std::make_shared<TsdfLayer>(voxel_size_m_, MemoryType::kUnified)) {
+    constexpr float fu = 300;
+    constexpr float fv = 300;
+    constexpr int width = 640;
+    constexpr int height = 480;
+    constexpr float cu = static_cast<float>(width) / 2.0f;
+    constexpr float cv = static_cast<float>(height) / 2.0f;
+    camera_ptr_ = std::make_unique<Camera>(fu, fv, cu, cv, width, height);
+  }
+
+  // Layers
+  constexpr static float voxel_size_m_ = 0.05f;
+  std::shared_ptr<TsdfLayer> layer_;
+
+  // Integration/Ground-truth distance computation
+  constexpr static float truncation_distance_vox_ = 4.0f;
+  constexpr static float truncation_distance_m_ =
+      truncation_distance_vox_ * voxel_size_m_;
+
+  // Test camera
+  std::unique_ptr<Camera> camera_ptr_;
+
+  // Scenes
+  constexpr static float scene_sphere_radius_ = 2.0f;
+  const Vector3f scene_sphere_center_ = Vector3f(0.0f, 0.0f, 2.0f);
+  primitives::Scene getSphereInBoxScene() {
+    // Scene is bounded to -6, -6, 0 to 6, 6, 6.
+    // NOTE(alexmillane): I increased the size to 6 here because if the scene
+    // has walls on the very edge, the distance just through the wall is not
+    // allocated, making the ray marching go straight through the wall. Note
+    // that in reality, we will always have some thickness to the wall (the
+    // integrator ensures that).
+    primitives::Scene scene;
+    scene.aabb() = AxisAlignedBoundingBox(Vector3f(-6.0f, -6.0f, -1.0f),
+                                          Vector3f(6.0f, 6.0f, 6.0f));
+    scene.addGroundLevel(0.0f);
+    scene.addCeiling(5.0f);
+    scene.addPrimitive(std::make_unique<primitives::Sphere>(
+        scene_sphere_center_, scene_sphere_radius_));
+    scene.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+    return scene;
+  }
+  Transform getRandomViewpointInSphereInBoxScene() {
+    // - position
+    //    - inside the (-5, -5, 0) to (5, 5, 5) bounds,
+    //    - and not inside the sphere in the center
+    // - orientation
+    //    - random
+    auto is_inside_sphere = [this](const Vector3f& p) -> bool {
+      return (p - scene_sphere_center_).norm() <= scene_sphere_radius_;
+    };
+    Vector3f p_L = Vector3f::Zero();
+    while (is_inside_sphere(p_L)) {
+      p_L = Vector3f(test_utils::randomFloatInRange(-4.75, 4.75),
+                     test_utils::randomFloatInRange(-4.75, 4.75),
+                     test_utils::randomFloatInRange(0.25, 4.75));
+    }
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(
+        Eigen::AngleAxisf(test_utils::randomFloatInRange(-M_PI, M_PI),
+                          test_utils::getRandomUnitVector3f())
+            .toRotationMatrix());
+    T_S_C.pretranslate(p_L);
+    return T_S_C;
+  }
+  void getSphereSceneReconstruction(const primitives::Scene& scene,
+                                    TsdfLayer* layer_ptr) {
+    // Objects to perform the reconstruction
+    ProjectiveTsdfIntegrator integrator;
+    integrator.truncation_distance_vox(truncation_distance_vox_);
+    DepthImage depth_image(camera_ptr_->height(), camera_ptr_->width(),
+                           MemoryType::kUnified);
+
+    // Fuse in viewpoints
+    constexpr float kTrajectoryRadius = 4.0f;
+    constexpr float kTrajectoryHeight = 2.0f;
+    constexpr int kNumTrajectoryPoints = 80;
+    const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+    for (size_t i = 0; i < kNumTrajectoryPoints; i++) {
+      const float theta = radians_increment * i;
+      // Convert polar to cartesian coordinates.
+      Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                     kTrajectoryRadius * std::sin(theta),
+                                     kTrajectoryHeight);
+      // The camera has its z axis pointing towards the origin.
+      Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+      Eigen::Quaternionf rotation_theta(
+          Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+      // Construct a transform from camera to scene with this.
+      Transform T_S_C = Transform::Identity();
+      T_S_C.prerotate(rotation_theta * rotation_base);
+      T_S_C.pretranslate(cartesian_coordinates);
+
+      // Generate a depth image of the scene.
+      SphereTracer::Params params;
+      scene.generateDepthImageFromScene(
+          *camera_ptr_, T_S_C, params.maximum_ray_length_m, &depth_image);
+
+      // Integrate this depth image.
+      integrator.integrateFrame(depth_image, T_S_C, *camera_ptr_, layer_ptr);
+    }
+  }
+};
+
+class SphereTracingInScaledDistanceFieldTest
+    : public SphereTracingTest,
+      public ::testing::WithParamInterface<float> {
+ protected:
+  // Yo dawg I heard you like params
+};
+
+TEST_P(SphereTracingInScaledDistanceFieldTest, PlaneTest) {
+  // Get the test param
+  const float distance_scaling = GetParam();
+
+  // Test params
+  constexpr float kAllowableErrorInDistanceAlongRay = voxel_size_m_;
+
+  constexpr float kVolumeHalfSize = 5.0f;
+  constexpr float kMinDistanceFromPlane = 1.0f;
+  constexpr float kMinDistanceFromVolumeEdges = 0.5;
+
+  // Scene
+  primitives::Scene scene;
+  scene.aabb() = AxisAlignedBoundingBox(
+      Vector3f(-kVolumeHalfSize, -kVolumeHalfSize, -kVolumeHalfSize),
+      Vector3f(kVolumeHalfSize, kVolumeHalfSize, kVolumeHalfSize));
+  // Create a scene with a plane in the center.
+  Vector3f plane_center(0.0f, 0.0f, 0.0f);
+  Vector3f plane_normal(-1.0f, 0.0f, 0.0f);
+  scene.addPrimitive(
+      std::make_unique<primitives::Plane>(plane_center, plane_normal));
+
+  // Get the ground truth SDF for it.
+  const float truncation_distance_m = 5.0;
+  scene.generateSdfFromScene(truncation_distance_m, layer_.get());
+
+  // Scale the distance for all voxels in the layer
+  auto scaling_lambda = [distance_scaling](const Index3D&, const Index3D&,
+                                           TsdfVoxel* voxel) {
+    voxel->distance *= distance_scaling;
+  };
+  callFunctionOnAllVoxels<TsdfVoxel>(layer_.get(), scaling_lambda);
+
+  // Sphere tracer
+  SphereTracer sphere_tracer_gpu;
+
+  // Test a number of random rays
+  constexpr int kNumTests = 1000;
+  int num_converged = 0;
+  int num_error_too_high = 0;
+  for (int i = 0; i < kNumTests; i++) {
+    // Get a random point on the plane (and in volume)
+    Vector3f p_plane_L(0.0f,  // NOLINT
+                       test_utils::randomFloatInRange(
+                           -(kVolumeHalfSize - kMinDistanceFromVolumeEdges),
+                           (kVolumeHalfSize - kMinDistanceFromVolumeEdges)),
+                       test_utils::randomFloatInRange(
+                           -(kVolumeHalfSize - kMinDistanceFromVolumeEdges),
+                           (kVolumeHalfSize - kMinDistanceFromVolumeEdges)));
+
+    // Get a random point in the volume (positive side of the plane)
+    Vector3f p_volume_L(test_utils::randomFloatInRange(
+                            -(kVolumeHalfSize - kMinDistanceFromVolumeEdges),
+                            -kMinDistanceFromPlane),
+                        test_utils::randomFloatInRange(
+                            -(kVolumeHalfSize - kMinDistanceFromVolumeEdges),
+                            (kVolumeHalfSize - kMinDistanceFromVolumeEdges)),
+                        test_utils::randomFloatInRange(
+                            -(kVolumeHalfSize - kMinDistanceFromVolumeEdges),
+                            (kVolumeHalfSize - kMinDistanceFromVolumeEdges)));
+
+    // Ray between these points
+    const Ray ray{(p_plane_L - p_volume_L).normalized(), p_volume_L};
+
+    // March
+    float t;
+    const bool marcher_converged =
+        sphere_tracer_gpu.castOnGPU(ray, *layer_, truncation_distance_m_, &t);
+    EXPECT_TRUE(marcher_converged);
+    if (!marcher_converged) {
+      continue;
+    }
+
+    // GT
+    float distance_gt;
+    Vector3f ray_intersection_gt;
+    constexpr float kMaxDist = 20.0f;
+    EXPECT_TRUE(scene.getRayIntersection(ray.origin, ray.direction, kMaxDist,
+                                         &ray_intersection_gt, &distance_gt));
+
+    // Check
+    const float error = std::abs(t - distance_gt);
+    bool error_too_high = error > kAllowableErrorInDistanceAlongRay;
+
+    // Tally
+    if (marcher_converged) ++num_converged;
+    if (error_too_high) ++num_error_too_high;
+  }
+
+  constexpr float kAllowableErrorTooHighPercentage = 0.5f;
+  float percentage_error_too_high = num_error_too_high * 100.0f / kNumTests;
+  EXPECT_LT(percentage_error_too_high, kAllowableErrorTooHighPercentage);
+
+  LOG(INFO) << "num_converged: " << num_converged << " / " << kNumTests;
+  LOG(INFO) << "num_error_too_high: " << num_error_too_high << " / "
+            << kNumTests;
+}
+
+INSTANTIATE_TEST_CASE_P(PlaneTests, SphereTracingInScaledDistanceFieldTest,
+                        ::testing::Values(0.9f, 1.0f, 1.1f));
+
+enum class DistanceFieldType { kGroundTruth, kReconstruction };
+
+class SphereTracingInSphereSceneTest
+    : public SphereTracingTest,
+      public ::testing::WithParamInterface<DistanceFieldType> {
+ protected:
+  // Yo dawg I heard you like params
+};
+
+TEST_P(SphereTracingInSphereSceneTest, SphereSceneTests) {
+  // Scene
+  primitives::Scene scene = getSphereInBoxScene();
+
+  // Distance Field type
+  const DistanceFieldType field_type = GetParam();
+
+  // Distance field (GT or reconsruction)
+  if (field_type == DistanceFieldType::kGroundTruth) {
+    std::cout << "Testing on Ground Truth distance field." << std::endl;
+    scene.generateSdfFromScene(truncation_distance_m_, layer_.get());
+  } else {
+    std::cout << "Testing on reconstructed distance field." << std::endl;
+    getSphereSceneReconstruction(scene, layer_.get());
+  }
+
+  // Sphere tracer
+  SphereTracer sphere_tracer_gpu;
+
+  // Declare the images here so we have access to them after the tests
+  std::shared_ptr<const DepthImage> depth_image_sphere_traced_ptr;
+  DepthImage depth_frame_gt(camera_ptr_->height(), camera_ptr_->width(),
+                            MemoryType::kUnified);
+  DepthImage diff;
+
+  constexpr int kNUmImages = 10;
+  for (int i = 0; i < kNUmImages; i++) {
+    // Random view points
+    const Transform T_S_C = getRandomViewpointInSphereInBoxScene();
+
+    // Generate a sphere traced image
+    timing::Timer render_timer("render");
+    depth_image_sphere_traced_ptr = sphere_tracer_gpu.renderImageOnGPU(
+        *camera_ptr_, T_S_C, *layer_, truncation_distance_m_,
+        MemoryType::kUnified);
+    render_timer.Stop();
+
+    // Generate a GT image
+    scene.generateDepthImageFromScene(
+        *camera_ptr_, T_S_C, sphere_tracer_gpu.params().maximum_ray_length_m,
+        &depth_frame_gt);
+
+    // Error image
+    test_utils::getDifferenceImageOnGPU(*depth_image_sphere_traced_ptr,
+                                        depth_frame_gt, &diff);
+
+    // Count the number of error pixels.
+    // NOTE(alexmillane): We ignore rays that do not converge (this occurs in
+    // small regions of the reconstruction test).
+    constexpr float kErrorPixelThreshold = 4.0f * voxel_size_m_;
+    int num_error_pixels = 0;
+    int num_rays_converged = 0;
+    for (int i = 0; i < diff.numel(); i++) {
+      if ((*depth_image_sphere_traced_ptr)(i) > 0.0f) {
+        ++num_rays_converged;
+        if (diff(i) > kErrorPixelThreshold) {
+          ++num_error_pixels;
+        }
+      }
+    }
+    const float percentage_rays_converged =
+        static_cast<float>(num_rays_converged) * 100.0f / diff.numel();
+    const float percentage_error_pixels =
+        static_cast<float>(num_error_pixels) * 100.0f / diff.numel();
+    std::cout << "percentage of rays converged "
+              << " = " << percentage_rays_converged << "\%" << std::endl;
+    std::cout << "number of pixels with error > " << kErrorPixelThreshold
+              << ": " << num_error_pixels << " = " << percentage_error_pixels
+              << "\%" << std::endl;
+
+    constexpr float kPercentagePixelsFailThreshold = 2.0f;  // percent!
+    EXPECT_LT(percentage_error_pixels, kPercentagePixelsFailThreshold);
+
+    // In the groundtruth distance field we expect all rays to converge
+    if (field_type == DistanceFieldType::kGroundTruth) {
+      EXPECT_GT(percentage_rays_converged, 99.5f);
+    }
+  }
+
+  std::cout << timing::Timing::Print() << std::endl;
+
+  // Write Images
+  io::writeToCsv("sphere_tracing_image.txt", *depth_image_sphere_traced_ptr);
+  io::writeToCsv("sphere_tracing_gt.txt", depth_frame_gt);
+  io::writeToCsv("sphere_tracing_diff.txt", diff);
+
+  // Write Scene
+  // io::outputVoxelLayerToPly(*layer_, "sphere_tracing_scene.ply");
+}
+
+INSTANTIATE_TEST_CASE_P(SphereSceneTests, SphereTracingInSphereSceneTest,
+                        ::testing::Values(DistanceFieldType::kGroundTruth,
+                                          DistanceFieldType::kReconstruction));
+
+void generateErrorImageFromSubsampledImage(const DepthImage& original,
+                                           const DepthImage& subsampled,
+                                           DepthImage* diff) {
+  CHECK_EQ(original.rows(), diff->rows());
+  CHECK_EQ(original.cols(), diff->cols());
+  CHECK_EQ(original.rows() % subsampled.rows(), 0);
+  CHECK_EQ(original.cols() % subsampled.cols(), 0);
+
+  const int subsampling_factor = original.rows() / subsampled.rows();
+  CHECK_EQ(original.cols() / subsampled.cols(), subsampling_factor);
+
+  for (int row_idx = 0; row_idx < original.rows(); row_idx++) {
+    for (int col_idx = 0; col_idx < original.cols(); col_idx++) {
+      const Vector2f u_px_subsampled(
+          static_cast<float>(col_idx) / static_cast<float>(subsampling_factor) +
+              0.5,
+          static_cast<float>(row_idx) / static_cast<float>(subsampling_factor) +
+              0.5);
+
+      const float original_depth = original(row_idx, col_idx);
+      const bool original_success = original_depth > 0.0f;
+
+      float subsampled_depth;
+      const bool subsampled_success = interpolation::interpolate2DLinear(
+          subsampled, u_px_subsampled, &subsampled_depth);
+
+      if (original_success && subsampled_success) {
+        const float depth_error = std::abs(original_depth - subsampled_depth);
+        (*diff)(row_idx, col_idx) = depth_error;
+      } else {
+        (*diff)(row_idx, col_idx) = -1.0f;
+      }
+    }
+  }
+}
+
+float getPercentageErrorPixels(const DepthImage& diff,
+                               const float error_threshold) {
+  int num_error_pixels = 0;
+  for (int i = 0; i < diff.numel(); i++) {
+    if ((diff)(i) > 0.0f) {
+      if (diff(i) > error_threshold) {
+        ++num_error_pixels;
+      }
+    }
+  }
+  return static_cast<float>(num_error_pixels) /
+         static_cast<float>(diff.numel()) * 100.0f;
+}
+
+TEST_F(SphereTracingTest, SubsamplingTest) {
+  // Scene
+  primitives::Scene scene = getSphereInBoxScene();
+
+  // Distance field (GT or reconsruction)
+  getSphereSceneReconstruction(scene, layer_.get());
+
+  // Sphere tracer
+  SphereTracer sphere_tracer_gpu;
+
+  // Declare the images here so we have access to them after the tests
+  DepthImage depth_image_full;
+  DepthImage depth_image_half;
+  DepthImage depth_image_quarter;
+  DepthImage diff(camera_ptr_->rows(), camera_ptr_->cols(),
+                  MemoryType::kUnified);
+
+  constexpr int kNUmImages = 10;
+  for (int i = 0; i < kNUmImages; i++) {
+    // Random view points
+    const Transform T_S_C = getRandomViewpointInSphereInBoxScene();
+
+    // Generate a sphere traced image
+    timing::Timer render_timer_full("render/full");
+    depth_image_full = *sphere_tracer_gpu.renderImageOnGPU(
+        *camera_ptr_, T_S_C, *layer_, truncation_distance_m_,
+        MemoryType::kUnified);
+    render_timer_full.Stop();
+
+    timing::Timer render_timer_half("render/half");
+    depth_image_half = *sphere_tracer_gpu.renderImageOnGPU(
+        *camera_ptr_, T_S_C, *layer_, truncation_distance_m_,
+        MemoryType::kUnified, 2);
+    render_timer_half.Stop();
+
+    timing::Timer render_timer_quarter("render/quarter");
+    depth_image_quarter = *sphere_tracer_gpu.renderImageOnGPU(
+        *camera_ptr_, T_S_C, *layer_, truncation_distance_m_,
+        MemoryType::kUnified, 4);
+    render_timer_quarter.Stop();
+
+    // Errors
+    constexpr float kErrorPixelThreshold = 0.5;
+    generateErrorImageFromSubsampledImage(depth_image_full, depth_image_half,
+                                          &diff);
+    const float percentage_error_pixels_half =
+        getPercentageErrorPixels(diff, kErrorPixelThreshold);
+
+    generateErrorImageFromSubsampledImage(depth_image_full, depth_image_quarter,
+                                          &diff);
+    const float percentage_error_pixels_quarter =
+        getPercentageErrorPixels(diff, kErrorPixelThreshold);
+
+    std::cout << "percentage_error_pixels_half: "
+              << percentage_error_pixels_half << std::endl;
+    std::cout << "percentage_error_pixels_quarter: "
+              << percentage_error_pixels_quarter << std::endl;
+
+    EXPECT_LT(percentage_error_pixels_half, 2.0f);
+    EXPECT_LT(percentage_error_pixels_half, 4.0f);
+  }
+
+  std::cout << timing::Timing::Print() << std::endl;
+
+  // Write Images
+  io::writeToCsv("sphere_tracing_image_full.csv", depth_image_full);
+  io::writeToCsv("sphere_tracing_image_half.csv", depth_image_half);
+  io::writeToCsv("sphere_tracing_image_quarter.csv", depth_image_quarter);
+  io::writeToCsv("sphere_tracing_image_subsampling_diff.csv", diff);
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  warmupCuda();
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_traits.cpp b/nvblox/tests/test_traits.cpp
new file mode 100644
index 00000000..88111c25
--- /dev/null
+++ b/nvblox/tests/test_traits.cpp
@@ -0,0 +1,62 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/blox.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/voxels.h"
+
+using namespace nvblox;
+
+struct NewVoxel {
+ float voxel_data = 0.0f;
+};
+
+struct NewBlock {
+  typedef unified_ptr<NewBlock> Ptr;
+  typedef unified_ptr<const NewBlock> ConstPtr;
+
+  static Ptr allocate(MemoryType memory_type) {
+    return make_unified<NewBlock>();
+  };
+
+  float block_data = 0.0f;
+};
+
+using NewVoxelLayer = VoxelBlockLayer<NewVoxel>;
+using NewBlockLayer = BlockLayer<NewBlock>;
+
+TEST(TraitsTest, LayerTraits) {
+  // Existing layer types
+  EXPECT_TRUE(traits::is_voxel_layer<TsdfLayer>());
+  EXPECT_FALSE(traits::is_voxel_layer<MeshLayer>());
+  EXPECT_TRUE(traits::is_voxel_layer<NewVoxelLayer>());
+  EXPECT_FALSE(traits::is_voxel_layer<NewBlockLayer>());
+  // New layer types
+  static_assert(traits::is_voxel_layer<TsdfLayer>());
+  static_assert(!traits::is_voxel_layer<MeshLayer>());
+  static_assert(traits::is_voxel_layer<NewVoxelLayer>());
+  static_assert(!traits::is_voxel_layer<NewBlockLayer>());
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_tsdf_integrator.cpp b/nvblox/tests/test_tsdf_integrator.cpp
new file mode 100644
index 00000000..4453bad2
--- /dev/null
+++ b/nvblox/tests/test_tsdf_integrator.cpp
@@ -0,0 +1,391 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+#include <cmath>
+
+#include "nvblox/core/camera.h"
+#include "nvblox/core/image.h"
+#include "nvblox/core/interpolation_3d.h"
+#include "nvblox/core/layer.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/voxels.h"
+#include "nvblox/integrators/projective_tsdf_integrator.h"
+#include "nvblox/io/csv.h"
+#include "nvblox/io/ply_writer.h"
+#include "nvblox/io/pointcloud_io.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/tests/projective_tsdf_integrator_cpu.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+DECLARE_bool(alsologtostderr);
+
+constexpr float kInterpolatedSurfaceDistanceEpsilon = 1e-6;
+
+// Parameters which define a passing test which measures the difference between
+// CPU and GPU integration. We accept a test if
+// <(kAcceptablePercentageOverThreshold)% voxels have less than
+// (kGPUvsCPUDifferenceThresholdM)meters difference.
+// NOTE(alexmillane): Obviously there are some slight differences in the voxel
+// distance values in a small number of voxels.
+constexpr float kGPUvsCPUDifferenceThresholdM = 1e-3;      // 1mm
+constexpr float kAcceptablePercentageOverThreshold = 0.4;  // 0.4%
+
+class TsdfIntegratorTest : public ::testing::Test {
+ protected:
+  TsdfIntegratorTest()
+      : layer_(voxel_size_m_, MemoryType::kUnified),
+        camera_(Camera(fu_, fv_, cu_, cv_, width_, height_)) {}
+
+  // Test layer
+  constexpr static float voxel_size_m_ = 0.2;
+  TsdfLayer layer_;
+
+  // How much error we expect on the surface
+  constexpr static float surface_reconstruction_allowable_distance_error_vox_ =
+      2.0f;
+  constexpr static float surface_reconstruction_allowable_distance_error_m_ =
+      surface_reconstruction_allowable_distance_error_vox_ * voxel_size_m_;
+
+  // Test camera
+  constexpr static float fu_ = 300;
+  constexpr static float fv_ = 300;
+  constexpr static int width_ = 640;
+  constexpr static int height_ = 480;
+  constexpr static float cu_ = static_cast<float>(width_) / 2.0f;
+  constexpr static float cv_ = static_cast<float>(height_) / 2.0f;
+  Camera camera_;
+};
+
+class TsdfIntegratorTestParameterized
+    : public TsdfIntegratorTest,
+      public ::testing::WithParamInterface<DeviceType> {
+  // Add a param
+};
+
+// ProjectiveTsdfIntegrator child that gives the tests access to the internal
+// functions.
+class TestProjectiveTsdfIntegrator : public ProjectiveTsdfIntegrator {
+ public:
+  TestProjectiveTsdfIntegrator() : ProjectiveTsdfIntegrator() {}
+  FRIEND_TEST(TsdfIntegratorTest, BlocksInView);
+};
+
+struct Plane {
+  Plane(const Vector3f& _p, const Vector3f& _n) : p(_p), n(_n){};
+
+  static Plane RandomAtPoint(const Vector3f& p) {
+    return Plane(p, Vector3f::Random().normalized());
+  }
+
+  const Vector3f p;
+  const Vector3f n;
+};
+
+DepthImage matrixToDepthImage(const Eigen::MatrixXf& mat) {
+  DepthImage depth_frame(mat.rows(), mat.cols(), MemoryType::kUnified);
+  for (int col_idx = 0; col_idx < mat.cols(); col_idx++) {
+    for (int row_idx = 0; row_idx < mat.rows(); row_idx++) {
+      depth_frame(row_idx, col_idx) = mat(row_idx, col_idx);
+    }
+  }
+  return depth_frame;
+}
+
+Eigen::MatrixX3f backProjectToPlaneVectorized(
+    const Eigen::MatrixX2f& uv_coordinates, const Plane& plane,
+    const Camera& camera) {
+  CHECK((uv_coordinates.col(0).array() >= 0.0f).all() &&
+        (uv_coordinates.col(0).array() < camera.width()).all());
+  CHECK((uv_coordinates.col(1).array() >= 0.0f).all() &&
+        (uv_coordinates.col(1).array() < camera.height()).all());
+  // Plane-ray intersection
+  Eigen::ArrayX3f rays_matrix(uv_coordinates.rows(), 3);
+  rays_matrix.col(0) =
+      (uv_coordinates.col(0).array() - camera.cu()) / camera.fu();
+  rays_matrix.col(1) =
+      (uv_coordinates.col(1).array() - camera.cv()) / camera.fv();
+  rays_matrix.col(2) = 1.0f;
+  const Eigen::ArrayXf t_matrix =
+      plane.p.dot(plane.n) *
+      (rays_matrix.col(0) * plane.n.x() + rays_matrix.col(1) * plane.n.y() +
+       rays_matrix.col(2) * plane.n.z())
+          .inverse();
+  // Each pixel's 3D point
+  return rays_matrix.colwise() * t_matrix;
+}
+
+DepthImage getDepthImage(const Plane& plane, const Camera& camera) {
+  CHECK(plane.p.z() > 0.0f);
+  // Enumerate all pixel locations.
+  Eigen::MatrixX2f uv_coordinates(camera.height() * camera.width(), 2);
+  int linear_idx = 0;
+  for (int u = 0; u < camera.width(); u++) {
+    for (int v = 0; v < camera.height(); v++) {
+      uv_coordinates(linear_idx, 0) = u;
+      uv_coordinates(linear_idx, 1) = v;
+      ++linear_idx;
+    }
+  }
+  // Back project and get depth frame
+  const Eigen::MatrixX3f points_C =
+      backProjectToPlaneVectorized(uv_coordinates, plane, camera);
+  Eigen::MatrixXf depths = (points_C.col(2).array());
+  depths.resize(camera.height(), camera.width());
+  return matrixToDepthImage(depths);
+}
+
+Eigen::MatrixX2f getRandomPixelLocations(const int num_samples,
+                                         const Camera& camera) {
+  // Note: Eigen's Random() generates numbers between -1.0 and 1.0 -> hence the
+  // abs().
+  Eigen::MatrixX2f uv_coordinates =
+      Eigen::MatrixX2f::Random(num_samples, 2).array().abs();
+  constexpr int border_px = 20;
+  uv_coordinates.col(0) =
+      (uv_coordinates.col(0) *
+       static_cast<float>(camera.width() - 1 - 2 * border_px))
+          .array() +
+      border_px;
+  uv_coordinates.col(1) =
+      (uv_coordinates.col(1) *
+       static_cast<float>(camera.height() - 1 - 2 * border_px))
+          .array() +
+      border_px;
+  return uv_coordinates;
+}
+
+TEST_P(TsdfIntegratorTestParameterized, ReconstructPlane) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  // Get the params
+  const DeviceType device_type = GetParam();
+
+  // Plane centered at (0,0,depth) with random (slight) slant
+  const Plane plane =
+      Plane(Vector3f(0.0f, 0.0f, 5.0f),
+            Vector3f(test_utils::randomFloatInRange(-0.25, 0.25),
+                     test_utils::randomFloatInRange(-0.25, 0.25), -1.0f));
+
+  // Get a depth map of our view of the plane.
+  const DepthImage depth_frame = getDepthImage(plane, camera_);
+
+  std::string filepath = "./depth_frame.csv";
+  io::writeToCsv(filepath, depth_frame);
+
+  // Integrate into a layer
+  std::unique_ptr<ProjectiveTsdfIntegrator> integrator_ptr;
+  if (device_type == DeviceType::kCPU) {
+    integrator_ptr = std::make_unique<ProjectiveTsdfIntegratorCPU>();
+  } else {
+    integrator_ptr = std::make_unique<ProjectiveTsdfIntegrator>();
+  }
+  const Transform T_L_C = Transform::Identity();
+
+  integrator_ptr->truncation_distance_vox(10.0f);
+  integrator_ptr->integrateFrame(depth_frame, T_L_C, camera_, &layer_);
+
+  // Sample some points on the plane, within the camera view.
+  constexpr int kNumberOfPointsToCheck = 1000;
+  const Eigen::MatrixX2f u_random_C =
+      getRandomPixelLocations(kNumberOfPointsToCheck, camera_);
+  const Eigen::MatrixX3f p_check_L =
+      backProjectToPlaneVectorized(u_random_C, plane, camera_);
+
+  // Get the distance of these surface points
+  std::vector<Vector3f> points_L;
+  points_L.reserve(p_check_L.rows());
+  for (int i = 0; i < p_check_L.rows(); i++) {
+    points_L.push_back(p_check_L.row(i));
+  }
+  std::vector<float> distances;
+  std::vector<bool> success_flags;
+  interpolation::interpolateOnCPU(points_L, layer_, &distances, &success_flags);
+  EXPECT_EQ(success_flags.size(), kNumberOfPointsToCheck);
+  EXPECT_EQ(distances.size(), success_flags.size());
+
+  // Check that something actually got integrated
+  float max_distance = std::numeric_limits<float>::min();
+  auto lambda = [&max_distance](const Index3D& block_index,
+                                const Index3D& voxel_index,
+                                const TsdfVoxel* voxel) {
+    if (voxel->distance > max_distance) max_distance = voxel->distance;
+  };
+  callFunctionOnAllVoxels<TsdfVoxel>(layer_, lambda);
+  const float nothing_integrator_indicator = voxel_size_m_;
+  EXPECT_GT(max_distance, nothing_integrator_indicator);
+
+  // Check that all interpolations worked and that the distance is close to zero
+  int num_failures = 0;
+  int num_bad_flags = 0;
+  for (int i = 0; i < distances.size(); i++) {
+    EXPECT_TRUE(success_flags[i]);
+    EXPECT_NEAR(distances[i], 0.0f,
+                surface_reconstruction_allowable_distance_error_m_);
+    if (std::abs(distances[i]) >
+        surface_reconstruction_allowable_distance_error_m_) {
+      num_failures++;
+    }
+  }
+  LOG(INFO) << "Num points greater than allowable distance: " << num_failures;
+  LOG(INFO) << "num_bad_flags: " << num_bad_flags << " / " << distances.size();
+}
+
+TEST_F(TsdfIntegratorTestParameterized, SphereSceneTest) {
+  constexpr float kSphereRadius = 2.0f;
+  constexpr float kTrajectoryRadius = 4.0f;
+  constexpr float kTrajectoryHeight = 2.0f;
+  constexpr int kNumTrajectoryPoints = 80;
+  constexpr float kTruncationDistanceVox = 2;
+  constexpr float kTruncationDistanceMeters =
+      kTruncationDistanceVox * voxel_size_m_;
+  // Maximum distance to consider for scene generation.
+  constexpr float kMaxDist = 10.0;
+  constexpr float kMinWeight = 1.0;
+
+  // Tolerance for error.
+  constexpr float kDistanceErrorTolerance = kTruncationDistanceMeters;
+
+  // Scene is bounded to -5, -5, 0 to 5, 5, 5.
+  primitives::Scene scene;
+  scene.aabb() = AxisAlignedBoundingBox(Vector3f(-5.0f, -5.0f, 0.0f),
+                                        Vector3f(5.0f, 5.0f, 5.0f));
+  // Create a scene with a ground plane and a sphere.
+  scene.addGroundLevel(0.0f);
+  scene.addCeiling(5.0f);
+  scene.addPrimitive(
+      std::make_unique<primitives::Sphere>(Vector3f(0.0f, 0.0f, 2.0f), 2.0f));
+  // Add bounding planes at 5 meters. Basically makes it sphere in a box.
+  scene.addPlaneBoundaries(-5.0f, 5.0f, -5.0f, 5.0f);
+
+  // Get the ground truth SDF for it.
+  TsdfLayer gt_layer(voxel_size_m_, MemoryType::kUnified);
+  scene.generateSdfFromScene(kTruncationDistanceMeters, &gt_layer);
+
+  // Create an integrator.
+  ProjectiveTsdfIntegratorCPU integrator_cpu;
+  ProjectiveTsdfIntegrator integrator_gpu;
+  integrator_cpu.truncation_distance_vox(kTruncationDistanceVox);
+  integrator_gpu.truncation_distance_vox(kTruncationDistanceVox);
+
+  // Simulate a trajectory of the requisite amount of points, on the circle
+  // around the sphere.
+  const float radians_increment = 2 * M_PI / (kNumTrajectoryPoints);
+
+  // Create a depth frame. We share this memory buffer for the entire
+  // trajectory.
+  DepthImage depth_frame(camera_.height(), camera_.width(),
+                         MemoryType::kUnified);
+
+  // Two layers, one for CPU integration and one for GPU integration
+  TsdfLayer layer_cpu(layer_.voxel_size(), MemoryType::kUnified);
+  TsdfLayer layer_gpu(layer_.voxel_size(), MemoryType::kUnified);
+
+  for (size_t i = 0; i < kNumTrajectoryPoints; i++) {
+    const float theta = radians_increment * i;
+    // Convert polar to cartesian coordinates.
+    Vector3f cartesian_coordinates(kTrajectoryRadius * std::cos(theta),
+                                   kTrajectoryRadius * std::sin(theta),
+                                   kTrajectoryHeight);
+    // The camera has its z axis pointing towards the origin.
+    Eigen::Quaternionf rotation_base(0.5, 0.5, 0.5, 0.5);
+    Eigen::Quaternionf rotation_theta(
+        Eigen::AngleAxisf(M_PI + theta, Vector3f::UnitZ()));
+
+    // Construct a transform from camera to scene with this.
+    Transform T_S_C = Transform::Identity();
+    T_S_C.prerotate(rotation_theta * rotation_base);
+    T_S_C.pretranslate(cartesian_coordinates);
+
+    // Generate a depth image of the scene.
+    scene.generateDepthImageFromScene(camera_, T_S_C, kMaxDist, &depth_frame);
+
+    // Integrate this depth image.
+    integrator_cpu.integrateFrame(depth_frame, T_S_C, camera_, &layer_cpu);
+    integrator_gpu.integrateFrame(depth_frame, T_S_C, camera_, &layer_gpu);
+  }
+
+  // Now do some checks...
+  // Check every voxel in the map.
+  auto lambda = [&](const Index3D& block_index, const Index3D& voxel_index,
+                    const TsdfVoxel* voxel) {
+    if (voxel->weight >= kMinWeight) {
+      // Get the corresponding point from the GT layer.
+      const TsdfVoxel* gt_voxel = getVoxelAtBlockAndVoxelIndex<TsdfVoxel>(
+          gt_layer, block_index, voxel_index);
+      if (gt_voxel != nullptr) {
+        EXPECT_NEAR(voxel->distance, gt_voxel->distance,
+                    kDistanceErrorTolerance);
+      }
+    }
+  };
+  callFunctionOnAllVoxels<TsdfVoxel>(layer_cpu, lambda);
+  callFunctionOnAllVoxels<TsdfVoxel>(layer_gpu, lambda);
+
+  io::outputVoxelLayerToPly(layer_gpu, "test_tsdf_projective_gpu.ply");
+  io::outputVoxelLayerToPly(layer_cpu, "test_tsdf_projective_cpu.ply");
+  io::outputVoxelLayerToPly(gt_layer, "test_tsdf_projective_gt.ply");
+
+  // Compare the layers
+  ASSERT_GE(layer_cpu.numAllocatedBlocks(), layer_gpu.numAllocatedBlocks());
+  size_t total_num_voxels_observed = 0;
+  size_t num_voxels_over_threshold = 0;
+  for (const Index3D& block_index : layer_gpu.getAllBlockIndices()) {
+    const auto block_cpu = layer_cpu.getBlockAtIndex(block_index);
+    const auto block_gpu = layer_gpu.getBlockAtIndex(block_index);
+    for (int x = 0; x < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; x++) {
+      for (int y = 0; y < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; y++) {
+        for (int z = 0; z < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; z++) {
+          const float diff = block_gpu->voxels[x][y][z].distance -
+                             block_cpu->voxels[x][y][z].distance;
+          // EXPECT_NEAR(diff, 0.0f, kGPUvsCPUDifferenceThresholdM);
+          if (block_gpu->voxels[x][y][z].weight > 0.0f) {
+            ++total_num_voxels_observed;
+            if (std::abs(diff) > kGPUvsCPUDifferenceThresholdM) {
+              ++num_voxels_over_threshold;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  const float percentage_over_threshold =
+      100.0f * (static_cast<float>(num_voxels_over_threshold) /
+                static_cast<float>(total_num_voxels_observed));
+  EXPECT_LT(percentage_over_threshold, kAcceptablePercentageOverThreshold);
+  LOG(INFO) << "Percentage of voxels with a difference greater than "
+            << kGPUvsCPUDifferenceThresholdM << ": "
+            << percentage_over_threshold << "%";
+  LOG(INFO) << "total_num_voxels_observed: " << total_num_voxels_observed
+            << std::endl;
+  LOG(INFO) << "num_voxels_over_threshold: " << num_voxels_over_threshold
+            << std::endl;
+}
+
+INSTANTIATE_TEST_CASE_P(DeviceTests, TsdfIntegratorTestParameterized,
+                        ::testing::Values(DeviceType::kCPU, DeviceType::kGPU));
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/nvblox/tests/test_tsdf_integrator_cuda_components.cpp b/nvblox/tests/test_tsdf_integrator_cuda_components.cpp
new file mode 100644
index 00000000..9df24438
--- /dev/null
+++ b/nvblox/tests/test_tsdf_integrator_cuda_components.cpp
@@ -0,0 +1,210 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/types.h"
+#include "nvblox/primitives/primitives.h"
+#include "nvblox/primitives/scene.h"
+
+#include "nvblox/tests/projective_tsdf_integrator_cuda_components.h"
+#include "nvblox/tests/utils.h"
+
+using namespace nvblox;
+
+// TODO: Decide where to put test epsilons
+constexpr float kFloatEpsilon = 1e-4;
+constexpr float kImagePlaneEpsilon = 1e-2;
+
+class CudaTsdfIntegratorTest : public ::testing::Test {
+ protected:
+  CudaTsdfIntegratorTest()
+      : layer_(block_size_m_, MemoryType::kUnified),
+        camera_(Camera(fu_, fv_, cu_, cv_, width_, height_)) {}
+
+  // Test layer
+  constexpr static float voxel_size_m_ = 0.2;
+  constexpr static float block_size_m_ =
+      VoxelBlock<TsdfVoxel>::kVoxelsPerSide * voxel_size_m_;
+  TsdfLayer layer_;
+
+  // Test camera
+  constexpr static float fu_ = 300;
+  constexpr static float fv_ = 300;
+  constexpr static int width_ = 640;
+  constexpr static int height_ = 480;
+  constexpr static float cu_ = static_cast<float>(width_) / 2.0f;
+  constexpr static float cv_ = static_cast<float>(height_) / 2.0f;
+  Camera camera_;
+};
+
+Transform getRandomTransform() {
+  return Transform(
+      Eigen::Translation3f(Vector3f::Random()) *
+      Eigen::AngleAxisf(
+          test_utils::randomFloatInRange(-M_PI, M_PI),
+          test_utils::getRandomVector3fInRange(-1.0f, 1.0f).normalized()));
+}
+
+TEST_F(CudaTsdfIntegratorTest, GPUVectorizedPointsTransform) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  const Transform T_B_A = getRandomTransform();
+
+  // Transform vectors on the CPU
+  constexpr int kNumVecs = 1000;
+  const Eigen::Matrix<float, 3, kNumVecs> vecs_A =
+      Eigen::Matrix<float, 3, kNumVecs>::Random();
+  const Eigen::Matrix<float, 3, kNumVecs> vecs_B =
+      (T_B_A.rotation() * vecs_A).colwise() + T_B_A.translation();
+
+  // Transform vectors on the GPU
+  const Eigen::Matrix3Xf vecs_B_from_GPU =
+      test_utils::transformPointsOnGPU(T_B_A, vecs_A);
+
+  // Check we get the same result.
+  EXPECT_NEAR((vecs_B - vecs_B_from_GPU).maxCoeff(), 0.0f, kFloatEpsilon);
+}
+
+TEST_F(CudaTsdfIntegratorTest, GpuBlockCenterProjection) {
+  std::vector<Index3D> block_indices;
+  block_indices.push_back(Index3D(0.0f, 0.0f, 0.0f));
+
+  // Camera Pose
+  Transform T_C_L = Transform::Identity();
+
+  // Project on CPU
+  std::vector<test_utils::BlockProjectionResult> cpu_block_projections;
+  cpu_block_projections.reserve(block_indices.size());
+  for (const Index3D& block_index : block_indices) {
+    test_utils::BlockProjectionResult results_block;
+    int lin_voxel_idx = 0;
+    for (int x = 0; x < VoxelBlock<bool>::kVoxelsPerSide; x++) {
+      for (int y = 0; y < VoxelBlock<bool>::kVoxelsPerSide; y++) {
+        for (int z = 0; z < VoxelBlock<bool>::kVoxelsPerSide; z++) {
+          const Vector3f p_L = getCenterPostionFromBlockIndexAndVoxelIndex(
+              layer_.block_size(), block_index, Index3D(x, y, z));
+          const Vector3f p_C = T_C_L * p_L;
+          Eigen::Vector2f u_px;
+          if (!camera_.project(p_C, &u_px)) {
+            u_px = Eigen::Vector2f(0.0f, 0.0f);
+          }
+          results_block.row(lin_voxel_idx) = u_px;
+          ++lin_voxel_idx;
+        }
+      }
+    }
+    cpu_block_projections.push_back(results_block);
+  }
+
+  // Project on GPU
+  std::vector<test_utils::BlockProjectionResult> gpu_block_projections =
+      test_utils::projectBlocksOnGPU(block_indices, camera_, T_C_L, &layer_);
+
+  // Check
+  for (int i = 0; i < gpu_block_projections.size(); i++) {
+    EXPECT_NEAR(
+        (cpu_block_projections[i] - gpu_block_projections[i]).maxCoeff(), 0.0f,
+        kFloatEpsilon);
+  }
+}
+
+TEST_F(CudaTsdfIntegratorTest, GpuDepthImageInterpolation) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  // The frames {x_indices_, y_indices_} are set up such that if you
+  // interpolate, you should get the interpolated position back. Note that here,
+  // (in contrast to the 3D voxel grid) I consider that pixel indices correspond
+  // to centers (rather than low-side-corners).
+
+  DepthImage depth_frame_col_indices(height_, width_, MemoryType::kUnified);
+  DepthImage depth_frame_row_indices(height_, width_, MemoryType::kUnified);
+  for (int col_idx = 0; col_idx < width_; col_idx++) {
+    for (int row_idx = 0; row_idx < height_; row_idx++) {
+      depth_frame_col_indices(row_idx, col_idx) = col_idx;
+      depth_frame_row_indices(row_idx, col_idx) = row_idx;
+    }
+  }
+
+  // Generate random locations on the image plane
+  constexpr int kNumTests = 1000;
+  Eigen::MatrixX2f u_px_vec(kNumTests, 2);
+  for (int i = 0; i < kNumTests; i++) {
+    u_px_vec.row(i) = Vector2f(
+        test_utils::randomFloatInRange(0.0f, static_cast<float>(width_ - 1)),
+        test_utils::randomFloatInRange(0.0f, static_cast<float>(height_ - 1)));
+  }
+
+  // CPU interpolation
+  Eigen::VectorXf results_col_cpu(kNumTests);
+  Eigen::VectorXf results_row_cpu(kNumTests);
+  for (int i = 0; i < kNumTests; i++) {
+    // Interpolate x and y grids
+    EXPECT_TRUE(interpolation::interpolate2DLinear(
+        depth_frame_col_indices, u_px_vec.row(i), &results_col_cpu(i)));
+    EXPECT_TRUE(interpolation::interpolate2DLinear(
+        depth_frame_row_indices, u_px_vec.row(i), &results_row_cpu(i)));
+  }
+
+  // GPU interpolation
+  const Eigen::VectorXf results_col_gpu =
+      test_utils::interpolatePointsOnGPU(depth_frame_col_indices, u_px_vec);
+  const Eigen::VectorXf results_row_gpu =
+      test_utils::interpolatePointsOnGPU(depth_frame_row_indices, u_px_vec);
+
+  EXPECT_NEAR((results_col_cpu - results_col_gpu).maxCoeff(), 0.0f,
+              kImagePlaneEpsilon);
+  EXPECT_NEAR((results_row_cpu - results_row_gpu).maxCoeff(), 0.0f,
+              kImagePlaneEpsilon);
+}
+
+TEST_F(CudaTsdfIntegratorTest, SetBlocksOnGPU) {
+  // Make sure this is deterministic.
+  std::srand(0);
+
+  // Allocating blocks for setting
+  constexpr int kNumBlocks = 10;
+  for (int i = 0; i < kNumBlocks; i++) {
+    layer_.allocateBlockAtIndex(test_utils::getRandomIndex3dInRange(-10, 10));
+  }
+
+  // The voxel to their linear indices
+  test_utils::setVoxelBlockOnGPU(&layer_);
+
+  // Check
+  for (const Index3D& block_idx : layer_.getAllBlockIndices()) {
+    const auto block_ptr = layer_.getBlockAtIndex(block_idx);
+    float lin_idx = 0.0f;
+    for (int x = 0; x < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; x++) {
+      for (int y = 0; y < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; y++) {
+        for (int z = 0; z < VoxelBlock<TsdfVoxel>::kVoxelsPerSide; z++) {
+          EXPECT_NEAR(block_ptr->voxels[x][y][z].distance, lin_idx,
+                      kFloatEpsilon);
+          ++lin_idx;
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  FLAGS_alsologtostderr = true;
+  google::InitGoogleLogging(argv[0]);
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_unified_ptr.cpp b/nvblox/tests/test_unified_ptr.cpp
new file mode 100644
index 00000000..87d7267d
--- /dev/null
+++ b/nvblox/tests/test_unified_ptr.cpp
@@ -0,0 +1,210 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include "nvblox/core/accessors.h"
+#include "nvblox/core/common_names.h"
+#include "nvblox/core/types.h"
+#include "nvblox/core/unified_ptr.h"
+
+#include "nvblox/tests/increment_on_gpu.h"
+#include "nvblox/tests/test_utils_cuda.h"
+
+using namespace nvblox;
+
+void setToConstantOnCPU(const int value, const int num_elements, int ints[]) {
+  for (int i = 0; i < num_elements; i++) {
+    ints[i] = value;
+  }
+}
+
+void incrementOnCPU(const int num_elements, int ints[]) {
+  for (int i = 0; i < num_elements; i++) {
+    ints[i]++;
+  }
+}
+
+template <typename T>
+void expect_cuda_freed(T* ptr) {
+  // Note(alexmillane): Whether or not this call returns an error seems to vary
+  // between machines... Hopefully the checks below indicate that the memory has
+  // been freed.
+  cudaPointerAttributes attributes;
+  checkCudaErrors(cudaPointerGetAttributes(&attributes, ptr));
+  EXPECT_EQ(attributes.type, cudaMemoryType::cudaMemoryTypeUnregistered);
+  EXPECT_EQ(attributes.devicePointer, nullptr);
+  EXPECT_EQ(attributes.hostPointer, nullptr);
+}
+
+TEST(UnifiedPointerTest, IntTest) {
+  unified_ptr<int> hello = make_unified<int>(3);
+
+  EXPECT_TRUE(hello);
+  EXPECT_EQ(*hello, 3);
+  (*hello)++;
+  EXPECT_EQ(*hello, 4);
+
+  test_utils::incrementOnGPU(hello.get());
+
+  EXPECT_EQ(*hello, 5);
+
+  hello.reset();
+  EXPECT_FALSE(hello);
+  EXPECT_EQ(hello.get(), nullptr);
+}
+
+TEST(UnifiedPointerTest, ObjectTest) {
+  unified_ptr<std::array<int, 100>> array_of_ints =
+      make_unified<std::array<int, 100>>();
+
+  EXPECT_TRUE(array_of_ints);
+  (*array_of_ints)[3] = 4;
+  EXPECT_EQ((*array_of_ints)[3], 4);
+}
+
+TEST(UnifiedPointerTest, EmptyTest) {
+  unified_ptr<Eigen::Vector3f> vec;
+  EXPECT_FALSE(vec);
+}
+
+TEST(UnifiedPointerTest, MemoryTest) {
+  int* raw_ptr;
+  {
+    // Put this in a disappearing scope.
+    unified_ptr<int> dummy_ptr(make_unified<int>(100));
+    raw_ptr = dummy_ptr.get();
+
+    // Check that the memory is allocated.
+    cudaPointerAttributes attributes;
+    cudaError_t error = cudaPointerGetAttributes(&attributes, raw_ptr);
+    EXPECT_EQ(error, cudaSuccess);
+    EXPECT_EQ(attributes.type, cudaMemoryType::cudaMemoryTypeManaged);
+  }
+
+  // Make sure the memory no longer exists now it's out of scope.
+  cudaPointerAttributes attributes;
+  cudaError_t error = cudaPointerGetAttributes(&attributes, raw_ptr);
+  // Note(alexmillane): Whether or not this call returns an error seems to vary
+  // between machines... Hopefully the checks below indicate that the memory has
+  // been freed. EXPECT_EQ(error, cudaErrorInvalidValue);
+  EXPECT_EQ(attributes.type, cudaMemoryType::cudaMemoryTypeUnregistered);
+  EXPECT_EQ(attributes.devicePointer, nullptr);
+  EXPECT_EQ(attributes.hostPointer, nullptr);
+}
+
+TEST(UnifiedPointerTest, ArrayTest) {
+  constexpr int kNumElements = 1000;
+  unified_ptr<int[]> goodbye = make_unified<int[]>(kNumElements);
+
+  constexpr int kValue = 100;
+  setToConstantOnCPU(kValue, kNumElements, goodbye.get());
+
+  for (int i = 0; i < kNumElements; i++) {
+    EXPECT_EQ(goodbye[i], kValue);
+  }
+
+  incrementOnCPU(kNumElements, goodbye.get());
+
+  for (int i = 0; i < kNumElements; i++) {
+    EXPECT_EQ(goodbye[i], kValue + 1);
+  }
+
+  test_utils::incrementOnGPU(kNumElements, goodbye.get());
+
+  for (int i = 0; i < kNumElements; i++) {
+    EXPECT_EQ(goodbye[i], kValue + 2);
+  }
+
+  int* raw_ptr = goodbye.get();
+  goodbye.reset();
+  EXPECT_FALSE(goodbye);
+  EXPECT_EQ(goodbye.get(), nullptr);
+  // Make sure the memory no longer exists now it's out of scope.
+  expect_cuda_freed(raw_ptr);
+}
+
+TEST(UnifiedPointerTest, HostTest) {
+  auto int_host_ptr = make_unified<int>(MemoryType::kHost, 1);
+  EXPECT_EQ(*int_host_ptr, 1);
+  EXPECT_NE(*int_host_ptr, 2);
+
+  cudaPointerAttributes attributes;
+  checkCudaErrors(cudaPointerGetAttributes(&attributes, int_host_ptr.get()));
+  EXPECT_EQ(attributes.type, cudaMemoryType::cudaMemoryTypeHost);
+
+  int* raw_ptr = int_host_ptr.get();
+  int_host_ptr.reset();
+  EXPECT_FALSE(int_host_ptr);
+  EXPECT_EQ(int_host_ptr.get(), nullptr);
+  expect_cuda_freed(raw_ptr);
+}
+
+TEST(UnifiedPointerTest, CloneTest) {
+  // Host-only cloning
+  auto int_ptr_1 = make_unified<int>(MemoryType::kHost, 1);
+  EXPECT_EQ(*int_ptr_1, 1);
+  auto int_ptr_2 = int_ptr_1.clone();
+  EXPECT_EQ(*int_ptr_2, 1);
+  *int_ptr_2 += 1;
+  EXPECT_EQ(*int_ptr_2, 2);
+  EXPECT_EQ(*int_ptr_1, 1);
+
+  // Host-to-device-to-unified cloning
+  auto int_ptr_3 = int_ptr_1.clone(MemoryType::kDevice);
+  test_utils::incrementOnGPU(int_ptr_3.get());
+  auto int_ptr_4 = int_ptr_3.clone(MemoryType::kUnified);
+  EXPECT_EQ(*int_ptr_4, 2);
+
+  // Array cloning
+  constexpr int kNumElems = 10;
+  auto int_ptr_5 = make_unified<int[]>(kNumElems, MemoryType::kDevice);
+  test_utils::fillWithConstant(1, kNumElems, int_ptr_5.get());
+  EXPECT_TRUE(test_utils::checkAllConstant(int_ptr_5.get(), 1, kNumElems));
+  test_utils::incrementOnGPU(kNumElems, int_ptr_5.get());
+  EXPECT_TRUE(test_utils::checkAllConstant(int_ptr_5.get(), 2, kNumElems));
+  test_utils::incrementOnGPU(kNumElems, int_ptr_5.get());
+  auto int_ptr_6 = int_ptr_5.clone(MemoryType::kHost);
+
+  for (int i = 0; i < kNumElems; i++) {
+    EXPECT_EQ(int_ptr_6[i], 3);
+  }
+}
+
+TEST(UnifiedPointerTest, CloneConstBlock) {
+  constexpr float kBlockSize = 1.0f;
+  TsdfLayer layer(kBlockSize, MemoryType::kDevice);
+  auto block_ptr = layer.allocateBlockAtIndex({0, 0, 0});
+
+  const TsdfLayer& layer_const_ref = layer;
+
+  TsdfBlock::Ptr cloned_block =
+      layer_const_ref.getBlockAtIndex({0, 0, 0}).clone(MemoryType::kHost);
+
+  auto expect_voxel_zero_lambda = [](const Index3D& voxel_index,
+                                     const TsdfVoxel* voxel) -> void {
+    EXPECT_EQ(voxel->distance, 0.0f);
+    EXPECT_EQ(voxel->weight, 0.0f);
+  };
+  callFunctionOnAllVoxels<TsdfVoxel>(*cloned_block, expect_voxel_zero_lambda);
+}
+
+int main(int argc, char** argv) {
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/tests/test_unified_vector.cpp b/nvblox/tests/test_unified_vector.cpp
new file mode 100644
index 00000000..1855cd24
--- /dev/null
+++ b/nvblox/tests/test_unified_vector.cpp
@@ -0,0 +1,221 @@
+/*
+Copyright 2022 NVIDIA CORPORATION
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "nvblox/core/cuda/warmup.h"
+#include "nvblox/core/unified_vector.h"
+
+#include "nvblox/tests/increment_on_gpu.h"
+#include "nvblox/tests/test_utils_cuda.h"
+
+using namespace nvblox;
+
+cudaMemoryType getPointerMemoryType(void* pointer) {
+  // Check that the memory is allocated.
+  cudaPointerAttributes attributes;
+  cudaError_t error = cudaPointerGetAttributes(&attributes, pointer);
+  EXPECT_EQ(error, cudaSuccess);
+  return attributes.type;
+}
+
+TEST(UnifiedVectorTest, EmptyTest) {
+  unified_vector<float> vec;
+  EXPECT_TRUE(vec.empty());
+  EXPECT_EQ(vec.data(), nullptr);
+  EXPECT_EQ(vec.size(), 0);
+}
+
+TEST(UnifiedVectorTest, ClearTest) {
+  unified_vector<float> vec(100, 10);
+  EXPECT_FALSE(vec.empty());
+  EXPECT_NE(vec.data(), nullptr);
+  EXPECT_EQ(vec.size(), 100);
+  EXPECT_EQ(vec[0], 10);
+  EXPECT_EQ(vec[99], 10);
+
+  vec.clear();
+  EXPECT_TRUE(vec.empty());
+  EXPECT_EQ(vec.data(), nullptr);
+  EXPECT_EQ(vec.size(), 0);
+}
+
+TEST(UnifiedVectorTest, PushBackTest) {
+  unified_vector<size_t> vec;
+  constexpr size_t kMaxSize = 999999;
+  for (size_t i = 0; i < kMaxSize; i++) {
+    vec.push_back(i);
+  }
+  EXPECT_FALSE(vec.empty());
+  EXPECT_NE(vec.data(), nullptr);
+  EXPECT_EQ(vec.size(), kMaxSize);
+
+  for (size_t i = 0; i < kMaxSize; i++) {
+    EXPECT_EQ(vec[i], i);
+  }
+  EXPECT_GE(vec.capacity(), vec.size());
+}
+
+TEST(UnifiedVectorTest, AssignmentTest) {
+  unified_vector<size_t> vec;
+  constexpr size_t kMaxSize = 10;
+  for (size_t i = 0; i < kMaxSize; i++) {
+    vec.push_back(i);
+  }
+
+  // Copy the vector over;
+  unified_vector<size_t> vec2 = vec;
+
+  EXPECT_FALSE(vec2.empty());
+  EXPECT_NE(vec2.data(), nullptr);
+  EXPECT_EQ(vec2.size(), kMaxSize);
+
+  for (size_t i = 0; i < kMaxSize; i++) {
+    EXPECT_EQ(vec2[i], i);
+  }
+  EXPECT_GE(vec2.capacity(), vec.size());
+}
+
+TEST(UnifiedVectorTest, IteratorTest) {
+  unified_vector<size_t> vec;
+  constexpr size_t kMaxSize = 10;
+  for (size_t i = 0; i < kMaxSize; i++) {
+    vec.push_back(i);
+  }
+
+  size_t i = 0;
+  for (size_t num : vec) {
+    EXPECT_EQ(num, i++);
+  }
+  EXPECT_EQ(i, kMaxSize);
+}
+
+TEST(UnifiedVectorTest, CpuGpuReadWrite) {
+  const float value = 12.3f;
+
+  const size_t kVectorSize = 100;
+  unified_vector<float> vec;
+  vec.resize(kVectorSize);
+  test_utils::fillVectorWithConstant(value, &vec);
+
+  // Use the operator to check on the HOST
+  for (int i = 0; i < kVectorSize; i++) {
+    EXPECT_EQ(vec[i], value);
+  }
+
+  // Check on the DEVICE
+  EXPECT_TRUE(test_utils::checkVectorAllConstant(vec, value));
+}
+
+TEST(UnifiedVectorTest, SetZeroTest) {
+  constexpr size_t kSize = 100;
+  unified_vector<int> vec(kSize, 1, MemoryType::kHost);
+  for (size_t i = 0; i < kSize; i++) {
+    EXPECT_EQ(vec[i], 1);
+  }
+  vec.setZero();
+  for (size_t i = 0; i < kSize; i++) {
+    EXPECT_EQ(vec[i], 0);
+  }
+}
+
+TEST(UnifiedVectorTest, HostToDeviceToHostCopy) {
+  constexpr size_t kSize = 100;
+  unified_vector<int> vec(kSize, MemoryType::kHost);
+  EXPECT_EQ(getPointerMemoryType(vec.data()),
+            cudaMemoryType::cudaMemoryTypeHost);
+  for (size_t i = 0; i < kSize; i++) {
+    vec[i] = i;
+  }
+
+  // Copy over to unified memory.
+  unified_vector<int> vec_unified(MemoryType::kUnified);
+  vec_unified = vec;
+  EXPECT_EQ(getPointerMemoryType(vec_unified.data()),
+            cudaMemoryType::cudaMemoryTypeManaged);
+  for (size_t i = 0; i < kSize; i++) {
+    EXPECT_EQ(vec_unified[i], i);
+    // Modify it for later.
+    vec_unified[i] = i - 1;
+  }
+
+  // Copy over to device memory.
+  unified_vector<int> vec_device(MemoryType::kDevice);
+  vec_device = vec_unified;
+  EXPECT_EQ(getPointerMemoryType(vec_device.data()),
+            cudaMemoryType::cudaMemoryTypeDevice);
+
+  unified_vector<int> vec_host(vec_device, MemoryType::kHost);
+  EXPECT_EQ(getPointerMemoryType(vec_host.data()),
+            cudaMemoryType::cudaMemoryTypeHost);
+  for (size_t i = 0; i < kSize; i++) {
+    EXPECT_EQ(vec_host[i], i - 1);
+  }
+}
+
+template <typename VectorType>
+void checkAllConstantCPU(const VectorType& vec, const int value) {
+  for (auto i : vec) {
+    EXPECT_EQ(i, value);
+  }
+}
+
+TEST(UnifiedVectorTest, HostAndDeviceVectors) {
+  // Host vector
+  constexpr int kNumElems = 100;
+  host_vector<int> vec_host_1(kNumElems, 1);
+  checkAllConstantCPU(vec_host_1, 1);
+
+  // To device
+  device_vector<int> vec_device = vec_host_1;
+  EXPECT_TRUE(test_utils::checkAllConstant(vec_device.data(), 1, kNumElems));
+  EXPECT_FALSE(test_utils::checkAllConstant(vec_device.data(), 2, kNumElems));
+  test_utils::incrementOnGPU(kNumElems, vec_device.data());
+  EXPECT_TRUE(test_utils::checkAllConstant(vec_device.data(), 2, kNumElems));
+
+  // Back to host
+  host_vector<int> vec_host_2 = vec_device;
+  checkAllConstantCPU(vec_host_2, 2);
+
+  // Conversion to std::vector
+  std::vector<int> std_vec_host_1 = vec_device.toVector();
+  checkAllConstantCPU(std_vec_host_1, 2);
+
+  // Conversion from std::vector
+  std::vector<int> std_vec_host_2(kNumElems, 3);
+  checkAllConstantCPU(std_vec_host_2, 3);
+  device_vector<int> vec_device_2(std_vec_host_2);
+  EXPECT_TRUE(test_utils::checkAllConstant(vec_device_2.data(), 3, kNumElems));
+
+  // Checking conversion from base class.
+  host_vector<int> std_vec_host_3(kNumElems, 4);
+  const unified_vector<int>& base_class_reference = std_vec_host_3;
+  checkAllConstantCPU(base_class_reference, 4);
+  device_vector<int> vec_device_3(base_class_reference);
+  EXPECT_TRUE(test_utils::checkAllConstant(vec_device_3.data(), 4, kNumElems));
+}
+
+int main(int argc, char** argv) {
+  warmupCuda();
+  google::InitGoogleLogging(argv[0]);
+  FLAGS_alsologtostderr = true;
+  google::InstallFailureSignalHandler();
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
\ No newline at end of file
diff --git a/nvblox/thirdparty/eigen/eigen.cmake b/nvblox/thirdparty/eigen/eigen.cmake
new file mode 100644
index 00000000..9cec9191
--- /dev/null
+++ b/nvblox/thirdparty/eigen/eigen.cmake
@@ -0,0 +1,29 @@
+include(ExternalProject)
+
+# If the caller wants Eigen installed somewhere, do it, otherwise don't configure and don't install (eigen is header only).
+if(EIGEN_INCLUDE_DESTINATION)
+    set(EIGEN_INCLUDE_DIRS "${EIGEN_INCLUDE_DESTINATION}/include/eigen3")
+    file(MAKE_DIRECTORY ${EIGEN_INCLUDE_DIRS})
+    set(EIGEN_CONFIGURE_CMD)
+    set(EIGEN_INSTALL_CMD $(MAKE) install)
+else()
+    set(EIGEN_CONFIGURE_CMD echo 'not installing eigen')
+    set(EIGEN_INSTALL_CMD echo 'not installing eigen')
+endif()
+
+ExternalProject_Add(
+    ext_eigen
+    PREFIX eigen
+    URL https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.tar.gz
+    URL_HASH SHA256=8586084f71f9bde545ee7fa6d00288b264a2b7ac3607b974e54d13e7162c1c72
+    UPDATE_COMMAND ""
+    CONFIGURE_COMMAND ${EIGEN_CONFIGURE_CMD}
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${EIGEN_INCLUDE_DESTINATION}
+    BUILD_COMMAND ""
+    INSTALL_COMMAND ${EIGEN_INSTALL_CMD}
+)
+
+if(NOT EIGEN_INCLUDE_DESTINATION)
+    ExternalProject_Get_Property(ext_eigen SOURCE_DIR)
+    set(EIGEN_INCLUDE_DIRS ${SOURCE_DIR})
+endif()
diff --git a/nvblox/thirdparty/stdgpu/stdgpu.cmake b/nvblox/thirdparty/stdgpu/stdgpu.cmake
new file mode 100644
index 00000000..c87c941d
--- /dev/null
+++ b/nvblox/thirdparty/stdgpu/stdgpu.cmake
@@ -0,0 +1,52 @@
+
+
+
+include(FetchContent)
+FetchContent_Declare(
+  ext_stdgpu
+  PREFIX stdgpu
+  GIT_REPOSITORY https://github.com/stotko/stdgpu.git
+  GIT_TAG        e10f6f3ccc9902d693af4380c3bcd188ec34a2e6
+  UPDATE_COMMAND ""
+)
+
+# stdgpu build options
+set(STDGPU_BUILD_SHARED_LIBS ON)
+set(STDGPU_BUILD_EXAMPLES OFF)
+set(STDGPU_BUILD_TESTS OFF)
+set(STDGPU_ENABLE_CONTRACT_CHECKS OFF)
+
+set(STDGPU_BACKEND_DIRECTORY "cuda") # DO we need this?
+
+# Download the files
+FetchContent_MakeAvailable(ext_stdgpu)
+
+# Variables we need in the parent
+set(STDGPU_INCLUDE_DIRS ${STDGPU_INSTALL_DESTINATION}/include/) # "/" is critical.
+set(STDGPU_SRC_DIR ${stdgpu_SOURCE_DIR})
+
+# Grabbing the compute capability through functions defined by stdgpu
+# https://github.com/stotko/stdgpu/tree/master/cmake/cuda
+
+set(CMAKE_MODULE_PATH_OLD ${CMAKE_MODULE_PATH})
+# Temporary replace CMAKE_MODULE_PATH
+set(CMAKE_MODULE_PATH "${STDGPU_SRC_DIR}/cmake/cuda")
+
+include("${STDGPU_SRC_DIR}/cmake/cuda/set_device_flags.cmake")
+
+stdgpu_set_device_flags(STDGPU_DEVICE_FLAGS)
+stdgpu_cuda_set_architecture_flags(STDGPU_CUDA_ARCHITECTURE_FLAGS)
+if(STDGPU_CUDA_ARCHITECTURE_FLAGS)
+    if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+        set(CMAKE_CUDA_ARCHITECTURES ${STDGPU_CUDA_ARCHITECTURE_FLAGS})
+    else()
+        string(APPEND CMAKE_CUDA_FLAGS "${STDGPU_CUDA_ARCHITECTURE_FLAGS}")
+        message(STATUS "Building with modified CMAKE_CUDA_FLAGS : ${CMAKE_CUDA_FLAGS}")
+    endif()
+else()
+    message(WARNING "Falling back to default CCs : ${CMAKE_CUDA_ARCHITECTURES}")
+endif()
+
+# Restore CMAKE_MODULE_PATH
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH_OLD})
+
diff --git a/nvblox/visualization/visualize_mesh.py b/nvblox/visualization/visualize_mesh.py
new file mode 100755
index 00000000..fd09d9ff
--- /dev/null
+++ b/nvblox/visualization/visualize_mesh.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python3
+
+import os
+import sys
+import argparse
+
+import numpy as np
+import open3d as o3d
+
+
+def visualize_ply(ply_path: str, do_normal_coloring: bool):
+    # Load the mesh.
+    mesh = o3d.io.read_triangle_mesh(ply_path)
+    print(mesh)
+    mesh.compute_vertex_normals()
+
+    # Create a window.
+    vis = o3d.visualization.Visualizer()
+    vis.create_window()
+    vis.add_geometry(mesh)
+    # Color the mesh by normals.
+    if do_normal_coloring:
+        vis.get_render_option().mesh_color_option = \
+            o3d.visualization.MeshColorOption.Normal
+    vis.run()
+    vis.destroy_window()
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description="Visualize a PLY mesh.")
+    parser.add_argument("path", metavar="path", type=str,
+                        help="Path to the .ply file or file to visualize.")
+
+    parser.add_argument("--normal_coloring", dest="do_normal_coloring", action='store_const',
+                        const=True, default=False,
+                        help="Flag indicating if we should just color the mesh by normals.")
+
+    args = parser.parse_args()
+    if args.path:
+        visualize_ply(args.path, args.do_normal_coloring)