diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
index d03d4529..ef304a7c 100644
--- a/docs/benchmarks.rst
+++ b/docs/benchmarks.rst
@@ -89,6 +89,7 @@ We compare our results against existing data loading platforms:
 - `Pytorch DataLoader <https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader>`_: This is the default option that comes with the Pytorch library and uses individual JPEG files as the source.
 - `Webdataset <https://github.com/webdataset/webdataset>`_: This loader requires pre-processed files aggregated in multiple big `.tar` archives.
 - `DALI <https://docs.nvidia.com/deeplearning/dali/user-guide/docs/>`_: Data loading pipeline developed by Nvidia. In this experiment we used the default file format which is the same as that of the Pytorch DataLoader.
+
 The specific instantiation of DALI that we apply is the PyTorch ImageNet example DALI code found in the `NVIDIA DeepLearningExamples repository <https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/Classification/ConvNets/resnet50v1.5>`_.
 We use the DGX-1 configuration and remove all the model optimization, benchmarking only the dataloader.
 
diff --git a/docs/ffcv_examples/custom_transforms.rst b/docs/ffcv_examples/custom_transforms.rst
index 7c4b4195..54acaa49 100644
--- a/docs/ffcv_examples/custom_transforms.rst
+++ b/docs/ffcv_examples/custom_transforms.rst
@@ -31,11 +31,11 @@ Doing so requires providing implementation for two functions:
         # Return the code to run this operation
         @abstractmethod
         def generate_code(self) -> Callable:
-            raise NotImplementedError
+            raise NotImplementedError()
 
         @abstractmethod
         def declare_state_and_memory(self, previous_state: State) -> Tuple[State, Optional[AllocationQuery]]:
-            raise NotImplementedError
+            raise NotImplementedError()
 
 Advancing state and pre-allocating memory
 ------------------------------------------
diff --git a/ffcv/benchmarks/benchmark.py b/ffcv/benchmarks/benchmark.py
index 859f731d..a5489cbe 100644
--- a/ffcv/benchmarks/benchmark.py
+++ b/ffcv/benchmarks/benchmark.py
@@ -8,4 +8,4 @@ def __init__(self, **kwargs):
 
     @abstractmethod
     def run(self):
-        raise NotImplemented()
\ No newline at end of file
+        raise NotImplementedError()
\ No newline at end of file
diff --git a/ffcv/fields/base.py b/ffcv/fields/base.py
index 329275d1..9f8fe171 100644
--- a/ffcv/fields/base.py
+++ b/ffcv/fields/base.py
@@ -25,21 +25,21 @@ class Field(ABC):
     @property
     @abstractmethod
     def metadata_type(self) -> np.dtype:
-        raise NotImplemented
+        raise NotImplementedError()
 
     @staticmethod
     @abstractmethod
     def from_binary(binary: ARG_TYPE) -> Field:
-        raise NotImplementedError
+        raise NotImplementedError()
 
     @abstractmethod
     def to_binary(self) -> ARG_TYPE:
-        raise NotImplementedError
+        raise NotImplementedError()
 
     @abstractmethod
     def encode(field, metadata_destination, malloc):
-        raise NotImplementedError
+        raise NotImplementedError()
 
     @abstractmethod
     def get_decoder_class(self) -> Type[Operation]:
-        raise NotImplementedError
+        raise NotImplementedError()
diff --git a/ffcv/fields/rgb_image.py b/ffcv/fields/rgb_image.py
index 829ec8a3..87dd7707 100644
--- a/ffcv/fields/rgb_image.py
+++ b/ffcv/fields/rgb_image.py
@@ -238,7 +238,7 @@ def decode(batch_indices, my_storage, metadata, storage_state):
     @property
     @abstractmethod
     def get_crop_generator():
-        raise NotImplementedError
+        raise NotImplementedError()
 
 
 class RandomResizedCropRGBImageDecoder(ResizedCropRGBImageDecoder):
diff --git a/ffcv/memory_managers/base.py b/ffcv/memory_managers/base.py
index 525833a0..2c450369 100644
--- a/ffcv/memory_managers/base.py
+++ b/ffcv/memory_managers/base.py
@@ -72,7 +72,7 @@ def schedule_epoch(self, batches: Sequence[Sequence[int]]) -> MemoryContext:
 
     @abstractmethod
     def compile_reader(self, address, size) -> Callable:
-        raise NotImplemented()
+        raise NotImplementedError()
 
     @property
     @abstractmethod
diff --git a/ffcv/pipeline/graph.py b/ffcv/pipeline/graph.py
index 05da7cee..9ff26fb0 100644
--- a/ffcv/pipeline/graph.py
+++ b/ffcv/pipeline/graph.py
@@ -1,6 +1,6 @@
-from distutils.log import warn
 import warnings
 import ast
+import sys
 
 try:
     # Useful for debugging
@@ -23,11 +23,18 @@
 import torch as ch
 import numpy as np
 
-# This is the starting state of the pipeline
-INITIAL_STATE = State(jit_mode=True,
-                       device=ch.device('cpu'),
-                       dtype=np.dtype('u1'),
-                       shape=None)
+if "sphinx" in sys.modules:
+    # Sphinx fails on jit+gpu assert due to improper initialization of device
+    INITIAL_STATE = State(jit_mode=False,
+                          device=ch.device('cpu'),
+                          dtype=np.dtype('u1'),
+                          shape=None)
+else:
+    # This is the starting state of the pipeline
+    INITIAL_STATE = State(jit_mode=True,
+                          device=ch.device('cpu'),
+                          dtype=np.dtype('u1'),
+                          shape=None)
 
 
 class Node(ABC):
@@ -40,34 +47,34 @@ def __init__(self):
     @property
     @abstractmethod
     def is_jitted(self):
-        raise NotImplemented()
+        raise NotImplementedError()
 
     @property
     @abstractmethod
     def parent(self):
-        raise NotImplemented()
+        raise NotImplementedError()
     
     @property
     @abstractmethod
     def arg_id(self):
-        raise NotImplemented()
+        raise NotImplementedError()
     
     @property
     @abstractmethod
     def result_id(self):
-        raise NotImplemented()
+        raise NotImplementedError()
     
     @property
     @abstractmethod
     def result_id(self):
-        raise NotImplemented()
+        raise NotImplementedError()
     
     def get_shared_code_ast(self, done_ops):
         return ast.Pass()
     
     @abstractmethod
     def generate_code(self):
-        raise NotImplemented()
+        raise NotImplementedError()
 
     def recompile(self):
         self._code = self.generate_code()
diff --git a/ffcv/pipeline/operation.py b/ffcv/pipeline/operation.py
index 8ad947e8..b46257fc 100644
--- a/ffcv/pipeline/operation.py
+++ b/ffcv/pipeline/operation.py
@@ -28,7 +28,7 @@ def accept_globals(self, metadata, memory_read):
     # Return the code to run this operation
     @abstractmethod
     def generate_code(self) -> Callable:
-        raise NotImplementedError
+        raise NotImplementedError()
 
     def declare_shared_memory(self, previous_state: State) -> Optional[AllocationQuery]:
         return None
@@ -38,4 +38,4 @@ def generate_code_for_shared_state(self) -> Optional[Callable]:
 
     @abstractmethod
     def declare_state_and_memory(self, previous_state: State) -> Tuple[State, Optional[AllocationQuery]]:
-        raise NotImplementedError
+        raise NotImplementedError()
diff --git a/ffcv/pipeline/state.py b/ffcv/pipeline/state.py
index a2e31dcc..0b553c5c 100644
--- a/ffcv/pipeline/state.py
+++ b/ffcv/pipeline/state.py
@@ -14,7 +14,7 @@ class State:
     
     # Assess the validity of a pipeline stage
     def __post_init__(self):
-        if self.jit_mode and self.device != ch.device('cpu'):
+        if self.jit_mode and self.device.type != 'cpu':
             raise AssertionError("Can't be in JIT mode and on the GPU")
         if self.jit_mode and isinstance(self.dtype, ch.dtype):
             raise AssertionError("Can't allocate a torch tensor in JIT mode")
\ No newline at end of file
diff --git a/ffcv/transforms/mixup.py b/ffcv/transforms/mixup.py
index 53239b6f..724994d5 100644
--- a/ffcv/transforms/mixup.py
+++ b/ffcv/transforms/mixup.py
@@ -58,7 +58,7 @@ def declare_state_and_memory(self, previous_state: State) -> Tuple[State, Option
 
 class LabelMixup(Operation):
     """Mixup for labels. Should be initialized in exactly the same way as
-    :cla:`ffcv.transforms.ImageMixup`.
+    :class:`ffcv.transforms.ImageMixup`.
     """
     def __init__(self, alpha: float, same_lambda: bool):
         super().__init__()
diff --git a/ffcv/transforms/random_resized_crop.py b/ffcv/transforms/random_resized_crop.py
index 5a7405c5..7f311b42 100644
--- a/ffcv/transforms/random_resized_crop.py
+++ b/ffcv/transforms/random_resized_crop.py
@@ -14,7 +14,7 @@ class RandomResizedCrop(Operation):
     """Crop a random portion of image with random aspect ratio and resize it to
     a given size. Chances are you do not want to use this augmentation and
     instead want to include RRC as part of the decoder, by using the 
-    :cla:`~ffcv.fields.rgb_image.ResizedCropRGBImageDecoder` class.
+    :class:`~ffcv.fields.rgb_image.ResizedCropRGBImageDecoder` class.
 
     Parameters
     ----------
diff --git a/ffcv/traversal_order/base.py b/ffcv/traversal_order/base.py
index 74f1a70b..fcbb7f5e 100644
--- a/ffcv/traversal_order/base.py
+++ b/ffcv/traversal_order/base.py
@@ -17,4 +17,4 @@ def __init__(self, loader: 'Loader'):
 
     @abstractmethod
     def sample_order(self, epoch:int) -> Sequence[int]:
-        raise NotImplemented()
+        raise NotImplementedError()