diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bfcbadb..caf757d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/pycqa/isort
-    rev: 5.13.2
+    rev: 6.0.1
     hooks:
       - id: isort
         args: [--profile, black, --force-single-line-imports]
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.4.2
+    rev: 25.1.0
     hooks:
     - id: black
       language_version: python3.10
\ No newline at end of file
diff --git a/benchmark/benchmark_backend.py b/benchmark/benchmark_backend.py
index 6f57c03..c841c7e 100644
--- a/benchmark/benchmark_backend.py
+++ b/benchmark/benchmark_backend.py
@@ -4,31 +4,40 @@
 Example usage: pytest benchmark/benchmark_backend.py --benchmark-only --benchmark-group-by=func -v
 """
 
-import torch
-import pytest
 import asyncio
-from hfppl.llms import CachedCausalLM
+
+import pytest
+import torch
+
 from examples.haiku import run_example as run_haiku
 from examples.hard_constraints import run_example as run_hard_constraints
+from hfppl.llms import CachedCausalLM
 
 backends = [
-    'hf',
+    "hf",
     pytest.param(
-        'vllm',
+        "vllm",
         marks=pytest.mark.skipif(
-            not torch.cuda.is_available(),
-            reason="vLLM backend requires CUDA"
-        )
-    )
+            not torch.cuda.is_available(), reason="vLLM backend requires CUDA"
+        ),
+    ),
 ]
 
+
 @pytest.fixture
 def LLM(backend):
     # Set lower gpu_memory_utilization in vllm so that we can fit both models on the GPU
-    kwargs = {'engine_opts' : {'gpu_memory_utilization' : 0.45}, 'cache_size' : 100} if backend == 'vllm' else {}
-    return CachedCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", backend=backend, **kwargs)
+    kwargs = (
+        {"engine_opts": {"gpu_memory_utilization": 0.45}, "cache_size": 100}
+        if backend == "vllm"
+        else {}
+    )
+    return CachedCausalLM.from_pretrained(
+        "meta-llama/Meta-Llama-3-8B", backend=backend, **kwargs
+    )
 
-@pytest.mark.parametrize('backend', backends)
+
+@pytest.mark.parametrize("backend", backends)
 def test_hard_constraints_benchmark(LLM, benchmark, n_particles=20, max_tokens=50):
     def run_with_clear_cache():
         LLM.clear_cache()
@@ -38,24 +47,25 @@ def run_with_clear_cache():
 
     # warmup
     run_with_clear_cache()
-    
+
     benchmark.pedantic(
         run_with_clear_cache,
         iterations=1,
         rounds=3,
     )
 
-@pytest.mark.parametrize('backend', backends)
+
+@pytest.mark.parametrize("backend", backends)
 def test_haiku_benchmark(LLM, benchmark, n_particles=20):
     def run_with_clear_cache():
         LLM.clear_cache()
         return asyncio.run(
-            run_haiku(LLM, poem_title='The beauty of testing', n_particles=n_particles)
+            run_haiku(LLM, poem_title="The beauty of testing", n_particles=n_particles)
         )
 
     # warmup
     run_with_clear_cache()
-    
+
     benchmark.pedantic(
         run_with_clear_cache,
         iterations=1,
diff --git a/examples/grammar_constraint.py b/examples/grammar_constraint.py
index 73bf11b..0b0bd71 100644
--- a/examples/grammar_constraint.py
+++ b/examples/grammar_constraint.py
@@ -126,7 +126,7 @@ async def run_generation(
     verbose: bool = False,
 ):
     LLM = CachedCausalLM.from_pretrained(args.model)
-    if LLM.backend == 'hf':
+    if LLM.backend == "hf":
         LLM.batch_size = args.batch_size
     model = GrammarConstrainedSMC(
         lm=LLM,
diff --git a/examples/haiku.py b/examples/haiku.py
index 4d4bfd0..7ffb573 100644
--- a/examples/haiku.py
+++ b/examples/haiku.py
@@ -34,6 +34,7 @@ def count_syllables(word, unknown_word_syllables=100):
 
     return syllable_count
 
+
 # Example poems for the prompt.
 # Authors:
 #   - Amy Lowell
@@ -65,6 +66,7 @@ def count_syllables(word, unknown_word_syllables=100):
 this deep in fall,
 still not a butterfly."""
 
+
 # LLaMPPL model
 class Haiku(Model):
 
@@ -116,13 +118,16 @@ def string_for_serialization(self):
         )
         return s.replace("\n", "/")
 
-async def run_example(LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=20, ess_threshold=0.5):
+
+async def run_example(
+    LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=20, ess_threshold=0.5
+):
     # Construct prompt
     prompt = f"""{EXAMPLE_POEMS}
 
 5. "{poem_title}"
 """
-    
+
     # Cache the key value vectors for the prompt
     LLM.cache_kv(LLM.tokenizer.encode(prompt))
 
@@ -136,6 +141,7 @@ async def run_example(LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=2
 
     return particles
 
+
 def main():
     # Load the language model.
     # Mistral is an open model; to use a model with restricted access, like LLaMA 3,
@@ -144,22 +150,24 @@ def main():
     # LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
 
     # Set batch size if using HuggingFace backend
-    if LLM.backend == 'hf':
+    if LLM.backend == "hf":
         LLM.batch_size = 40
 
     # Get poem title from user
     poem_title = input("Enter a title for your Haiku: ")
 
-    syllables_per_line = [5, 7, 5] # [5, 3, 5] for a Lune
-    
+    syllables_per_line = [5, 7, 5]  # [5, 3, 5] for a Lune
+
     # Run the example
-    particles = asyncio.run(run_example(LLM, poem_title, syllable_pattern=syllables_per_line))
+    particles = asyncio.run(
+        run_example(LLM, poem_title, syllable_pattern=syllables_per_line)
+    )
 
     print("--------")
     for i, particle in enumerate(particles):
         print(f"\nPoem {i} (weight {particle.weight}):")
         print(f"{particle.context}")
 
+
 if __name__ == "__main__":
     main()
-
diff --git a/examples/hard_constraints.py b/examples/hard_constraints.py
index 17bbb89..23b13e5 100644
--- a/examples/hard_constraints.py
+++ b/examples/hard_constraints.py
@@ -7,6 +7,7 @@
 from hfppl import Model
 from hfppl import smc_standard
 
+
 def make_masks(LLM):
     return {
         i: set(
@@ -63,7 +64,7 @@ def string_for_serialization(self):
         return f"{self.context}"
 
     def immutable_properties(self):
-        return ['masks']
+        return ["masks"]
 
 
 # From Politico.com
@@ -75,6 +76,7 @@ def immutable_properties(self):
 
 3."""
 
+
 async def run_example(LLM, max_tokens=50, n_particles=20, ess_threshold=0.5):
     # Cache the key value vectors for the prompt.
     LLM.cache_kv(LLM.tokenizer.encode(prompt))
@@ -91,6 +93,7 @@ async def run_example(LLM, max_tokens=50, n_particles=20, ess_threshold=0.5):
 
     return particles
 
+
 def main():
     # Load the language model.
     # Mistral and Vicuna are open models; to use a model with restricted access, like LLaMA 3,
@@ -100,11 +103,12 @@ def main():
     # LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
 
     # Set batch size if provided. This operation is only valid for the HuggingFace backend.
-    if LLM.backend == 'hf':
+    if LLM.backend == "hf":
         LLM.batch_size = 40
-        
+
     # Run the example.
     asyncio.run(run_example(LLM))
 
+
 if __name__ == "__main__":
     main()
diff --git a/hfppl/__init__.py b/hfppl/__init__.py
index cb938f9..febaca4 100644
--- a/hfppl/__init__.py
+++ b/hfppl/__init__.py
@@ -1,5 +1,4 @@
-"""Probabilistic programming with HuggingFace Transformer models.
-"""
+"""Probabilistic programming with HuggingFace Transformer models."""
 
 from .chunks import *
 from .distributions import *
diff --git a/hfppl/chunks.py b/hfppl/chunks.py
index 4976040..cc5e378 100644
--- a/hfppl/chunks.py
+++ b/hfppl/chunks.py
@@ -7,7 +7,9 @@
 @submodel
 async def sample_word(self, context, max_tokens=5, allow_punctuation=True):
     """Sample a word from the `LMContext` object `context`."""
-    last_token = context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
+    last_token = (
+        context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
+    )
     last_character = last_token[-1] if len(last_token) > 0 else ""
     needs_space = last_character not in string.whitespace and last_character not in [
         "-",
@@ -81,7 +83,9 @@ async def sample_word_2(
     if max_chars is not None:
         assert max_chars > 1
 
-    last_token = context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
+    last_token = (
+        context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
+    )
     last_character = last_token[-1] if len(last_token) > 0 else ""
     needs_space = last_character not in string.whitespace and last_character not in [
         "-",
diff --git a/hfppl/distributions/lmcontext.py b/hfppl/distributions/lmcontext.py
index 6f2db00..9e963fa 100644
--- a/hfppl/distributions/lmcontext.py
+++ b/hfppl/distributions/lmcontext.py
@@ -1,5 +1,6 @@
 import copy
 import warnings
+
 import numpy as np
 
 from ..llms import Token
@@ -82,7 +83,9 @@ async def log_prob(self, v):
             # If there are no good tokens, the log probability of v under the mask is -inf
             # However, since this method updates the model_mask as a side-effect,
             # this will put the context in an invalid state, so we instead raise an exception.
-            raise NullMask("Unable to compute log probability of mask that rules out all tokens.")
+            raise NullMask(
+                "Unable to compute log probability of mask that rules out all tokens."
+            )
         else:
             logprob_good = logsumexp(self.ctx.next_token_logprobs[list(good_tokens)])
 
diff --git a/hfppl/llms.py b/hfppl/llms.py
index 82d65a0..0e1472f 100644
--- a/hfppl/llms.py
+++ b/hfppl/llms.py
@@ -1,11 +1,14 @@
 """Utilities for working with language models."""
 
-import torch
-import string
 import asyncio
+import string
 import warnings
 from collections import defaultdict
-from genlm_backend.llm import AsyncVirtualLM, AsyncTransformer, MockAsyncLM
+
+import torch
+from genlm_backend.llm import AsyncTransformer
+from genlm_backend.llm import AsyncVirtualLM
+from genlm_backend.llm import MockAsyncLM
 
 VLLM_AVAILABLE = True
 try:
@@ -13,8 +16,9 @@
 except ImportError:
     VLLM_AVAILABLE = False
 
-warnings.filterwarnings('once', category=DeprecationWarning)
-warnings.filterwarnings('once', category=RuntimeWarning)
+warnings.filterwarnings("once", category=DeprecationWarning)
+warnings.filterwarnings("once", category=RuntimeWarning)
+
 
 class Masks:
     def __init__(self, lm):
@@ -207,55 +211,59 @@ def from_pretrained(cls, model_id, backend=None, **kwargs):
         Returns:
             CachedCausalLM: The hfppl-compatible interface to the `AsyncLM` model.
         """
-        backend = backend or ('vllm' if (torch.cuda.is_available() and VLLM_AVAILABLE) else 'hf')
+        backend = backend or (
+            "vllm" if (torch.cuda.is_available() and VLLM_AVAILABLE) else "hf"
+        )
 
-        if backend == 'vllm':
+        if backend == "vllm":
             if not VLLM_AVAILABLE:
                 raise ValueError(
                     "vLLM backend requested but vLLM is not installed. "
                     "Please install vLLM with `pip install vllm`."
                 )
             model_cls = AsyncVirtualLM
-        elif backend == 'hf':
+        elif backend == "hf":
             model_cls = AsyncTransformer
-        elif backend == 'mock':
+        elif backend == "mock":
             model_cls = MockAsyncLM
         else:
-            raise ValueError(f"Unknown backend: {backend}. Must be one of ['vllm', 'hf', 'mock']")
+            raise ValueError(
+                f"Unknown backend: {backend}. Must be one of ['vllm', 'hf', 'mock']"
+            )
 
         # Handle legacy auth_token parameter. The ability to pass in the auth_token should
         # be removed in a future version since it is not supported by the vllm backend.
         # Users should authenticate with the HuggingFace CLI.
-        auth_token = kwargs.pop('auth_token', None)
+        auth_token = kwargs.pop("auth_token", None)
         if auth_token:
-            if backend == 'vllm':
+            if backend == "vllm":
                 raise ValueError(
                     "Explicitly passing auth_token is not compatible with the vLLM AsyncLM backend. "
                     "Authenticate using `huggingface-cli login` instead."
                 )
 
-            if 'hf_opts' not in kwargs:
-                kwargs['hf_opts'] = {}
-            kwargs['hf_opts']['token'] = auth_token
+            if "hf_opts" not in kwargs:
+                kwargs["hf_opts"] = {}
+            kwargs["hf_opts"]["token"] = auth_token
 
             warnings.warn(
                 "Passing auth_token directly is deprecated and will be removed in a future version. "
                 "Please authenticate using `huggingface-cli login` instead.",
                 DeprecationWarning,
-                stacklevel=2
+                stacklevel=2,
             )
 
-        load_in_8bit = kwargs.pop('load_in_8bit', False)
+        load_in_8bit = kwargs.pop("load_in_8bit", False)
         if load_in_8bit:
-            if 'bitsandbytes_opts' not in kwargs:
-                kwargs['bitsandbytes_opts'] = {}
-            kwargs['bitsandbytes_opts']['load_in_8bit'] = True
+            if "bitsandbytes_opts" not in kwargs:
+                kwargs["bitsandbytes_opts"] = {}
+            kwargs["bitsandbytes_opts"]["load_in_8bit"] = True
 
             warnings.warn(
                 "load_in_8bit is deprecated and will be removed in a future version. "
                 "Please pass `bitsandbytes_opts` instead.",
                 DeprecationWarning,
-                stacklevel=2
+                stacklevel=2,
             )
 
         model = model_cls.from_name(model_id, **kwargs)
@@ -270,13 +278,15 @@ def __init__(self, model):
             model (genlm_backend.llm.AsyncLM): an `AsyncLM` instance.
         """
         if isinstance(model, AsyncVirtualLM):
-            self.backend = 'vllm'
+            self.backend = "vllm"
         elif isinstance(model, AsyncTransformer):
-            self.backend = 'hf'
+            self.backend = "hf"
         elif isinstance(model, MockAsyncLM):
-            self.backend = 'mock'
+            self.backend = "mock"
         else:
-            raise ValueError(f"Unknown model type: {type(model)}. Must be one of [AsyncVirtualLM, AsyncTransformer, MockAsyncLM]")
+            raise ValueError(
+                f"Unknown model type: {type(model)}. Must be one of [AsyncVirtualLM, AsyncTransformer, MockAsyncLM]"
+            )
 
         self.model = model
         self.tokenizer = model.tokenizer
@@ -290,7 +300,7 @@ def vocab(self):
         warnings.warn(
             "Accessing .vocab directly is deprecated and will be removed in a future version. Use .str_vocab or .byte_vocab instead.",
             DeprecationWarning,
-            stacklevel=2
+            stacklevel=2,
         )
         return self.model.str_vocab
 
@@ -331,33 +341,37 @@ def clear_cache(self):
 
     def clear_kv_cache(self):
         """Clear any key and value vectors from the cache."""
-        if self.backend == 'hf':
+        if self.backend == "hf":
             self.model.clear_kv_cache()
-        elif self.backend == 'vllm':
+        elif self.backend == "vllm":
             warnings.warn(
                 "clear_kv_cache() is only supported for the HuggingFace backend. The KV cache for the vLLM backend is handled internally by vLLM. No operation performed.",
                 RuntimeWarning,
-                stacklevel=2
+                stacklevel=2,
             )
-        elif self.backend == 'mock':
+        elif self.backend == "mock":
             pass
         else:
-            raise RuntimeError(f"clear_kv_cache() is not implemented for backend type {type(self.model)}")
+            raise RuntimeError(
+                f"clear_kv_cache() is not implemented for backend type {type(self.model)}"
+            )
 
     def reset_async_queries(self):
         """Clear any pending language model queries from the queue."""
-        if self.backend == 'hf':
+        if self.backend == "hf":
             self.model.reset_async_queries()
-        elif self.backend == 'vllm':
+        elif self.backend == "vllm":
             warnings.warn(
                 "reset_async_queries() is only supported for the HuggingFace backend. No operation performed.",
                 RuntimeWarning,
-                stacklevel=2
+                stacklevel=2,
             )
-        elif self.backend == 'mock':
+        elif self.backend == "mock":
             pass
         else:
-            raise RuntimeError(f"reset_async_queries() is not implemented for backend type {type(self.model)}")
+            raise RuntimeError(
+                f"reset_async_queries() is not implemented for backend type {type(self.model)}"
+            )
 
     def cache_kv(self, prompt_tokens):
         """Cache the key and value vectors for a prompt.
@@ -365,15 +379,17 @@ def cache_kv(self, prompt_tokens):
         Args:
             prompt_tokens (list[int]): token ids for the prompt to cache.
         """
-        if self.backend == 'hf':
+        if self.backend == "hf":
             self.model.cache_kv(prompt_tokens)
-        elif self.backend == 'vllm':
+        elif self.backend == "vllm":
             warnings.warn(
                 "cache_kv() is only supported for the HuggingFace backend. The KV cache for the vLLM backend is handled internally by vLLM. No operation performed.",
                 RuntimeWarning,
-                stacklevel=2
+                stacklevel=2,
             )
-        elif self.backend == 'mock':
+        elif self.backend == "mock":
             pass
         else:
-            raise RuntimeError(f"cache_kv() is not implemented for backend type {type(self.model)}")
\ No newline at end of file
+            raise RuntimeError(
+                f"cache_kv() is not implemented for backend type {type(self.model)}"
+            )
diff --git a/tests/test_examples.py b/tests/test_examples.py
index a11e085..1aaeb4c 100644
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -1,38 +1,44 @@
-import torch
-import pytest
 import asyncio
-from hfppl.llms import CachedCausalLM
+
+import pytest
+import torch
+
 from examples.haiku import run_example as run_haiku
 from examples.hard_constraints import run_example as run_hard_constraints
+from hfppl.llms import CachedCausalLM
 
 backends = [
-    'mock',
-    'hf',
+    "mock",
+    "hf",
     pytest.param(
-        'vllm',
+        "vllm",
         marks=pytest.mark.skipif(
-            not torch.cuda.is_available(),
-            reason="vLLM backend requires CUDA"
-        )
-    )
+            not torch.cuda.is_available(), reason="vLLM backend requires CUDA"
+        ),
+    ),
 ]
 
+
 @pytest.fixture
 def LLM(backend):
     # Set lower gpu_memory_utilization in vllm so that we can fit both models on the GPU
-    kwargs = {'engine_opts' : {'gpu_memory_utilization' : 0.45}} if backend == 'vllm' else {}
-    return CachedCausalLM.from_pretrained('gpt2', backend=backend, **kwargs)
+    kwargs = (
+        {"engine_opts": {"gpu_memory_utilization": 0.45}} if backend == "vllm" else {}
+    )
+    return CachedCausalLM.from_pretrained("gpt2", backend=backend, **kwargs)
+
 
-@pytest.mark.parametrize('backend', backends)
+@pytest.mark.parametrize("backend", backends)
 def test_hard_constraints(LLM, n_particles=20, max_tokens=25):
     particles = asyncio.run(
         run_hard_constraints(LLM, max_tokens=max_tokens, n_particles=n_particles)
     )
     assert len(particles) == n_particles
 
-@pytest.mark.parametrize('backend', backends)
+
+@pytest.mark.parametrize("backend", backends)
 def test_haiku(LLM, n_particles=20):
     particles = asyncio.run(
-        run_haiku(LLM, poem_title='The beauty of testing', n_particles=n_particles)
+        run_haiku(LLM, poem_title="The beauty of testing", n_particles=n_particles)
     )
     assert len(particles) == n_particles
diff --git a/tests/test_lmcontext.py b/tests/test_lmcontext.py
index a528c63..75fc33f 100644
--- a/tests/test_lmcontext.py
+++ b/tests/test_lmcontext.py
@@ -1,36 +1,40 @@
-import torch
-import pytest
 import asyncio
+
 import numpy as np
-from hfppl.llms import CachedCausalLM
+import pytest
+import torch
+
 from hfppl.distributions.lmcontext import LMContext
+from hfppl.llms import CachedCausalLM
 
 backends = [
-    'mock',
-    'hf',
+    "mock",
+    "hf",
     pytest.param(
-        'vllm',
+        "vllm",
         marks=pytest.mark.skipif(
-            not torch.cuda.is_available(),
-            reason="vLLM backend requires CUDA"
-        )
-    )
+            not torch.cuda.is_available(), reason="vLLM backend requires CUDA"
+        ),
+    ),
 ]
 
 
 @pytest.fixture
 def lm(backend):
-    return CachedCausalLM.from_pretrained('gpt2', backend=backend)
+    return CachedCausalLM.from_pretrained("gpt2", backend=backend)
 
 
-@pytest.mark.parametrize('backend', backends)
+@pytest.mark.parametrize("backend", backends)
 def test_init(lm):
-    prompt = 'Hello, world!'
+    prompt = "Hello, world!"
     lmcontext = LMContext(lm, prompt)
     assert lmcontext.tokens == lm.tokenizer.encode(prompt)
     logprobs = lm.next_token_logprobs_unbatched(lmcontext.tokens)
     np.testing.assert_allclose(
-        lmcontext.next_token_logprobs, logprobs, rtol=1e-5, err_msg='Sync context __init__'
+        lmcontext.next_token_logprobs,
+        logprobs,
+        rtol=1e-5,
+        err_msg="Sync context __init__",
     )
 
     async def async_context():
@@ -38,7 +42,10 @@ async def async_context():
 
     lmcontext = asyncio.run(async_context())
     np.testing.assert_allclose(
-        lmcontext.next_token_logprobs, logprobs, rtol=1e-5, err_msg='Async context __init__'
+        lmcontext.next_token_logprobs,
+        logprobs,
+        rtol=1e-5,
+        err_msg="Async context __init__",
     )
 
     async def async_context_create():
@@ -46,5 +53,8 @@ async def async_context_create():
 
     lmcontext = asyncio.run(async_context_create())
     np.testing.assert_allclose(
-        lmcontext.next_token_logprobs, logprobs, rtol=1e-5, err_msg='Async context create'
+        lmcontext.next_token_logprobs,
+        logprobs,
+        rtol=1e-5,
+        err_msg="Async context create",
     )