diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bfcbadb..caf757d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,11 +1,11 @@ repos: - repo: https://github.com/pycqa/isort - rev: 5.13.2 + rev: 6.0.1 hooks: - id: isort args: [--profile, black, --force-single-line-imports] - repo: https://github.com/psf/black-pre-commit-mirror - rev: 24.4.2 + rev: 25.1.0 hooks: - id: black language_version: python3.10 \ No newline at end of file diff --git a/benchmark/benchmark_backend.py b/benchmark/benchmark_backend.py index 6f57c03..c841c7e 100644 --- a/benchmark/benchmark_backend.py +++ b/benchmark/benchmark_backend.py @@ -4,31 +4,40 @@ Example usage: pytest benchmark/benchmark_backend.py --benchmark-only --benchmark-group-by=func -v """ -import torch -import pytest import asyncio -from hfppl.llms import CachedCausalLM + +import pytest +import torch + from examples.haiku import run_example as run_haiku from examples.hard_constraints import run_example as run_hard_constraints +from hfppl.llms import CachedCausalLM backends = [ - 'hf', + "hf", pytest.param( - 'vllm', + "vllm", marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason="vLLM backend requires CUDA" - ) - ) + not torch.cuda.is_available(), reason="vLLM backend requires CUDA" + ), + ), ] + @pytest.fixture def LLM(backend): # Set lower gpu_memory_utilization in vllm so that we can fit both models on the GPU - kwargs = {'engine_opts' : {'gpu_memory_utilization' : 0.45}, 'cache_size' : 100} if backend == 'vllm' else {} - return CachedCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", backend=backend, **kwargs) + kwargs = ( + {"engine_opts": {"gpu_memory_utilization": 0.45}, "cache_size": 100} + if backend == "vllm" + else {} + ) + return CachedCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B", backend=backend, **kwargs + ) -@pytest.mark.parametrize('backend', backends) + +@pytest.mark.parametrize("backend", backends) def test_hard_constraints_benchmark(LLM, benchmark, n_particles=20, max_tokens=50): def run_with_clear_cache(): LLM.clear_cache() @@ -38,24 +47,25 @@ def run_with_clear_cache(): # warmup run_with_clear_cache() - + benchmark.pedantic( run_with_clear_cache, iterations=1, rounds=3, ) -@pytest.mark.parametrize('backend', backends) + +@pytest.mark.parametrize("backend", backends) def test_haiku_benchmark(LLM, benchmark, n_particles=20): def run_with_clear_cache(): LLM.clear_cache() return asyncio.run( - run_haiku(LLM, poem_title='The beauty of testing', n_particles=n_particles) + run_haiku(LLM, poem_title="The beauty of testing", n_particles=n_particles) ) # warmup run_with_clear_cache() - + benchmark.pedantic( run_with_clear_cache, iterations=1, diff --git a/examples/grammar_constraint.py b/examples/grammar_constraint.py index 73bf11b..0b0bd71 100644 --- a/examples/grammar_constraint.py +++ b/examples/grammar_constraint.py @@ -126,7 +126,7 @@ async def run_generation( verbose: bool = False, ): LLM = CachedCausalLM.from_pretrained(args.model) - if LLM.backend == 'hf': + if LLM.backend == "hf": LLM.batch_size = args.batch_size model = GrammarConstrainedSMC( lm=LLM, diff --git a/examples/haiku.py b/examples/haiku.py index 4d4bfd0..7ffb573 100644 --- a/examples/haiku.py +++ b/examples/haiku.py @@ -34,6 +34,7 @@ def count_syllables(word, unknown_word_syllables=100): return syllable_count + # Example poems for the prompt. # Authors: # - Amy Lowell @@ -65,6 +66,7 @@ def count_syllables(word, unknown_word_syllables=100): this deep in fall, still not a butterfly.""" + # LLaMPPL model class Haiku(Model): @@ -116,13 +118,16 @@ def string_for_serialization(self): ) return s.replace("\n", "/") -async def run_example(LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=20, ess_threshold=0.5): + +async def run_example( + LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=20, ess_threshold=0.5 +): # Construct prompt prompt = f"""{EXAMPLE_POEMS} 5. "{poem_title}" """ - + # Cache the key value vectors for the prompt LLM.cache_kv(LLM.tokenizer.encode(prompt)) @@ -136,6 +141,7 @@ async def run_example(LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=2 return particles + def main(): # Load the language model. # Mistral is an open model; to use a model with restricted access, like LLaMA 3, @@ -144,22 +150,24 @@ def main(): # LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") # Set batch size if using HuggingFace backend - if LLM.backend == 'hf': + if LLM.backend == "hf": LLM.batch_size = 40 # Get poem title from user poem_title = input("Enter a title for your Haiku: ") - syllables_per_line = [5, 7, 5] # [5, 3, 5] for a Lune - + syllables_per_line = [5, 7, 5] # [5, 3, 5] for a Lune + # Run the example - particles = asyncio.run(run_example(LLM, poem_title, syllable_pattern=syllables_per_line)) + particles = asyncio.run( + run_example(LLM, poem_title, syllable_pattern=syllables_per_line) + ) print("--------") for i, particle in enumerate(particles): print(f"\nPoem {i} (weight {particle.weight}):") print(f"{particle.context}") + if __name__ == "__main__": main() - diff --git a/examples/hard_constraints.py b/examples/hard_constraints.py index 17bbb89..23b13e5 100644 --- a/examples/hard_constraints.py +++ b/examples/hard_constraints.py @@ -7,6 +7,7 @@ from hfppl import Model from hfppl import smc_standard + def make_masks(LLM): return { i: set( @@ -63,7 +64,7 @@ def string_for_serialization(self): return f"{self.context}" def immutable_properties(self): - return ['masks'] + return ["masks"] # From Politico.com @@ -75,6 +76,7 @@ def immutable_properties(self): 3.""" + async def run_example(LLM, max_tokens=50, n_particles=20, ess_threshold=0.5): # Cache the key value vectors for the prompt. LLM.cache_kv(LLM.tokenizer.encode(prompt)) @@ -91,6 +93,7 @@ async def run_example(LLM, max_tokens=50, n_particles=20, ess_threshold=0.5): return particles + def main(): # Load the language model. # Mistral and Vicuna are open models; to use a model with restricted access, like LLaMA 3, @@ -100,11 +103,12 @@ def main(): # LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1") # Set batch size if provided. This operation is only valid for the HuggingFace backend. - if LLM.backend == 'hf': + if LLM.backend == "hf": LLM.batch_size = 40 - + # Run the example. asyncio.run(run_example(LLM)) + if __name__ == "__main__": main() diff --git a/hfppl/__init__.py b/hfppl/__init__.py index cb938f9..febaca4 100644 --- a/hfppl/__init__.py +++ b/hfppl/__init__.py @@ -1,5 +1,4 @@ -"""Probabilistic programming with HuggingFace Transformer models. -""" +"""Probabilistic programming with HuggingFace Transformer models.""" from .chunks import * from .distributions import * diff --git a/hfppl/chunks.py b/hfppl/chunks.py index 4976040..cc5e378 100644 --- a/hfppl/chunks.py +++ b/hfppl/chunks.py @@ -7,7 +7,9 @@ @submodel async def sample_word(self, context, max_tokens=5, allow_punctuation=True): """Sample a word from the `LMContext` object `context`.""" - last_token = context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else "" + last_token = ( + context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else "" + ) last_character = last_token[-1] if len(last_token) > 0 else "" needs_space = last_character not in string.whitespace and last_character not in [ "-", @@ -81,7 +83,9 @@ async def sample_word_2( if max_chars is not None: assert max_chars > 1 - last_token = context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else "" + last_token = ( + context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else "" + ) last_character = last_token[-1] if len(last_token) > 0 else "" needs_space = last_character not in string.whitespace and last_character not in [ "-", diff --git a/hfppl/distributions/lmcontext.py b/hfppl/distributions/lmcontext.py index 6f2db00..9e963fa 100644 --- a/hfppl/distributions/lmcontext.py +++ b/hfppl/distributions/lmcontext.py @@ -1,5 +1,6 @@ import copy import warnings + import numpy as np from ..llms import Token @@ -82,7 +83,9 @@ async def log_prob(self, v): # If there are no good tokens, the log probability of v under the mask is -inf # However, since this method updates the model_mask as a side-effect, # this will put the context in an invalid state, so we instead raise an exception. - raise NullMask("Unable to compute log probability of mask that rules out all tokens.") + raise NullMask( + "Unable to compute log probability of mask that rules out all tokens." + ) else: logprob_good = logsumexp(self.ctx.next_token_logprobs[list(good_tokens)]) diff --git a/hfppl/llms.py b/hfppl/llms.py index 82d65a0..0e1472f 100644 --- a/hfppl/llms.py +++ b/hfppl/llms.py @@ -1,11 +1,14 @@ """Utilities for working with language models.""" -import torch -import string import asyncio +import string import warnings from collections import defaultdict -from genlm_backend.llm import AsyncVirtualLM, AsyncTransformer, MockAsyncLM + +import torch +from genlm_backend.llm import AsyncTransformer +from genlm_backend.llm import AsyncVirtualLM +from genlm_backend.llm import MockAsyncLM VLLM_AVAILABLE = True try: @@ -13,8 +16,9 @@ except ImportError: VLLM_AVAILABLE = False -warnings.filterwarnings('once', category=DeprecationWarning) -warnings.filterwarnings('once', category=RuntimeWarning) +warnings.filterwarnings("once", category=DeprecationWarning) +warnings.filterwarnings("once", category=RuntimeWarning) + class Masks: def __init__(self, lm): @@ -207,55 +211,59 @@ def from_pretrained(cls, model_id, backend=None, **kwargs): Returns: CachedCausalLM: The hfppl-compatible interface to the `AsyncLM` model. """ - backend = backend or ('vllm' if (torch.cuda.is_available() and VLLM_AVAILABLE) else 'hf') + backend = backend or ( + "vllm" if (torch.cuda.is_available() and VLLM_AVAILABLE) else "hf" + ) - if backend == 'vllm': + if backend == "vllm": if not VLLM_AVAILABLE: raise ValueError( "vLLM backend requested but vLLM is not installed. " "Please install vLLM with `pip install vllm`." ) model_cls = AsyncVirtualLM - elif backend == 'hf': + elif backend == "hf": model_cls = AsyncTransformer - elif backend == 'mock': + elif backend == "mock": model_cls = MockAsyncLM else: - raise ValueError(f"Unknown backend: {backend}. Must be one of ['vllm', 'hf', 'mock']") + raise ValueError( + f"Unknown backend: {backend}. Must be one of ['vllm', 'hf', 'mock']" + ) # Handle legacy auth_token parameter. The ability to pass in the auth_token should # be removed in a future version since it is not supported by the vllm backend. # Users should authenticate with the HuggingFace CLI. - auth_token = kwargs.pop('auth_token', None) + auth_token = kwargs.pop("auth_token", None) if auth_token: - if backend == 'vllm': + if backend == "vllm": raise ValueError( "Explicitly passing auth_token is not compatible with the vLLM AsyncLM backend. " "Authenticate using `huggingface-cli login` instead." ) - if 'hf_opts' not in kwargs: - kwargs['hf_opts'] = {} - kwargs['hf_opts']['token'] = auth_token + if "hf_opts" not in kwargs: + kwargs["hf_opts"] = {} + kwargs["hf_opts"]["token"] = auth_token warnings.warn( "Passing auth_token directly is deprecated and will be removed in a future version. " "Please authenticate using `huggingface-cli login` instead.", DeprecationWarning, - stacklevel=2 + stacklevel=2, ) - load_in_8bit = kwargs.pop('load_in_8bit', False) + load_in_8bit = kwargs.pop("load_in_8bit", False) if load_in_8bit: - if 'bitsandbytes_opts' not in kwargs: - kwargs['bitsandbytes_opts'] = {} - kwargs['bitsandbytes_opts']['load_in_8bit'] = True + if "bitsandbytes_opts" not in kwargs: + kwargs["bitsandbytes_opts"] = {} + kwargs["bitsandbytes_opts"]["load_in_8bit"] = True warnings.warn( "load_in_8bit is deprecated and will be removed in a future version. " "Please pass `bitsandbytes_opts` instead.", DeprecationWarning, - stacklevel=2 + stacklevel=2, ) model = model_cls.from_name(model_id, **kwargs) @@ -270,13 +278,15 @@ def __init__(self, model): model (genlm_backend.llm.AsyncLM): an `AsyncLM` instance. """ if isinstance(model, AsyncVirtualLM): - self.backend = 'vllm' + self.backend = "vllm" elif isinstance(model, AsyncTransformer): - self.backend = 'hf' + self.backend = "hf" elif isinstance(model, MockAsyncLM): - self.backend = 'mock' + self.backend = "mock" else: - raise ValueError(f"Unknown model type: {type(model)}. Must be one of [AsyncVirtualLM, AsyncTransformer, MockAsyncLM]") + raise ValueError( + f"Unknown model type: {type(model)}. Must be one of [AsyncVirtualLM, AsyncTransformer, MockAsyncLM]" + ) self.model = model self.tokenizer = model.tokenizer @@ -290,7 +300,7 @@ def vocab(self): warnings.warn( "Accessing .vocab directly is deprecated and will be removed in a future version. Use .str_vocab or .byte_vocab instead.", DeprecationWarning, - stacklevel=2 + stacklevel=2, ) return self.model.str_vocab @@ -331,33 +341,37 @@ def clear_cache(self): def clear_kv_cache(self): """Clear any key and value vectors from the cache.""" - if self.backend == 'hf': + if self.backend == "hf": self.model.clear_kv_cache() - elif self.backend == 'vllm': + elif self.backend == "vllm": warnings.warn( "clear_kv_cache() is only supported for the HuggingFace backend. The KV cache for the vLLM backend is handled internally by vLLM. No operation performed.", RuntimeWarning, - stacklevel=2 + stacklevel=2, ) - elif self.backend == 'mock': + elif self.backend == "mock": pass else: - raise RuntimeError(f"clear_kv_cache() is not implemented for backend type {type(self.model)}") + raise RuntimeError( + f"clear_kv_cache() is not implemented for backend type {type(self.model)}" + ) def reset_async_queries(self): """Clear any pending language model queries from the queue.""" - if self.backend == 'hf': + if self.backend == "hf": self.model.reset_async_queries() - elif self.backend == 'vllm': + elif self.backend == "vllm": warnings.warn( "reset_async_queries() is only supported for the HuggingFace backend. No operation performed.", RuntimeWarning, - stacklevel=2 + stacklevel=2, ) - elif self.backend == 'mock': + elif self.backend == "mock": pass else: - raise RuntimeError(f"reset_async_queries() is not implemented for backend type {type(self.model)}") + raise RuntimeError( + f"reset_async_queries() is not implemented for backend type {type(self.model)}" + ) def cache_kv(self, prompt_tokens): """Cache the key and value vectors for a prompt. @@ -365,15 +379,17 @@ def cache_kv(self, prompt_tokens): Args: prompt_tokens (list[int]): token ids for the prompt to cache. """ - if self.backend == 'hf': + if self.backend == "hf": self.model.cache_kv(prompt_tokens) - elif self.backend == 'vllm': + elif self.backend == "vllm": warnings.warn( "cache_kv() is only supported for the HuggingFace backend. The KV cache for the vLLM backend is handled internally by vLLM. No operation performed.", RuntimeWarning, - stacklevel=2 + stacklevel=2, ) - elif self.backend == 'mock': + elif self.backend == "mock": pass else: - raise RuntimeError(f"cache_kv() is not implemented for backend type {type(self.model)}") \ No newline at end of file + raise RuntimeError( + f"cache_kv() is not implemented for backend type {type(self.model)}" + ) diff --git a/tests/test_examples.py b/tests/test_examples.py index a11e085..1aaeb4c 100644 --- a/tests/test_examples.py +++ b/tests/test_examples.py @@ -1,38 +1,44 @@ -import torch -import pytest import asyncio -from hfppl.llms import CachedCausalLM + +import pytest +import torch + from examples.haiku import run_example as run_haiku from examples.hard_constraints import run_example as run_hard_constraints +from hfppl.llms import CachedCausalLM backends = [ - 'mock', - 'hf', + "mock", + "hf", pytest.param( - 'vllm', + "vllm", marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason="vLLM backend requires CUDA" - ) - ) + not torch.cuda.is_available(), reason="vLLM backend requires CUDA" + ), + ), ] + @pytest.fixture def LLM(backend): # Set lower gpu_memory_utilization in vllm so that we can fit both models on the GPU - kwargs = {'engine_opts' : {'gpu_memory_utilization' : 0.45}} if backend == 'vllm' else {} - return CachedCausalLM.from_pretrained('gpt2', backend=backend, **kwargs) + kwargs = ( + {"engine_opts": {"gpu_memory_utilization": 0.45}} if backend == "vllm" else {} + ) + return CachedCausalLM.from_pretrained("gpt2", backend=backend, **kwargs) + -@pytest.mark.parametrize('backend', backends) +@pytest.mark.parametrize("backend", backends) def test_hard_constraints(LLM, n_particles=20, max_tokens=25): particles = asyncio.run( run_hard_constraints(LLM, max_tokens=max_tokens, n_particles=n_particles) ) assert len(particles) == n_particles -@pytest.mark.parametrize('backend', backends) + +@pytest.mark.parametrize("backend", backends) def test_haiku(LLM, n_particles=20): particles = asyncio.run( - run_haiku(LLM, poem_title='The beauty of testing', n_particles=n_particles) + run_haiku(LLM, poem_title="The beauty of testing", n_particles=n_particles) ) assert len(particles) == n_particles diff --git a/tests/test_lmcontext.py b/tests/test_lmcontext.py index a528c63..75fc33f 100644 --- a/tests/test_lmcontext.py +++ b/tests/test_lmcontext.py @@ -1,36 +1,40 @@ -import torch -import pytest import asyncio + import numpy as np -from hfppl.llms import CachedCausalLM +import pytest +import torch + from hfppl.distributions.lmcontext import LMContext +from hfppl.llms import CachedCausalLM backends = [ - 'mock', - 'hf', + "mock", + "hf", pytest.param( - 'vllm', + "vllm", marks=pytest.mark.skipif( - not torch.cuda.is_available(), - reason="vLLM backend requires CUDA" - ) - ) + not torch.cuda.is_available(), reason="vLLM backend requires CUDA" + ), + ), ] @pytest.fixture def lm(backend): - return CachedCausalLM.from_pretrained('gpt2', backend=backend) + return CachedCausalLM.from_pretrained("gpt2", backend=backend) -@pytest.mark.parametrize('backend', backends) +@pytest.mark.parametrize("backend", backends) def test_init(lm): - prompt = 'Hello, world!' + prompt = "Hello, world!" lmcontext = LMContext(lm, prompt) assert lmcontext.tokens == lm.tokenizer.encode(prompt) logprobs = lm.next_token_logprobs_unbatched(lmcontext.tokens) np.testing.assert_allclose( - lmcontext.next_token_logprobs, logprobs, rtol=1e-5, err_msg='Sync context __init__' + lmcontext.next_token_logprobs, + logprobs, + rtol=1e-5, + err_msg="Sync context __init__", ) async def async_context(): @@ -38,7 +42,10 @@ async def async_context(): lmcontext = asyncio.run(async_context()) np.testing.assert_allclose( - lmcontext.next_token_logprobs, logprobs, rtol=1e-5, err_msg='Async context __init__' + lmcontext.next_token_logprobs, + logprobs, + rtol=1e-5, + err_msg="Async context __init__", ) async def async_context_create(): @@ -46,5 +53,8 @@ async def async_context_create(): lmcontext = asyncio.run(async_context_create()) np.testing.assert_allclose( - lmcontext.next_token_logprobs, logprobs, rtol=1e-5, err_msg='Async context create' + lmcontext.next_token_logprobs, + logprobs, + rtol=1e-5, + err_msg="Async context create", )