Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pre-commit.ci] pre-commit autoupdate #31

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
repos:
- repo: https://github.com/pycqa/isort
rev: 5.13.2
rev: 6.0.1
hooks:
- id: isort
args: [--profile, black, --force-single-line-imports]
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.4.2
rev: 25.1.0
hooks:
- id: black
language_version: python3.10
42 changes: 26 additions & 16 deletions benchmark/benchmark_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,40 @@
Example usage: pytest benchmark/benchmark_backend.py --benchmark-only --benchmark-group-by=func -v
"""

import torch
import pytest
import asyncio
from hfppl.llms import CachedCausalLM

import pytest
import torch

from examples.haiku import run_example as run_haiku
from examples.hard_constraints import run_example as run_hard_constraints
from hfppl.llms import CachedCausalLM

backends = [
'hf',
"hf",
pytest.param(
'vllm',
"vllm",
marks=pytest.mark.skipif(
not torch.cuda.is_available(),
reason="vLLM backend requires CUDA"
)
)
not torch.cuda.is_available(), reason="vLLM backend requires CUDA"
),
),
]


@pytest.fixture
def LLM(backend):
# Set lower gpu_memory_utilization in vllm so that we can fit both models on the GPU
kwargs = {'engine_opts' : {'gpu_memory_utilization' : 0.45}, 'cache_size' : 100} if backend == 'vllm' else {}
return CachedCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B", backend=backend, **kwargs)
kwargs = (
{"engine_opts": {"gpu_memory_utilization": 0.45}, "cache_size": 100}
if backend == "vllm"
else {}
)
return CachedCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3-8B", backend=backend, **kwargs
)

@pytest.mark.parametrize('backend', backends)

@pytest.mark.parametrize("backend", backends)
def test_hard_constraints_benchmark(LLM, benchmark, n_particles=20, max_tokens=50):
def run_with_clear_cache():
LLM.clear_cache()
Expand All @@ -38,24 +47,25 @@ def run_with_clear_cache():

# warmup
run_with_clear_cache()

benchmark.pedantic(
run_with_clear_cache,
iterations=1,
rounds=3,
)

@pytest.mark.parametrize('backend', backends)

@pytest.mark.parametrize("backend", backends)
def test_haiku_benchmark(LLM, benchmark, n_particles=20):
def run_with_clear_cache():
LLM.clear_cache()
return asyncio.run(
run_haiku(LLM, poem_title='The beauty of testing', n_particles=n_particles)
run_haiku(LLM, poem_title="The beauty of testing", n_particles=n_particles)
)

# warmup
run_with_clear_cache()

benchmark.pedantic(
run_with_clear_cache,
iterations=1,
Expand Down
2 changes: 1 addition & 1 deletion examples/grammar_constraint.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ async def run_generation(
verbose: bool = False,
):
LLM = CachedCausalLM.from_pretrained(args.model)
if LLM.backend == 'hf':
if LLM.backend == "hf":
LLM.batch_size = args.batch_size
model = GrammarConstrainedSMC(
lm=LLM,
Expand Down
22 changes: 15 additions & 7 deletions examples/haiku.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def count_syllables(word, unknown_word_syllables=100):

return syllable_count


# Example poems for the prompt.
# Authors:
# - Amy Lowell
Expand Down Expand Up @@ -65,6 +66,7 @@ def count_syllables(word, unknown_word_syllables=100):
this deep in fall,
still not a butterfly."""


# LLaMPPL model
class Haiku(Model):

Expand Down Expand Up @@ -116,13 +118,16 @@ def string_for_serialization(self):
)
return s.replace("\n", "/")

async def run_example(LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=20, ess_threshold=0.5):

async def run_example(
LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=20, ess_threshold=0.5
):
# Construct prompt
prompt = f"""{EXAMPLE_POEMS}

5. "{poem_title}"
"""

# Cache the key value vectors for the prompt
LLM.cache_kv(LLM.tokenizer.encode(prompt))

Expand All @@ -136,6 +141,7 @@ async def run_example(LLM, poem_title, syllable_pattern=[5, 7, 5], n_particles=2

return particles


def main():
# Load the language model.
# Mistral is an open model; to use a model with restricted access, like LLaMA 3,
Expand All @@ -144,22 +150,24 @@ def main():
# LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

# Set batch size if using HuggingFace backend
if LLM.backend == 'hf':
if LLM.backend == "hf":
LLM.batch_size = 40

# Get poem title from user
poem_title = input("Enter a title for your Haiku: ")

syllables_per_line = [5, 7, 5] # [5, 3, 5] for a Lune
syllables_per_line = [5, 7, 5] # [5, 3, 5] for a Lune

# Run the example
particles = asyncio.run(run_example(LLM, poem_title, syllable_pattern=syllables_per_line))
particles = asyncio.run(
run_example(LLM, poem_title, syllable_pattern=syllables_per_line)
)

print("--------")
for i, particle in enumerate(particles):
print(f"\nPoem {i} (weight {particle.weight}):")
print(f"{particle.context}")


if __name__ == "__main__":
main()

10 changes: 7 additions & 3 deletions examples/hard_constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from hfppl import Model
from hfppl import smc_standard


def make_masks(LLM):
return {
i: set(
Expand Down Expand Up @@ -63,7 +64,7 @@ def string_for_serialization(self):
return f"{self.context}"

def immutable_properties(self):
return ['masks']
return ["masks"]


# From Politico.com
Expand All @@ -75,6 +76,7 @@ def immutable_properties(self):

3."""


async def run_example(LLM, max_tokens=50, n_particles=20, ess_threshold=0.5):
# Cache the key value vectors for the prompt.
LLM.cache_kv(LLM.tokenizer.encode(prompt))
Expand All @@ -91,6 +93,7 @@ async def run_example(LLM, max_tokens=50, n_particles=20, ess_threshold=0.5):

return particles


def main():
# Load the language model.
# Mistral and Vicuna are open models; to use a model with restricted access, like LLaMA 3,
Expand All @@ -100,11 +103,12 @@ def main():
# LLM = CachedCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")

# Set batch size if provided. This operation is only valid for the HuggingFace backend.
if LLM.backend == 'hf':
if LLM.backend == "hf":
LLM.batch_size = 40

# Run the example.
asyncio.run(run_example(LLM))


if __name__ == "__main__":
main()
3 changes: 1 addition & 2 deletions hfppl/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Probabilistic programming with HuggingFace Transformer models.
"""
"""Probabilistic programming with HuggingFace Transformer models."""

from .chunks import *
from .distributions import *
Expand Down
8 changes: 6 additions & 2 deletions hfppl/chunks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
@submodel
async def sample_word(self, context, max_tokens=5, allow_punctuation=True):
"""Sample a word from the `LMContext` object `context`."""
last_token = context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
last_token = (
context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
)
last_character = last_token[-1] if len(last_token) > 0 else ""
needs_space = last_character not in string.whitespace and last_character not in [
"-",
Expand Down Expand Up @@ -81,7 +83,9 @@ async def sample_word_2(
if max_chars is not None:
assert max_chars > 1

last_token = context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
last_token = (
context.lm.str_vocab[context.tokens[-1]] if len(context.tokens) > 0 else ""
)
last_character = last_token[-1] if len(last_token) > 0 else ""
needs_space = last_character not in string.whitespace and last_character not in [
"-",
Expand Down
5 changes: 4 additions & 1 deletion hfppl/distributions/lmcontext.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import copy
import warnings

import numpy as np

from ..llms import Token
Expand Down Expand Up @@ -82,7 +83,9 @@ async def log_prob(self, v):
# If there are no good tokens, the log probability of v under the mask is -inf
# However, since this method updates the model_mask as a side-effect,
# this will put the context in an invalid state, so we instead raise an exception.
raise NullMask("Unable to compute log probability of mask that rules out all tokens.")
raise NullMask(
"Unable to compute log probability of mask that rules out all tokens."
)
else:
logprob_good = logsumexp(self.ctx.next_token_logprobs[list(good_tokens)])

Expand Down
Loading