|
1 | | -# Copyright 2024- the Outlines developers |
2 | | -# This file is adapted from |
3 | | -# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py |
4 | | -# |
5 | | -# Licensed under the Apache License, Version 2.0 (the "License"); |
6 | | -# you may not use this file except in compliance with the License. |
7 | | -# You may obtain a copy of the License at |
8 | | - |
9 | | -# http://www.apache.org/licenses/LICENSE-2.0 |
10 | | - |
| 1 | +# Copyright (c) OpenMMLab. All rights reserved. |
11 | 2 | import copy |
12 | | -import math |
13 | | -# Unless required by applicable law or agreed to in writing, software |
14 | | -# distributed under the License is distributed on an "AS IS" BASIS, |
15 | | -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
16 | | -# See the License for the specific language governing permissions and |
17 | | -# limitations under the License. |
18 | | -from collections import defaultdict |
| 3 | +import json |
| 4 | +import logging |
19 | 5 | from functools import lru_cache |
20 | | -from typing import DefaultDict, Dict, List, Union |
| 6 | +from typing import Optional |
21 | 7 |
|
22 | 8 | import torch |
23 | | -from outlines.fsm.guide import CFGGuide, Generate, RegexGuide, Write |
24 | | -from outlines.fsm.json_schema import build_regex_from_schema |
25 | | -from pydantic import BaseModel |
| 9 | +import xgrammar as xgr |
26 | 10 | from transformers import PreTrainedTokenizerBase |
27 | 11 |
|
| 12 | +logger = logging.getLogger('guided_process') |
28 | 13 |
|
29 | | -class BaseLogitsProcessor: |
30 | | - |
31 | | - def init_state(self): |
32 | | - """Initialize the FSM states.""" |
33 | | - self.fsm_state: DefaultDict[int, int] = defaultdict(int) |
34 | | - |
35 | | - def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor: |
36 | | - """Use the FSM to bias the logits before sampling the next token.""" |
37 | | - |
38 | | - seq_id = hash(tuple(input_ids)) |
39 | | - |
40 | | - if len(input_ids) == 0: |
41 | | - self.init_state() |
42 | | - else: |
43 | | - last_token = input_ids[-1] |
44 | | - last_seq_id = hash(tuple(input_ids[:-1])) |
45 | | - self.fsm_state[seq_id] = self.fsm.get_next_state(state=self.fsm_state[last_seq_id], token_id=last_token) |
46 | | - |
47 | | - instruction = self.fsm.get_next_instruction(self.fsm_state[seq_id]) |
48 | 14 |
|
49 | | - if type(instruction) == Generate: |
50 | | - allowed_tokens = instruction.tokens |
51 | | - elif type(instruction) == Write: |
52 | | - # TODO: support fast forward tokens |
53 | | - allowed_tokens = [instruction.tokens[0]] |
54 | | - else: |
55 | | - raise TypeError(f'Unsupported instruction type {type(instruction)}') |
| 15 | +class BaseLogitsProcessor: |
| 16 | + """Base logits processor that uses xgrammar matcher for guided decoding.""" |
56 | 17 |
|
57 | | - mask = torch.full((scores.shape[-1], ), -math.inf, device=scores.device) |
58 | | - mask[allowed_tokens] = 0 |
59 | | - scores.add_(mask) |
| 18 | + def __init__(self, compiled_grammar: xgr.CompiledGrammar, tokenizer_info: xgr.TokenizerInfo): |
| 19 | + self.matcher = xgr.GrammarMatcher(compiled_grammar, terminate_without_stop_token=True) |
| 20 | + self.token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size) |
60 | 21 |
|
| 22 | + def process(self, scores: torch.Tensor) -> torch.Tensor: |
| 23 | + """Apply grammar constraints to logits before sampling the next |
| 24 | + token.""" |
| 25 | + self.matcher.fill_next_token_bitmask(self.token_bitmask) |
| 26 | + xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device)) |
61 | 27 | return scores |
62 | 28 |
|
63 | | - def adapt_tokenizer(self, tokenizer): |
64 | | - """Adapt tokenizer to use to compile the FSM. |
| 29 | + def accept(self, token_id: int) -> bool: |
| 30 | + """Update matcher state after a token is generated.""" |
| 31 | + return self.matcher.accept_token(token_id) |
65 | 32 |
|
66 | | - The API of Outlines tokenizers is slightly different to that of `transformers`. In addition we need to handle |
67 | | - the missing spaces to Llama's tokenizer to be able to compile FSMs for this model. |
68 | | - """ |
69 | | - from outlines.integrations.utils import adapt_tokenizer |
70 | | - tokenizer = adapt_tokenizer(tokenizer) |
71 | | - # vocab size greater than logits shape because of '[UNUSED_TOKEN_...]' |
72 | | - if hasattr(tokenizer, '_tokenizer'): |
73 | | - tokenizer.vocabulary = tokenizer._tokenizer.get_vocab(with_added_tokens=False) |
74 | | - return tokenizer |
| 33 | + def reset(self): |
| 34 | + """Reset matcher state for next generation.""" |
| 35 | + self.matcher.reset() |
75 | 36 |
|
76 | 37 |
|
77 | 38 | class RegexLogitsProcessor(BaseLogitsProcessor): |
| 39 | + """Regex-guided logits processor using xgrammar.""" |
78 | 40 |
|
79 | | - def __init__(self, regex_string: str, tokenizer): |
80 | | - """Compile the FSM that drives the regex-structured generation. |
81 | | -
|
82 | | - Args: |
83 | | - regex_string: A string that represents a regular expression |
84 | | - tokenizer: The model's tokenizer |
85 | | - """ |
86 | | - tokenizer = self.adapt_tokenizer(copy.deepcopy(tokenizer)) |
87 | | - fsm = RegexGuide(regex_string, tokenizer) |
88 | | - self.fsm = fsm |
89 | | - |
90 | | - |
91 | | -class JSONLogitsProcessor(RegexLogitsProcessor): |
92 | | - |
93 | | - def __init__(self, schema: Union[str, Dict, BaseModel], tokenizer): |
94 | | - """Compile the FSM that drives the JSON-guided generation. |
95 | | -
|
96 | | - Args: |
97 | | - schema: A str schema that encodes the structure we want the model |
98 | | - to generate |
99 | | - tokenizer: The model's tokenizer |
100 | | - """ |
101 | | - regex_string = build_regex_from_schema(schema) |
102 | | - super().__init__(regex_string, tokenizer) |
103 | | - |
| 41 | + def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None): |
| 42 | + tokenizer = copy.deepcopy(tokenizer) |
| 43 | + if vocab_size_padded is None: |
| 44 | + vocab_size_padded = tokenizer.vocab_size |
104 | 45 |
|
105 | | -class CFGLogitsProcessor(BaseLogitsProcessor): |
| 46 | + tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded) |
106 | 47 |
|
107 | | - def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase): |
108 | | - """Compile the FSM that drives the context free grammar generation. |
| 48 | + compiler = xgr.GrammarCompiler(tokenizer_info) |
| 49 | + compiled = compiler.compile_regex_grammar(regex_string) |
109 | 50 |
|
110 | | - Parameters |
111 | | - ---------- |
112 | | - cfg |
113 | | - A string that represents a context-free grammar |
114 | | - tokenizer |
115 | | - The model's tokenizer |
116 | | - """ |
117 | | - tokenizer = self.adapt_tokenizer(tokenizer) |
118 | | - fsm = CFGGuide(cfg, tokenizer) |
119 | | - self.fsm = fsm |
| 51 | + super().__init__(compiled, tokenizer_info) |
120 | 52 |
|
121 | 53 |
|
122 | | -# copied from https://github.com/vllm-project/vllm/blob/a7f65c2be93f491771aca31106f790bf381c0bad/vllm/model_executor/guided_decoding/outlines_decoding.py#L31 # noqa |
123 | | -JSON_GRAMMAR = r""" |
124 | | -?start: object | array |
| 54 | +class JSONLogitsProcessor(BaseLogitsProcessor): |
| 55 | + """JSON-schema guided logits processor using xgrammar.""" |
125 | 56 |
|
126 | | -?value: object |
127 | | -| array |
128 | | -| UNESCAPED_STRING |
129 | | -| SIGNED_NUMBER -> number |
130 | | -| "true" -> true |
131 | | -| "false" -> false |
132 | | -| "null" -> null |
| 57 | + def __init__(self, schema: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None): |
| 58 | + tokenizer = copy.deepcopy(tokenizer) |
| 59 | + tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded) |
| 60 | + if vocab_size_padded is None: |
| 61 | + vocab_size_padded = tokenizer.vocab_size |
133 | 62 |
|
134 | | -array : "[" [value ("," value)*] "]" |
135 | | -object : "{" [pair ("," pair)*] "}" |
136 | | -pair : UNESCAPED_STRING ":" value |
| 63 | + compiler = xgr.GrammarCompiler(tokenizer_info) |
| 64 | + if isinstance(schema, str): |
| 65 | + schema = json.loads(schema) |
137 | 66 |
|
138 | | -%import common.UNESCAPED_STRING |
139 | | -%import common.SIGNED_NUMBER |
140 | | -%import common.WS |
| 67 | + assert isinstance(schema, dict) |
| 68 | + compiled = compiler.compile_json_schema(schema) |
141 | 69 |
|
142 | | -%ignore WS |
143 | | -""" |
| 70 | + super().__init__(compiled, tokenizer_info) |
144 | 71 |
|
145 | 72 |
|
146 | 73 | @lru_cache(maxsize=32) |
147 | | -def _get_guided_logits_processor(guide: str, tokenizer: PreTrainedTokenizerBase, type: str): |
| 74 | +def _get_guided_logits_processor(guide: str, |
| 75 | + tokenizer: PreTrainedTokenizerBase, |
| 76 | + type: str, |
| 77 | + vocab_size_padded: Optional[int] = None): |
148 | 78 | try: |
149 | | - if type == 'json_object': |
150 | | - return CFGLogitsProcessor(guide, tokenizer) |
151 | | - elif type == 'json_schema': |
152 | | - return JSONLogitsProcessor(guide, tokenizer) |
| 79 | + if type == 'json_schema': |
| 80 | + return JSONLogitsProcessor(guide, tokenizer, vocab_size_padded) |
153 | 81 | elif type == 'regex_schema': |
154 | | - return RegexLogitsProcessor(guide, tokenizer) |
| 82 | + return RegexLogitsProcessor(guide, tokenizer, vocab_size_padded) |
155 | 83 | else: |
156 | 84 | return None |
157 | 85 | except Exception as e: |
158 | | - from lmdeploy.utils import get_logger |
159 | | - logger = get_logger('lmdeploy') |
160 | 86 | logger.error(e) |
161 | | - return None |
| 87 | + raise |
0 commit comments