InternLM
diff --git a/‎.github/workflows/unit-test.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/unit-test.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docker/prepare_wheel.sh‎
Lines changed: 0 additions & 1 deletion b/‎docker/prepare_wheel.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎lmdeploy/pytorch/engine/guided_process.py‎
Lines changed: 52 additions & 126 deletions b/‎lmdeploy/pytorch/engine/guided_process.py‎
Lines changed: 52 additions & 126 deletions
diff --git a/‎lmdeploy/pytorch/engine/logits_process.py‎
Lines changed: 24 additions & 22 deletions b/‎lmdeploy/pytorch/engine/logits_process.py‎
Lines changed: 24 additions & 22 deletions
diff --git a/‎lmdeploy/pytorch/strategies/ar/model_agent.py‎
Lines changed: 0 additions & 4 deletions b/‎lmdeploy/pytorch/strategies/ar/model_agent.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎lmdeploy/pytorch/strategies/ar/sampling.py‎
Lines changed: 0 additions & 17 deletions b/‎lmdeploy/pytorch/strategies/ar/sampling.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎lmdeploy/pytorch/strategies/dllm/sampling.py‎
Lines changed: 0 additions & 1 deletion b/‎lmdeploy/pytorch/strategies/dllm/sampling.py‎
Lines changed: 0 additions & 1 deletion
@@ -64,7 +64,6 @@ jobs:
           python3 -m pip install /root/packages/cu118/flash_attn-*.whl
           python3 -m pip install -r requirements_cuda.txt -r requirements/test.txt
           python3 -m pip install -e .
-          python3 -m pip install -U 'numpy<2.0'
       - name: Check env
         run: |
           python3 -m pip list
 
@@ -17,7 +17,6 @@ if [[ ${PYTHON_VERSION} = "3.13" ]]; then
 
     pip install setuptools_rust
     pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/google/[email protected]#subdirectory=python"
-    pip wheel -v --no-build-isolation --no-deps -w /wheels --use-deprecated=legacy-resolver outlines_core==0.1.26
 fi
 
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
 
@@ -1,161 +1,87 @@
-# Copyright 2024- the Outlines developers
-# This file is adapted from
-# https://github.com/outlines-dev/outlines/blob/main/outlines/serve/vllm.py
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-
-#     http://www.apache.org/licenses/LICENSE-2.0
-
+# Copyright (c) OpenMMLab. All rights reserved.
 import copy
-import math
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from collections import defaultdict
+import json
+import logging
 from functools import lru_cache
-from typing import DefaultDict, Dict, List, Union
+from typing import Optional
 
 import torch
-from outlines.fsm.guide import CFGGuide, Generate, RegexGuide, Write
-from outlines.fsm.json_schema import build_regex_from_schema
-from pydantic import BaseModel
+import xgrammar as xgr
 from transformers import PreTrainedTokenizerBase
 
+logger = logging.getLogger('guided_process')
 
-class BaseLogitsProcessor:
-
-    def init_state(self):
-        """Initialize the FSM states."""
-        self.fsm_state: DefaultDict[int, int] = defaultdict(int)
-
-    def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
-        """Use the FSM to bias the logits before sampling the next token."""
-
-        seq_id = hash(tuple(input_ids))
-
-        if len(input_ids) == 0:
-            self.init_state()
-        else:
-            last_token = input_ids[-1]
-            last_seq_id = hash(tuple(input_ids[:-1]))
-            self.fsm_state[seq_id] = self.fsm.get_next_state(state=self.fsm_state[last_seq_id], token_id=last_token)
-
-        instruction = self.fsm.get_next_instruction(self.fsm_state[seq_id])
 
-        if type(instruction) == Generate:
-            allowed_tokens = instruction.tokens
-        elif type(instruction) == Write:
-            # TODO: support fast forward tokens
-            allowed_tokens = [instruction.tokens[0]]
-        else:
-            raise TypeError(f'Unsupported instruction type {type(instruction)}')
+class BaseLogitsProcessor:
+    """Base logits processor that uses xgrammar matcher for guided decoding."""
 
-        mask = torch.full((scores.shape[-1], ), -math.inf, device=scores.device)
-        mask[allowed_tokens] = 0
-        scores.add_(mask)
+    def __init__(self, compiled_grammar: xgr.CompiledGrammar, tokenizer_info: xgr.TokenizerInfo):
+        self.matcher = xgr.GrammarMatcher(compiled_grammar, terminate_without_stop_token=True)
+        self.token_bitmask = xgr.allocate_token_bitmask(1, tokenizer_info.vocab_size)
 
+    def process(self, scores: torch.Tensor) -> torch.Tensor:
+        """Apply grammar constraints to logits before sampling the next
+        token."""
+        self.matcher.fill_next_token_bitmask(self.token_bitmask)
+        xgr.apply_token_bitmask_inplace(scores, self.token_bitmask.to(scores.device))
         return scores
 
-    def adapt_tokenizer(self, tokenizer):
-        """Adapt tokenizer to use to compile the FSM.
+    def accept(self, token_id: int) -> bool:
+        """Update matcher state after a token is generated."""
+        return self.matcher.accept_token(token_id)
 
-        The API of Outlines tokenizers is slightly different to that of `transformers`. In addition we need to handle
-        the missing spaces to Llama's tokenizer to be able to compile FSMs for this model.
-        """
-        from outlines.integrations.utils import adapt_tokenizer
-        tokenizer = adapt_tokenizer(tokenizer)
-        # vocab size greater than logits shape because of '[UNUSED_TOKEN_...]'
-        if hasattr(tokenizer, '_tokenizer'):
-            tokenizer.vocabulary = tokenizer._tokenizer.get_vocab(with_added_tokens=False)
-        return tokenizer
+    def reset(self):
+        """Reset matcher state for next generation."""
+        self.matcher.reset()
 
 
 class RegexLogitsProcessor(BaseLogitsProcessor):
+    """Regex-guided logits processor using xgrammar."""
 
-    def __init__(self, regex_string: str, tokenizer):
-        """Compile the FSM that drives the regex-structured generation.
-
-        Args:
-            regex_string: A string that represents a regular expression
-            tokenizer: The model's tokenizer
-        """
-        tokenizer = self.adapt_tokenizer(copy.deepcopy(tokenizer))
-        fsm = RegexGuide(regex_string, tokenizer)
-        self.fsm = fsm
-
-
-class JSONLogitsProcessor(RegexLogitsProcessor):
-
-    def __init__(self, schema: Union[str, Dict, BaseModel], tokenizer):
-        """Compile the FSM that drives the JSON-guided generation.
-
-        Args:
-            schema: A str schema that encodes the structure we want the model
-                to generate
-            tokenizer: The model's tokenizer
-        """
-        regex_string = build_regex_from_schema(schema)
-        super().__init__(regex_string, tokenizer)
-
+    def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None):
+        tokenizer = copy.deepcopy(tokenizer)
+        if vocab_size_padded is None:
+            vocab_size_padded = tokenizer.vocab_size
 
-class CFGLogitsProcessor(BaseLogitsProcessor):
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded)
 
-    def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
-        """Compile the FSM that drives the context free grammar generation.
+        compiler = xgr.GrammarCompiler(tokenizer_info)
+        compiled = compiler.compile_regex_grammar(regex_string)
 
-        Parameters
-        ----------
-        cfg
-            A string that represents a context-free grammar
-        tokenizer
-            The model's tokenizer
-        """
-        tokenizer = self.adapt_tokenizer(tokenizer)
-        fsm = CFGGuide(cfg, tokenizer)
-        self.fsm = fsm
+        super().__init__(compiled, tokenizer_info)
 
 
-# copied from https://github.com/vllm-project/vllm/blob/a7f65c2be93f491771aca31106f790bf381c0bad/vllm/model_executor/guided_decoding/outlines_decoding.py#L31  # noqa
-JSON_GRAMMAR = r"""
-?start: object | array
+class JSONLogitsProcessor(BaseLogitsProcessor):
+    """JSON-schema guided logits processor using xgrammar."""
 
-?value: object
-| array
-| UNESCAPED_STRING
-| SIGNED_NUMBER      -> number
-| "true"             -> true
-| "false"            -> false
-| "null"             -> null
+    def __init__(self, schema: str, tokenizer: PreTrainedTokenizerBase, vocab_size_padded: Optional[int] = None):
+        tokenizer = copy.deepcopy(tokenizer)
+        tokenizer_info = xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=vocab_size_padded)
+        if vocab_size_padded is None:
+            vocab_size_padded = tokenizer.vocab_size
 
-array  : "[" [value ("," value)*] "]"
-object : "{" [pair ("," pair)*] "}"
-pair   : UNESCAPED_STRING ":" value
+        compiler = xgr.GrammarCompiler(tokenizer_info)
+        if isinstance(schema, str):
+            schema = json.loads(schema)
 
-%import common.UNESCAPED_STRING
-%import common.SIGNED_NUMBER
-%import common.WS
+        assert isinstance(schema, dict)
+        compiled = compiler.compile_json_schema(schema)
 
-%ignore WS
-"""
+        super().__init__(compiled, tokenizer_info)
 
 
 @lru_cache(maxsize=32)
-def _get_guided_logits_processor(guide: str, tokenizer: PreTrainedTokenizerBase, type: str):
+def _get_guided_logits_processor(guide: str,
+                                 tokenizer: PreTrainedTokenizerBase,
+                                 type: str,
+                                 vocab_size_padded: Optional[int] = None):
     try:
-        if type == 'json_object':
-            return CFGLogitsProcessor(guide, tokenizer)
-        elif type == 'json_schema':
-            return JSONLogitsProcessor(guide, tokenizer)
+        if type == 'json_schema':
+            return JSONLogitsProcessor(guide, tokenizer, vocab_size_padded)
         elif type == 'regex_schema':
-            return RegexLogitsProcessor(guide, tokenizer)
+            return RegexLogitsProcessor(guide, tokenizer, vocab_size_padded)
         else:
             return None
     except Exception as e:
-        from lmdeploy.utils import get_logger
-        logger = get_logger('lmdeploy')
         logger.error(e)
-        return None
+        raise
@@ -78,35 +78,30 @@ def _multinomial_sampling(scores: torch.Tensor,
     return multinomial_sampling(scores, seeds, offsets, indices)
 
 
-def _guided_sampling(response_formats: Tuple[Dict], scores: torch.Tensor, guided_input_ids: Optional[torch.Tensor],
-                     tokenizer: object):
-    if guided_input_ids is None:
-        return scores
-    for i in range(len(response_formats)):
-        _format = response_formats[i]
+def _get_guided_processors(response_formats: Tuple[Dict], tokenizer: object, vocab_size_padded: int):
+    processors = {}
+    for i, _format in enumerate(response_formats):
         if isinstance(_format, Dict) and _format.get('type', 'text') != 'text':
             if _format['type'] == 'json_schema':
                 schema = _format['json_schema']
                 if isinstance(schema, Dict):
                     for key in ['json_schema', 'schema']:
                         if key in schema:
                             schema = json.dumps(schema[key], ensure_ascii=False)
-                elif schema is None:
-                    from .guided_process import JSON_GRAMMAR
-                    schema = JSON_GRAMMAR
-                elif isinstance(schema, str):
+
+                if not isinstance(schema, str):
                     raise ValueError(f'Cannot parse schema {schema}. The schema must be '
                                      'either a dictionary or a string that contains the'
                                      ' JSON Schema specification')
             elif _format['type'] == 'regex_schema':
                 schema = _format.get('regex_schema', '')
             else:
                 raise ValueError(f"unsupported format type: {_format['type']}")
+
             from .guided_process import _get_guided_logits_processor
-            processor = _get_guided_logits_processor(schema, tokenizer, _format['type'])
-            if processor:
-                scores[i] = processor(guided_input_ids[i].tolist(), scores[i])
-    return scores
+            processors[i] = _get_guided_logits_processor(schema, tokenizer, _format['type'], vocab_size_padded)
+
+    return processors
 
 
 SeqList = List[SchedulerSequence]
@@ -131,7 +126,6 @@ class SamplingInputs:
     logits_processors: List[List[LogitsProcessor]] = None
     max_num_logprobs: Optional[int] = None
     all_ids: Optional[torch.Tensor] = None
-    guided_input_ids: Optional[torch.Tensor] = None
     num_ignore_eos: torch.Tensor = None
     batch_size: int = 0
 
@@ -169,6 +163,8 @@ def __init__(self,
         self.tokenizer = tokenizer
         self.sampling_vocab_size = sampling_vocab_size
         self.logprobs_mode = logprobs_mode
+        self.guided_processors = _get_guided_processors(sampling_inputs.response_formats, tokenizer,
+                                                        sampling_vocab_size)
 
     async def _wait_stream_once(self):
         """Wait stream once."""
@@ -205,9 +201,12 @@ async def __call__(self, scores: torch.FloatTensor) -> torch.FloatTensor:
 
         sampling_inputs = self.sampling_inputs
         all_ids = sampling_inputs.all_ids
-        guided_input_ids = sampling_inputs.guided_input_ids
-
         custom_logits_processors = self.sampling_inputs.logits_processors
+        if self.guided_processors:
+            await self._wait_stream_once()
+            for i, processor in self.guided_processors.items():
+                scores[i] = processor.process(scores[i])
+
         if any(custom_logits_processors):
             await self._wait_stream_once()
             scores = _apply_custom_logits_processors(custom_logits_processors, all_ids, scores)
@@ -232,9 +231,6 @@ async def __call__(self, scores: torch.FloatTensor) -> torch.FloatTensor:
             stop_mask = torch.where(ignore_eos[:, None], stop_mask, False)
             scores = _process_bad_words_(scores, stop_words, stop_mask)
 
-        if guided_input_ids is not None:
-            await self._wait_stream_once()
-            scores = _guided_sampling(sampling_inputs.response_formats, scores, guided_input_ids, self.tokenizer)
         return scores, logprobs
 
     @torch.inference_mode()
@@ -272,15 +268,21 @@ def __random_sampling(scores: torch.Tensor, indices: torch.LongTensor):
             logits = logits[..., :self.sampling_vocab_size]
 
         if sampling_inputs.max_top_k == 1:
-            return logits.argmax(-1)
+            result = logits.argmax(-1)
         else:
             # sort logits is too slow. and we only need topk logits
             max_topk = sampling_inputs.max_top_k
             if max_topk <= 0:
                 scores, indices = logits.sort(1, descending=True)
             else:
                 scores, indices = logits.topk(max_topk, dim=1)
-            return __random_sampling(scores, indices)
+            result = __random_sampling(scores, indices)
+
+        if self.guided_processors:
+            for i, processor in self.guided_processors.items():
+                processor.accept(result[i])
+
+        return result
 
     @torch.inference_mode()
     def compute_logprobs(self, raw_logprobs: torch.Tensor, token_ids: torch.LongTensor):
 
@@ -72,10 +72,6 @@ def _step_sampling_inputs(self, sampling_inputs: SamplingInputs, next_token_ids:
         if all_ids is not None:
             sampling_inputs.all_ids = torch.cat([all_ids, next_token_ids[:, None]], 1)
 
-        guided_input_ids = sampling_inputs.guided_input_ids
-        if guided_input_ids is not None:
-            sampling_inputs.guided_input_ids = torch.cat([guided_input_ids, next_token_ids[:, None]], 1)
-
         return sampling_inputs
 
     def make_stopping_criteria(self, seqs: SeqList) -> ARStoppingCriteria:
 
@@ -27,22 +27,6 @@ def _gather_all_ids(pad_id: int, seqs: SeqList, sampling_inputs: SamplingInputs)
     return output
 
 
-def _gather_guided_input_ids(pad_id: int, seqs: SeqList, sampling_inputs: 'SamplingInputs'):
-    """Gather input ids for guided decode."""
-    if not any(sampling_inputs.response_formats or ()):
-        return None
-    batch = len(seqs)
-    max_len = max(seq.num_new_tokens for seq in seqs)
-    output = torch.full((batch, max_len), pad_id, dtype=torch.int64)
-    for idx, seq in enumerate(seqs):
-        h_len = seq.num_new_tokens
-        if h_len == 0:
-            continue
-        h_ids = torch.from_numpy(seq.generated_ids)
-        output[idx, -h_len:] = h_ids
-    return output
-
-
 def _get_num_ignore_eos(seqs: SeqList):
     """Get num ignore eos."""
     ret = [seq.sampling_param.min_new_tokens - seq.num_new_tokens for seq in seqs]
@@ -186,6 +170,5 @@ def __get_bad_words(bad_words):
 
         pad_token_id = self.pad_token_id
         sampling_input.all_ids = _gather_all_ids(pad_token_id, seqs, sampling_input)
-        sampling_input.guided_input_ids = _gather_guided_input_ids(pad_token_id, seqs, sampling_input)
         sampling_input.num_ignore_eos = _get_num_ignore_eos(seqs)
         return sampling_input
@@ -35,7 +35,6 @@ def make_sampling_inputs(self, seqs: SeqList) -> SamplingInputs:
             'random_seeds',
             'random_offsets',
             'all_ids',
-            'guided_input_ids',
             'num_ignore_eos',
         ]
         for name in update_attr_names:
Original file line number	Diff line number	Diff line change
`@@ -35,7 +35,6 @@ def make_sampling_inputs(self, seqs: SeqList) -> SamplingInputs:`
`35`	`35`	`'random_seeds',`
`36`	`36`	`'random_offsets',`
`37`	`37`	`'all_ids',`
`38`		`- 'guided_input_ids',`
`39`	`38`	`'num_ignore_eos',`
`40`	`39`	`]`
`41`	`40`	`for name in update_attr_names:`