fix: fix test case

windreamer · windreamer · commit d8bb401cc975 · 2025-09-22T17:51:52.000+08:00
diff --git a/lmdeploy/turbomind/tokenizer_info.py b/lmdeploy/turbomind/tokenizer_info.py
@@ -188,7 +188,7 @@ def from_huggingface(
         try:
             vocab_dict = tokenizer.get_vocab()
         except AttributeError as e:
-            msg = (f"Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer "
+            msg = (f'Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer '
                    'should have a get_vocab method.')
             raise ValueError(msg) from e
 
@@ -284,7 +284,7 @@ def from_huggingface(
 
         else:
             # TODO(yixin): unsupported tokenizer
-            raise ValueError(f"Unsupported tokenizer type: {type(tokenizer)}")
+            raise ValueError(f'Unsupported tokenizer type: {type(tokenizer)}')
 
     @property
     def vocab_type(self) -> VocabType:
diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1,5 +1,6 @@
 allure-pytest
 coverage
+jsonschema
 nvidia-ml-py
 pytest
 pytest-assume
diff --git a/src/turbomind/engine/request.h b/src/turbomind/engine/request.h
@@ -37,7 +37,8 @@ struct GenerationConfig {
 
     int output_logprobs = 0;
 
-    enum OutType {
+    enum OutType
+    {
         kNone       = 0,
         kAll        = 1,
         kGeneration = 2
@@ -139,7 +140,8 @@ struct Request {
 
     int ec;  // set when disabling conflicting requests
 
-    enum {
+    enum
+    {
         kOk            = 0,
         kInvalid       = 1,  // Sequence not exist or both `start` & `stop` (instead of `end`) is set
         kConflict      = 2,  // Concurrent requests to the same sequence
diff --git a/src/turbomind/kernels/apply_token_bitmask_inplace_cuda.cu b/src/turbomind/kernels/apply_token_bitmask_inplace_cuda.cu
@@ -31,9 +31,11 @@ using namespace std;
 #define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
 #endif
 
+#if __CUDA_ARCH__ >= 800
 #ifndef CUDART_INF_BF16
 #define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
 #endif
+#endif
 
 constexpr int32_t BITS_PER_BLOCK           = 32;
 constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
@@ -50,11 +52,13 @@ __device__ __half NegativeInfinity<__half>()
     return -CUDART_INF_FP16;
 }
 
+#if __CUDA_ARCH__ >= 800
 template<>
 __device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>()
 {
     return -CUDART_INF_BF16;
 }
+#endif
 
 template<typename T, typename PackedT>
 __device__ PackedT PackedNegativeInfinity()
@@ -217,13 +221,15 @@ void ApplyTokenBitmaskInplace(Tensor logits, Tensor bitmask, std::optional<Tenso
                 logits.data<half_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
             break;
         }
+#if __CUDA_ARCH__ >= 800
         case kBfloat16: {
             ApplyTokenBitmaskInplaceDispatchToPackedT(
                 logits.data<bfloat16_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
             break;
         }
+#endif
         default:
-            TM_CHECK(false) << "logits dtype must be float, half or bfloat16.";
+            TM_CHECK(false) << "logits dtype must be float, float16 or bfloat16.";
             break;
     }
 }
diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc
@@ -39,8 +39,8 @@ void GuidedDecodeMaskLayer<T>::Forward(TensorMap& args)
 {
     TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 
-    Tensor_<float>       logits = args.at("logits");
-    const ssize_t        bsz    = logits.shape(0);
+    Tensor_<float> logits = args.at("logits");
+    const ssize_t  bsz    = logits.shape(0);
 
     FT_CHECK(bsz == matchers_.size());
 
@@ -56,19 +56,19 @@ void GuidedDecodeMaskLayer<T>::Forward(TensorMap& args)
                               bitmap_shape.data(),
                               nullptr,
                               0};
-
+    bool     need_apply = false;
     for (size_t i = 0; i < bsz; ++i) {
         const auto& matcher = matchers_[i];
         if (matcher) {
             matcher->FillNextTokenBitmask(&bitmask_dltensor, i);
+            need_apply = true;
         }
     }
 
-    Copy(bitmask, bitmask_device);
-    ApplyTokenBitmaskInplace(logits, bitmask_device);
-
-    //xgrammar::ApplyTokenBitmaskInplaceCPU(&logits_dltensor, bitmask_dltensor, vocab_size_, std::nullopt);
-
+    if (need_apply) {
+        Copy(bitmask, bitmask_device);
+        ApplyTokenBitmaskInplace(logits, bitmask_device);
+    }
 }
 
 template class GuidedDecodeMaskLayer<float>;
diff --git a/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc b/src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc
@@ -48,7 +48,9 @@ void GuidedDecodeUpdateLayer<T>::Forward(TensorMap& args)
 
     for (size_t i = 0; i < bsz; ++i) {
         const auto& matcher = matchers_[i];
-        matcher->AcceptToken(output_ids_buf.data()[i]);
+        if (matcher) {
+            matcher->AcceptToken(output_ids_buf.data()[i]);
+        }
     }
 }
 
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
@@ -275,10 +275,10 @@ static void safe_memcpy(void* dst, const void* src, size_t size)
 namespace {
 
 struct ScopedGIL {
-    ScopedGIL(const ScopedGIL&)            = delete;
+    ScopedGIL(const ScopedGIL&) = delete;
     ScopedGIL& operator=(const ScopedGIL&) = delete;
     ScopedGIL(ScopedGIL&&)                 = delete;
-    ScopedGIL& operator=(ScopedGIL&&)      = delete;
+    ScopedGIL& operator=(ScopedGIL&&) = delete;
     ScopedGIL()
     {
         state = PyGILState_Ensure();
diff --git a/tests/test_lmdeploy/test_grammar.py b/tests/test_lmdeploy/test_grammar.py
@@ -1,12 +1,15 @@
+import json
+
 import pytest
+from jsonschema import validate
 
 from lmdeploy import pipeline
 from lmdeploy.messages import GenerationConfig, TurbomindEngineConfig
 
 
 @pytest.fixture(scope='module')
 def tiny_model_id():
-    return 'Qwen/Qwen2.5-0.5B'
+    return 'internlm/internlm2_5-1_8b'
 
 
 @pytest.fixture(scope='module')
@@ -54,4 +57,4 @@ def test_tm_guided_pipeline(tiny_model_id):
                     log_level='INFO')
     gen_config = GenerationConfig(response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide)))
     response = pipe(['Make a self introduction please.'], gen_config=gen_config)
-    assert False, response
+    validate(instance=json.loads(response[0].text), schema=guide)

Original file line number	Diff line number	Diff line change
`@@ -31,9 +31,11 @@ using namespace std;`
`31`	`31`	`#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)`
`32`	`32`	`#endif`
`33`	`33`
	`34`	`+#if __CUDA_ARCH__ >= 800`
`34`	`35`	`#ifndef CUDART_INF_BF16`
`35`	`36`	`#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)`
`36`	`37`	`#endif`
	`38`	`+#endif`
`37`	`39`
`38`	`40`	`constexpr int32_t BITS_PER_BLOCK = 32;`
`39`	`41`	`constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;`
`@@ -50,11 +52,13 @@ __device__ __half NegativeInfinity<__half>()`
`50`	`52`	`return -CUDART_INF_FP16;`
`51`	`53`	`}`
`52`	`54`
	`55`	`+#if __CUDA_ARCH__ >= 800`
`53`	`56`	`template<>`
`54`	`57`	`__device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>()`
`55`	`58`	`{`
`56`	`59`	`return -CUDART_INF_BF16;`
`57`	`60`	`}`
	`61`	`+#endif`
`58`	`62`
`59`	`63`	`template<typename T, typename PackedT>`
`60`	`64`	`__device__ PackedT PackedNegativeInfinity()`
`@@ -217,13 +221,15 @@ void ApplyTokenBitmaskInplace(Tensor logits, Tensor bitmask, std::optional<Tenso`
`217`	`221`	`logits.data<half_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);`
`218`	`222`	`break;`
`219`	`223`	`}`
	`224`	`+#if __CUDA_ARCH__ >= 800`
`220`	`225`	`case kBfloat16: {`
`221`	`226`	`ApplyTokenBitmaskInplaceDispatchToPackedT(`
`222`	`227`	`logits.data<bfloat16_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);`
`223`	`228`	`break;`
`224`	`229`	`}`
	`230`	`+#endif`
`225`	`231`	`default:`
`226`		`- TM_CHECK(false) << "logits dtype must be float, half or bfloat16.";`
	`232`	`+ TM_CHECK(false) << "logits dtype must be float, float16 or bfloat16.";`
`227`	`233`	`break;`
`228`	`234`	`}`
`229`	`235`	`}`
Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,9 @@ void GuidedDecodeUpdateLayer<T>::Forward(TensorMap& args)`
`48`	`48`
`49`	`49`	`for (size_t i = 0; i < bsz; ++i) {`
`50`	`50`	`const auto& matcher = matchers_[i];`
`51`		`- matcher->AcceptToken(output_ids_buf.data()[i]);`
	`51`	`+ if (matcher) {`
	`52`	`+ matcher->AcceptToken(output_ids_buf.data()[i]);`
	`53`	`+ }`
`52`	`54`	`}`
`53`	`55`	`}`
`54`	`56`