Skip to content

Commit d8bb401

Browse files
committed
fix: fix test case
1 parent 3018f91 commit d8bb401

File tree

8 files changed

+32
-18
lines changed

8 files changed

+32
-18
lines changed

lmdeploy/turbomind/tokenizer_info.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def from_huggingface(
188188
try:
189189
vocab_dict = tokenizer.get_vocab()
190190
except AttributeError as e:
191-
msg = (f"Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer "
191+
msg = (f'Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer '
192192
'should have a get_vocab method.')
193193
raise ValueError(msg) from e
194194

@@ -284,7 +284,7 @@ def from_huggingface(
284284

285285
else:
286286
# TODO(yixin): unsupported tokenizer
287-
raise ValueError(f"Unsupported tokenizer type: {type(tokenizer)}")
287+
raise ValueError(f'Unsupported tokenizer type: {type(tokenizer)}')
288288

289289
@property
290290
def vocab_type(self) -> VocabType:

requirements/test.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
allure-pytest
22
coverage
3+
jsonschema
34
nvidia-ml-py
45
pytest
56
pytest-assume

src/turbomind/engine/request.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ struct GenerationConfig {
3737

3838
int output_logprobs = 0;
3939

40-
enum OutType {
40+
enum OutType
41+
{
4142
kNone = 0,
4243
kAll = 1,
4344
kGeneration = 2
@@ -139,7 +140,8 @@ struct Request {
139140

140141
int ec; // set when disabling conflicting requests
141142

142-
enum {
143+
enum
144+
{
143145
kOk = 0,
144146
kInvalid = 1, // Sequence not exist or both `start` & `stop` (instead of `end`) is set
145147
kConflict = 2, // Concurrent requests to the same sequence

src/turbomind/kernels/apply_token_bitmask_inplace_cuda.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@ using namespace std;
3131
#define CUDART_INF_FP16 __ushort_as_half((unsigned short)0x7C00U)
3232
#endif
3333

34+
#if __CUDA_ARCH__ >= 800
3435
#ifndef CUDART_INF_BF16
3536
#define CUDART_INF_BF16 __ushort_as_bfloat16((unsigned short)0x7F80U)
3637
#endif
38+
#endif
3739

3840
constexpr int32_t BITS_PER_BLOCK = 32;
3941
constexpr int32_t THREADS_PER_THREAD_BLOCK = 256;
@@ -50,11 +52,13 @@ __device__ __half NegativeInfinity<__half>()
5052
return -CUDART_INF_FP16;
5153
}
5254

55+
#if __CUDA_ARCH__ >= 800
5356
template<>
5457
__device__ __nv_bfloat16 NegativeInfinity<__nv_bfloat16>()
5558
{
5659
return -CUDART_INF_BF16;
5760
}
61+
#endif
5862

5963
template<typename T, typename PackedT>
6064
__device__ PackedT PackedNegativeInfinity()
@@ -217,13 +221,15 @@ void ApplyTokenBitmaskInplace(Tensor logits, Tensor bitmask, std::optional<Tenso
217221
logits.data<half_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
218222
break;
219223
}
224+
#if __CUDA_ARCH__ >= 800
220225
case kBfloat16: {
221226
ApplyTokenBitmaskInplaceDispatchToPackedT(
222227
logits.data<bfloat16_t>(), bitmask.data<int32_t>(), indices_ptr, vocab_size, 0, 0, num_rows);
223228
break;
224229
}
230+
#endif
225231
default:
226-
TM_CHECK(false) << "logits dtype must be float, half or bfloat16.";
232+
TM_CHECK(false) << "logits dtype must be float, float16 or bfloat16.";
227233
break;
228234
}
229235
}

src/turbomind/layers/sampling_layers/GuidedDecodeMaskLayer.cc

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ void GuidedDecodeMaskLayer<T>::Forward(TensorMap& args)
3939
{
4040
TM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
4141

42-
Tensor_<float> logits = args.at("logits");
43-
const ssize_t bsz = logits.shape(0);
42+
Tensor_<float> logits = args.at("logits");
43+
const ssize_t bsz = logits.shape(0);
4444

4545
FT_CHECK(bsz == matchers_.size());
4646

@@ -56,19 +56,19 @@ void GuidedDecodeMaskLayer<T>::Forward(TensorMap& args)
5656
bitmap_shape.data(),
5757
nullptr,
5858
0};
59-
59+
bool need_apply = false;
6060
for (size_t i = 0; i < bsz; ++i) {
6161
const auto& matcher = matchers_[i];
6262
if (matcher) {
6363
matcher->FillNextTokenBitmask(&bitmask_dltensor, i);
64+
need_apply = true;
6465
}
6566
}
6667

67-
Copy(bitmask, bitmask_device);
68-
ApplyTokenBitmaskInplace(logits, bitmask_device);
69-
70-
//xgrammar::ApplyTokenBitmaskInplaceCPU(&logits_dltensor, bitmask_dltensor, vocab_size_, std::nullopt);
71-
68+
if (need_apply) {
69+
Copy(bitmask, bitmask_device);
70+
ApplyTokenBitmaskInplace(logits, bitmask_device);
71+
}
7272
}
7373

7474
template class GuidedDecodeMaskLayer<float>;

src/turbomind/layers/sampling_layers/GuidedDecodeUpdateLayer.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ void GuidedDecodeUpdateLayer<T>::Forward(TensorMap& args)
4848

4949
for (size_t i = 0; i < bsz; ++i) {
5050
const auto& matcher = matchers_[i];
51-
matcher->AcceptToken(output_ids_buf.data()[i]);
51+
if (matcher) {
52+
matcher->AcceptToken(output_ids_buf.data()[i]);
53+
}
5254
}
5355
}
5456

src/turbomind/python/bind.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,10 @@ static void safe_memcpy(void* dst, const void* src, size_t size)
275275
namespace {
276276

277277
struct ScopedGIL {
278-
ScopedGIL(const ScopedGIL&) = delete;
278+
ScopedGIL(const ScopedGIL&) = delete;
279279
ScopedGIL& operator=(const ScopedGIL&) = delete;
280280
ScopedGIL(ScopedGIL&&) = delete;
281-
ScopedGIL& operator=(ScopedGIL&&) = delete;
281+
ScopedGIL& operator=(ScopedGIL&&) = delete;
282282
ScopedGIL()
283283
{
284284
state = PyGILState_Ensure();

tests/test_lmdeploy/test_grammar.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,15 @@
1+
import json
2+
13
import pytest
4+
from jsonschema import validate
25

36
from lmdeploy import pipeline
47
from lmdeploy.messages import GenerationConfig, TurbomindEngineConfig
58

69

710
@pytest.fixture(scope='module')
811
def tiny_model_id():
9-
return 'Qwen/Qwen2.5-0.5B'
12+
return 'internlm/internlm2_5-1_8b'
1013

1114

1215
@pytest.fixture(scope='module')
@@ -54,4 +57,4 @@ def test_tm_guided_pipeline(tiny_model_id):
5457
log_level='INFO')
5558
gen_config = GenerationConfig(response_format=dict(type='json_schema', json_schema=dict(name='test', schema=guide)))
5659
response = pipe(['Make a self introduction please.'], gen_config=gen_config)
57-
assert False, response
60+
validate(instance=json.loads(response[0].text), schema=guide)

0 commit comments

Comments
 (0)