-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference-stream-openvino-only-stateless.py
124 lines (99 loc) · 5.29 KB
/
inference-stream-openvino-only-stateless.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Run an LLM chat model only with OpenVINO (supports only the STATELESS LLM models)
# - Without 'optimum-intel', 'PyTorch' and HF-Tokenizers.
# This program uses sampling method to generate the output text.
import os
import json
import numpy as np
import openvino as ov
from simple_tokenizer import SimpleTokenizer
from misc import sampling
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
#model_id = 'Intel/neural-chat-7b-v3'
model_vendor, model_name = model_id.split('/')
model_precision = ['FP16', 'INT8', 'INT4', 'INT4_stateless'][3]
print(f'LLM model: {model_id}, {model_precision}')
#from transformers import AutoTokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_id) # Fast and reliable :-)
tokenizer = SimpleTokenizer(model_vendor, model_name) # (somewhat) compatible tokenizer with HuggingFace tokenizers (simple, slow, and dumb)
with open(os.path.join(model_name, model_precision, 'config.json')) as f:
config_file = json.load(f)
num_attention_heads = config_file['num_attention_heads']
num_hidden_layers = config_file['num_hidden_layers']
num_key_value_heads = config_file['num_key_value_heads']
device = 'CPU'
ov_core = ov.Core()
ov_config={"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": "./cache"}
ov_model = ov_core.read_model(model=f'{model_name}/{model_precision}/openvino_model.xml')
print(f'Compiling the model to {device}')
compiled_model = ov.compile_model(ov_model, device, ov_config)
infer_request = compiled_model.create_infer_request()
input_names = [ layer.any_name for layer in ov_model.inputs ] # Obtain names of the model inputs
key_value_input_names = [key for key in input_names if 'key_values' in key]
# Generate dummy KV-cache tensors for the 1st iteration
inputs = {}
for input_name in key_value_input_names:
model_input = ov_model.input(input_name)
shape = model_input.get_partial_shape()
shape[0] = 1 # batch size
if shape[2].is_dynamic:
shape[2] = 0
else:
shape[1] = 0
inputs[input_name] = ov.Tensor(model_input.get_element_type(), shape.get_shape())
question = 'Explain the plot of Cinderella in a sentence.'
question = 'What is the best food in Tokyo?'
prompt_text_tinyllama = f"""\
<|system|>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.</s>
<|user|>
{question}</s>
<|assistant|>
"""
prompt_text_neuralchat = f"""\
### System:
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
### User:
{question}
### Assistant:
"""
input_text = prompt_text_tinyllama.format(question=question)
#input_text = prompt_text_neuralchat.format(question=question)
print(f'Question: {question}')
# Tokenize the input text (text -> token IDs)
# - The model input for the 1st iteration
input_tokens = tokenizer(text=input_text, return_tensors='np')
inputs['input_ids'] = input_tokens.input_ids
inputs['attention_mask'] = input_tokens.attention_mask
position = inputs['input_ids'].shape[-1]
inputs['position_ids'] = np.arange(position, dtype=np.int64).reshape(1, position)
# Sampling parameters for generated word
temperature = 1.0
top_p = 0.85
top_k = 10
print(f'Sampling parameters - Temperature: {temperature}, Top-p: {top_p}, Top-k: {top_k}')
prev_output = ''
generated_text_ids = np.array([], dtype=np.int32)
eos_token_id = tokenizer.eos_token_id
num_max_token_for_generation = 300
for i in range(num_max_token_for_generation):
# Run inference (to generate the logits for the next word prediction)
response = infer_request.infer(inputs)
logits = response['logits'][0, -1, :]
sampled_id = sampling(logits, temperature=temperature, top_k=top_k, top_p=top_p, do_sample=True)
if sampled_id == eos_token_id:
print('\n*** EOS token detected.')
break
generated_text_ids = np.append(generated_text_ids, sampled_id) # Append the predicted word to the bottom of the generated text ID array
output_text = tokenizer.decode(generated_text_ids) # Decode and generate the text from the array of token IDs
print(output_text[len(prev_output):], end='', flush=True) # Print only the last generated word
prev_output = output_text
# Update KV-cache values with the inference result
for n in range(num_hidden_layers):
inputs[f'past_key_values.{n}.key'] = response[f'present.{n}.key']
inputs[f'past_key_values.{n}.value'] = response[f'present.{n}.value']
# Supply only the last predicted (sampled) word ID as the model input from the 2nd iteration, and the latter
inputs['input_ids'] = np.array([[sampled_id]], dtype=np.int64)
inputs['attention_mask'] = np.array([[1]], dtype=np.int64)
inputs['position_ids'] = np.array([[position]], dtype=np.int64)
position += 1
print(f'\n\n*** Completed.')