-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinference.py
23 lines (18 loc) · 1.09 KB
/
inference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from optimum.intel.openvino import OVModelForCausalLM
from transformers import AutoTokenizer, AutoConfig
model_id = 'TinyLlama/TinyLlama-1.1B-Chat-v1.0'
#model_id = 'Intel/neural-chat-7b-v3'
model_vendor, model_name = model_id.split('/')
model_precision = ['FP16', 'INT8', 'INT4', 'INT4_stateless'][2]
print(f'LLM model: {model_id}, {model_precision}')
tokenizer = AutoTokenizer.from_pretrained(model_id)
ov_model = OVModelForCausalLM.from_pretrained(
model_id = f'{model_name}/{model_precision}', # <- OpenVINO model directory. This directory must contain 'openvino_model[.xml|.bin]' and 'config.json'.
config=AutoConfig.from_pretrained(model_id)
)
# The most simple and naive generation method
prompt_text = 'Explain the plot of Cinderella in a sentence.\n\n'
input_tokens = tokenizer.encode(prompt_text, return_tensors='pt')
response = ov_model.generate(input_tokens, max_new_tokens=300) # Other options: temperature=1.0, do_sample=True, top_k=5, top_p=0.85, repetition_penalty=1.2)
response_text = tokenizer.decode(response[0], skip_special_tokens=True)
print(response_text)