-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
124 lines (114 loc) · 3.56 KB
/
main.py
File metadata and controls
124 lines (114 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env python3
"""
STIP end-to-end inference demo: init, client encrypt, server forward, decode.
Requires: mlx, transformers. Uses GPU (Metal) by default; pass --cpu for CPU.
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
# Set device before any MLX model load: default GPU (Metal), opt-in CPU with --cpu
if "--cpu" in sys.argv:
import mlx.core as _mx
_mx.set_default_device(_mx.cpu)
else:
import mlx.core as _mx
if hasattr(_mx, "gpu"):
_mx.set_default_device(_mx.gpu)
from src.inference import load_model, run_inference
def main() -> None:
p = argparse.ArgumentParser(description="STIP end-to-end inference demo")
p.add_argument(
"--model-dir",
type=str,
default="stip_model",
help="STIP sharded weight dir (manifest.json + non_layer*.safetensors + layer_XX.safetensors)",
)
p.add_argument(
"--manifest",
type=str,
default=None,
help="Path to manifest.json (default <model-dir>/manifest.json)",
)
p.add_argument(
"--tokenizer",
type=str,
default="Qwen/Qwen2.5-3B",
help="HuggingFace model name or path for tokenizer",
)
p.add_argument(
"--prompt",
type=str,
default="Hello, how are you?",
help="Input text",
)
p.add_argument(
"--max-new-tokens",
type=int,
default=32,
help="Max new tokens to generate",
)
p.add_argument(
"--quiet",
action="store_true",
help="Suppress per-step timing and progress",
)
p.add_argument(
"--cpu",
action="store_true",
help="Use CPU (slower; default is GPU/Metal)",
)
p.add_argument(
"--compile-decode",
action="store_true",
help="JIT-compile decode (may recompile each step with growing cache; often slower)",
)
p.add_argument(
"--profile",
action="store_true",
help="Print per-step breakdown: forward vs logits (first 2 steps only)",
)
args = p.parse_args()
# Load from sharded safetensors dir: manifest.json + non_layer*.safetensors + layer_XX.safetensors
if not args.quiet:
import time as _time
t_load_start = _time.perf_counter()
try:
model, tokenizer, chain = load_model(
args.model_dir,
manifest_path=args.manifest,
tokenizer_name=args.tokenizer,
)
except FileNotFoundError as e:
print(f"Error: {e}")
sys.exit(1)
except Exception as e:
if "transformers" in str(e) or "AutoTokenizer" in str(e):
print("Error: install transformers: pip install transformers")
else:
print(f"Error: {e}")
sys.exit(1)
if not args.quiet:
t_load_sec = _time.perf_counter() - t_load_start
print(f"Load: {t_load_sec:.2f}s")
print(f"Model: layers={model.num_layers}, hidden_size={model.hidden_size}")
if not args.quiet:
import mlx.core as _mx
dev = _mx.default_device()
print(f"Device: {dev}")
print("Inference timing:")
output_text, total_time = run_inference(
model, tokenizer, chain,
prompt=args.prompt,
max_new_tokens=args.max_new_tokens,
compile_decode=args.compile_decode,
verbose=not args.quiet,
profile=args.profile,
)
if args.quiet:
print(f"Total time: {total_time:.4f} s")
print(f"Output:\n {output_text}")
print("Done.")
if __name__ == "__main__":
main()