Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
get-pip.py
kvshare-env/
__pycache__/
.DS_Store
.env
.env
18 changes: 18 additions & 0 deletions benchmarks/default_bench.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
==========================
Average Inference Time: 4.7325 sec
Best Inference Time: 4.8443 sec
Median Inference Time: 4.7036 sec
Worst Inference Time: 4.5913 sec
First Token Generation Time: 4.5913 sec
Second Token to Last Token Generation Time: 19.0710 sec
==========================
Average VRAM Usage: 55.4200 % VRAM Usage
Best VRAM Usage: 55.3000 % VRAM Usage
Median VRAM Usage: 55.4000 % VRAM Usage
Worst VRAM Usage: 55.6000 % VRAM Usage
==========================
Average CPU Usage: 29.4200 % CPU Usage
Best CPU Usage: 28.3286 % CPU Usage
Median CPU Usage: 29.5786 % CPU Usage
Worst CPU Usage: 30.3214 % CPU Usage
Tokens per second: 10.57
71 changes: 69 additions & 2 deletions gpt_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
import torch
import time
import math
import psutil

# Model Name
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # or "meta-llama/Llama-2-7b-chat-hf"
Expand All @@ -15,35 +17,100 @@
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def bench(prompt, max_length=50, runs=5):
def bench(prompt, max_length=50, runs=5, isCached=False, isDistributed=False):
# '.to' to the device tensors (n-arrays) to put on.
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Warm up
_ = model.generate(**inputs, max_length=max_length)

times = []
ram_usages = []
cpu_usages = []
thisProc = psutil.Process()
thisProc.cpu_percent()

for _ in range(runs):
start_time = time.time()
with torch.no_grad(): # No gradient
output_ids = model.generate(**inputs, max_length=max_length)
times.append(time.time() - start_time)
ram_usages.append(psutil.virtual_memory().percent)
cpu_usages.append(thisProc.cpu_percent() / psutil.cpu_count())

# decode
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# core latency metrics
mid = math.floor(runs / 2)
avg_time = sum(times) / runs
first_token_time = times[0]
rest_tokens_time = sum(times) - first_token_time
times.sort()
worst_time = times[0]
best_time = times[runs-1]
median_time = times[mid]
print("==========================")
print(f"Average Inference Time: {avg_time:.4f} sec")
print(f"Best Inference Time: {best_time:.4f} sec")
print(f"Median Inference Time: {median_time:.4f} sec")
print(f"Worst Inference Time: {worst_time:.4f} sec")
print(f"First Token Generation Time: {first_token_time:.4f} sec")
print(f"Second Token to Last Token Generation Time: {rest_tokens_time:.4f} sec")

# ram
print("==========================")
avg_ram_usage = sum(ram_usages) / runs
ram_usages.sort()
best_ram_usage = ram_usages[0]
median_ram_usage = ram_usages[mid]
worst_ram_usage = ram_usages[runs-1]
print(f"Average RAM Usage: {avg_ram_usage:.4f} % RAM Usage")
print(f"Best RAM Usage: {best_ram_usage:.4f} % RAM Usage")
print(f"Median RAM Usage: {median_ram_usage:.4f} % RAM Usage")
print(f"Worst RAM Usage: {worst_ram_usage:.4f} % RAM Usage")

# cpu usage
print("==========================")
avg_cpu_usage = sum(cpu_usages) / runs
cpu_usages.sort()
best_cpu_usage = cpu_usages[0]
median_cpu_usage = cpu_usages[mid]
worst_cpu_usage = cpu_usages[runs-1]
print(f"Average CPU Usage: {avg_cpu_usage:.4f} % CPU Usage")
print(f"Best CPU Usage: {best_cpu_usage:.4f} % CPU Usage")
print(f"Median CPU Usage: {median_cpu_usage:.4f} % CPU Usage")
print(f"Worst CPU Usage: {worst_cpu_usage:.4f} % CPU Usage")

# tokens per second
num_tokens = max_length
throughput = num_tokens/avg_time
print(f"Tokens per second: {throughput:.2f}")

# Quality Metrics (ie correctness)

if isDistributed:
# Requests per Second
# Disk I/O
# Network I/O
pass

if isCached:
# Cache-Only Metrics
# Hit Rate
# Miss Rate
# Eviction Rate
# Cache Lookup Time
pass

# Resource Metrics (left to implement)
# VRAM
# Energy Consumption (?)

# memory or vram usage
if torch.cuda.is_available():
print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1e6:.2f} MB")


print(response)

Expand Down