SamuelShao1 · byoseph3 · Apr 5, 2025 · Apr 5, 2025 · Apr 5, 2025 · Apr 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
+get-pip.py
 kvshare-env/
 __pycache__/
 .DS_Store
-.env
+.env
diff --git a/benchmarks/default_bench.txt b/benchmarks/default_bench.txt
@@ -0,0 +1,18 @@
+==========================
+Average Inference Time: 4.7325 sec
+Best Inference Time: 4.8443 sec
+Median Inference Time: 4.7036 sec
+Worst Inference Time: 4.5913 sec
+First Token Generation Time: 4.5913 sec
+Second Token to Last Token Generation Time: 19.0710 sec
+==========================
+Average VRAM Usage: 55.4200 % VRAM Usage
+Best VRAM Usage: 55.3000 % VRAM Usage
+Median VRAM Usage: 55.4000 % VRAM Usage
+Worst VRAM Usage: 55.6000 % VRAM Usage
+==========================
+Average CPU Usage: 29.4200 % CPU Usage
+Best CPU Usage: 28.3286 % CPU Usage
+Median CPU Usage: 29.5786 % CPU Usage
+Worst CPU Usage: 30.3214 % CPU Usage
+Tokens per second: 10.57
diff --git a/gpt_benchmarks.py b/gpt_benchmarks.py
@@ -1,6 +1,8 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel
 import torch
 import time
+import math
+import psutil
 
 # Model Name
 model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # or "meta-llama/Llama-2-7b-chat-hf"
@@ -15,35 +17,100 @@
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 
-
-def bench(prompt, max_length=50, runs=5):
+def bench(prompt, max_length=50, runs=5, isCached=False, isDistributed=False):
     # '.to' to the device tensors (n-arrays) to put on.
     inputs = tokenizer(prompt, return_tensors="pt").to(device)
 
     # Warm up
     _ = model.generate(**inputs, max_length=max_length)
 
     times = []
+    ram_usages = []
+    cpu_usages = []
+    thisProc = psutil.Process()
+    thisProc.cpu_percent()
 
     for _ in range(runs):
         start_time = time.time()
         with torch.no_grad(): # No gradient
             output_ids = model.generate(**inputs, max_length=max_length)
         times.append(time.time() - start_time)
+        ram_usages.append(psutil.virtual_memory().percent)
+        cpu_usages.append(thisProc.cpu_percent() / psutil.cpu_count())
 
         # decode
         response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+
+    # core latency metrics
+    mid = math.floor(runs / 2)
     avg_time = sum(times) / runs
+    first_token_time = times[0]
+    rest_tokens_time = sum(times) - first_token_time
+    times.sort()
+    worst_time = times[0]
+    best_time = times[runs-1]
+    median_time = times[mid]
+    print("==========================")
     print(f"Average Inference Time: {avg_time:.4f} sec")
+    print(f"Best Inference Time: {best_time:.4f} sec")
+    print(f"Median Inference Time: {median_time:.4f} sec")
+    print(f"Worst Inference Time: {worst_time:.4f} sec")
+    print(f"First Token Generation Time: {first_token_time:.4f} sec")
+    print(f"Second Token to Last Token Generation Time: {rest_tokens_time:.4f} sec")
+
+    # ram
+    print("==========================")
+    avg_ram_usage = sum(ram_usages) / runs
+    ram_usages.sort()
+    best_ram_usage = ram_usages[0]
+    median_ram_usage = ram_usages[mid]
+    worst_ram_usage = ram_usages[runs-1]
+    print(f"Average RAM Usage: {avg_ram_usage:.4f} % RAM Usage")
+    print(f"Best RAM Usage: {best_ram_usage:.4f} % RAM Usage")
+    print(f"Median RAM Usage: {median_ram_usage:.4f} % RAM Usage")
+    print(f"Worst RAM Usage: {worst_ram_usage:.4f} % RAM Usage")
+
+    # cpu usage
+    print("==========================")
+    avg_cpu_usage = sum(cpu_usages) / runs
+    cpu_usages.sort()
+    best_cpu_usage = cpu_usages[0]
+    median_cpu_usage = cpu_usages[mid]
+    worst_cpu_usage = cpu_usages[runs-1]
+    print(f"Average CPU Usage: {avg_cpu_usage:.4f} % CPU Usage")
+    print(f"Best CPU Usage: {best_cpu_usage:.4f} % CPU Usage")
+    print(f"Median CPU Usage: {median_cpu_usage:.4f} % CPU Usage")
+    print(f"Worst CPU Usage: {worst_cpu_usage:.4f} % CPU Usage")
 
     # tokens per second
     num_tokens = max_length
     throughput = num_tokens/avg_time
     print(f"Tokens per second: {throughput:.2f}")
 
+    # Quality Metrics (ie correctness)
+
+    if isDistributed:
+        # Requests per Second
+        # Disk I/O
+        # Network I/O
+        pass
+
+    if isCached:
+        # Cache-Only Metrics
+        # Hit Rate
+        # Miss Rate
+        # Eviction Rate
+        # Cache Lookup Time
+        pass
+
+    # Resource Metrics (left to implement)
+    # VRAM
+    # Energy Consumption (?)
+
     # memory or vram usage
     if torch.cuda.is_available():
         print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1e6:.2f} MB")
+
 
     print(response)