diff --git a/.gitignore b/.gitignore index cf19b1b..5bff448 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ +get-pip.py kvshare-env/ __pycache__/ .DS_Store -.env \ No newline at end of file +.env diff --git a/benchmarks/default_bench.txt b/benchmarks/default_bench.txt new file mode 100644 index 0000000..0d49202 --- /dev/null +++ b/benchmarks/default_bench.txt @@ -0,0 +1,18 @@ +========================== +Average Inference Time: 4.7325 sec +Best Inference Time: 4.8443 sec +Median Inference Time: 4.7036 sec +Worst Inference Time: 4.5913 sec +First Token Generation Time: 4.5913 sec +Second Token to Last Token Generation Time: 19.0710 sec +========================== +Average VRAM Usage: 55.4200 % VRAM Usage +Best VRAM Usage: 55.3000 % VRAM Usage +Median VRAM Usage: 55.4000 % VRAM Usage +Worst VRAM Usage: 55.6000 % VRAM Usage +========================== +Average CPU Usage: 29.4200 % CPU Usage +Best CPU Usage: 28.3286 % CPU Usage +Median CPU Usage: 29.5786 % CPU Usage +Worst CPU Usage: 30.3214 % CPU Usage +Tokens per second: 10.57 \ No newline at end of file diff --git a/gpt_benchmarks.py b/gpt_benchmarks.py index f6c1d4e..b6521fd 100644 --- a/gpt_benchmarks.py +++ b/gpt_benchmarks.py @@ -1,6 +1,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel import torch import time +import math +import psutil # Model Name model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" # or "meta-llama/Llama-2-7b-chat-hf" @@ -15,8 +17,7 @@ device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) - -def bench(prompt, max_length=50, runs=5): +def bench(prompt, max_length=50, runs=5, isCached=False, isDistributed=False): # '.to' to the device tensors (n-arrays) to put on. inputs = tokenizer(prompt, return_tensors="pt").to(device) @@ -24,26 +25,92 @@ def bench(prompt, max_length=50, runs=5): _ = model.generate(**inputs, max_length=max_length) times = [] + ram_usages = [] + cpu_usages = [] + thisProc = psutil.Process() + thisProc.cpu_percent() for _ in range(runs): start_time = time.time() with torch.no_grad(): # No gradient output_ids = model.generate(**inputs, max_length=max_length) times.append(time.time() - start_time) + ram_usages.append(psutil.virtual_memory().percent) + cpu_usages.append(thisProc.cpu_percent() / psutil.cpu_count()) # decode response = tokenizer.decode(output_ids[0], skip_special_tokens=True) + + # core latency metrics + mid = math.floor(runs / 2) avg_time = sum(times) / runs + first_token_time = times[0] + rest_tokens_time = sum(times) - first_token_time + times.sort() + worst_time = times[0] + best_time = times[runs-1] + median_time = times[mid] + print("==========================") print(f"Average Inference Time: {avg_time:.4f} sec") + print(f"Best Inference Time: {best_time:.4f} sec") + print(f"Median Inference Time: {median_time:.4f} sec") + print(f"Worst Inference Time: {worst_time:.4f} sec") + print(f"First Token Generation Time: {first_token_time:.4f} sec") + print(f"Second Token to Last Token Generation Time: {rest_tokens_time:.4f} sec") + + # ram + print("==========================") + avg_ram_usage = sum(ram_usages) / runs + ram_usages.sort() + best_ram_usage = ram_usages[0] + median_ram_usage = ram_usages[mid] + worst_ram_usage = ram_usages[runs-1] + print(f"Average RAM Usage: {avg_ram_usage:.4f} % RAM Usage") + print(f"Best RAM Usage: {best_ram_usage:.4f} % RAM Usage") + print(f"Median RAM Usage: {median_ram_usage:.4f} % RAM Usage") + print(f"Worst RAM Usage: {worst_ram_usage:.4f} % RAM Usage") + + # cpu usage + print("==========================") + avg_cpu_usage = sum(cpu_usages) / runs + cpu_usages.sort() + best_cpu_usage = cpu_usages[0] + median_cpu_usage = cpu_usages[mid] + worst_cpu_usage = cpu_usages[runs-1] + print(f"Average CPU Usage: {avg_cpu_usage:.4f} % CPU Usage") + print(f"Best CPU Usage: {best_cpu_usage:.4f} % CPU Usage") + print(f"Median CPU Usage: {median_cpu_usage:.4f} % CPU Usage") + print(f"Worst CPU Usage: {worst_cpu_usage:.4f} % CPU Usage") # tokens per second num_tokens = max_length throughput = num_tokens/avg_time print(f"Tokens per second: {throughput:.2f}") + # Quality Metrics (ie correctness) + + if isDistributed: + # Requests per Second + # Disk I/O + # Network I/O + pass + + if isCached: + # Cache-Only Metrics + # Hit Rate + # Miss Rate + # Eviction Rate + # Cache Lookup Time + pass + + # Resource Metrics (left to implement) + # VRAM + # Energy Consumption (?) + # memory or vram usage if torch.cuda.is_available(): print(f"GPU Memory Usage: {torch.cuda.memory_allocated() / 1e6:.2f} MB") + print(response)