pytorch
diff --git a/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/microbenchmarks/README.md
Lines changed: 13 additions & 2 deletions b/‎benchmarks/microbenchmarks/README.md
Lines changed: 13 additions & 2 deletions
diff --git a/‎benchmarks/microbenchmarks/benchmark_inference.py
Lines changed: 44 additions & 3 deletions b/‎benchmarks/microbenchmarks/benchmark_inference.py
Lines changed: 44 additions & 3 deletions
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
 
@@ -30,11 +30,12 @@ python -m benchmarks.microbenchmarks.benchmark_runner --config path/to/config.ym
 
 ```yaml
 # Sample configuration for inference benchmarks
+benchmark_mode: "inference"
 quantization_config_recipe_names:
   - "baseline"
   - "int8wo"
-  - "int4wo-128"
-  - "int4wo-128-hqq"
+  - "float8wo"
+  - "float8dq-tensor"
 
 output_dir: "benchmarks/microbenchmarks/results"
 
@@ -50,10 +51,20 @@ model_params:
   compile: "max-autotune" # Options: "default", "max-autotune", "false"
   device: "cuda"  # Options: "cuda", "mps", "xpu", "cpu"
   model_type: "linear"  # Options: "linear", "ln_linear_sigmoid"
+  enable_profiler: true  # Enable standard profiling
+  enable_memory_profiler: true  # Enable CUDA memory profiling
 ```
 
 ## Configuration Options
 
+### Profiling Options
+- `enable_profiler`: Enable standard PyTorch profiling (default: false)
+- `enable_memory_profiler`: Enable CUDA memory profiling (default: false)
+  - Only works when device is set to "cuda"
+  - Generates memory snapshots before and after inference
+  - Creates visualizations of memory usage
+  - Outputs are saved in the memory_profiler subdirectory
+
 ### Quantization Methods
 Currently, quantization string is in same format as the one being passed in llama/generate.py.
 - `baseline`: No quantization
 
@@ -10,13 +10,16 @@
 - run() function is the main entry point for running inference benchmarks.
 """
 
+import os
 from copy import deepcopy
 from pathlib import Path
 
 import torch
 
 from benchmarks.microbenchmarks.profiler import (
+    generate_memory_profile,
     generate_model_profile,
+    visualize_memory_profile,
 )
 from benchmarks.microbenchmarks.utils import (
     BenchmarkConfig,
@@ -98,11 +101,49 @@ def run(config: BenchmarkConfig) -> BenchmarkResult:
         if config.enable_profiler:
             print("Running profiler...")
             try:
-                result.profiler_json_path = generate_model_profile(
-                    m_copy, input_data, config.profiler_file_name
+                profiler_json_path = generate_model_profile(
+                    model=m_copy,
+                    input_data=input_data,
+                    profile_file_path=os.path.join(
+                        config.output_dir,
+                        "profiler",
+                        f"{config._file_name}_profile.json",
+                    ),
                 )
+                result.profiler_json_path = profiler_json_path
             except Exception as e:
-                print(f"Error running profiler for {config.name} with error: {e}")
+                print(f"Error running profiler: {e}")
+
+        # Run memory profiler if enabled
+        if config.enable_memory_profiler:
+            print("Running memory profiler...")
+            try:
+                result.memory_profile_path, result.memory_stats = (
+                    generate_memory_profile(
+                        model=m_copy,
+                        input_data=input_data,
+                        profile_file_path=os.path.join(
+                            config.output_dir,
+                            "memory_profiler/pickle",
+                            f"{config._file_name}_memory_profile.pickle",
+                        ),
+                    )
+                )
+
+                if result.memory_profile_path:
+                    result.memory_visualization_path = visualize_memory_profile(
+                        result.memory_profile_path
+                    )
+            except ValueError as e:
+                if "not enough values to unpack" in e:
+                    print(
+                        "Failed due to existing bugs, re-run the code to generate memory profile. Please raise an issue if it persists."
+                    )
+            except Exception as e:
+                print(f"Error running memory profiler: {e}")
+                import traceback
+
+                traceback.print_exc()
 
         return result
     except Exception as e: