Added simple Quant and GPTQ seprately via cmdline

amitsrivastava78 · amitsrivastava78 · commit d2b5822ef93f · 2025-07-07T11:55:27.000+05:30
diff --git a/opt.py b/opt.py
@@ -269,6 +269,50 @@ def noop(*args, **kwargs):
 
     return model
 
+def simple_quantize_weights(weights, bits=4):
+    """Simple quantization function that rounds weights to specified bits"""
+    w_min = weights.min()
+    w_max = weights.max()
+    
+    # Calculate scale and zero point for specified bits
+    max_val = (2 ** bits) - 1
+    scale = (w_max - w_min) / max_val
+    zero_point = w_min
+    
+    # Quantize
+    quantized = torch.round((weights - zero_point) / scale)
+    quantized = torch.clamp(quantized, 0, max_val)
+    
+    return quantized.int(), scale, zero_point
+
+def simple_quantize_model(model, bits=4):
+    """Apply simple quantization to all linear layers in the model"""
+    print(f"Applying simple {bits}-bit quantization...")
+    
+    for name, module in model.named_modules():
+        if isinstance(module, nn.Linear):
+            print(f"Quantizing {name}...")
+            
+            # Quantize weights
+            quantized_weights, scale, zero_point = simple_quantize_weights(module.weight.data, bits)
+            
+            # Store quantization parameters
+            module.register_buffer('weight_scale', torch.tensor(scale))
+            module.register_buffer('weight_zero_point', torch.tensor(zero_point))
+            module.register_buffer('weight_quantized', quantized_weights)
+            
+            # Override forward method
+            def make_forward(module, scale, zero_point, quantized_weights):
+                def forward(x):
+                    # Dequantize weights on-the-fly
+                    dequantized_weights = quantized_weights.float() * scale + zero_point
+                    return nn.functional.linear(x, dequantized_weights, module.bias)
+                return forward
+            
+            module.forward = make_forward(module, scale, zero_point, quantized_weights)
+    
+    print("Simple quantization completed!")
+
 def opt_multigpu(model, gpus):
     model.model.decoder.embed_tokens = model.model.decoder.embed_tokens.to(gpus[0])
     model.model.decoder.embed_positions = model.model.decoder.embed_positions.to(gpus[0])
@@ -307,7 +351,8 @@ def forward(self, *inp, **kwargs):
 
 def benchmark(model, input_ids, check=False):
     input_ids = input_ids.to(model.gpus[0] if hasattr(model, 'gpus') else DEV)
-    torch.cuda.synchronize()
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
 
     cache = {'past': None}
     def clear_past(i):
@@ -327,9 +372,11 @@ def tmp(layer, inp, out):
     def sync():
         if hasattr(model, 'gpus'):
             for gpu in model.gpus:
-                torch.cuda.synchronize(gpu)
+                if torch.cuda.is_available():
+                    torch.cuda.synchronize()
         else:
-            torch.cuda.synchronize()
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
     with torch.no_grad():
         attention_mask = torch.ones((1, input_ids.numel()), device=DEV)
         times = []
@@ -432,6 +479,10 @@ def sync():
         '--static-groups', action='store_true',
         help='Whether to use static groups; recommended when using `--actorder` for more efficient inference.'
     )
+    parser.add_argument(
+        '--quantization-type', choices=['gptq', 'simple'], default='gptq',
+        help='Type of quantization to use: gptq (sophisticated) or simple (basic rounding)'
+    )
 
     args = parser.parse_args()
 
@@ -446,16 +497,28 @@ def sync():
     )
 
     if args.wbits < 16 and not args.nearest:
-        tick = time.time()
-        quantizers = opt_sequential(model, dataloader, DEV)
-        print(time.time() - tick)
+        if args.quantization_type == 'gptq':
+            print("Using GPTQ quantization...")
+            tick = time.time()
+            quantizers = opt_sequential(model, dataloader, DEV)
+            print(time.time() - tick)
+        elif args.quantization_type == 'simple':
+            print("Using simple quantization...")
+            simple_quantize_model(model, args.wbits)
+            quantizers = {}  # Empty dict for simple quantization
 
     if args.benchmark:
-        gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
-        if len(gpus) > 1:
-            opt_multigpu(model, gpus)
-        else:
-            model = model.to(DEV)
+        try:
+            gpus = [torch.device('cuda:%d' % i) for i in range(torch.cuda.device_count())]
+            if len(gpus) > 1:
+                opt_multigpu(model, gpus)
+            else:
+                model = model.to(DEV)
+        except (AssertionError, RuntimeError):
+            print("CUDA not available, using CPU for benchmarking...")
+            model = model.to('cpu')
+            DEV = torch.device('cpu')
+        
         if args.benchmark:
             input_ids = next(iter(dataloader))[0][:, :args.benchmark]
             benchmark(model, input_ids, check=args.check)