IST-DASLab · amitsrivastava78 · Jul 4, 2025 · Jul 4, 2025 · Jul 4, 2025 · Jul 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ opt175b
 *.txt
 *.pt
 *egg-info*
+.DS_Store
diff --git a/Amitopt.py b/Amitopt.py
@@ -0,0 +1,126 @@
+# main.py
+import tensorflow as tf
+from datasets import load_dataset
+from transformers import AutoTokenizer, TFOPTForCausalLM
+
+def get_wikitext2(tokenizer, sequence_length=128, batch_size=8):
+    """
+    Loads and processes the wikitext-2-raw-v1 dataset.
+
+    Args:
+        tokenizer: The tokenizer to use for encoding the text.
+        sequence_length (int): The fixed length of sequences.
+        batch_size (int): The batch size for the DataLoader.
+
+    Returns:
+        A tf.data.Dataset object ready for training.
+    """
+    print("Loading wikitext-2 dataset...")
+    # Load the training split
+    train_dataset = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
+
+    # Filter out empty lines
+    train_dataset = train_dataset.filter(lambda example: example['text'] != '')
+    print(f"Number of examples after filtering: {len(train_dataset)}")
+
+    # Tokenize the dataset
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], return_tensors="tf", padding='max_length', truncation=True, max_length=sequence_length)
+
+    tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+
+    # Convert to a TensorFlow DataLoader (tf.data.Dataset)
+    # For language modeling, the input_ids are used as both input and label.
+    tf_dataset = tokenized_dataset.to_tf_dataset(
+        columns=['input_ids', 'attention_mask'],
+        label_cols=['input_ids'], # Use input_ids as the label
+        shuffle=True,
+        batch_size=batch_size,
+        collate_fn=None # Use default collation
+    )
+
+    print("Wikitext-2 dataset converted to TensorFlow DataLoader.")
+    return tf_dataset
+
+def get_ptb(tokenizer, sequence_length=128, batch_size=8):
+    """
+    Loads and processes the Penn Treebank (PTB) dataset directly from its source URL.
+
+    Args:
+        tokenizer: The tokenizer to use for encoding the text.
+        sequence_length (int): The fixed length of sequences.
+        batch_size (int): The batch size for the DataLoader.
+
+    Returns:
+        A tf.data.Dataset object ready for training.
+    """
+    print("\nLoading PTB dataset...")
+    # We load the data directly from its source URL using the generic 'text' loader.
+    data_files = {"train": "https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt"}
+    train_dataset = load_dataset("text", data_files=data_files, split="train")
+
+    # Filter out empty lines (the 'text' loader creates a 'text' column)
+    train_dataset = train_dataset.filter(lambda example: example['text'] != '')
+    print(f"Number of examples after filtering: {len(train_dataset)}")
+
+    # Tokenize the dataset
+    def tokenize_function(examples):
+        return tokenizer(examples["text"], return_tensors="tf", padding='max_length', truncation=True, max_length=sequence_length)
+
+    tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+
+    # Convert to a TensorFlow DataLoader (tf.data.Dataset)
+    tf_dataset = tokenized_dataset.to_tf_dataset(
+        columns=['input_ids', 'attention_mask'],
+        label_cols=['input_ids'], # Use input_ids as the label
+        shuffle=True,
+        batch_size=batch_size,
+        collate_fn=None # Use default collation
+    )
+
+    print("PTB dataset converted to TensorFlow DataLoader.")
+    return tf_dataset
+
+def get_opt_125m_tf():
+    """
+    Loads the facebook/opt-125m model and tokenizer for TensorFlow.
+
+    Returns:
+        A tuple containing the loaded model and tokenizer.
+    """
+    print("\nLoading facebook/opt-125m for TensorFlow...")
+    model_name = "facebook/opt-125m"
+    # Note the use of TFOPTForCausalLM for TensorFlow
+    model = TFOPTForCausalLM.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    print("Model and tokenizer loaded.")
+    return model, tokenizer
+
+if __name__ == "__main__":
+    # Define a batch size
+    BATCH_SIZE = 4
+
+    # 1. Load the TensorFlow model and tokenizer
+    opt_model, opt_tokenizer = get_opt_125m_tf()
+
+    # 2. Load and process the datasets into TensorFlow DataLoaders
+    wikitext_dataloader = get_wikitext2(opt_tokenizer, batch_size=BATCH_SIZE)
+    ptb_dataloader = get_ptb(opt_tokenizer, batch_size=BATCH_SIZE)
+
+    # 3. Print some information to verify
+    print("\n--- Verification ---")
+    print(f"Model Class: {opt_model.__class__.__name__}")
+    print(f"Tokenizer Class: {opt_tokenizer.__class__.__name__}")
+
+    # Take one batch from each dataloader to show the structure
+    print("\nSample batch from Wikitext-2 DataLoader:")
+    for inputs, labels in wikitext_dataloader.take(1):
+        print("Inputs (input_ids) shape:", inputs['input_ids'].shape)
+        print("Inputs (attention_mask) shape:", inputs['attention_mask'].shape)
+        print("Labels shape:", labels.shape)
+
+    print("\nSample batch from PTB DataLoader:")
+    for inputs, labels in ptb_dataloader.take(1):
+        print("Inputs (input_ids) shape:", inputs['input_ids'].shape)
+        print("Inputs (attention_mask) shape:", inputs['attention_mask'].shape)
+        print("Labels shape:", labels.shape)
diff --git a/datautils.py b/datautils.py
@@ -31,13 +31,30 @@ def get_wikitext2(nsamples, seed, seqlen, model):
 
 def get_ptb(nsamples, seed, seqlen, model):
     from datasets import load_dataset
-    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
-    valdata = load_dataset('ptb_text_only', 'penn_treebank', split='validation')
-
     from transformers import AutoTokenizer 
+
+    try:
+        # Try the new way first
+        traindata = load_dataset('ptb-text-only/ptb_text_only', split='train')
+        valdata = load_dataset('ptb-text-only/ptb_text_only', split='validation')
+        text_field = 'sentence'
+    except Exception as e1:
+        try:
+            # Try alternative dataset
+            traindata = load_dataset('ptb_text_only', split='train')
+            valdata = load_dataset('ptb_text_only', split='validation')
+            text_field = 'sentence'
+        except Exception as e2:
+            print(f"PTB dataset not available. Using WikiText-2 as fallback.")
+            print(f"Original errors: {e1}, {e2}")
+            # Fallback to WikiText-2
+            traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
+            valdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+            text_field = 'text'
+
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer("\n\n".join(traindata['sentence']), return_tensors='pt')
-    testenc = tokenizer("\n\n".join(valdata['sentence']), return_tensors='pt')
+    trainenc = tokenizer("\n\n".join(traindata[text_field]), return_tensors='pt')
+    testenc = tokenizer("\n\n".join(valdata[text_field]), return_tensors='pt')
 
     import random
     random.seed(seed)
@@ -53,12 +70,8 @@ def get_ptb(nsamples, seed, seqlen, model):
 
 def get_c4(nsamples, seed, seqlen, model):
     from datasets import load_dataset
-    traindata = load_dataset(
-        'allenai/c4', 'allenai--c4', data_files={'train': 'en/c4-train.00000-of-01024.json.gz'}, split='train'
-    )
-    valdata = load_dataset(
-        'allenai/c4', 'allenai--c4', data_files={'validation': 'en/c4-validation.00000-of-00008.json.gz'}, split='validation'
-    )
+    traindata = load_dataset('allenai/c4', 'en', split='train')
+    valdata = load_dataset('allenai/c4', 'en', split='validation')
 
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
@@ -97,17 +110,34 @@ def __init__(self, input_ids):
             self.input_ids = input_ids
     valenc = TokenizerWrapper(valenc)
 
-    return trainloader, valenc 
+    return trainloader, valenc
 
 def get_ptb_new(nsamples, seed, seqlen, model):
     from datasets import load_dataset
-    traindata = load_dataset('ptb_text_only', 'penn_treebank', split='train')
-    testdata = load_dataset('ptb_text_only', 'penn_treebank', split='test')
-
     from transformers import AutoTokenizer
+
+    try:
+        # Try the new way first
+        traindata = load_dataset('ptb-text-only/ptb_text_only', split='train')
+        testdata = load_dataset('ptb-text-only/ptb_text_only', split='test')
+        text_field = 'sentence'
+    except Exception as e1:
+        try:
+            # Try alternative dataset
+            traindata = load_dataset('ptb_text_only', split='train')
+            testdata = load_dataset('ptb_text_only', split='test')
+            text_field = 'sentence'
+        except Exception as e2:
+            print(f"PTB dataset not available. Using WikiText-2 as fallback.")
+            print(f"Original errors: {e1}, {e2}")
+            # Fallback to WikiText-2
+            traindata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
+            testdata = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+            text_field = 'text'
+
     tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False)
-    trainenc = tokenizer(" ".join(traindata['sentence']), return_tensors='pt')
-    testenc = tokenizer(" ".join(testdata['sentence']), return_tensors='pt')
+    trainenc = tokenizer(" ".join(traindata[text_field]), return_tensors='pt')
+    testenc = tokenizer(" ".join(testdata[text_field]), return_tensors='pt')
 
     import random
     random.seed(seed)

diff --git a/gptq.py b/gptq.py
@@ -148,7 +148,9 @@ def fasterquant(
                 print(torch.sum((self.layer(self.inp1) - self.out1) ** 2))
                 print(torch.sum(Losses))
 
-        torch.cuda.synchronize()
+        # Synchronize only if CUDA is available
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
         print('time %.2f' % (time.time() - tick))
         print('error', torch.sum(Losses).item())
 
@@ -168,4 +170,6 @@ def free(self):
         self.H = None
         self.Losses = None
         self.Trace = None
-        torch.cuda.empty_cache()
+        # Clear cache only if CUDA is available
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ opt175b @@
     *.txt
     *.pt
     *egg-info*
+    .DS_Store