conceptofmind · Mistobaan · Mar 10, 2023
diff --git a/.github/workflows/code_quality.yml b/.github/workflows/code_quality.yml
@@ -0,0 +1,13 @@
+name: Code Quality
+
+on: [pull_request]
+
+jobs:
+  code-quality:
+    runs-on: ubuntu-20.04
+    steps:
+      - uses: actions/checkout@v2
+      - uses: actions/setup-python@v2
+        with:
+          python-version: 3.8
+      - uses: pre-commit/[email protected]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,40 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+        - id: check-case-conflict
+        - id: check-json
+        - id: check-symlinks
+        - id: check-yaml
+        - id: destroyed-symlinks
+        - id: end-of-file-fixer
+          exclude: docs/CNAME
+        - id: fix-byte-order-marker
+        - id: fix-encoding-pragma
+          args: [--remove]
+        - id: mixed-line-ending
+          args: [--fix=lf]
+        - id: requirements-txt-fixer
+        - id: trailing-whitespace
+  - repo: https://github.com/psf/black
+    rev: 23.1.0
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+        name: isort (python)
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.0.0
+    hooks:
+      - id: flake8
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.2.2
+    hooks:
+      - id: codespell
+        args: [--ignore-words, dictionary.txt, --skip, customToolformer/merges.txt, --skip, customToolformer/vocab.json]
+        additional_dependencies:
+          - tomli
diff --git a/configs/ds_configs/ds_config_gpt_j_z3.json b/configs/ds_configs/ds_config_gpt_j_z3.json
@@ -1,44 +1,44 @@
 {
 	"train_batch_size": "auto",
 	"fp16": {
-	  "enabled": "auto",
-	  "min_loss_scale": 1,
-	  "loss_scale_window": 1000,
-	  "hysteresis": 2,
-	  "initial_scale_power": 32
+		"enabled": "auto",
+		"min_loss_scale": 1,
+		"loss_scale_window": 1000,
+		"hysteresis": 2,
+		"initial_scale_power": 32
 	},
 	"bf16": {
 		"enabled": "auto"
 	},
 	"zero_optimization": {
-	  "stage": 3,
-	  "offload_param": {
-		"device": "none"
-	  },
-	  "offload_optimizer": {
-		"device": "none"
-	  },
-	  "allgather_partitions": true,
-	  "allgather_bucket_size": 5e8,
-	  "contiguous_gradients": true
+		"stage": 3,
+		"offload_param": {
+			"device": "none"
+		},
+		"offload_optimizer": {
+			"device": "none"
+		},
+		"allgather_partitions": true,
+		"allgather_bucket_size": 5e8,
+		"contiguous_gradients": true
 	},
 	"optimizer": {
-	  "type": "AdamW",
-	  "params": {
-		"lr": "auto",
-		"betas": [
-		  0.9,
-		  0.999
-		],
-		"eps": 1e-08
-	  }
+		"type": "AdamW",
+		"params": {
+			"lr": "auto",
+			"betas": [
+				0.9,
+				0.999
+			],
+			"eps": 1e-08
+		}
 	},
 	"scheduler": {
-	  "type": "WarmupLR",
-	  "params": {
-		"warmup_min_lr": 0,
-		"warmup_max_lr": "auto",
-		"warmup_num_steps": 100
-	  }
+		"type": "WarmupLR",
+		"params": {
+			"warmup_min_lr": 0,
+			"warmup_max_lr": "auto",
+			"warmup_num_steps": 100
+		}
 	}
-  }
+}
diff --git a/configs/trlx_configs/ds_config_trlx_gpt_j_z3.json b/configs/trlx_configs/ds_config_trlx_gpt_j_z3.json
@@ -0,0 +1 @@
+{}
diff --git a/data_generation/api_checker.py b/data_generation/api_checker.py
@@ -1,8 +1,9 @@
-from dataclasses import dataclass
-from transformers import PreTrainedTokenizerBase
-import dateutil.parser as dparser
 import random
 import re
+from dataclasses import dataclass
+
+import dateutil.parser as dparser
+from transformers import PreTrainedTokenizerBase
 
 
 @dataclass
@@ -32,11 +33,11 @@ def check_apis_available(
     available = AvailableAPIs()
     # In case we need a different version, found this here:
     # https://stackoverflow.com/questions/28198370/regex-for-validating-correct-input-for-calculator
-    calc_pattern = re.compile("^(\d+[\+\-\*\/]{1})+\d+$")
+    calc_pattern = re.compile(r"^(\d+[\+\-\*\/]{1})+\d+$")
     if len(tokenized_data) < 4096:
         available.retrieval = False
     try:
-        date = dparser.parse(data["url"], fuzzy=True)
+        dparser.parse(data["url"], fuzzy=True)
     except (ValueError, OverflowError):
         available.calendar = False
     available.calculator = False

diff --git a/data_generation/base_api.py b/data_generation/base_api.py
@@ -1,13 +1,13 @@
-import json
 from typing import List
+
 import torch
+from torch import nn
 from transformers import (
-    PreTrainedTokenizerBase,
-    pipeline,
     PreTrainedModel,
+    PreTrainedTokenizerBase,
     TextGenerationPipeline,
+    pipeline,
 )
-from torch import nn
 
 MAX_BATCH_SIZE = 1  # My 3090 is weak 😔
 N = 64  # SEQ Len
@@ -22,7 +22,7 @@ def __init__(
         minimum_percentage: float = 0.1,
     ):
         """
-        Base API Postprocesing class
+        Base API Postprocessing class
 
         :param start_tokens: token representation for [ or other tokens
         :param end_tokens:  token representation for ] or other tokens

diff --git a/data_generation/calculator.py b/data_generation/calculator.py
@@ -1,14 +1,11 @@
-import torch
-from transformers import (
-    PreTrainedTokenizerBase,
-    PreTrainedModel,
-)
-from tools import Calculator
-from prompts import calculator_prompt
 from typing import List
-from data_generation.base_api import APICallPostprocessing
-import dateutil.parser as dparser
 
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
+
+from data_generation.base_api import APICallPostprocessing
+from prompts import calculator_prompt
+from tools import Calculator
 
 # TODO: Per API?
 MAX_BATCH_SIZE = 1  # My 3090 is weak 😔
@@ -68,7 +65,10 @@ def add_api_calls(
                     continue
                 if outputs[j]["Calculator"] is None:
                     continue
-                outputs[j]["Calculator_output"] = [outputs[j]["Calculator_text"][1:], str(outputs[j]["Calculator"])]
+                outputs[j]["Calculator_output"] = [
+                    outputs[j]["Calculator_text"][1:],
+                    str(outputs[j]["Calculator"]),
+                ]
                 outputs[j]["Calculator_text"] = (
                     outputs[j]["Calculator_text"]
                     + "->"
@@ -113,7 +113,7 @@ def parse_article(
     ):
         outputs = list()
         tokens = tokenizer(data["text"], return_tensors="pt")["input_ids"]
-        for i in range((tokens.shape[1]-1)//N):
+        for i in range((tokens.shape[1] - 1) // N):
             if (N * (i + 1)) > tokens.shape[1]:
                 continue
             input_tokens = tokens[:, (-N * (i + 1) - 1) : (-N * (i) - 1)]
@@ -145,5 +145,7 @@ def parse_article(
                 output["index"] += int(tokens.shape[1] + (-N * (i + 1)))
                 # filter by score
                 if output["Score"] > 0.0:
-                    outputs.append([output["Score"], output["index"]] + output["Calculator_output"])
+                    outputs.append(
+                        [output["Score"], output["index"]] + output["Calculator_output"]
+                    )
         return outputs
diff --git a/data_generation/calendar.py b/data_generation/calendar.py
@@ -1,14 +1,12 @@
-import torch
-from transformers import (
-    PreTrainedTokenizerBase,
-    PreTrainedModel,
-)
-from tools import Calendar
-from prompts import calendar_prompt
 from typing import List
-from data_generation.base_api import APICallPostprocessing
+
 import dateutil.parser as dparser
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
+from data_generation.base_api import APICallPostprocessing
+from prompts import calendar_prompt
+from tools import Calendar
 
 # TODO: Per API?
 MAX_BATCH_SIZE = 1  # My 3090 is weak 😔
@@ -62,7 +60,10 @@ def add_api_calls(
                     return_tensors="pt",
                 )["input_ids"].cuda()
                 outputs[j]["Calendar"] = self.calendar(calendar_string)
-                outputs[j]["Calendar_output"] = [outputs[j]["Calendar_text"][1:], outputs[j]["Calendar"]]
+                outputs[j]["Calendar_output"] = [
+                    outputs[j]["Calendar_text"][1:],
+                    outputs[j]["Calendar"],
+                ]
                 outputs[j]["Calendar_text"] = (
                     outputs[j]["Calendar_text"] + "->" + outputs[j]["Calendar"] + "]"
                 )
@@ -104,15 +105,15 @@ def parse_article(
     ):
         outputs = list()
         tokens = tokenizer(data["text"], return_tensors="pt")["input_ids"]
-        for i in range((tokens.shape[1]-1)//N):
+        for i in range((tokens.shape[1] - 1) // N):
             if (N * (i + 1)) > tokens.shape[1]:
                 continue
             input_tokens = tokens[:, (-N * (i + 1) - 1) : (-N * (i) - 1)]
             labels = tokens[
                 :,
                 int(tokens.shape[1] + (-N * (i + 1))) : int(tokens.shape[1] + (-N * i)),
             ]
-            ret_tokens = tokens[:, : (-N * (i + 1) - 1)]
+            # ret_tokens = tokens[:, : (-N * (i + 1) - 1)]
             print(tokens.shape)
             string = tokenizer.decode(input_tokens[0])
             # print(ret_strings)
@@ -138,5 +139,7 @@ def parse_article(
                 output["index"] += int(tokens.shape[1] + (-N * (i + 1)))
                 # filter by score
                 if output["Score"] > 0.0:
-                    outputs.append([output["Score"], output["index"]] + output["Calendar_output"])
+                    outputs.append(
+                        [output["Score"], output["index"]] + output["Calendar_output"]
+                    )
         return outputs
diff --git a/data_generation/llmchain.py b/data_generation/llmchain.py
@@ -1,14 +1,11 @@
-import torch
-from transformers import (
-    PreTrainedTokenizerBase,
-    PreTrainedModel,
-)
-from tools import langchain_llmchain
-from prompts import llmchain_prompt
 from typing import List
-from data_generation.base_api import APICallPostprocessing
 
+import torch
+from transformers import PreTrainedModel, PreTrainedTokenizerBase
 
+from data_generation.base_api import APICallPostprocessing
+from prompts import llmchain_prompt
+from tools import langchain_llmchain
 
 # TODO: Per API?
 MAX_BATCH_SIZE = 1  # My 3090 is weak 😔
@@ -55,9 +52,9 @@ def add_api_calls(
                 )
                 if ")" in outputs[j]["LLMChain"]:
                     outputs[j]["LLMChain"] = outputs[j]["LLMChain"].split(")")[0]
-                if outputs[j]["LLMChain"][0] == "\"":
+                if outputs[j]["LLMChain"][0] == '"':
                     outputs[j]["LLMChain"] = outputs[j]["LLMChain"][1:]
-                if outputs[j]["LLMChain"][-1] == "\"":
+                if outputs[j]["LLMChain"][-1] == '"':
                     outputs[j]["LLMChain"] = outputs[j]["LLMChain"][:-1]
                 outputs[j]["LLMChain_text"] = (
                     "[LLMChain(" + outputs[j]["LLMChain"] + ")"
@@ -67,12 +64,12 @@ def add_api_calls(
                     return_tensors="pt",
                 )["input_ids"].cuda()
                 outputs[j]["LLMChain"] = str(self.llmchain(outputs[j]["LLMChain"]))
-                outputs[j]["LLMChain_output"] = [outputs[j]["LLMChain_text"][1:], outputs[j]["LLMChain"]]
+                outputs[j]["LLMChain_output"] = [
+                    outputs[j]["LLMChain_text"][1:],
+                    outputs[j]["LLMChain"],
+                ]
                 outputs[j]["LLMChain_text"] = (
-                    outputs[j]["LLMChain_text"]
-                    + "->"
-                    + outputs[j]["LLMChain"]
-                    + "]"
+                    outputs[j]["LLMChain_text"] + "->" + outputs[j]["LLMChain"] + "]"
                 )
                 test_inputs = tokenizer(
                     outputs[j]["LLMChain_text"] + "\n",
@@ -113,7 +110,7 @@ def parse_article(
         outputs = list()
         tokens = tokenizer(data["text"], return_tensors="pt")["input_ids"]
         start_step = 0
-        total_steps = tokens.shape[1]//N
+        total_steps = tokens.shape[1] // N
         for i in range(start_step, total_steps):
             input_tokens = tokens[:, (-N * (i + 1) - 1) : (-N * (i) - 1)]
             labels = tokens[
@@ -145,5 +142,7 @@ def parse_article(
                 output["index"] += int(tokens.shape[1] + (-N * (i + 1)))
                 # filter by score
                 if output["Score"] > 1.0:
-                    outputs.append([output["Score"], output["index"]] + output["LLMChain_output"])
+                    outputs.append(
+                        [output["Score"], output["index"]] + output["LLMChain_output"]
+                    )
         return outputs