add dpo mix dataset

Leroll · Apr 19, 2024 · 6339ede · 6339ede
1 parent ba559a6
commit 6339ede
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 105 deletions.
diff --git a/data/belle_multiturn/belle_multiturn.py b/data/belle_multiturn/belle_multiturn.py
@@ -1,5 +1,6 @@
-import os
 import json
+import os
+
 import datasets
 
 
@@ -22,31 +23,19 @@
 
 
 class BelleMultiturn(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self):
-        features = datasets.Features({
-            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
-        })
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager):
         file_path = dl_manager.download(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": file_path
-                }
-            )
-        ]
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
 
     def _generate_examples(self, filepath: str):
         with open(filepath, "r", encoding="utf-8") as f:
@@ -58,7 +47,7 @@ def _generate_examples(self, filepath: str):
 
                 assist_idx = prompt.rfind("Assistant:")
                 human_idx = prompt.rfind("Human:")
-                query = prompt[human_idx+6:assist_idx].strip()
+                query = prompt[human_idx + 6 : assist_idx].strip()
                 prompt = prompt[:human_idx].strip()
                 conversations.insert(0, {"from": "gpt", "value": response})
                 conversations.insert(0, {"from": "human", "value": query})
@@ -67,8 +56,8 @@ def _generate_examples(self, filepath: str):
                     assist_idx = prompt.rfind("Assistant:")
                     human_idx = prompt.rfind("Human:")
                     if human_idx != -1:
-                        old_query = prompt[human_idx+6:assist_idx].strip()
-                        old_resp = prompt[assist_idx+10:].strip()
+                        old_query = prompt[human_idx + 6 : assist_idx].strip()
+                        old_resp = prompt[assist_idx + 10 :].strip()
                         conversations.insert(0, {"from": "gpt", "value": old_resp})
                         conversations.insert(0, {"from": "human", "value": old_query})
                     else:

diff --git a/data/dataset_info.json b/data/dataset_info.json
@@ -318,6 +318,28 @@
     "ms_hub_url": "AI-ModelScope/RLAIF-Nectar",
     "ranking": true
   },
+  "dpo_mix_en": {
+    "hf_hub_url": "hiyouga/DPO-En-Zh-20k",
+    "subset": "en",
+    "ranking": true,
+    "columns": {
+      "prompt": "prompt",
+      "response": "answer",
+      "system": "system",
+      "history": "history"
+    }
+  },
+  "dpo_mix_zh": {
+    "hf_hub_url": "hiyouga/DPO-En-Zh-20k",
+    "subset": "zh",
+    "ranking": true,
+    "columns": {
+      "prompt": "prompt",
+      "response": "answer",
+      "system": "system",
+      "history": "history"
+    }
+  },
   "orca_dpo_de" : {
     "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de",
     "ranking": true

diff --git a/data/example_dataset/example_dataset.py b/data/example_dataset/example_dataset.py
@@ -1,7 +1,8 @@
 import json
-import datasets
 from typing import Any, Dict, Generator, List, Tuple
 
+import datasets
+
 
 _DESCRIPTION = "An example of dataset."
 _CITATION = ""
@@ -11,34 +12,24 @@
 
 
 class ExampleDataset(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self) -> datasets.DatasetInfo:
-        features = datasets.Features({
-            "instruction": datasets.Value("string"),
-            "input": datasets.Value("string"),
-            "output": datasets.Value("string"),
-            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
-        })
+        features = datasets.Features(
+            {
+                "instruction": datasets.Value("string"),
+                "input": datasets.Value("string"),
+                "output": datasets.Value("string"),
+                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+            }
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager) -> List[datasets.SplitGenerator]:
         file_path = dl_manager.download(_URL)
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepath": file_path
-                }
-            )
-        ]
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": file_path})]
 
     def _generate_examples(self, filepath: str) -> Generator[Tuple[int, Dict[str, Any]], None, None]:
         example_dataset = json.load(open(filepath, "r", encoding="utf-8"))

diff --git a/data/hh_rlhf_en/hh_rlhf_en.py b/data/hh_rlhf_en/hh_rlhf_en.py
@@ -1,8 +1,10 @@
-import os
 import json
-import datasets
+import os
 from typing import List
 
+import datasets
+
+
 _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 _DESCRIPTION = "Human preference data about helpfulness and harmlessness."
 _CITATION = ""
@@ -14,50 +16,37 @@
         _URL + "harmless-base/train.jsonl.gz",
         _URL + "helpful-base/train.jsonl.gz",
         _URL + "helpful-online/train.jsonl.gz",
-        _URL + "helpful-rejection-sampled/train.jsonl.gz"
+        _URL + "helpful-rejection-sampled/train.jsonl.gz",
     ],
     "test": [
         _URL + "harmless-base/test.jsonl.gz",
         _URL + "helpful-base/test.jsonl.gz",
         _URL + "helpful-online/test.jsonl.gz",
-        _URL + "helpful-rejection-sampled/test.jsonl.gz"
-    ]
+        _URL + "helpful-rejection-sampled/test.jsonl.gz",
+    ],
 }
 
 
 class HhRlhfEn(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self) -> datasets.DatasetInfo:
-        features = datasets.Features({
-            "instruction": datasets.Value("string"),
-            "output": datasets.Sequence(datasets.Value("string")),
-            "history": datasets.Sequence(datasets.Sequence(datasets.Value("string")))
-        })
+        features = datasets.Features(
+            {
+                "instruction": datasets.Value("string"),
+                "output": datasets.Sequence(datasets.Value("string")),
+                "history": datasets.Sequence(datasets.Sequence(datasets.Value("string"))),
+            }
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager):
         file_path = dl_manager.download_and_extract(_URLS)
         return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepaths": file_path["train"]
-                }
-            ),
-            datasets.SplitGenerator(
-                name=datasets.Split.TEST,
-                gen_kwargs={
-                    "filepaths": file_path["test"]
-                }
-            )
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_path["train"]}),
+            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepaths": file_path["test"]}),
         ]
 
     def _generate_examples(self, filepaths: List[str]):
@@ -70,29 +59,25 @@ def _generate_examples(self, filepaths: List[str]):
                     rejected = data["rejected"]
 
                     assist_idx = rejected.rfind("\n\nAssistant: ")
-                    r_reject = rejected[assist_idx+13:].strip()
+                    r_reject = rejected[assist_idx + 13 :].strip()
                     assist_idx = chosen.rfind("\n\nAssistant: ")
-                    r_accept = chosen[assist_idx+13:].strip()
+                    r_accept = chosen[assist_idx + 13 :].strip()
 
                     human_idx = chosen.rfind("\n\nHuman: ")
-                    query = chosen[human_idx+9:assist_idx].strip()
+                    query = chosen[human_idx + 9 : assist_idx].strip()
                     prompt = chosen[:human_idx]
                     history = []
 
                     while prompt.rfind("\n\nAssistant: ") != -1:
                         assist_idx = prompt.rfind("\n\nAssistant: ")
                         human_idx = prompt.rfind("\n\nHuman: ")
                         if human_idx != -1:
-                            old_query = prompt[human_idx+9:assist_idx].strip()
-                            old_resp = prompt[assist_idx+13:].strip()
+                            old_query = prompt[human_idx + 9 : assist_idx].strip()
+                            old_resp = prompt[assist_idx + 13 :].strip()
                             history.insert(0, (old_query, old_resp))
                         else:
                             break
                         prompt = prompt[:human_idx]
 
-                    yield key, {
-                        "instruction": query,
-                        "output": [r_accept, r_reject],
-                        "history": history
-                    }
+                    yield key, {"instruction": query, "output": [r_accept, r_reject], "history": history}
                     key += 1
diff --git a/data/ultra_chat/ultra_chat.py b/data/ultra_chat/ultra_chat.py
@@ -1,8 +1,10 @@
-import os
 import json
-import datasets
+import os
 from typing import List
 
+import datasets
+
+
 _HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
 
 _DESCRIPTION = "UltraChat: Large-scale, Informative, and Diverse Multi-round Dialogue Data."
@@ -24,48 +26,35 @@
 
 
 class UltraChat(datasets.GeneratorBasedBuilder):
-
     VERSION = datasets.Version("0.0.0")
 
     def _info(self):
-        features = datasets.Features({
-            "conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]
-        })
+        features = datasets.Features(
+            {"conversations": [{"from": datasets.Value("string"), "value": datasets.Value("string")}]}
+        )
         return datasets.DatasetInfo(
-            description=_DESCRIPTION,
-            features=features,
-            homepage=_HOMEPAGE,
-            license=_LICENSE,
-            citation=_CITATION
+            description=_DESCRIPTION, features=features, homepage=_HOMEPAGE, license=_LICENSE, citation=_CITATION
         )
 
     def _split_generators(self, dl_manager: datasets.DownloadManager):
-        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)] # multiple shards
-        return [
-            datasets.SplitGenerator(
-                name=datasets.Split.TRAIN,
-                gen_kwargs={
-                    "filepaths": file_paths
-                }
-            )
-        ]
+        file_paths = [dl_manager.download(_BASE_DATA_URL.format(idx=idx)) for idx in range(10)]  # multiple shards
+        return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": file_paths})]
 
     def _generate_examples(self, filepaths: List[str]):
         for filepath in filepaths:
             with open(filepath, "r", encoding="utf-8") as f:
                 for row in f:
                     try:
                         data = json.loads(row)
-                    except:
+                    except Exception:
                         continue
                     key: int = data["id"]
                     content: List[str] = data["data"]
                     if len(content) % 2 == 1:
                         content.pop(-1)
                     if len(content) < 2:
                         continue
-                    conversations = [{
-                        "from": "human" if i % 2 == 0 else "gpt",
-                        "value": content[i]
-                    } for i in range(len(content))]
+                    conversations = [
+                        {"from": "human" if i % 2 == 0 else "gpt", "value": content[i]} for i in range(len(content))
+                    ]
                     yield key, {"conversations": conversations}