RLHFlow
diff --git a/‎.DS_Store
8 KB b/‎.DS_Store
8 KB
diff --git a/‎annotate_data/.DS_Store
6 KB b/‎annotate_data/.DS_Store
6 KB
diff --git a/‎annotate_data/get_rewards.py
+166 b/‎annotate_data/get_rewards.py
+166
diff --git a/‎configs/.DS_Store
6 KB b/‎configs/.DS_Store
6 KB
diff --git a/‎configs/deepspeed_stage1.json
+23 b/‎configs/deepspeed_stage1.json
+23
diff --git a/‎configs/deepspeed_stage2.json
+27 b/‎configs/deepspeed_stage2.json
+27
diff --git a/‎configs/deepspeed_stage3.json
+31 b/‎configs/deepspeed_stage3.json
+31
diff --git a/‎generation/.DS_Store
6 KB b/‎generation/.DS_Store
6 KB
diff --git a/‎generation/gen_hf.py
+142 b/‎generation/gen_hf.py
+142
@@ -0,0 +1,166 @@
+import json
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+import torch
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer, HfArgumentParser, pipeline
+from accelerate import Accelerator
+
+tqdm.pandas()
+
+#####
+# This script takes a dataset as the input, where each sample is {"prompt": "the pormpt", "responses": ["response1", "response2", "response3", ...]}
+# The script will compute the reward for each input-output pair, and eventually output a new dataset, where each sample contains {"prompt": "the pormpt", "responses": ["response1", "response2", "response3", ...], "rewards": [reward1, reward2, ...]}
+#####
+
+
+@dataclass
+class ScriptArguments:
+    """
+    The arguments for the DPO training script.
+    """
+
+    dataset_name_or_path: Optional[str] = field(
+        default="uf_split0_responses_K8.jsonl",
+        metadata={"help": "the location of the dataset name or path"},
+    )
+    output_dir: Optional[str] = field(
+        default="uf_split0_responses_K8_reward.json",
+        metadata={"help": "the location of the output file"},
+    )
+    record_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "the location of the recording file"},
+    )
+    reward_name_or_path: Optional[str] = field(
+        default="sfairXC/FsfairX-LLaMA3-RM-v0.1",
+        metadata={"help": "the name of the reward model"},
+    )
+    input_output_delimiter: Optional[str] = field(
+        default="",
+        metadata={"help": "the delimiter between input and output"},
+    )
+    K: Optional[int] = field(
+        default=8,
+        metadata={"help": "the number of responses per prompt"},
+    )
+
+
+accelerator = Accelerator()
+
+parser = HfArgumentParser(ScriptArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+
+device = accelerator.device
+pipe_kwargs = {
+    "return_all_scores": True,
+    "function_to_apply": "none",
+    "batch_size": 1,
+}
+reward_model = script_args.reward_name_or_path
+rm_tokenizer = AutoTokenizer.from_pretrained(reward_model)
+rm_pipe = pipeline(
+    "sentiment-analysis",
+    model=reward_model,
+    device=device,
+    tokenizer=rm_tokenizer,
+    model_kwargs={"torch_dtype": torch.bfloat16},
+    truncation=True,
+)
+
+
+ds_dir = script_args.dataset_name_or_path
+world_size = int(os.getenv("WORLD_SIZE", "1"))
+ds = load_dataset("json", data_files=ds_dir, split="train")
+
+local_rank = Accelerator().local_process_index
+
+data_size = len(ds["prompt"])
+
+share = int(data_size / world_size) + 1
+ds = ds.select(np.arange(local_rank * share, min((local_rank + 1) * share, len(ds))))
+
+"""
+We process the data format here and query the reward model to get the rewards.
+"""
+
+
+def get_reward(test_texts):
+    pipe_outputs = rm_pipe(test_texts, **pipe_kwargs)
+    rewards = [output[0]["score"] for output in pipe_outputs]
+    return rewards
+
+
+def change_of_format(prom, resp):
+    # To be modified according to the reward model and the LLM you use
+    # Be careful about multi-turn conversions
+    """
+    prom = prom.replace("<s>GPT4 Correct User: ", "").replace("<|end_of_turn|>GPT4 Correct Assistant:", "")
+
+    final_resp = resp.split("GPT4 Correct User")[0]
+    """
+    message = prom + [{"role": "assistant", "content": resp}]
+    return rm_tokenizer.apply_chat_template(message, tokenize=False).replace(rm_tokenizer.bos_token, "")
+
+
+data = []
+
+# tqdm is used to show the progress bar
+with torch.no_grad():
+    for sample in tqdm(ds):
+        # The VLLM may not generate responses for some prompts because it is too long, we skip them
+        if len(sample["responses"]) < script_args.K:
+            continue
+        test_texts = [change_of_format(sample['prompt'], tmp_output) for tmp_output in sample['responses']]
+        
+        rewards = get_reward(test_texts)
+        data.append({"prompt": sample["prompt"], "responses": sample["responses"], "rewards": rewards})
+
+
+# Send the data to other GPUs
+world_size = int(os.getenv("WORLD_SIZE", "1"))
+all_process_list = [{}] * world_size
+
+data_to_send = {
+    "data": [[data[i]] for i in range(len(data))],
+}
+
+import torch.distributed as dist
+
+dist.all_gather_object(all_process_list, data_to_send)
+gathered_data = []
+
+
+for i in range(world_size):
+    tmp_data = [tmp[0] for tmp in all_process_list[i]["data"]]
+    gathered_data.extend(tmp_data)
+
+all_rewards = [sample["rewards"] for sample in gathered_data]
+top1_scores = np.mean(np.max(all_rewards, axis=1))
+mean_scores = np.mean(all_rewards)
+
+
+if local_rank == 0:
+    print(
+        "Collect {} data from {} inputs. mean score {} top1 score: {}".format(
+            len(gathered_data), data_size, mean_scores, top1_scores
+        )
+    )
+    if len(gathered_data) < data_size:
+        print(
+            "Some of the prompts are with responses < {}. This can happen because the prompt is too long and is ignored by VLLM".format(
+                script_args.K
+            )
+        )
+
+    with open(script_args.output_dir, "w", encoding="utf8") as f:
+        for i in range(len(gathered_data)):
+            json.dump(gathered_data[i], f, ensure_ascii=False)
+            f.write('\n')
+            
+    if script_args.record_dir is not None:
+        with open(script_args.record_dir, "a") as f:
+            f.write(str(mean_scores) + "\t" + str(top1_scores) + "\n")
@@ -0,0 +1,23 @@
+{
+  "zero_optimization": {
+    "stage": 1,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
@@ -0,0 +1,27 @@
+{
+  "zero_optimization": {
+    "stage": 2,
+    "offload_optimizer": {
+      "device": "cpu"
+    },
+    "contiguous_gradients": true,
+    "overlap_comm": true
+  },
+  "bf16": {
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
@@ -0,0 +1,31 @@
+{
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "gradient_accumulation_steps": "auto",
+  "gradient_clipping": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
@@ -0,0 +1,142 @@
+import json
+from dataclasses import dataclass, field
+from typing import List, Optional
+from datasets import load_dataset
+from tqdm import tqdm
+from transformers import AutoTokenizer, HfArgumentParser
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import requests
+
+tqdm.pandas()
+
+
+@dataclass
+class ScriptArguments:
+    """
+    The arguments for the DPO training script.
+    """
+
+    url: Optional[str] = field(
+        default="http://localhost",
+        metadata={"help": "url of the model response"},
+    )
+    tokenizer: Optional[str] = field(
+        default="HuggingFaceH4/mistral-7b-sft-beta",
+        metadata={"help": "the tokenizer to use"},
+    )
+    ports: List[str] = field(default_factory=lambda: ["8000"], metadata={"help": "ports of the model response"})
+    eos_ids: List[int] = field(default_factory=lambda: [], metadata={"help": "the ids of the end of sentence tokens"})
+    dataset_name_or_path: Optional[str] = field(
+        default="cornfieldrm/iterative-prompt-v1-iter1-2K",
+        metadata={"help": "the location of the dataset name or path"},
+    )
+    output_dir: Optional[str] = field(
+        default="uf_split0_responses_K8.jsonl",
+        metadata={"help": "the location of the output file"},
+    )
+    bos_format: Optional[str] = field(
+        default="",
+        metadata={"help": "the format of the beginning of the sentence"},
+    )
+    K: Optional[int] = field(
+        default=8,
+        metadata={"help": "the number of generations per prompt"},
+    )
+    max_input_length: Optional[int] = field(
+        default=10000,
+        metadata={"help": "the maximum length of the input tokens"},
+    )
+    max_new_tokens: Optional[int] = field(
+        default=2048,
+        metadata={"help": "the maximum length of the new tokens"},
+    )
+    seed: Optional[int] = field(
+        default=42,
+        metadata={"help": "the random seed"},
+    )
+    temperature: Optional[float] = field(
+        default=0.7,
+        metadata={"help": "the temperature"},
+    )
+    use_beam_search: Optional[bool] = field(
+        default=False,
+        metadata={"help": "the beam search"},
+    )
+    dataset_key: Optional[str] = field(
+        default="context_messages",
+        metadata={"help": "the key of the dataset"},
+    )
+    max_workers: Optional[int] = field(
+        default=1024,
+        metadata={"help": "the number of workers"},
+    )
+
+
+parser = HfArgumentParser(ScriptArguments)
+script_args = parser.parse_args_into_dataclasses()[0]
+ds_dir = script_args.dataset_name_or_path
+output_dir = script_args.output_dir
+K = script_args.K
+ports = script_args.ports
+
+tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer)
+
+
+def query_model(prompt, args, port):
+    json = {
+        **args,
+        "prompt": prompt,
+    }
+    response = requests.post(url=script_args.url + ":" + str(port) + "/generate", json=json)
+    response_json = response.json()
+    return [response_json["text"][i][len(prompt) :] for i in range(len(response_json["text"]))]
+
+
+default_args = {
+    "use_beam_search": script_args.use_beam_search,
+    "n": script_args.K,
+    "temperature": script_args.temperature,
+    "max_tokens": script_args.max_new_tokens,
+    "seed": script_args.seed,
+    "top_p": 1.0,
+    "top_k": -1,
+    "stop_token_ids": [tokenizer.eos_token_id] + script_args.eos_ids,
+}
+
+print(default_args)
+
+ds = load_dataset(ds_dir, split="train")
+# load_dataset("json", data_files=ds_dir, split="train", field="instances")
+print(ds)
+
+# use tokenizer.apply_template to apply the template to the prompt
+ds = ds.map(
+    lambda x: {
+        "prompt": tokenizer.apply_chat_template(x[script_args.dataset_key], tokenize=False, add_generation_prompt=True)
+    }
+)
+
+
+with ThreadPoolExecutor(max_workers=script_args.max_workers) as executor:
+    result = [
+        executor.submit(query_model, ds[i]["prompt"], default_args, ports[i % len(ports)]) for i in range(len(ds))
+    ]
+    # use tqdm to show progress
+    for _ in tqdm(as_completed(result), total=len(result)):
+        pass
+
+    responses = [r.result() for r in result]
+
+
+gathered_data = []
+for i in range(len(ds)):
+    tmp_data = {"prompt": ds[i][script_args.dataset_key], "responses": responses[i]}
+    gathered_data.append(tmp_data)
+
+print("I collect ", len(gathered_data), "samples")
+
+
+with open(output_dir, 'w', encoding='utf8') as f:
+    for i in range(len(gathered_data)):
+        json.dump(gathered_data[i], f, ensure_ascii=False)
+        f.write('\n')