Implementing support for dense rewards

Dahoas · Dahoas · commit f04446d9bcff · 2023-07-18T12:00:45.000Z
diff --git a/examples/ppo_redemption.py b/examples/ppo_redemption.py
@@ -0,0 +1,83 @@
+# Generates positive movie reviews by tuning a pretrained model on IMDB dataset
+# with a sentiment reward function
+import json
+import os
+import sys
+from typing import List
+
+import torch
+from datasets import load_dataset
+from transformers import pipeline, AutoTokenizer
+
+import trlx
+from trlx.data.default_configs import TRLConfig, default_ppo_config
+
+
+def get_positive_score(scores):
+    "Extract value associated with a positive sentiment from pipeline's output"
+    return dict(map(lambda x: tuple(x.values()), scores))["POSITIVE"]
+
+def get_negative_score(scores):
+    return dict(map(lambda x: tuple(x.values()), scores))["NEGATIVE"]
+
+
+def main(hparams={}):
+    # Merge sweep config with default config if given
+    config = TRLConfig.update(default_ppo_config().to_dict(), hparams)
+    config.method.cliprange_reward = False
+    config.method.gen_kwargs["max_new_tokens"] = 70
+    config.method.gen_kwargs["temperature"] = 0.3
+    config.train.total_steps = 20000
+    config.train.checkpoint_interval = 10000000
+    #config.method.init_kl_coef = 0
+
+    if torch.cuda.is_available():
+        device = int(os.environ.get("LOCAL_RANK", 0))
+    else:
+        device = -1
+
+    sentiment_fn = pipeline(
+        "sentiment-analysis",
+        "lvwerra/distilbert-imdb",
+        top_k=2,
+        truncation=True,
+        batch_size=256,
+        device=device,
+    )
+
+    def dense_reward_fn(samples: List[str], prompts: List[str], outputs: List[str], model_tok, **kwargs) -> List[float]:
+        # Reward positively for initially negative then positive review
+        # Reward functions should never receive padded text except for a singel EOS at the end
+        # Reward function should return token rewards for just the response
+        # Note: To get trajectory length, the reward fn should not tokenize the samples but should instead separately tokenizer prompts and outputs and then combine them
+        # Also note outputs has a single EOS at end of each
+        first_halves = [".".join(sample.split(".")[:len(sample.split(".")) // 2]) for sample in samples]
+        negative_first_halves = list(map(get_negative_score, sentiment_fn(first_halves)))
+        second_halves = [".".join(sample.split(".")[len(sample.split(".")) // 2:]) for sample in samples]
+        positive_second_halves = list(map(get_positive_score, sentiment_fn(second_halves)))
+        text_scores = [[f, s] for f, s in zip(negative_first_halves, positive_second_halves)]
+        tok_scores = []
+        for sample, prompt, response, text_score in zip(samples, prompts, outputs, text_scores):
+            toks = model_tok(response).input_ids
+            tok_score = [0] * len(toks)
+            # Hacky way of assigning intermediate score
+            tok_score[len(tok_score) // 2] = text_score[0]
+            tok_score[-1] = text_score[1]
+            tok_scores.append(tok_score)
+        return tok_scores
+
+    # Take few words off of movies reviews as prompts
+    imdb = load_dataset("imdb", split="train+test")
+    prompts = [" ".join(review.split()[:4]) for review in imdb["text"]]
+
+    trlx.train(
+        reward_fn=dense_reward_fn,
+        prompts=prompts,
+        eval_prompts=["I don't know much about Hungarian underground"] * 256,
+        config=config,
+    )
+
+
+if __name__ == "__main__":
+    hparams = {} if len(sys.argv) == 1 else json.loads(sys.argv[1])
+    main(hparams)
diff --git a/trlx/trainer/accelerate_base_trainer.py b/trlx/trainer/accelerate_base_trainer.py
@@ -232,9 +232,7 @@ def decode(
             # or add one if it was trimmed with `self.stop_sequences`.
             # When a generation ended due to `max_new_tokens` exhaustion,
             # only then <pad> or <eos> token would not be present in the original sample at the end
-            if append_eos_token and (
-                trimmed or sample[-1] == self.tokenizer.eos_token_id or sample[-1] == self.tokenizer.pad_token_id
-            ):
+            if append_eos_token:
                 str_output += self.tokenizer.eos_token
 
             str_prompts.append(str_prompt)
@@ -427,10 +425,8 @@ def evaluate(self):  # noqa: C901
                 # in online setting, compute the reward for validation
                 if self.reward_fn:
                     logger.info("Computing rewards")
-                    rewards = torch.tensor(
-                        self.reward_fn(samples=str_samples, prompts=str_prompts, outputs=str_outputs, **metadata),
-                        dtype=float,
-                    )
+                    rewards = self.reward_fn(samples=str_samples, prompts=str_prompts, outputs=str_outputs, model_tok=self.tokenizer, **metadata)
+                    rewards = torch.tensor([sum(r) if type(r) is list else r for r in rewards], dtype=float)
                     mean_reward = rewards.mean().item()
                     columns.append("reward")
                     if not isinstance(rewards, list):
diff --git a/trlx/trainer/accelerate_ppo_trainer.py b/trlx/trainer/accelerate_ppo_trainer.py
@@ -6,6 +6,7 @@
 
 import torch
 import torch.nn.functional as F
+from torch.nn.utils.rnn import pad_sequence
 import transformers
 from torch.utils.data import DataLoader
 from transformers import AutoTokenizer
@@ -297,21 +298,24 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
                 )
 
                 rollout_score_time = time()
-                all_scores = torch.tensor(
-                    self.reward_fn(
-                        samples=all_str_samples, prompts=all_str_prompts, outputs=all_str_outputs, **metadata
-                    ),
-                    dtype=torch.float,
-                    device=device,
-                )
+                # reward_fn should return list of rewards at each token per sample
+                # NOTE: all_scores[0][i] is the reward due to token (action) i in prompt + response (b/c of how kl is computed)
+                all_scores = self.reward_fn(samples=all_str_samples, prompts=all_str_prompts, outputs=all_str_outputs, model_tok=self.tokenizer, **metadata)
+                all_scores = [torch.tensor(score, dtype=torch.float, device=device).view(-1,) for score in all_scores]
+                # Pad 0 reward on the ends
+                all_scores = pad_sequence(all_scores, batch_first=True, padding_value=-1)
+                max_len = torch.tensor(all_scores.shape[1], dtype=torch.long, device=device)
+
                 stats["time/rollout_score"] = time() - rollout_score_time
 
-                all_scores = list(all_scores.reshape(self.accelerator.num_processes, -1).unbind())
+                all_scores = list(all_scores.reshape(self.accelerator.num_processes, -1, max_len).unbind())
             else:
                 all_scores = None
+                max_len = torch.tensor(0, dtype=torch.long, device=device)
 
             if torch.distributed.is_initialized():
-                scores = torch.empty(len(samples), device=device)
+                torch.distributed.broadcast(max_len, 0)
+                scores = torch.empty((len(samples), max_len), device=device)
                 torch.distributed.scatter(scores, all_scores)
             else:
                 scores = all_scores[0].clone().detach()
@@ -342,7 +346,7 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
 
             # store statistics of the initial rollout as reference
             if self.ref_mean is None:
-                self.ref_mean, self.ref_std = scores.mean(), scores.std()
+                self.ref_mean, self.ref_std = scores.sum(dim=1).mean(), scores.sum(dim=1).std()
             all_scores_mean, all_scores_std = self.running_moments.update(scores)
             stats["rollout_scores/mean"] = all_scores_mean.item()
             stats["rollout_scores/std"] = all_scores_std.item()
@@ -415,6 +419,7 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
                 logprobs = logprobs_of_labels(logits[:, :-1, :], sample_outputs[:, 1:])
                 ref_logprobs = logprobs_of_labels(ref_logits[:, :-1, :], sample_outputs[:, 1:])
             else:
+                # NOTE: logprob[i] is (log)prob at which all_token[i+1] was sampled
                 logprobs = logprobs_of_labels(logits[:, :-1, :], all_tokens[:, 1:])
                 ref_logprobs = logprobs_of_labels(ref_logits[:, :-1, :], all_tokens[:, 1:])
 
@@ -425,6 +430,7 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
                 attention_mask = sample_outputs != self.tokenizer.pad_token_id
                 start = 0
             else:
+                # NOTE: -1 because kl[prompt_tensors.shape[1]] is kl of the second token in the response
                 start = prompt_tensors.shape[1] - 1
 
             log_ratio = (logprobs - ref_logprobs) * attention_mask[:, :-1]
@@ -436,12 +442,16 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
             ref_logprobs = ref_logprobs.cpu()
             prompt_tensors = prompt_tensors.cpu()
             sample_outputs = sample_outputs.cpu()
+            # TODO(dahoas): Why [:, :-1]? Redudant with clipping via start : ends[ix]?
+            # Actually I think it's just wrong?
             values = values.cpu()[:, :-1]
 
             # Get the logprobs and values, for tokens that are not padding,
-            # from the start of the prompt up to the <eos> token, while also including the latter
+            # from the end of the prompt up to the <eos> token, while also including the latter
             # (these are taken from the student model and not the reference model)
             ends = start + attention_mask[:, start:].sum(1) + 1
+            # NOTE: values[i] is the value of the state after response token i
+            # TODO(dahoas): Does it actually make sense to get the rewards one step early?
             all_values = [values[ix, start : ends[ix]] for ix in range(n_samples)]
             all_logprobs = [logprobs[ix, start : ends[ix]] for ix in range(n_samples)]
 
@@ -451,8 +461,20 @@ def make_experience(self, num_rollouts: int = 1024, iter_count: int = 0):  # noq
             rollout_count = 0
 
             for sample_idx in range(n_samples):
+                # To compute per token reward first add in kl penalties over trajectory
+                # NOTE: kl_penalty[i] is kl_diff at token i+1 in the output (w/o EOS)
                 rewards = kl_penalty[sample_idx]
-                rewards[-1] += scores[sample_idx].cpu()
+                # Then add in rewards
+                if scores.shape[1] == 1:
+                    # NOTE: Final reward given at EOS token following HHH practice
+                    rewards[-1] += scores[sample_idx][0].cpu()
+                else:
+                    score = scores[sample_idx]
+                    score_right_padding = torch.sum(score != -1)
+                    score = score[:score_right_padding].cpu()
+                    p_score = torch.zeros_like(rewards)
+                    p_score[:score.shape[0]] += score
+                    rewards += p_score
 
                 ppo_rl_elements.append(
                     PPORLElement(