neulab · zhaochenyang20 · Aug 11, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 29, 2023
diff --git a/prompt2model/dataset_generator/openai_gpt.py b/prompt2model/dataset_generator/openai_gpt.py
@@ -95,33 +95,12 @@ def __init__(
         self.requests_per_minute = requests_per_minute
         self.filter_duplicated_examples = filter_duplicated_examples
         self.cache_root = Path(cache_root)
-        # This list stores all generated examples. These will later be
-        # converted into `generated_dataset` and `input_output_map`
-        # if `filter_duplicated_examples` is True.
-        self.generated_examples: list[Example] = []
-
-        # `generated_examples` will be transformed into `generated_dataset`.
-        # If `filter_duplicated_examples` is True, `generated_examples` will
-        # be filtered based on multi-votes before being used to construct
-        # `generated_dataset`. If it's False, `generated_examples` will be
-        # used directly to construct `generated_dataset`.
-        self.generated_dataset: Dataset = Dataset.from_dict({})
-
-        # If `filter_duplicated_examples` is True, `self.generated_examples`
-        # will first be converted into `input_output_map`, and then into
-        # `generated_dataset`. If it's False, `input_output_map` will remain
-        # empty.
-        self.input_output_map: dict[str, Counter] = defaultdict(Counter)
-
-        # `generating_split` refers to the DatasetSplit currently being
-        # generated. After each loop, `generated_examples` will be
-        # stored as a Dataset at the path `{cache_root}/{generating_split}`.
-        self.generating_split: DatasetSplit | None = None
 
     def generate_prompt(
         self,
         instruction: str,
-        few_shot_example_string: str = None,
+        few_shot_example_string: str,
+        generated_examples: list[Example],
     ) -> str:
         """Generates a prompt string.
 
@@ -130,43 +109,44 @@ def generate_prompt(
             few_shot_example_string: A string representing the few-shot examples
                 parsed from the user's prompt, which quality is higher than the
                 genrated examples.
+            generated_examples: A list of currently generated examples.
 
         Returns:
             The generated prompt string.
         """
         # The random_example_string is a string, which contains several random
         # few-shot examples as demonstrations for the DatasetGenerator. If
-        # self.generated_examples is empty, then the random_example_string
+        # generated_examples is empty, then the random_example_string
         # is the few-shot examples parsed from the user's prompt.
         while True:
-            if len(self.generated_examples) == 0:
+            if len(generated_examples) == 0:
                 low_quality_example_string = "N/A\n"
-                # Create default low_quality_example_string if self.generated_examples
+                # Create default low_quality_example_string if generated_examples
                 # is empty. few_shot_example_string is the high-quality few-shot
                 # examples parsed from the user's prompt. But if user does not
                 # provideany examples in the input prompt, the few_shot_example_string
                 # will be "N/A"/""/None.
                 random_selected_generated_example_num = 0
                 # random_selected_generated_example_num is the number of selected
-                # random examples from self.generated_examples that will be used to
+                # random examples from generated_examples that will be used to
                 # create the low_quality_example_string. If generated_examples
                 # is empty, then random_selected_generated_example_num is 0.
             else:
-                # If self.generated_examples is not empty, low_quality_example_string
-                # is sveral random generated examples from self.generated_examples.
+                # If generated_examples is not empty, low_quality_example_string
+                # is sveral random generated examples from generated_examples.
 
                 low_quality_example_string = ""
                 random_selected_generated_example_num = random.randint(
-                    1, min(len(self.generated_examples), 10)
+                    1, min(len(generated_examples), 10)
                 )
                 # random_selected_generated_example_num is the number of selected
-                # random examples from self.generated_examples that will
+                # random examples from generated_examples that will
                 # be concatenated to create low_quality_example_string.
                 random_examples = random.sample(
-                    self.generated_examples, random_selected_generated_example_num
+                    generated_examples, random_selected_generated_example_num
                 )
                 # If generated_examples is not empty, then choose several
-                # random examples from self.generated_examples to construct
+                # random examples from generated_examples to construct
                 # new low_quality_example_string.
                 for example in random_examples:
                     low_quality_example_string += (
@@ -194,21 +174,161 @@ def generate_prompt(
             else:
                 continue
 
-    def extract_responses(self, completions: list[openai.Completion]) -> None:
+    def construct_input_output_map(
+        self,
+        generated_examples: list[Example],
+    ) -> dict[str, Counter]:
+        """Constructs a dictionary mapping inputs to `Counter` objects of outputs.
+
+        Args:
+            generated_examples: A list of currently generated examples.
+
+        Ideally, each input should have a unique output (one-to-one mapping).
+        However, language models may occasionally generate different outputs
+        for identical inputs. For instance, given the input “What is the biggest
+        city in China?”, it might produce different but correct outputs such as
+        “Shanghai” and “The biggest city in China is Shanghai”. At other times,
+        it may produce incorrect variations. For the input “What is the Chemical
+        symbol of gold?”, the outputs might be “Au”, “Au”, and “AU”, where the
+        last one is wrong due to capital letters.
+
+        To address this, OpenAIDataSetGenerator uses a two-step multi-vote
+        filtering mechanism. This function represents the first step, creating a
+        dictionary to map inputs to a `Counter` of their outputs.
+
+        The function iterates over all the examples, building a dictionary where
+        inputs serve as keys and `Counter` objects as values. The `Counter`
+        tracks the frequency of each output for a specific input.
+
+        For example:
+        input: ["apple", "banana", "apple", "orange", "apple"]
+        output: ["A", "B", "A", "O", "D"]
+
+        Then input_output_map value is:
+        {
+            "apple": Counter({"A": 2, "D": 1}),
+            "banana": Counter({"B": 1}),
+            "orange": Counter({"O": 1})
+        }
+        """
+        input_output_map: dict[str, Counter] = defaultdict(Counter)
+
+        # Iterate through the examples and construct the mapping.
+        for example in generated_examples:
+            input_str = example.input_col
+            output_str = example.output_col
+
+            # Increment the count of the output for the specific input.
+            input_output_map[input_str][output_str] += 1
+
+        # Ensure that the generated_examples list is not empty
+        # and the map is constructed correctly.
+        if len(generated_examples) != 0:
+            assert input_output_map
+
+        return input_output_map
+
+    def apply_multi_vote_to_construct_generated_dataset(
+        self, input_output_map: dict[str, Counter]
+    ) -> Dataset:
+        """Multi-vote to construct generated_dataset from input_output_map.
+
+        Args:
+            generated_examples: A list of currently generated examples.
+
+        This method uses multi-vote filtering to create a unique mapping from inputs
+        to outputs. The input_col of generated_dataset contains unique inputs,
+        while the output_col holds the shortest, most frequent output for the
+        corresponding input.
+
+        The function asserts that self.filter_duplicated_examples is True and that
+        input_output_map is not None when generated_examples is not
+        empty. It then iterates over input_output_map, finding the most frequent
+        output for each input. If there are multiple outputs with the highest frequency,
+        it selects the shortest one. If there are multiple shortest outputs with the
+        highest frequency, it selects the one that comes first in lexicographical
+        (alphabetical) order.
+
+        Example:
+        Suppose input_output_map is:
+        {
+            "apple": Counter({"A": 2, "D": 2}),
+            "banana": Counter({"B": 2, "C": 1}),
+            "orange": Counter({"O": 1})
+        }
+
+        The function will produce generated_dataset:
+        {
+            "input_col": ["apple", "banana", "orange"],
+            "output_col": ["A", "B", "O"]
+        }
+
+        Note: When generated_examples is empty, both input_output_map
+        and generated_dataset will be empty.
+
+        Returns:
+            Currently generated dataset with multi-vote filtering applied.
+        """
+        # Ensure that multi-vote filtering is enabled.
+        assert self.filter_duplicated_examples
+
+        filtered_inputs = []
+        filtered_outputs = []
+
+        for input_str, output_counter in input_output_map.items():
+            # Find the most frequent output count.
+            most_common_count = output_counter.most_common(1)[0][1]
+
+            # Get all the outputs that have the most common count.
+            most_frequent_outputs = [
+                output
+                for output, count in output_counter.items()
+                if count == most_common_count
+            ]
+
+            # Sort the outputs based on their lengths and select
+            # the shortest ones. When several outputs have the
+            # same length with the highest frequency, they will
+            # be sorted in their lexicographical (alphabetical) order.
+            most_frequent_outputs.sort(key=len)
+            final_output = most_frequent_outputs[0]
+
+            filtered_inputs.append(input_str)
+            filtered_outputs.append(final_output)
+
+        # Note that when `generated_examples` is empty,
+        # `input_output_map` is None, and `generated_dataset`
+        # will also be empty.
+        generated_dataset = Dataset.from_dict(
+            {"input_col": filtered_inputs, "output_col": filtered_outputs}
+        )
+
+        # `generated_examples` will be transformed into `generated_dataset`.
+        # If `filter_duplicated_examples` is True, `generated_examples` will
+        # be filtered based on multi-votes before being used to construct
+        # `generated_dataset`. If it's False, `generated_examples` will be
+        # used directly to construct `generated_dataset`.
+        return generated_dataset
+
+    def extract_responses(
+        self, completions: list[openai.Completion], generated_examples: list[Example]
+    ) -> list[Example]:
         """Extracts the generated sample and annotation from an OpenAI API response.
 
         Args:
             completions: The generated completion objects returned by OpenAI API.
+            generated_examples: Currently generated examples of DatasetGenerator.
 
         Returns:
+            A list of `Example` objects.
                 Each API call will return `responses_per_request` completion objects.
                 If the response is a valid JSON object, create a namedtuple called
-                `example` and append it to self.generated_examples. `example` consists
+                `example` and append it to generated_examples. `example` consists
                 of `input_col` and`output_col`, where:
                 - input_col is the generated example string extracted from the response.
                 - output_col is the generated label string extracted from the response.
                 If the response is not a valid JSON object, discard it.
-            There is 5 * len(completions) responses at a time.
+                There is responses_per_request * len(completions) responses at a time.
         """
         for completion in completions:
             try:
@@ -230,14 +350,15 @@ def extract_responses(self, completions: list[openai.Completion]) -> None:
                         continue
                     input = str(response_json["input"]).strip()
                     output = str(response_json["output"]).strip()
-                    self.generated_examples.append(Example(input, output))
+                    generated_examples.append(Example(input, output))
                     logging.info(f"input: \n\n{input}\n\n")
                     logging.info(f"output: \n\n{output}\n\n")
             except Exception:
                 logging.warning(
                     f"Error happened when parsing API completion: {completion}"
                 )
                 continue
+        return generated_examples
 
     async def generate_responses(
         self, chat_api: ChatGPTAgent, prompts: list[str]
@@ -280,9 +401,9 @@ def generate_dataset_split(
         """
         _ = split  # suppress unused variable warnings
         chat_api = ChatGPTAgent(self.api_key)
-        self.generated_examples = []
+        generated_examples: list[Example] = []
         pbar = tqdm(total=expected_num_examples, desc="Generating examples")
-        while len(self.generated_examples) < expected_num_examples:
+        while len(generated_examples) < expected_num_examples:
             try:
                 if self.max_api_calls and self.api_call_counter >= self.max_api_calls:
                     logging.warning("Maximum number of API calls reached.")
@@ -293,10 +414,7 @@ def generate_dataset_split(
                             self.batch_size,
                             math.ceil(
                                 (
-                                    (
-                                        expected_num_examples
-                                        - len(self.generated_examples)
-                                    )
+                                    (expected_num_examples - len(generated_examples))
                                     / self.responses_per_request
                                 )
                             ),
@@ -306,10 +424,7 @@ def generate_dataset_split(
                             self.batch_size,
                             math.ceil(
                                 (
-                                    (
-                                        expected_num_examples
-                                        - len(self.generated_examples)
-                                    )
+                                    (expected_num_examples - len(generated_examples))
                                     / self.responses_per_request
                                 )
                             ),
@@ -322,6 +437,7 @@ def generate_dataset_split(
                         self.generate_prompt(
                             instruction=prompt_spec.instruction,
                             few_shot_example_string=prompt_spec.examples,
+                            generated_examples=generated_examples,
                         )
                         for _ in range(batch_size)
                     ]
@@ -330,22 +446,21 @@ def generate_dataset_split(
                     responses = loop.run_until_complete(
                         self.generate_responses(chat_api, prompts)
                     )
-                    self.extract_responses(responses)
-                    pbar.update(len(self.generated_examples) - pbar.n)
+                    generated_examples = self.extract_responses(
+                        responses, generated_examples=generated_examples
+                    )
+                    pbar.update(len(generated_examples) - pbar.n)
             except OPENAI_ERRORS as e:
                 self.api_call_counter = handle_openai_error(e, self.api_call_counter)
         # Each API call will return `responses_per_request` completion
         # objects. The upper bound of the length of generated dataset
         # is expected_num_examples + responses_per_request.
         assert (
-            len(self.generated_examples)
-            < expected_num_examples + self.responses_per_request
+            len(generated_examples) < expected_num_examples + self.responses_per_request
         )
         return Dataset.from_dict(
             {
-                "input_col": [example.input_col for example in self.generated_examples],
-                "output_col": [
-                    example.output_col for example in self.generated_examples
-                ],
+                "input_col": [example.input_col for example in generated_examples],
+                "output_col": [example.output_col for example in generated_examples],
             }
         )
diff --git a/test_helpers/__init__.py b/test_helpers/__init__.py
@@ -1,7 +1,11 @@
 """Import mock classes used in unit tests."""
+from test_helpers.dataset_tools import (
+    are_dataset_dicts_identical,
+    are_datasets_identical,
+)
 from test_helpers.mock_openai import (
     MockCompletion,
-    mock_batch_openai_response,
+    mock_batch_openai_response_with_identical_completions,
     mock_one_openai_response,
 )
 from test_helpers.model_and_tokenizer import (
@@ -14,5 +18,7 @@
     "create_gpt2_model_and_tokenizer",
     "create_t5_model_and_tokenizer",
     "mock_one_openai_response",
-    "mock_batch_openai_response",
+    "mock_batch_openai_response_with_identical_completions",
+    "are_dataset_dicts_identical",
+    "are_datasets_identical",
 )