[deploy] Merge pull request #99 from microsoft/dev

v0.1.6: multi-table data formulation support
microsoft · Feb 20, 2025 · 2c2ca4d · 2c2ca4d
2 parents 0e1f215 + 92c2262
commit 2c2ca4d
Show file tree

Hide file tree

Showing 16 changed files with 1,122 additions and 615 deletions.
diff --git a/py-src/data_formulator/agents/agent_data_rec.py b/py-src/data_formulator/agents/agent_data_rec.py
@@ -52,7 +52,7 @@
     (4) "visualization_fields" should be no more than 3 (for x,y,legend).
     (5) "chart_type" must be one of "point", "bar", "line", or "boxplot"
 
-    2. Then, write a python function based on the inferred goal, the function input is a dataframe "df" and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
+    2. Then, write a python function based on the inferred goal, the function input is a dataframe "df" (or multiple dataframes based on tables presented in the [CONTEXT] section) and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
 The python function must follow the template provided in [TEMPLATE], do not import any other libraries or modify function name. The function should be as simple as possible and easily readable. 
 If there is no data transformation needed based on "output_fields", the transformation function can simply "return df".
 
@@ -63,11 +63,15 @@
 import collections
 import numpy as np
 
-def transform_data(df):
+def transform_data(df1, df2, ...): 
     # complete the template here
     return transformed_df
 ```
 
+note: 
+- if the user provided one table, then it should be def transform_data(df1), if the user provided multiple tables, then it should be def transform_data(df1, df2, ...) and you should consider the join between tables to derive the output.
+- try to use table names to refer to the input dataframes, for example, if the user provided two tables city and weather, you can use `transform_data(df_city, df_weather)` to refer to the two dataframes.
+
     3. The [OUTPUT] must only contain a json object representing the refined goal and a python code block representing the transformation code, do not add any extra text explanation.
 '''
 

diff --git a/py-src/data_formulator/agents/agent_data_transform_v2.py b/py-src/data_formulator/agents/agent_data_transform_v2.py
@@ -45,7 +45,7 @@
 }
 ```
 
-    2. Then, write a python function based on the refined goal, the function input is a dataframe "df" and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
+    2. Then, write a python function based on the refined goal, the function input is a dataframe "df" (or multiple dataframes based on tables presented in the [CONTEXT] section) and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
 The python function must follow the template provided in [TEMPLATE], do not import any other libraries or modify function name. The function should be as simple as possible and easily readable.
 If there is no data transformation needed based on "output_fields", the transformation function can simply "return df".
 
@@ -56,11 +56,15 @@
 import collections
 import numpy as np
 
-def transform_data(df):
+def transform_data(df1, df2, ...): 
     # complete the template here
     return transformed_df
 ```
 
+note: 
+- if the user provided one table, then it should be def transform_data(df1), if the user provided multiple tables, then it should be def transform_data(df1, df2, ...) and you should consider the join between tables to derive the output.
+- try to use table names to refer to the input dataframes, for example, if the user provided two tables city and weather, you can use `transform_data(df_city, df_weather)` to refer to the two dataframes.
+
     3. The [OUTPUT] must only contain a json object representing the refined goal (including "detailed_instruction", "output_fields", "visualization_fields" and "reason") and a python code block representing the transformation code, do not add any extra text explanation.
 '''
 
@@ -226,6 +230,10 @@ def process_gpt_response(self, input_tables, messages, response):
             if len(code_blocks) > 0:
                 code_str = code_blocks[-1]
 
+                for table in input_tables:
+                    logger.info(f"Table: {table['name']}")
+                    logger.info(table['rows'])
+
                 try:
                     result = py_sandbox.run_transform_in_sandbox2020(code_str, [t['rows'] for t in input_tables])
                     result['code'] = code_str
@@ -254,7 +262,16 @@ def process_gpt_response(self, input_tables, messages, response):
         return candidates
 
 
-    def run(self, input_tables, description, expected_fields: list[str], n=1):
+    def run(self, input_tables, description, expected_fields: list[str], prev_messages: list[dict] = [], n=1):
+
+        if len(prev_messages) > 0:
+            logger.info("=== Previous messages ===>")
+            formatted_prev_messages = ""
+            for m in prev_messages:
+                if m['role'] != 'system':
+                    formatted_prev_messages += f"{m['role']}: \n\n\t{m['content']}\n\n"
+            logger.info(formatted_prev_messages)
+            prev_messages = [{"role": "user", "content": '[Previous Messages] Here are the previous messages for your reference:\n\n' + formatted_prev_messages}]
 
         data_summary = generate_data_summary(input_tables, include_data_samples=True)
 
@@ -268,6 +285,7 @@ def run(self, input_tables, description, expected_fields: list[str], n=1):
         logger.info(user_query)
 
         messages = [{"role":"system", "content": self.system_prompt},
+                    *prev_messages,
                     {"role":"user","content": user_query}]
 
         response = completion_response_wrapper(self.client, messages, n)

diff --git a/py-src/data_formulator/agents/client_utils.py b/py-src/data_formulator/agents/client_utils.py
@@ -1,8 +1,8 @@
 import os
-from litellm import completion
+import litellm
+import openai
 from azure.identity import DefaultAzureCredential, get_bearer_token_provider
 
-
 class Client(object):
     """
     Returns a LiteLLM client configured for the specified endpoint and model.
@@ -15,11 +15,17 @@ def __init__(self, endpoint, model, api_key=None,  api_base=None, api_version=No
 
         # other params, including temperature, max_completion_tokens, api_base, api_version
         self.params = {
-            "api_key": api_key,
             "temperature": 0.7,
             "max_completion_tokens": 1200,
         }
 
+        if api_key is not None and api_key != "":
+            self.params["api_key"] = api_key
+        if api_base is not None and api_base != "":
+            self.params["api_base"] = api_base
+        if api_version is not None and api_version != "":
+            self.params["api_version"] = api_version
+
         if self.endpoint == "gemini":
             if model.startswith("gemini/"):
                 self.model = model
@@ -53,9 +59,24 @@ def get_completion(self, messages):
         Supports OpenAI, Azure, Ollama, and other providers via LiteLLM.
         """
         # Configure LiteLLM 
-        return completion(
-            model=self.model,
-            messages=messages,
-            drop_params=True,
-            **self.params
-        )
+
+        if self.endpoint == "openai":
+            client = openai.OpenAI(
+                api_key=self.params["api_key"], 
+                base_url=self.params["api_base"] if "api_base" in self.params else None,
+                timeout=120
+            )
+
+            return client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=self.params["temperature"],
+                max_tokens=self.params["max_completion_tokens"],
+            )
+        else:
+            return litellm.completion(
+                model=self.model,
+                messages=messages,
+                drop_params=True,
+                **self.params
+            )
diff --git a/py-src/data_formulator/app.py b/py-src/data_formulator/app.py
@@ -425,6 +425,11 @@ def derive_data():
         new_fields = content["new_fields"]
         instruction = content["extra_prompt"]
 
+        if "additional_messages" in content:
+            prev_messages = content["additional_messages"]
+        else:
+            prev_messages = []
+
         print("spec------------------------------")
         print(new_fields)
         print(instruction)
@@ -439,7 +444,7 @@ def derive_data():
             results = agent.run(input_tables, instruction)
         else:
             agent = DataTransformationAgentV2(client=client)
-            results = agent.run(input_tables, instruction, [field['name'] for field in new_fields])
+            results = agent.run(input_tables, instruction, [field['name'] for field in new_fields], prev_messages)
 
         repair_attempts = 0
         while results[0]['status'] == 'error' and repair_attempts == 0: # only try once

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "data_formulator"
-version = "0.1.5.1"
+version = "0.1.6"
 
 requires-python = ">=3.9"
 authors = [