Skip to content

Commit

Permalink
[deploy] Merge pull request #99 from microsoft/dev
Browse files Browse the repository at this point in the history
v0.1.6: multi-table data formulation support
  • Loading branch information
Chenglong-MS authored Feb 20, 2025
2 parents 0e1f215 + 92c2262 commit 2c2ca4d
Show file tree
Hide file tree
Showing 16 changed files with 1,122 additions and 615 deletions.
8 changes: 6 additions & 2 deletions py-src/data_formulator/agents/agent_data_rec.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
(4) "visualization_fields" should be no more than 3 (for x,y,legend).
(5) "chart_type" must be one of "point", "bar", "line", or "boxplot"
2. Then, write a python function based on the inferred goal, the function input is a dataframe "df" and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
2. Then, write a python function based on the inferred goal, the function input is a dataframe "df" (or multiple dataframes based on tables presented in the [CONTEXT] section) and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
The python function must follow the template provided in [TEMPLATE], do not import any other libraries or modify function name. The function should be as simple as possible and easily readable.
If there is no data transformation needed based on "output_fields", the transformation function can simply "return df".
Expand All @@ -63,11 +63,15 @@
import collections
import numpy as np
def transform_data(df):
def transform_data(df1, df2, ...):
# complete the template here
return transformed_df
```
note:
- if the user provided one table, then it should be def transform_data(df1), if the user provided multiple tables, then it should be def transform_data(df1, df2, ...) and you should consider the join between tables to derive the output.
- try to use table names to refer to the input dataframes, for example, if the user provided two tables city and weather, you can use `transform_data(df_city, df_weather)` to refer to the two dataframes.
3. The [OUTPUT] must only contain a json object representing the refined goal and a python code block representing the transformation code, do not add any extra text explanation.
'''

Expand Down
24 changes: 21 additions & 3 deletions py-src/data_formulator/agents/agent_data_transform_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
}
```
2. Then, write a python function based on the refined goal, the function input is a dataframe "df" and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
2. Then, write a python function based on the refined goal, the function input is a dataframe "df" (or multiple dataframes based on tables presented in the [CONTEXT] section) and the output is the transformed dataframe "transformed_df". "transformed_df" should contain all "output_fields" from the refined goal.
The python function must follow the template provided in [TEMPLATE], do not import any other libraries or modify function name. The function should be as simple as possible and easily readable.
If there is no data transformation needed based on "output_fields", the transformation function can simply "return df".
Expand All @@ -56,11 +56,15 @@
import collections
import numpy as np
def transform_data(df):
def transform_data(df1, df2, ...):
# complete the template here
return transformed_df
```
note:
- if the user provided one table, then it should be def transform_data(df1), if the user provided multiple tables, then it should be def transform_data(df1, df2, ...) and you should consider the join between tables to derive the output.
- try to use table names to refer to the input dataframes, for example, if the user provided two tables city and weather, you can use `transform_data(df_city, df_weather)` to refer to the two dataframes.
3. The [OUTPUT] must only contain a json object representing the refined goal (including "detailed_instruction", "output_fields", "visualization_fields" and "reason") and a python code block representing the transformation code, do not add any extra text explanation.
'''

Expand Down Expand Up @@ -226,6 +230,10 @@ def process_gpt_response(self, input_tables, messages, response):
if len(code_blocks) > 0:
code_str = code_blocks[-1]

for table in input_tables:
logger.info(f"Table: {table['name']}")
logger.info(table['rows'])

try:
result = py_sandbox.run_transform_in_sandbox2020(code_str, [t['rows'] for t in input_tables])
result['code'] = code_str
Expand Down Expand Up @@ -254,7 +262,16 @@ def process_gpt_response(self, input_tables, messages, response):
return candidates


def run(self, input_tables, description, expected_fields: list[str], n=1):
def run(self, input_tables, description, expected_fields: list[str], prev_messages: list[dict] = [], n=1):

if len(prev_messages) > 0:
logger.info("=== Previous messages ===>")
formatted_prev_messages = ""
for m in prev_messages:
if m['role'] != 'system':
formatted_prev_messages += f"{m['role']}: \n\n\t{m['content']}\n\n"
logger.info(formatted_prev_messages)
prev_messages = [{"role": "user", "content": '[Previous Messages] Here are the previous messages for your reference:\n\n' + formatted_prev_messages}]

data_summary = generate_data_summary(input_tables, include_data_samples=True)

Expand All @@ -268,6 +285,7 @@ def run(self, input_tables, description, expected_fields: list[str], n=1):
logger.info(user_query)

messages = [{"role":"system", "content": self.system_prompt},
*prev_messages,
{"role":"user","content": user_query}]

response = completion_response_wrapper(self.client, messages, n)
Expand Down
39 changes: 30 additions & 9 deletions py-src/data_formulator/agents/client_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os
from litellm import completion
import litellm
import openai
from azure.identity import DefaultAzureCredential, get_bearer_token_provider


class Client(object):
"""
Returns a LiteLLM client configured for the specified endpoint and model.
Expand All @@ -15,11 +15,17 @@ def __init__(self, endpoint, model, api_key=None, api_base=None, api_version=No

# other params, including temperature, max_completion_tokens, api_base, api_version
self.params = {
"api_key": api_key,
"temperature": 0.7,
"max_completion_tokens": 1200,
}

if api_key is not None and api_key != "":
self.params["api_key"] = api_key
if api_base is not None and api_base != "":
self.params["api_base"] = api_base
if api_version is not None and api_version != "":
self.params["api_version"] = api_version

if self.endpoint == "gemini":
if model.startswith("gemini/"):
self.model = model
Expand Down Expand Up @@ -53,9 +59,24 @@ def get_completion(self, messages):
Supports OpenAI, Azure, Ollama, and other providers via LiteLLM.
"""
# Configure LiteLLM
return completion(
model=self.model,
messages=messages,
drop_params=True,
**self.params
)

if self.endpoint == "openai":
client = openai.OpenAI(
api_key=self.params["api_key"],
base_url=self.params["api_base"] if "api_base" in self.params else None,
timeout=120
)

return client.chat.completions.create(
model=self.model,
messages=messages,
temperature=self.params["temperature"],
max_tokens=self.params["max_completion_tokens"],
)
else:
return litellm.completion(
model=self.model,
messages=messages,
drop_params=True,
**self.params
)
7 changes: 6 additions & 1 deletion py-src/data_formulator/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,11 @@ def derive_data():
new_fields = content["new_fields"]
instruction = content["extra_prompt"]

if "additional_messages" in content:
prev_messages = content["additional_messages"]
else:
prev_messages = []

print("spec------------------------------")
print(new_fields)
print(instruction)
Expand All @@ -439,7 +444,7 @@ def derive_data():
results = agent.run(input_tables, instruction)
else:
agent = DataTransformationAgentV2(client=client)
results = agent.run(input_tables, instruction, [field['name'] for field in new_fields])
results = agent.run(input_tables, instruction, [field['name'] for field in new_fields], prev_messages)

repair_attempts = 0
while results[0]['status'] == 'error' and repair_attempts == 0: # only try once
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "data_formulator"
version = "0.1.5.1"
version = "0.1.6"

requires-python = ">=3.9"
authors = [
Expand Down
Loading

0 comments on commit 2c2ca4d

Please sign in to comment.