Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions codemeticulous/ai_convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
from codemeticulous.codemeta.models import CodeMeta
from codemeticulous.datacite.models import DataCite
from codemeticulous.cff.models import CitationFileFormat
from litellm import completion
import json
import os

STANDARDS = {
"codemeta": {
"model": CodeMeta,
"format": "json"
},
"datacite": {
"model": DataCite,
"format": "json"
},
"cff": {
"model": CitationFileFormat,
"format": "yaml"
},
}

def convert_ai(model: str, key: str, source_format: str, target_format: str, source_data):
"""
Automate metadata standard conversion using LLM and canonical representation.

Args:
- source_format: string representation of the source metadata standard.
- target_format: string representation of the target metadata standard.
- model: LLM model string (e.g., "openrouter/openai/gpt-4o")
- source_data: dict or pydantic.BaseModel instance representing the source metadata
- custom_fields: additional fields to add to the target metadata instance
"""

# Build prompt messages using pydantic schemas and the source data
source_model = STANDARDS[source_format]["model"]
target_model = STANDARDS[target_format]["model"]

# Creates pydantic model instance of source data -- might be unnecessary
if isinstance(source_data, dict):
source_instance = source_model(**source_data)
elif isinstance(source_data, source_model):
source_instance = source_data

messages = prompt_generator(source_instance, source_model, target_model)

# FIXME: adjust configuration for LiteLLM's standard env var lookup
os.environ['OPENROUTER_API_KEY'] = key

# Call the LLM via litellm completion function
response = completion(
model=model, # need to add guardrails to ensure that model string is valid before attempting a completion call
messages=messages,
)

# Extract assistant text from LLM response
assistant_text = response.get("choices", [{}])[0].get("message", {}).get("content") if isinstance(response, dict) else response

#TODO: Output response and see how it can be parsed + validated
print("LLM response:", assistant_text)


def prompt_generator(source_dict, source_model, target_model) -> list:
system = ( # Defining the model's
"You are a metadata conversion assistant using strictly the source and target schema models. ALWAYS return JSON only."
"Do not include any explanatory text outside the JSON.\n"
"If a source property cannot be mapped, add it to 'unmapped_properties' and explain in 'unmapped_explanations'.\n"
"RESPONSE FORMAT (JSON only):\n"
'{"converted": {...}, "unmapped_properties": ["property"], "unmapped_explanations": {"property":"reason"} }\n'

)

user = ( # Generating user query with given input for conversion
"Convert the SOURCE_DATA to match TARGET_MODEL_SCHEMA.\n"
"SOURCE_DATA:\n" + json.dumps(source_dict) + "\n\n"
"SOURCE_MODEL_SCHEMA:\n" + json.dumps(source_model.schema()) + "\n\n"
"TARGET_MODEL_SCHEMA:\n" + json.dumps(target_model.schema()) + "\n\n"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I bet context/docs (definitions of terms, etc.) for the schemas might help with results

we could put a description for all the fields in the pydantic models so that it would end up in the jsonschema from schema(), but this may be a lot of work, even with an initial pass of having AI fill them out from the docs below

alternatively, just throw in the full text of the documentation

https://codemeta.github.io/terms/
https://datacite-metadata-schema.readthedocs.io/en/4.6/
https://github.com/citation-file-format/citation-file-format/blob/main/schema-guide.md

)

return [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
82 changes: 81 additions & 1 deletion codemeticulous/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import yaml

from codemeticulous.convert import STANDARDS, convert as _convert

from codemeticulous.ai_convert import convert_ai as _convert_ai

@click.group()
def cli():
Expand Down Expand Up @@ -137,3 +137,83 @@ def load_file_autodetect(file_path):
raise ValueError(f"Unsupported file extension: {ext}.")
except Exception as e:
raise ValueError(f"Failed to load file: {file_path}. {str(e)}")

@cli.command()
@click.option(
"-m",
"--model",
"llm_model",
type=str,
required=True,
help="LLM model to use for conversion (e.g., 'openrouter/openai/gpt-4o')",
)
@click.option(
"-k",
"--key",
"api_key",
type=str,
required=True,
help="API key for LLM authorization",
)
@click.option(
"-f",
"--from",
"source_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Source format",
)
@click.option(
"-t",
"--to",
"target_format",
type=click.Choice(STANDARDS.keys()),
required=True,
help="Target format",
)
@click.option(
"-o",
"--output",
"output_file",
type=click.File("w"),
default=None,
help="Output file name (by default prints to stdout)",
)
@click.option(
"-v",
"--verbose",
is_flag=True,
default=False,
help="Print verbose output",
)
@click.argument("input_file", type=click.Path(exists=True))
def ai_convert(model: str, key: str, source_format: str, target_format: str, input_file, output_file, verbose):
try:
input_data = load_file_autodetect(input_file)
except Exception as e:
click.echo(f"Failed to load file: {input_file}. {str(e)}", err=True)
if verbose:
traceback.print_exc()
try:
converted_data = _convert_ai(model, key, source_format, target_format, input_data)
except Exception as e:
click.echo(f"Error during AI-assisted conversion: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

output_format = STANDARDS[target_format]["format"]

try:
output_data = dump_data(converted_data, output_format)
except Exception as e:
click.echo(f"Error during serialization: {str(e)}", err=True)
if verbose:
traceback.print_exc()
return

if output_file:
output_file.write(output_data)
click.echo(f"Data written to {output_file.name}")
else:
click.echo(output_data)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"click>=8.1.7",
"litellm>=1.77.1",
"pydantic2-schemaorg==0.2.0",
"pydantic>=2.9.2",
"pyyaml>=6.0.2",
Expand All @@ -30,4 +31,4 @@ include-package-data = false

[tool.setuptools.packages.find]
include = ["codemeticulous", "codemeticulous.*"]
exclude = ["tests*", "schema"]
exclude = ["tests*", "schema"]
Loading