Skip to content

Commit

Permalink
Key value extraction improvements (#1573)
Browse files Browse the repository at this point in the history
* Added example for image key value extraction evaluation

Signed-off-by: Yoav Katz <[email protected]>

* Removed unneeded comments

Signed-off-by: Yoav Katz <[email protected]>

* Model key value extraction as a first class task in catalog

Signed-off-by: Yoav Katz <[email protected]>

* Added documentation link

Signed-off-by: Yoav Katz <[email protected]>

* Added default template to extraction task

Signed-off-by: Yoav Katz <[email protected]>

* Changed order of results printout for clarity

Signed-off-by: Yoav Katz <[email protected]>

* Update docs/docs/examples.rst

Co-authored-by: Elron Bandel <[email protected]>

* Ensure key values are strings as expected by the metric

Signed-off-by: Yoav Katz <[email protected]>

* Moved to more standard font across Linux and Mac

Signed-off-by: Yoav Katz <[email protected]>

* Moved to default font

Signed-off-by: Yoav Katz <[email protected]>

* Doc improvements

Signed-off-by: Yoav Katz <[email protected]>

* Improved parsing and testing of processing.

Signed-off-by: Yoav Katz <[email protected]>

* Changed key value extraction to use json serializer

Signed-off-by: Yoav Katz <[email protected]>

---------

Signed-off-by: Yoav Katz <[email protected]>
Co-authored-by: Elron Bandel <[email protected]>
  • Loading branch information
2 people authored and dafnapension committed Feb 6, 2025
1 parent d9e01f8 commit d21a884
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 6 deletions.
13 changes: 12 additions & 1 deletion prepare/templates/key_value_extraction/templates.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
from unitxt import add_to_catalog
from unitxt.processors import PostProcess
from unitxt.serializers import (
DictAsJsonSerializer,
ImageSerializer,
ListSerializer,
MultiTypeSerializer,
)
from unitxt.struct_data_operators import JsonStrToListOfKeyValuePairs
from unitxt.templates import (
InputOutputTemplate,
Expand All @@ -10,7 +16,12 @@
instruction="Extract the key value pairs from the input. Return a valid json object with the following keys: {keys}. Return only the json representation, no additional text or explanations.",
input_format="{input}",
output_format="{key_value_pairs_answer}",
postprocessors=[PostProcess(JsonStrToListOfKeyValuePairs())],
postprocessors=[
PostProcess(JsonStrToListOfKeyValuePairs()),
],
serializer=MultiTypeSerializer(
serializers=[ImageSerializer(), DictAsJsonSerializer(), ListSerializer()]
),
),
"templates.key_value_extraction.extract_in_json_format",
overwrite=True,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,19 @@
"__type__": "json_str_to_list_of_key_value_pairs"
}
}
]
],
"serializer": {
"__type__": "multi_type_serializer",
"serializers": [
{
"__type__": "image_serializer"
},
{
"__type__": "dict_as_json_serializer"
},
{
"__type__": "list_serializer"
}
]
}
}
8 changes: 8 additions & 0 deletions src/unitxt/serializers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import io
import json
from abc import abstractmethod
from typing import Any, Dict, List, Union

Expand Down Expand Up @@ -61,6 +62,13 @@ def serialize(self, value: Any, instance: Dict[str, Any]) -> str:
return ", ".join(str(item) for item in value)


class DictAsJsonSerializer(SingleTypeSerializer):
serialized_type = dict

def serialize(self, value: Any, instance: Dict[str, Any]) -> str:
return json.dumps(value)


class DialogSerializer(SingleTypeSerializer):
serialized_type = Dialog

Expand Down
13 changes: 9 additions & 4 deletions src/unitxt/struct_data_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
{"key1": "value1", "key2": value2, "key3": "value3"}
"""

import ast
import json
import random
from abc import ABC, abstractmethod
Expand All @@ -43,6 +42,7 @@
from .operators import FieldOperator, InstanceOperator
from .random_utils import new_random_generator
from .serializers import ImageSerializer, TableSerializer
from .type_utils import isoftype
from .types import Table
from .utils import recursive_copy

Expand Down Expand Up @@ -1025,16 +1025,21 @@ def process_value(self, table: Any) -> Any:


class JsonStrToListOfKeyValuePairs(FieldOperator):
def process_value(self, text: str) -> List[Tuple[str, str]]:
text = text.replace("null", "None")
"""Convert a Json string of representing key value as dictionary to list of key value pairs."""

def process_value(self, text: str) -> List[Tuple[str, str]]:
try:
dict_value = ast.literal_eval(text)
dict_value = json.loads(text)
except Exception as e:
UnitxtWarning(
f"Unable to convert input text to json format in JsonStrToListOfKeyValuePairs due to {e}. Text: {text}"
)
dict_value = {}
if not isoftype(dict_value, Dict[str, Any]):
UnitxtWarning(
f"Unable to convert input text to dictionary in JsonStrToListOfKeyValuePairs. Text: {text}"
)
dict_value = {}
return [
(str(key), str(value))
for key, value in dict_value.items()
Expand Down
33 changes: 33 additions & 0 deletions tests/library/test_struct_data_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
DuplicateTableColumns,
DuplicateTableRows,
InsertEmptyTableRows,
JsonStrToListOfKeyValuePairs,
ListToKeyValPairs,
LoadJson,
MapHTMLTableToJSON,
Expand Down Expand Up @@ -709,3 +710,35 @@ def test_insert_empty_table_rows(self):
targets=targets,
tester=self,
)

def test_json_str_to_list_of_key_value_pairs(self):
inputs = [
{
"prediction": """
{ "a": null , "b" : 3, "c" : "word" }
"""
}
]

targets = [{"prediction": [("b", "3"), ("c", "word")]}]

check_operator(
operator=JsonStrToListOfKeyValuePairs(field="prediction"),
inputs=inputs,
targets=targets,
tester=self,
)

check_operator(
operator=JsonStrToListOfKeyValuePairs(field="prediction"),
inputs=[{"prediction": "bad input"}],
targets=[{"prediction": []}],
tester=self,
)

check_operator(
operator=JsonStrToListOfKeyValuePairs(field="prediction"),
inputs=[{"prediction": "3"}],
targets=[{"prediction": []}],
tester=self,
)

0 comments on commit d21a884

Please sign in to comment.