Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 67 additions & 2 deletions examples/mellea/agent/base_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@
ListItem,
SectionHeaderItem,
TableItem,
TableData,
TextItem,
TitleItem,
RefItem,
PictureItem,
ListGroup,
)
from docling_core.types.io import DocumentStream

Expand Down Expand Up @@ -139,7 +141,7 @@ def create_document_outline(doc: DoclingDocument) -> str:
return outline


def find_outline(text: str) -> DoclingDocument | None:
def find_outline_v1(text: str) -> DoclingDocument | None:
starts = ["paragraph", "list", "table", "figure", "picture"]

md = find_markdown_code_block(text)
Expand Down Expand Up @@ -175,9 +177,72 @@ def find_outline(text: str) -> DoclingDocument | None:
return None


def find_outline_v2(text: str) -> DoclingDocument | None:
starts = ["paragraph", "list", "table", "figure", "picture"]

md = find_markdown_code_block(text)

if md:
converter = DocumentConverter(allowed_formats=[InputFormat.MD])

buff = BytesIO(md.encode("utf-8"))
doc_stream = DocumentStream(name="tmp.md", stream=buff)

conv: ConversionResult = converter.convert(doc_stream)

# outline = copy.deepcopy(conv.document)

lines = []
for item, level in conv.document.iterate_items(with_groups=True):
if isinstance(item, TitleItem) or isinstance(item, SectionHeaderItem):
continue
elif isinstance(item, TextItem):
pattern = rf"^({'|'.join(starts)}):\s(.*)\.$"
match = re.match(pattern, item.text, re.DOTALL)

if match:
label = match[1]
summary = match[2]

if label == "paragraph":
item.summary = summary
elif label == "table":
data = TableData(table_cells=[], num_rows=0, num_cols=0)
new_item = conv.document.add_table(
label=DocItemLabel.TABLE, data=data
)
new_item.summary = summary
conv.document.replace_item(old_item=item, new_item=new_item)
elif label in ["figure", "picture"]:
new_item = conv.document.add_picture()
new_item.summary = summary
conv.document.replace_item(old_item=item, new_item=new_item)
elif label in ["list"]:
new_item = conv.document.add_group()
new_item.summary = summary
conv.document.replace_item(old_item=item, new_item=new_item)

else:
print(f"NOT SUPPORTED: {label}")
else:
lines.append(item.text)
else:
continue

if len(lines) > 0:
message = f"Every content line should start with one out of the following choices: {starts}. The following lines need to be updated: {'\n'.join(lines)}"
logger.error(message)

return None
else:
return conv.document
else:
return None


def validate_outline_format(text: str) -> bool:
logger.info(f"testing validate_outline_format for {text[0:64]}")
return find_outline(text) is not None
return find_outline_v2(text) is not None


def serialize_item_to_markdown(item: TextItem, doc: DoclingDocument) -> str:
Expand Down
7 changes: 5 additions & 2 deletions examples/mellea/agent/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@
validate_markdown_to_docling_document,
insert_document,
create_document_outline,
find_outline,
find_outline_v1,
find_outline_v2,
validate_outline_format,
validate_html_to_docling_document,
convert_html_to_docling_document,
Expand Down Expand Up @@ -173,7 +174,9 @@ def _make_outline_for_writing(
strategy=RejectionSamplingStrategy(loop_budget=loop_budget),
)

outline = find_outline(text=answer.value)
outline = find_outline_v2(text=answer.value)

exit(-1)

return outline

Expand Down
2 changes: 1 addition & 1 deletion examples/mellea/example_01_write_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def main():

# Save the document
os.makedirs("./scratch", exist_ok=True)
fname = datetime.now().strftime("%Y%m%d_%H%M%S")
fname = datetime.now().strftime("%Y_%m_%d_%H:%M:%S")

document.save_as_markdown(filename=f"./scratch/{fname}.md", text_width=72)
document.save_as_html(filename=f"./scratch/{fname}.html")
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ classifiers = [
]
requires-python = ">=3.10"
dependencies = [
"docling-core @ git+https://github.com/docling-project/docling-core.git@refs/pull/389/head",
"docling~=2.25",
"httpx>=0.28.1",
"mcp[cli]>=1.9.4",
Expand All @@ -51,6 +52,9 @@ dependencies = [
"python-dotenv>=1.1.0",
]

[tool.hatch.metadata]
allow-direct-references = true

[project.optional-dependencies]
llama-index-rag = [
"llama-index>=0.12.33",
Expand Down
12 changes: 5 additions & 7 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.