diff --git a/examples/mellea/agent/base_functions.py b/examples/mellea/agent/base_functions.py index 8e7fdf8..670ec75 100644 --- a/examples/mellea/agent/base_functions.py +++ b/examples/mellea/agent/base_functions.py @@ -32,10 +32,12 @@ ListItem, SectionHeaderItem, TableItem, + TableData, TextItem, TitleItem, RefItem, PictureItem, + ListGroup, ) from docling_core.types.io import DocumentStream @@ -139,7 +141,7 @@ def create_document_outline(doc: DoclingDocument) -> str: return outline -def find_outline(text: str) -> DoclingDocument | None: +def find_outline_v1(text: str) -> DoclingDocument | None: starts = ["paragraph", "list", "table", "figure", "picture"] md = find_markdown_code_block(text) @@ -175,9 +177,72 @@ def find_outline(text: str) -> DoclingDocument | None: return None +def find_outline_v2(text: str) -> DoclingDocument | None: + starts = ["paragraph", "list", "table", "figure", "picture"] + + md = find_markdown_code_block(text) + + if md: + converter = DocumentConverter(allowed_formats=[InputFormat.MD]) + + buff = BytesIO(md.encode("utf-8")) + doc_stream = DocumentStream(name="tmp.md", stream=buff) + + conv: ConversionResult = converter.convert(doc_stream) + + # outline = copy.deepcopy(conv.document) + + lines = [] + for item, level in conv.document.iterate_items(with_groups=True): + if isinstance(item, TitleItem) or isinstance(item, SectionHeaderItem): + continue + elif isinstance(item, TextItem): + pattern = rf"^({'|'.join(starts)}):\s(.*)\.$" + match = re.match(pattern, item.text, re.DOTALL) + + if match: + label = match[1] + summary = match[2] + + if label == "paragraph": + item.summary = summary + elif label == "table": + data = TableData(table_cells=[], num_rows=0, num_cols=0) + new_item = conv.document.add_table( + label=DocItemLabel.TABLE, data=data + ) + new_item.summary = summary + conv.document.replace_item(old_item=item, new_item=new_item) + elif label in ["figure", "picture"]: + new_item = conv.document.add_picture() + new_item.summary = summary + conv.document.replace_item(old_item=item, new_item=new_item) + elif label in ["list"]: + new_item = conv.document.add_group() + new_item.summary = summary + conv.document.replace_item(old_item=item, new_item=new_item) + + else: + print(f"NOT SUPPORTED: {label}") + else: + lines.append(item.text) + else: + continue + + if len(lines) > 0: + message = f"Every content line should start with one out of the following choices: {starts}. The following lines need to be updated: {'\n'.join(lines)}" + logger.error(message) + + return None + else: + return conv.document + else: + return None + + def validate_outline_format(text: str) -> bool: logger.info(f"testing validate_outline_format for {text[0:64]}") - return find_outline(text) is not None + return find_outline_v2(text) is not None def serialize_item_to_markdown(item: TextItem, doc: DoclingDocument) -> str: diff --git a/examples/mellea/agent/writer.py b/examples/mellea/agent/writer.py index 6a70c9c..9d01a96 100644 --- a/examples/mellea/agent/writer.py +++ b/examples/mellea/agent/writer.py @@ -77,7 +77,8 @@ validate_markdown_to_docling_document, insert_document, create_document_outline, - find_outline, + find_outline_v1, + find_outline_v2, validate_outline_format, validate_html_to_docling_document, convert_html_to_docling_document, @@ -173,7 +174,9 @@ def _make_outline_for_writing( strategy=RejectionSamplingStrategy(loop_budget=loop_budget), ) - outline = find_outline(text=answer.value) + outline = find_outline_v2(text=answer.value) + + exit(-1) return outline diff --git a/examples/mellea/example_01_write_report.py b/examples/mellea/example_01_write_report.py index 023c739..ab68e16 100644 --- a/examples/mellea/example_01_write_report.py +++ b/examples/mellea/example_01_write_report.py @@ -18,7 +18,7 @@ def main(): # Save the document os.makedirs("./scratch", exist_ok=True) - fname = datetime.now().strftime("%Y%m%d_%H%M%S") + fname = datetime.now().strftime("%Y_%m_%d_%H:%M:%S") document.save_as_markdown(filename=f"./scratch/{fname}.md", text_width=72) document.save_as_html(filename=f"./scratch/{fname}.html") diff --git a/pyproject.toml b/pyproject.toml index c904692..abad88d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,6 +42,7 @@ classifiers = [ ] requires-python = ">=3.10" dependencies = [ + "docling-core @ git+https://github.com/docling-project/docling-core.git@refs/pull/389/head", "docling~=2.25", "httpx>=0.28.1", "mcp[cli]>=1.9.4", @@ -51,6 +52,9 @@ dependencies = [ "python-dotenv>=1.1.0", ] +[tool.hatch.metadata] +allow-direct-references = true + [project.optional-dependencies] llama-index-rag = [ "llama-index>=0.12.33", diff --git a/uv.lock b/uv.lock index 1ed62f7..e853db9 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12' and sys_platform == 'darwin'", @@ -733,8 +733,8 @@ wheels = [ [[package]] name = "docling-core" -version = "2.45.0" -source = { registry = "https://pypi.org/simple" } +version = "2.48.2" +source = { git = "https://github.com/docling-project/docling-core.git?rev=refs%2Fpull%2F389%2Fhead#0c21580c109ded1dafa7df0334fa115b9cb58cde" } dependencies = [ { name = "jsonref" }, { name = "jsonschema" }, @@ -747,10 +747,6 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/57/e8/957a263fa00e8e0185589e36812f904618a597d77656187202f4b6bc78f1/docling_core-2.45.0.tar.gz", hash = "sha256:21ae79263f234ec5ed660b0b508019480adf16ddfa81cea60cd3a9f986f9abf1", size = 159058, upload-time = "2025-08-20T12:36:50.649Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/69/ea4ffb14ea5cd43c243be20a03b7b1f12a99f11ac3108f62ef45edf74c18/docling_core-2.45.0-py3-none-any.whl", hash = "sha256:043b2eb75ef48208fe43139aeee828b3d48941ed7083ed714851eee9f6f5c4f3", size = 163251, upload-time = "2025-08-20T12:36:48.858Z" }, -] [package.optional-dependencies] chunking = [ @@ -790,6 +786,7 @@ version = "1.3.0" source = { editable = "." } dependencies = [ { name = "docling" }, + { name = "docling-core" }, { name = "httpx" }, { name = "mcp", extra = ["cli"] }, { name = "mellea" }, @@ -845,6 +842,7 @@ examples = [ requires-dist = [ { name = "accelerate", marker = "extra == 'smolagents'", specifier = ">=0.20.0" }, { name = "docling", specifier = "~=2.25" }, + { name = "docling-core", git = "https://github.com/docling-project/docling-core.git?rev=refs%2Fpull%2F389%2Fhead" }, { name = "httpx", specifier = ">=0.28.1" }, { name = "llama-index", marker = "extra == 'llama-index-rag'", specifier = ">=0.12.33" }, { name = "llama-index-core", marker = "extra == 'llama-index-rag'", specifier = ">=0.12.28" },