Skip to content

Commit

Permalink
Merge branch 'main' into rtdl/export_latex_docx
Browse files Browse the repository at this point in the history
Signed-off-by: Rafael Teixeira de Lima <[email protected]>
  • Loading branch information
rateixei authored Jan 27, 2025
2 parents c32c3b6 + 8a4ec77 commit 30d0afe
Show file tree
Hide file tree
Showing 65 changed files with 3,380 additions and 301 deletions.
22 changes: 22 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24

### Feature

* New document picture classifier ([#805](https://github.com/DS4SD/docling/issues/805)) ([`16a218d`](https://github.com/DS4SD/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec))
* Add Docling JSON ingestion ([#783](https://github.com/DS4SD/docling/issues/783)) ([`88a0e66`](https://github.com/DS4SD/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc))
* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/DS4SD/docling/issues/752)) ([`3213b24`](https://github.com/DS4SD/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b))
* Add "auto" language for TesseractOcr ([#759](https://github.com/DS4SD/docling/issues/759)) ([`8543c22`](https://github.com/DS4SD/docling/commit/8543c22687fee40459d393bf4adcfc059712de02))

### Fix

* Added extraction of byte-images in excel ([#804](https://github.com/DS4SD/docling/issues/804)) ([`a458e29`](https://github.com/DS4SD/docling/commit/a458e298ca64da2c6df29d953e95645525817bed))
* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/DS4SD/docling/issues/769)) ([`670a08b`](https://github.com/DS4SD/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2))

### Documentation

* Fix minor typos ([#801](https://github.com/DS4SD/docling/issues/801)) ([`c58f75d`](https://github.com/DS4SD/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f))
* Add Azure RAG example ([#675](https://github.com/DS4SD/docling/issues/675)) ([`9020a93`](https://github.com/DS4SD/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5))
* Fix links between docs pages ([#697](https://github.com/DS4SD/docling/issues/697)) ([`c49b352`](https://github.com/DS4SD/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756))
* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/DS4SD/docling/issues/733)) ([`7686083`](https://github.com/DS4SD/docling/commit/768608351d40376c3504546f52e967195536b3d5))
* Example to translate documents ([#739](https://github.com/DS4SD/docling/issues/739)) ([`f7e1cbf`](https://github.com/DS4SD/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3))

## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10

### Fix
Expand Down
1 change: 0 additions & 1 deletion docling/backend/abstract_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ def is_valid(self) -> bool:
def supports_pagination(cls) -> bool:
pass

@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
Expand Down
1 change: 0 additions & 1 deletion docling/backend/asciidoc_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@


class AsciiDocBackend(DeclarativeDocumentBackend):

def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

Expand Down
2 changes: 1 addition & 1 deletion docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def handle_code(self, element, idx, doc):
label = DocItemLabel.CODE
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)
doc.add_code(parent=self.parents[self.level], label=label, text=text)

def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
Expand Down
Empty file.
58 changes: 58 additions & 0 deletions docling/backend/json/docling_json_backend.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from io import BytesIO
from pathlib import Path
from typing import Union

from docling_core.types.doc import DoclingDocument
from typing_extensions import override

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument


class DoclingJSONBackend(DeclarativeDocumentBackend):
@override
def __init__(
self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]
) -> None:
super().__init__(in_doc, path_or_stream)

# given we need to store any actual conversion exception for raising it from
# convert(), this captures the successful result or the actual error in a
# mutually exclusive way:
self._doc_or_err = self._get_doc_or_err()

@override
def is_valid(self) -> bool:
return isinstance(self._doc_or_err, DoclingDocument)

@classmethod
@override
def supports_pagination(cls) -> bool:
return False

@classmethod
@override
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.JSON_DOCLING}

def _get_doc_or_err(self) -> Union[DoclingDocument, Exception]:
try:
json_data: Union[str, bytes]
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, encoding="utf-8") as f:
json_data = f.read()
elif isinstance(self.path_or_stream, BytesIO):
json_data = self.path_or_stream.getvalue()
else:
raise RuntimeError(f"Unexpected: {type(self.path_or_stream)=}")
return DoclingDocument.model_validate_json(json_data=json_data)
except Exception as e:
return e

@override
def convert(self) -> DoclingDocument:
if isinstance(self._doc_or_err, DoclingDocument):
return self._doc_or_err
else:
raise self._doc_or_err
71 changes: 44 additions & 27 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,22 @@
import warnings
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from typing import List, Optional, Set, Union

import marko
import marko.ext
import marko.ext.gfm
import marko.inline
from docling_core.types.doc import (
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
from marko import Markdown

Expand All @@ -27,8 +30,7 @@


class MarkdownDocumentBackend(DeclarativeDocumentBackend):

def shorten_underscore_sequences(self, markdown_text, max_length=10):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"

Expand Down Expand Up @@ -90,13 +92,13 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
) from e
return

def close_table(self, doc=None):
def close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
_log.debug(md_table_row)
_log.debug("=== TABLE END ===")
tcells = []
tcells: List[TableCell] = []
result_table = []
for n, md_table_row in enumerate(self.md_table_buffer):
data = []
Expand Down Expand Up @@ -137,15 +139,19 @@ def close_table(self, doc=None):
self.in_table = False
self.md_table_buffer = [] # clean table markdown buffer
# Initialize Docling TableData
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
table_data = TableData(
num_rows=num_rows, num_cols=num_cols, table_cells=tcells
)
# Populate
for tcell in tcells:
data.table_cells.append(tcell)
table_data.table_cells.append(tcell)
if len(tcells) > 0:
doc.add_table(data=data)
doc.add_table(data=table_data)
return

def process_inline_text(self, parent_element, doc=None):
def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
):
# self.inline_text_buffer += str(text_in)
txt = self.inline_text_buffer.strip()
if len(txt) > 0:
Expand All @@ -156,14 +162,20 @@ def process_inline_text(self, parent_element, doc=None):
)
self.inline_text_buffer = ""

def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
def iterate_elements(
self,
element: marko.block.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}"
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
if element.level == 1:
doc_label = DocItemLabel.TITLE
Expand All @@ -172,10 +184,10 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):

# Header could have arbitrary inclusion of bold, italic or emphasis,
# hence we need to traverse the tree to get full text of a header
strings = []
strings: List[str] = []

# Define a recursive function to traverse the tree
def traverse(node):
def traverse(node: marko.block.BlockElement):
# Check if the node has a "children" attribute
if hasattr(node, "children"):
# If "children" is a list, continue traversal
Expand Down Expand Up @@ -209,9 +221,13 @@ def traverse(node):
self.process_inline_text(parent_element, doc)
_log.debug(" - List item")

snippet_text = str(element.children[0].children[0].children)
snippet_text = str(element.children[0].children[0].children) # type: ignore
is_numbered = False
if parent_element.label == GroupLabel.ORDERED_LIST:
if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text
Expand All @@ -221,7 +237,14 @@ def traverse(node):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
doc.add_picture(parent=parent_element, caption=element.title)

fig_caption: Optional[TextItem] = None
if element.title is not None and element.title != "":
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=element.title
)

doc.add_picture(parent=parent_element, caption=fig_caption)

elif isinstance(element, marko.block.Paragraph):
self.process_inline_text(parent_element, doc)
Expand Down Expand Up @@ -252,27 +275,21 @@ def traverse(node):
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.block.CodeBlock):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)

elif isinstance(element, marko.inline.LineBreak):
self.process_inline_text(parent_element, doc)
Expand Down
88 changes: 50 additions & 38 deletions docling/backend/msexcel_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from typing import Any, List

from PIL import Image as PILImage
from pydantic import BaseModel


Expand All @@ -44,7 +45,6 @@ class ExcelTable(BaseModel):


class MsExcelDocumentBackend(DeclarativeDocumentBackend):

def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)

Expand Down Expand Up @@ -326,49 +326,61 @@ def _find_images_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet
) -> DoclingDocument:

# FIXME: mypy does not agree with _images ...
"""
# Iterate over images in the sheet
for idx, image in enumerate(sheet._images): # Access embedded images
# Iterate over byte images in the sheet
for idx, image in enumerate(sheet._images): # type: ignore

image_bytes = BytesIO(image.ref.blob)
pil_image = Image.open(image_bytes)
try:
pil_image = PILImage.open(image.ref)

doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
"""
doc.add_picture(
parent=self.parents[0],
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except:
_log.error("could not extract the image from excel sheets")

# FIXME: mypy does not agree with _charts ...
"""
for idx, chart in enumerate(sheet._charts): # Access embedded charts
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)
for idx, chart in enumerate(sheet._charts): # type: ignore
try:
chart_path = f"chart_{idx + 1}.png"
_log.info(
f"Chart found, but dynamic rendering is required for: {chart_path}"
)
_log.info(f"Chart {idx + 1}:")
# Chart type
_log.info(f"Type: {type(chart).__name__}")
# Title
if chart.title:
_log.info(f"Title: {chart.title}")
else:
_log.info("No title")
# Data series
for series in chart.series:
_log.info(" => series ...")
_log.info(f"Data Series: {series.title}")
_log.info(f"Values: {series.values}")
_log.info(f"Categories: {series.categories}")
_log.info(f"Chart {idx + 1}:")
# Position
# _log.info(f"Anchor Cell: {chart.anchor}")
# Chart type
# _log.info(f"Type: {type(chart).__name__}")
print(f"Type: {type(chart).__name__}")
# Extract series data
for series_idx, series in enumerate(chart.series):
#_log.info(f"Series {series_idx + 1}:")
print(f"Series {series_idx + 1} type: {type(series).__name__}")
#print(f"x-values: {series.xVal}")
#print(f"y-values: {series.yVal}")
print(f"xval type: {type(series.xVal).__name__}")
xvals = []
for _ in series.xVal.numLit.pt:
print(f"xval type: {type(_).__name__}")
if hasattr(_, 'v'):
xvals.append(_.v)
print(f"x-values: {xvals}")
yvals = []
for _ in series.yVal:
if hasattr(_, 'v'):
yvals.append(_.v)
print(f"y-values: {yvals}")
except Exception as exc:
print(exc)
continue
"""

return doc
Loading

0 comments on commit 30d0afe

Please sign in to comment.