Skip to content

Commit

Permalink
docs: description of supported formats and backends (#788)
Browse files Browse the repository at this point in the history
* chore: remove type-ignore marks for attaching text to non GroupItems

After commit b74208 of docling-core, text items can be attached to any NodeItem
and therefore the ignore[arg-type] type marks can be removed.

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

* test: remove unnecessary imports

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

* docs: add documentation on supported formats and backends

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

* docs: add notebook example with XML backends

Signed-off-by: Cesar Berrospi Ramis <[email protected]>

---------

Signed-off-by: Cesar Berrospi Ramis <[email protected]>
  • Loading branch information
ceberam authored Jan 26, 2025
1 parent 3be2fb5 commit c2ae1cc
Show file tree
Hide file tree
Showing 7 changed files with 1,147 additions and 41 deletions.
50 changes: 25 additions & 25 deletions docling/backend/xml/uspto_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ def _add_property(self, name: str, text: str) -> None:
if name == self.Element.TITLE.value:
if text:
self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
text=text,
)
self.level += 1
Expand All @@ -406,7 +406,7 @@ def _add_property(self, name: str, text: str) -> None:
abstract_item = self.doc.add_heading(
heading_text,
level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type]
parent=self.parents[heading_level],
)
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
Expand Down Expand Up @@ -434,7 +434,7 @@ def _add_property(self, name: str, text: str) -> None:
claims_item = self.doc.add_heading(
heading_text,
level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type]
parent=self.parents[heading_level],
)
for text in self.claims:
self.doc.add_text(
Expand All @@ -452,15 +452,15 @@ def _add_property(self, name: str, text: str) -> None:
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=text,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.text = ""

elif name == self.Element.HEADING.value and text:
self.parents[self.level + 1] = self.doc.add_heading(
text=text,
level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.level += 1
self.text = ""
Expand All @@ -470,7 +470,7 @@ def _add_property(self, name: str, text: str) -> None:
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table(
data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

def _apply_style(self, text: str, style_tag: str) -> str:
Expand Down Expand Up @@ -721,7 +721,7 @@ def _add_property(self, name: str, text: str) -> None:
if self.Element.TITLE.value in self.property and text.strip():
title = text.strip()
self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
text=title,
)
self.level += 1
Expand Down Expand Up @@ -749,7 +749,7 @@ def _add_property(self, name: str, text: str) -> None:
self.parents[self.level + 1] = self.doc.add_heading(
text=text.strip(),
level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.level += 1

Expand All @@ -769,7 +769,7 @@ def _add_property(self, name: str, text: str) -> None:
claims_item = self.doc.add_heading(
heading_text,
level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type]
parent=self.parents[heading_level],
)
for text in self.claims:
self.doc.add_text(
Expand All @@ -787,7 +787,7 @@ def _add_property(self, name: str, text: str) -> None:
abstract_item = self.doc.add_heading(
heading_text,
level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type]
parent=self.parents[heading_level],
)
self.doc.add_text(
label=DocItemLabel.PARAGRAPH, text=abstract, parent=abstract_item
Expand All @@ -799,7 +799,7 @@ def _add_property(self, name: str, text: str) -> None:
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=paragraph,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
elif self.Element.CLAIM.value in self.property:
# we may need a space after a paragraph in claim text
Expand All @@ -811,7 +811,7 @@ def _add_property(self, name: str, text: str) -> None:
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table(
data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

def _apply_style(self, text: str, style_tag: str) -> str:
Expand Down Expand Up @@ -938,7 +938,7 @@ def store_section(self, section: str) -> None:
self.parents[self.level + 1] = self.doc.add_heading(
heading.value,
level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.level += 1

Expand All @@ -959,7 +959,7 @@ def store_content(self, section: str, field: str, value: str) -> None:

if field == self.Field.TITLE.value:
self.parents[self.level + 1] = self.doc.add_title(
parent=self.parents[self.level], text=value # type: ignore[arg-type]
parent=self.parents[self.level], text=value
)
self.level += 1

Expand All @@ -971,14 +971,14 @@ def store_content(self, section: str, field: str, value: str) -> None:
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=value,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

elif field == self.Field.NUMBER.value and section == self.Section.CLAIMS.value:
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text="",
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

elif (
Expand All @@ -996,7 +996,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
last_claim = self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text="",
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

last_claim.text += f" {value}" if last_claim.text else value
Expand All @@ -1012,7 +1012,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
self.parents[self.level + 1] = self.doc.add_heading(
value,
level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.level += 1

Expand All @@ -1029,7 +1029,7 @@ def store_content(self, section: str, field: str, value: str) -> None:
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=value,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

def parse(self, patent_content: str) -> Optional[DoclingDocument]:
Expand Down Expand Up @@ -1283,7 +1283,7 @@ def _add_property(self, name: str, text: str) -> None:
title = text.strip()
if title:
self.parents[self.level + 1] = self.doc.add_text(
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
label=DocItemLabel.TITLE,
text=title,
)
Expand All @@ -1301,7 +1301,7 @@ def _add_property(self, name: str, text: str) -> None:
abstract_item = self.doc.add_heading(
heading_text,
level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type]
parent=self.parents[heading_level],
)
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
Expand Down Expand Up @@ -1331,7 +1331,7 @@ def _add_property(self, name: str, text: str) -> None:
claims_item = self.doc.add_heading(
heading_text,
level=heading_level,
parent=self.parents[heading_level], # type: ignore[arg-type]
parent=self.parents[heading_level],
)
for text in self.claims:
self.doc.add_text(
Expand All @@ -1350,14 +1350,14 @@ def _add_property(self, name: str, text: str) -> None:
self.parents[self.level + 1] = self.doc.add_heading(
text=text,
level=self.level,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.level += 1
else:
self.doc.add_text(
label=DocItemLabel.PARAGRAPH,
text=text,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)
self.text = ""

Expand All @@ -1366,7 +1366,7 @@ def _add_property(self, name: str, text: str) -> None:
empty_table = TableData(num_rows=0, num_cols=0, table_cells=[])
self.doc.add_table(
data=empty_table,
parent=self.parents[self.level], # type: ignore[arg-type]
parent=self.parents[self.level],
)

def _apply_style(self, text: str, style_tag: str) -> str:
Expand Down
Loading

0 comments on commit c2ae1cc

Please sign in to comment.