Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add CodeItem as pydantic type, update export methods and APIs #129

Merged
merged 17 commits into from
Jan 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docling_core/types/doc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
from .document import (
CodeItem,
DocItem,
DoclingDocument,
DocumentOrigin,
Expand Down
73 changes: 68 additions & 5 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from docling_core.types.base import _JSON_POINTER_REGEX
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.base import ImageRefMode
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling_core.types.doc.utils import relative_path

Expand Down Expand Up @@ -597,7 +597,6 @@ class TextItem(DocItem):
DocItemLabel.CAPTION,
DocItemLabel.CHECKBOX_SELECTED,
DocItemLabel.CHECKBOX_UNSELECTED,
DocItemLabel.CODE,
DocItemLabel.FOOTNOTE,
DocItemLabel.FORMULA,
DocItemLabel.PAGE_FOOTER,
Expand Down Expand Up @@ -656,6 +655,15 @@ def export_to_document_tokens(
return body


class CodeItem(TextItem):
"""CodeItem."""

label: typing.Literal[DocItemLabel.CODE] = (
DocItemLabel.CODE # type: ignore[assignment]
)
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN


class SectionHeaderItem(TextItem):
"""SectionItem."""

Expand Down Expand Up @@ -1302,6 +1310,7 @@ class KeyValueItem(DocItem):
TextItem,
SectionHeaderItem,
ListItem,
CodeItem,
PictureItem,
TableItem,
KeyValueItem,
Expand Down Expand Up @@ -1397,7 +1406,7 @@ class DoclingDocument(BaseModel):
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []

groups: List[GroupItem] = []
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
pictures: List[PictureItem] = []
tables: List[TableItem] = []
key_value_items: List[KeyValueItem] = []
Expand Down Expand Up @@ -1643,6 +1652,46 @@ def add_title(

return text_item

def add_code(
self,
text: str,
code_language: Optional[CodeLanguageLabel] = None,
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[NodeItem] = None,
):
"""add_code.

:param text: str:
:param code_language: Optional[str]: (Default value = None)
:param orig: Optional[str]: (Default value = None)
:param prov: Optional[ProvenanceItem]: (Default value = None)
:param parent: Optional[NodeItem]: (Default value = None)
"""
if not parent:
parent = self.body

if not orig:
orig = text

text_index = len(self.texts)
cref = f"#/texts/{text_index}"
code_item = CodeItem(
text=text,
orig=orig,
self_ref=cref,
parent=parent.get_ref(),
)
if code_language:
code_item.code_language = code_language
if prov:
code_item.prov.append(prov)

self.texts.append(code_item)
parent.children.append(RefItem(cref=cref))

return code_item

def add_heading(
self,
text: str,
Expand Down Expand Up @@ -2086,7 +2135,7 @@ def export_to_markdown( # noqa: C901
text = f"{marker} {item.text}\n"
mdtexts.append(text.strip() + "\n")

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
elif isinstance(item, CodeItem) and item.label in labels:
in_list = False
text = f"```\n{item.text}\n```\n"
mdtexts.append(text)
Expand Down Expand Up @@ -2392,11 +2441,14 @@ def close_lists(
text = f"<li>{item.text}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = f"<pre><code>{item.text}</code></pre>"
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in labels:

text = f"<p>{item.text}</p>"
html_texts.append(text.strip())

elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand Down Expand Up @@ -2594,6 +2646,17 @@ def close_lists(
add_content=add_content,
add_page_index=add_page_index,
)
elif isinstance(item, CodeItem) and (item.label in labels):

result += item.export_to_document_tokens(
doc=self,
new_line=delim,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
)

elif isinstance(item, TextItem) and (item.label in labels):

Expand Down
66 changes: 66 additions & 0 deletions docling_core/types/doc/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
def __str__(self):
"""Get string value."""
return str(self.value)


class CodeLanguageLabel(str, Enum):
"""CodeLanguageLabel."""

ADA = "Ada"
AWK = "Awk"
BASH = "Bash"
BC = "bc"
C = "C"
C_SHARP = "C#"
C_PLUS_PLUS = "C++"
CMAKE = "CMake"
COBOL = "COBOL"
CSS = "CSS"
CEYLON = "Ceylon"
CLOJURE = "Clojure"
CRYSTAL = "Crystal"
CUDA = "Cuda"
CYTHON = "Cython"
D = "D"
DART = "Dart"
DC = "dc"
DOCKERFILE = "Dockerfile"
ELIXIR = "Elixir"
ERLANG = "Erlang"
FORTRAN = "FORTRAN"
FORTH = "Forth"
GO = "Go"
HTML = "HTML"
HASKELL = "Haskell"
HAXE = "Haxe"
JAVA = "Java"
JAVASCRIPT = "JavaScript"
JULIA = "Julia"
KOTLIN = "Kotlin"
LISP = "Lisp"
LUA = "Lua"
MATLAB = "Matlab"
MOONSCRIPT = "MoonScript"
NIM = "Nim"
OCAML = "OCaml"
OBJECTIVEC = "ObjectiveC"
OCTAVE = "Octave"
PHP = "PHP"
PASCAL = "Pascal"
PERL = "Perl"
PROLOG = "Prolog"
PYTHON = "Python"
RACKET = "Racket"
RUBY = "Ruby"
RUST = "Rust"
SML = "SML"
SQL = "SQL"
SCALA = "Scala"
SCHEME = "Scheme"
SWIFT = "Swift"
TYPESCRIPT = "TypeScript"
UNKNOWN = "unknown"
VISUALBASIC = "VisualBasic"
XML = "XML"
YAML = "YAML"

def __str__(self):
"""Get string value."""
return str(self.value)
131 changes: 130 additions & 1 deletion docs/DoclingDocument.json
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,133 @@
"title": "ChartStackedBar",
"type": "object"
},
"CodeItem": {
"additionalProperties": false,
"description": "CodeItem.",
"properties": {
"self_ref": {
"pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$",
"title": "Self Ref",
"type": "string"
},
"parent": {
"anyOf": [
{
"$ref": "#/$defs/RefItem"
},
{
"type": "null"
}
],
"default": null
},
"children": {
"default": [],
"items": {
"$ref": "#/$defs/RefItem"
},
"title": "Children",
"type": "array"
},
"label": {
"const": "code",
"default": "code",
"title": "Label",
"type": "string"
},
"prov": {
"default": [],
"items": {
"$ref": "#/$defs/ProvenanceItem"
},
"title": "Prov",
"type": "array"
},
"orig": {
"title": "Orig",
"type": "string"
},
"text": {
"title": "Text",
"type": "string"
},
"code_language": {
"$ref": "#/$defs/CodeLanguageLabel",
"default": "unknown"
}
},
"required": [
"self_ref",
"orig",
"text"
],
"title": "CodeItem",
"type": "object"
},
"CodeLanguageLabel": {
"description": "CodeLanguageLabel.",
"enum": [
"Ada",
"Awk",
"Bash",
"bc",
"C",
"C#",
"C++",
"CMake",
"COBOL",
"CSS",
"Ceylon",
"Clojure",
"Crystal",
"Cuda",
"Cython",
"D",
"Dart",
"dc",
"Dockerfile",
"Elixir",
"Erlang",
"FORTRAN",
"Forth",
"Go",
"HTML",
"Haskell",
"Haxe",
"Java",
"JavaScript",
"Julia",
"Kotlin",
"Lisp",
"Lua",
"Matlab",
"MoonScript",
"Nim",
"OCaml",
"ObjectiveC",
"Octave",
"PHP",
"Pascal",
"Perl",
"Prolog",
"Python",
"Racket",
"Ruby",
"Rust",
"SML",
"SQL",
"Scala",
"Scheme",
"Swift",
"TypeScript",
"unknown",
"VisualBasic",
"XML",
"YAML"
],
"title": "CodeLanguageLabel",
"type": "string"
},
"CoordOrigin": {
"description": "CoordOrigin.",
"enum": [
Expand Down Expand Up @@ -1266,7 +1393,6 @@
"caption",
"checkbox_selected",
"checkbox_unselected",
"code",
"footnote",
"formula",
"page_footer",
Expand Down Expand Up @@ -1375,6 +1501,9 @@
},
{
"$ref": "#/$defs/TextItem"
},
{
"$ref": "#/$defs/CodeItem"
}
]
},
Expand Down
Loading
Loading