Skip to content

Commit c940aa5

Browse files
Matteo-OmenettiMatteo-Omenettidolfim-ibmcau-git
authored
feat: Add CodeItem as pydantic type, update export methods and APIs (#129)
* added code item * added code item * added code item Signed-off-by: Matteo-Omenetti <[email protected]> * added code item Signed-off-by: Matteo-Omenetti <[email protected]> * added code item Signed-off-by: Matteo-Omenetti <[email protected]> * added code item Signed-off-by: Matteo-Omenetti <[email protected]> * added code item Signed-off-by: Matteo-Omenetti <[email protected]> * add constraints to allow numpy > 2.1.0 on python3.13 and others Signed-off-by: Michele Dolfi <[email protected]> * Add CodeItem to ContentItem Signed-off-by: Christoph Auer <[email protected]> * added CodeItem in ContentItem tagged union. * added enum for programming languages * removed double CodeItem in ContentItem Union * fixed type of code_language in CodeItem class * fixed sorting of programming languages, not sorted anymore by value of string but variable name --------- Signed-off-by: Matteo-Omenetti <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Signed-off-by: Christoph Auer <[email protected]> Co-authored-by: Matteo-Omenetti <[email protected]> Co-authored-by: Michele Dolfi <[email protected]> Co-authored-by: Christoph Auer <[email protected]>
1 parent 618df13 commit c940aa5

File tree

8 files changed

+734
-585
lines changed

8 files changed

+734
-585
lines changed

docling_core/types/doc/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
99
from .document import (
10+
CodeItem,
1011
DocItem,
1112
DoclingDocument,
1213
DocumentOrigin,

docling_core/types/doc/document.py

Lines changed: 68 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from docling_core.types.base import _JSON_POINTER_REGEX
3737
from docling_core.types.doc import BoundingBox, Size
3838
from docling_core.types.doc.base import ImageRefMode
39-
from docling_core.types.doc.labels import DocItemLabel, GroupLabel
39+
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel, GroupLabel
4040
from docling_core.types.doc.tokens import DocumentToken, TableToken
4141
from docling_core.types.doc.utils import relative_path
4242

@@ -597,7 +597,6 @@ class TextItem(DocItem):
597597
DocItemLabel.CAPTION,
598598
DocItemLabel.CHECKBOX_SELECTED,
599599
DocItemLabel.CHECKBOX_UNSELECTED,
600-
DocItemLabel.CODE,
601600
DocItemLabel.FOOTNOTE,
602601
DocItemLabel.FORMULA,
603602
DocItemLabel.PAGE_FOOTER,
@@ -656,6 +655,15 @@ def export_to_document_tokens(
656655
return body
657656

658657

658+
class CodeItem(TextItem):
659+
"""CodeItem."""
660+
661+
label: typing.Literal[DocItemLabel.CODE] = (
662+
DocItemLabel.CODE # type: ignore[assignment]
663+
)
664+
code_language: CodeLanguageLabel = CodeLanguageLabel.UNKNOWN
665+
666+
659667
class SectionHeaderItem(TextItem):
660668
"""SectionItem."""
661669

@@ -1302,6 +1310,7 @@ class KeyValueItem(DocItem):
13021310
TextItem,
13031311
SectionHeaderItem,
13041312
ListItem,
1313+
CodeItem,
13051314
PictureItem,
13061315
TableItem,
13071316
KeyValueItem,
@@ -1397,7 +1406,7 @@ class DoclingDocument(BaseModel):
13971406
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []
13981407

13991408
groups: List[GroupItem] = []
1400-
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
1409+
texts: List[Union[SectionHeaderItem, ListItem, TextItem, CodeItem]] = []
14011410
pictures: List[PictureItem] = []
14021411
tables: List[TableItem] = []
14031412
key_value_items: List[KeyValueItem] = []
@@ -1643,6 +1652,46 @@ def add_title(
16431652

16441653
return text_item
16451654

1655+
def add_code(
1656+
self,
1657+
text: str,
1658+
code_language: Optional[CodeLanguageLabel] = None,
1659+
orig: Optional[str] = None,
1660+
prov: Optional[ProvenanceItem] = None,
1661+
parent: Optional[NodeItem] = None,
1662+
):
1663+
"""add_code.
1664+
1665+
:param text: str:
1666+
:param code_language: Optional[str]: (Default value = None)
1667+
:param orig: Optional[str]: (Default value = None)
1668+
:param prov: Optional[ProvenanceItem]: (Default value = None)
1669+
:param parent: Optional[NodeItem]: (Default value = None)
1670+
"""
1671+
if not parent:
1672+
parent = self.body
1673+
1674+
if not orig:
1675+
orig = text
1676+
1677+
text_index = len(self.texts)
1678+
cref = f"#/texts/{text_index}"
1679+
code_item = CodeItem(
1680+
text=text,
1681+
orig=orig,
1682+
self_ref=cref,
1683+
parent=parent.get_ref(),
1684+
)
1685+
if code_language:
1686+
code_item.code_language = code_language
1687+
if prov:
1688+
code_item.prov.append(prov)
1689+
1690+
self.texts.append(code_item)
1691+
parent.children.append(RefItem(cref=cref))
1692+
1693+
return code_item
1694+
16461695
def add_heading(
16471696
self,
16481697
text: str,
@@ -2086,7 +2135,7 @@ def export_to_markdown( # noqa: C901
20862135
text = f"{marker} {item.text}\n"
20872136
mdtexts.append(text.strip() + "\n")
20882137

2089-
elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
2138+
elif isinstance(item, CodeItem) and item.label in labels:
20902139
in_list = False
20912140
text = f"```\n{item.text}\n```\n"
20922141
mdtexts.append(text)
@@ -2392,11 +2441,14 @@ def close_lists(
23922441
text = f"<li>{item.text}</li>"
23932442
html_texts.append(text)
23942443

2444+
elif isinstance(item, CodeItem) and item.label in labels:
2445+
text = f"<pre><code>{item.text}</code></pre>"
2446+
html_texts.append(text.strip())
2447+
23952448
elif isinstance(item, TextItem) and item.label in labels:
23962449

23972450
text = f"<p>{item.text}</p>"
23982451
html_texts.append(text.strip())
2399-
24002452
elif isinstance(item, TableItem):
24012453

24022454
text = item.export_to_html(doc=self, add_caption=True)
@@ -2594,6 +2646,17 @@ def close_lists(
25942646
add_content=add_content,
25952647
add_page_index=add_page_index,
25962648
)
2649+
elif isinstance(item, CodeItem) and (item.label in labels):
2650+
2651+
result += item.export_to_document_tokens(
2652+
doc=self,
2653+
new_line=delim,
2654+
xsize=xsize,
2655+
ysize=ysize,
2656+
add_location=add_location,
2657+
add_content=add_content,
2658+
add_page_index=add_page_index,
2659+
)
25972660

25982661
elif isinstance(item, TextItem) and (item.label in labels):
25992662

docling_core/types/doc/labels.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,3 +138,69 @@ class TableCellLabel(str, Enum):
138138
def __str__(self):
139139
"""Get string value."""
140140
return str(self.value)
141+
142+
143+
class CodeLanguageLabel(str, Enum):
144+
"""CodeLanguageLabel."""
145+
146+
ADA = "Ada"
147+
AWK = "Awk"
148+
BASH = "Bash"
149+
BC = "bc"
150+
C = "C"
151+
C_SHARP = "C#"
152+
C_PLUS_PLUS = "C++"
153+
CMAKE = "CMake"
154+
COBOL = "COBOL"
155+
CSS = "CSS"
156+
CEYLON = "Ceylon"
157+
CLOJURE = "Clojure"
158+
CRYSTAL = "Crystal"
159+
CUDA = "Cuda"
160+
CYTHON = "Cython"
161+
D = "D"
162+
DART = "Dart"
163+
DC = "dc"
164+
DOCKERFILE = "Dockerfile"
165+
ELIXIR = "Elixir"
166+
ERLANG = "Erlang"
167+
FORTRAN = "FORTRAN"
168+
FORTH = "Forth"
169+
GO = "Go"
170+
HTML = "HTML"
171+
HASKELL = "Haskell"
172+
HAXE = "Haxe"
173+
JAVA = "Java"
174+
JAVASCRIPT = "JavaScript"
175+
JULIA = "Julia"
176+
KOTLIN = "Kotlin"
177+
LISP = "Lisp"
178+
LUA = "Lua"
179+
MATLAB = "Matlab"
180+
MOONSCRIPT = "MoonScript"
181+
NIM = "Nim"
182+
OCAML = "OCaml"
183+
OBJECTIVEC = "ObjectiveC"
184+
OCTAVE = "Octave"
185+
PHP = "PHP"
186+
PASCAL = "Pascal"
187+
PERL = "Perl"
188+
PROLOG = "Prolog"
189+
PYTHON = "Python"
190+
RACKET = "Racket"
191+
RUBY = "Ruby"
192+
RUST = "Rust"
193+
SML = "SML"
194+
SQL = "SQL"
195+
SCALA = "Scala"
196+
SCHEME = "Scheme"
197+
SWIFT = "Swift"
198+
TYPESCRIPT = "TypeScript"
199+
UNKNOWN = "unknown"
200+
VISUALBASIC = "VisualBasic"
201+
XML = "XML"
202+
YAML = "YAML"
203+
204+
def __str__(self):
205+
"""Get string value."""
206+
return str(self.value)

docs/DoclingDocument.json

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,133 @@
162162
"title": "ChartStackedBar",
163163
"type": "object"
164164
},
165+
"CodeItem": {
166+
"additionalProperties": false,
167+
"description": "CodeItem.",
168+
"properties": {
169+
"self_ref": {
170+
"pattern": "^#(?:/([\\w-]+)(?:/(\\d+))?)?$",
171+
"title": "Self Ref",
172+
"type": "string"
173+
},
174+
"parent": {
175+
"anyOf": [
176+
{
177+
"$ref": "#/$defs/RefItem"
178+
},
179+
{
180+
"type": "null"
181+
}
182+
],
183+
"default": null
184+
},
185+
"children": {
186+
"default": [],
187+
"items": {
188+
"$ref": "#/$defs/RefItem"
189+
},
190+
"title": "Children",
191+
"type": "array"
192+
},
193+
"label": {
194+
"const": "code",
195+
"default": "code",
196+
"title": "Label",
197+
"type": "string"
198+
},
199+
"prov": {
200+
"default": [],
201+
"items": {
202+
"$ref": "#/$defs/ProvenanceItem"
203+
},
204+
"title": "Prov",
205+
"type": "array"
206+
},
207+
"orig": {
208+
"title": "Orig",
209+
"type": "string"
210+
},
211+
"text": {
212+
"title": "Text",
213+
"type": "string"
214+
},
215+
"code_language": {
216+
"$ref": "#/$defs/CodeLanguageLabel",
217+
"default": "unknown"
218+
}
219+
},
220+
"required": [
221+
"self_ref",
222+
"orig",
223+
"text"
224+
],
225+
"title": "CodeItem",
226+
"type": "object"
227+
},
228+
"CodeLanguageLabel": {
229+
"description": "CodeLanguageLabel.",
230+
"enum": [
231+
"Ada",
232+
"Awk",
233+
"Bash",
234+
"bc",
235+
"C",
236+
"C#",
237+
"C++",
238+
"CMake",
239+
"COBOL",
240+
"CSS",
241+
"Ceylon",
242+
"Clojure",
243+
"Crystal",
244+
"Cuda",
245+
"Cython",
246+
"D",
247+
"Dart",
248+
"dc",
249+
"Dockerfile",
250+
"Elixir",
251+
"Erlang",
252+
"FORTRAN",
253+
"Forth",
254+
"Go",
255+
"HTML",
256+
"Haskell",
257+
"Haxe",
258+
"Java",
259+
"JavaScript",
260+
"Julia",
261+
"Kotlin",
262+
"Lisp",
263+
"Lua",
264+
"Matlab",
265+
"MoonScript",
266+
"Nim",
267+
"OCaml",
268+
"ObjectiveC",
269+
"Octave",
270+
"PHP",
271+
"Pascal",
272+
"Perl",
273+
"Prolog",
274+
"Python",
275+
"Racket",
276+
"Ruby",
277+
"Rust",
278+
"SML",
279+
"SQL",
280+
"Scala",
281+
"Scheme",
282+
"Swift",
283+
"TypeScript",
284+
"unknown",
285+
"VisualBasic",
286+
"XML",
287+
"YAML"
288+
],
289+
"title": "CodeLanguageLabel",
290+
"type": "string"
291+
},
165292
"CoordOrigin": {
166293
"description": "CoordOrigin.",
167294
"enum": [
@@ -1266,7 +1393,6 @@
12661393
"caption",
12671394
"checkbox_selected",
12681395
"checkbox_unselected",
1269-
"code",
12701396
"footnote",
12711397
"formula",
12721398
"page_footer",
@@ -1375,6 +1501,9 @@
13751501
},
13761502
{
13771503
"$ref": "#/$defs/TextItem"
1504+
},
1505+
{
1506+
"$ref": "#/$defs/CodeItem"
13781507
}
13791508
]
13801509
},

0 commit comments

Comments
 (0)