From 1f240c776329782b5ff6f69a84c0a6dbd6eed59e Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Tue, 28 Jan 2025 09:32:00 +0100 Subject: [PATCH 1/8] Recommit with fixed history Signed-off-by: Rafael Teixeira de Lima --- docling/backend/docx_latex/__init__.py | 0 docling/backend/docx_latex/latex_dict.py | 271 +++++++++++++ docling/backend/docx_latex/omml.py | 460 +++++++++++++++++++++++ docling/backend/msword_backend.py | 21 +- poetry.lock | 22 +- pyproject.toml | 1 + 6 files changed, 767 insertions(+), 8 deletions(-) create mode 100644 docling/backend/docx_latex/__init__.py create mode 100644 docling/backend/docx_latex/latex_dict.py create mode 100644 docling/backend/docx_latex/omml.py diff --git a/docling/backend/docx_latex/__init__.py b/docling/backend/docx_latex/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/docling/backend/docx_latex/latex_dict.py b/docling/backend/docx_latex/latex_dict.py new file mode 100644 index 000000000..eb8255bbd --- /dev/null +++ b/docling/backend/docx_latex/latex_dict.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- + +""" +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py +On 23/01/2025 +""" + +from __future__ import unicode_literals + +CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") + +BLANK = "" +BACKSLASH = "\\" +ALN = "&" + +CHR = { + # Unicode : Latex Math Symbols + # Top accents + "\u0300": "\\grave{{{0}}}", + "\u0301": "\\acute{{{0}}}", + "\u0302": "\\hat{{{0}}}", + "\u0303": "\\tilde{{{0}}}", + "\u0304": "\\bar{{{0}}}", + "\u0305": "\\overbar{{{0}}}", + "\u0306": "\\breve{{{0}}}", + "\u0307": "\\dot{{{0}}}", + "\u0308": "\\ddot{{{0}}}", + "\u0309": "\\ovhook{{{0}}}", + "\u030a": "\\ocirc{{{0}}}}", + "\u030c": "\\check{{{0}}}}", + "\u0310": "\\candra{{{0}}}", + "\u0312": "\\oturnedcomma{{{0}}}", + "\u0315": "\\ocommatopright{{{0}}}", + "\u031a": "\\droang{{{0}}}", + "\u0338": "\\not{{{0}}}", + "\u20d0": "\\leftharpoonaccent{{{0}}}", + "\u20d1": "\\rightharpoonaccent{{{0}}}", + "\u20d2": "\\vertoverlay{{{0}}}", + "\u20d6": "\\overleftarrow{{{0}}}", + "\u20d7": "\\vec{{{0}}}", + "\u20db": "\\dddot{{{0}}}", + "\u20dc": "\\ddddot{{{0}}}", + "\u20e1": "\\overleftrightarrow{{{0}}}", + "\u20e7": "\\annuity{{{0}}}", + "\u20e9": "\\widebridgeabove{{{0}}}", + "\u20f0": "\\asteraccent{{{0}}}", + # Bottom accents + "\u0330": "\\wideutilde{{{0}}}", + "\u0331": "\\underbar{{{0}}}", + "\u20e8": "\\threeunderdot{{{0}}}", + "\u20ec": "\\underrightharpoondown{{{0}}}", + "\u20ed": "\\underleftharpoondown{{{0}}}", + "\u20ee": "\\underledtarrow{{{0}}}", + "\u20ef": "\\underrightarrow{{{0}}}", + # Over | group + "\u23b4": "\\overbracket{{{0}}}", + "\u23dc": "\\overparen{{{0}}}", + "\u23de": "\\overbrace{{{0}}}", + # Under| group + "\u23b5": "\\underbracket{{{0}}}", + "\u23dd": "\\underparen{{{0}}}", + "\u23df": "\\underbrace{{{0}}}", +} + +CHR_BO = { + # Big operators, + "\u2140": "\\Bbbsum", + "\u220f": "\\prod", + "\u2210": "\\coprod", + "\u2211": "\\sum", + "\u222b": "\\int", + "\u22c0": "\\bigwedge", + "\u22c1": "\\bigvee", + "\u22c2": "\\bigcap", + "\u22c3": "\\bigcup", + "\u2a00": "\\bigodot", + "\u2a01": "\\bigoplus", + "\u2a02": "\\bigotimes", +} + +T = { + "\u2192": "\\rightarrow ", + # Greek letters + "\U0001d6fc": "\\alpha ", + "\U0001d6fd": "\\beta ", + "\U0001d6fe": "\\gamma ", + "\U0001d6ff": "\\theta ", + "\U0001d700": "\\epsilon ", + "\U0001d701": "\\zeta ", + "\U0001d702": "\\eta ", + "\U0001d703": "\\theta ", + "\U0001d704": "\\iota ", + "\U0001d705": "\\kappa ", + "\U0001d706": "\\lambda ", + "\U0001d707": "\\m ", + "\U0001d708": "\\n ", + "\U0001d709": "\\xi ", + "\U0001d70a": "\\omicron ", + "\U0001d70b": "\\pi ", + "\U0001d70c": "\\rho ", + "\U0001d70d": "\\varsigma ", + "\U0001d70e": "\\sigma ", + "\U0001d70f": "\\ta ", + "\U0001d710": "\\upsilon ", + "\U0001d711": "\\phi ", + "\U0001d712": "\\chi ", + "\U0001d713": "\\psi ", + "\U0001d714": "\\omega ", + "\U0001d715": "\\partial ", + "\U0001d716": "\\varepsilon ", + "\U0001d717": "\\vartheta ", + "\U0001d718": "\\varkappa ", + "\U0001d719": "\\varphi ", + "\U0001d71a": "\\varrho ", + "\U0001d71b": "\\varpi ", + # Relation symbols + "\u2190": "\\leftarrow ", + "\u2191": "\\uparrow ", + "\u2192": "\\rightarrow ", + "\u2193": "\\downright ", + "\u2194": "\\leftrightarrow ", + "\u2195": "\\updownarrow ", + "\u2196": "\\nwarrow ", + "\u2197": "\\nearrow ", + "\u2198": "\\searrow ", + "\u2199": "\\swarrow ", + "\u22ee": "\\vdots ", + "\u22ef": "\\cdots ", + "\u22f0": "\\adots ", + "\u22f1": "\\ddots ", + "\u2260": "\\ne ", + "\u2264": "\\leq ", + "\u2265": "\\geq ", + "\u2266": "\\leqq ", + "\u2267": "\\geqq ", + "\u2268": "\\lneqq ", + "\u2269": "\\gneqq ", + "\u226a": "\\ll ", + "\u226b": "\\gg ", + "\u2208": "\\in ", + "\u2209": "\\notin ", + "\u220b": "\\ni ", + "\u220c": "\\nni ", + # Ordinary symbols + "\u221e": "\\infty ", + # Binary relations + "\u00b1": "\\pm ", + "\u2213": "\\mp ", + # Italic, Latin, uppercase + "\U0001d434": "A", + "\U0001d435": "B", + "\U0001d436": "C", + "\U0001d437": "D", + "\U0001d438": "E", + "\U0001d439": "F", + "\U0001d43a": "G", + "\U0001d43b": "H", + "\U0001d43c": "I", + "\U0001d43d": "J", + "\U0001d43e": "K", + "\U0001d43f": "L", + "\U0001d440": "M", + "\U0001d441": "N", + "\U0001d442": "O", + "\U0001d443": "P", + "\U0001d444": "Q", + "\U0001d445": "R", + "\U0001d446": "S", + "\U0001d447": "T", + "\U0001d448": "U", + "\U0001d449": "V", + "\U0001d44a": "W", + "\U0001d44b": "X", + "\U0001d44c": "Y", + "\U0001d44d": "Z", + # Italic, Latin, lowercase + "\U0001d44e": "a", + "\U0001d44f": "b", + "\U0001d450": "c", + "\U0001d451": "d", + "\U0001d452": "e", + "\U0001d453": "f", + "\U0001d454": "g", + "\U0001d456": "i", + "\U0001d457": "j", + "\U0001d458": "k", + "\U0001d459": "l", + "\U0001d45a": "m", + "\U0001d45b": "n", + "\U0001d45c": "o", + "\U0001d45d": "p", + "\U0001d45e": "q", + "\U0001d45f": "r", + "\U0001d460": "s", + "\U0001d461": "t", + "\U0001d462": "u", + "\U0001d463": "v", + "\U0001d464": "w", + "\U0001d465": "x", + "\U0001d466": "y", + "\U0001d467": "z", +} + +FUNC = { + "sin": "\\sin({fe})", + "cos": "\\cos({fe})", + "tan": "\\tan({fe})", + "arcsin": "\\arcsin({fe})", + "arccos": "\\arccos({fe})", + "arctan": "\\arctan({fe})", + "arccot": "\\arccot({fe})", + "sinh": "\\sinh({fe})", + "cosh": "\\cosh({fe})", + "tanh": "\\tanh({fe})", + "coth": "\\coth({fe})", + "sec": "\\sec({fe})", + "csc": "\\csc({fe})", +} + +FUNC_PLACE = "{fe}" + +BRK = "\\\\" + +CHR_DEFAULT = { + "ACC_VAL": "\\hat{{{0}}}", +} + +POS = { + "top": "\\overline{{{0}}}", # not sure + "bot": "\\underline{{{0}}}", +} + +POS_DEFAULT = { + "BAR_VAL": "\\overline{{{0}}}", +} + +SUB = "_{{{0}}}" + +SUP = "^{{{0}}}" + +F = { + "bar": "\\frac{{{num}}}{{{den}}}", + "skw": r"^{{{num}}}/_{{{den}}}", + "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", + "lin": "{{{num}}}/{{{den}}}", +} +F_DEFAULT = "\\frac{{{num}}}{{{den}}}" + +D = "\\left{left}{text}\\right{right}" + +D_DEFAULT = { + "left": "(", + "right": ")", + "null": ".", +} + +RAD = "\\sqrt[{deg}]{{{text}}}" +RAD_DEFAULT = "\\sqrt{{{text}}}" +ARR = "{text}" + +LIM_FUNC = { + "lim": "\\lim_{{{lim}}}", + "max": "\\max_{{{lim}}}", + "min": "\\min_{{{lim}}}", +} + +LIM_TO = ("\\rightarrow", "\\to") + +LIM_UPP = "\\overset{{{lim}}}{{{text}}}" + +M = "\\begin{{matrix}}{text}\end{{matrix}}" diff --git a/docling/backend/docx_latex/omml.py b/docling/backend/docx_latex/omml.py new file mode 100644 index 000000000..5a5e24c10 --- /dev/null +++ b/docling/backend/docx_latex/omml.py @@ -0,0 +1,460 @@ +""" +Office Math Markup Language (OMML) + +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py +On 23/01/2025 +""" + +import lxml.etree as ET +from pylatexenc.latexencode import UnicodeToLatexEncoder + +from docling.backend.docx_latex.latex_dict import ( + ALN, + ARR, + BACKSLASH, + BLANK, + BRK, + CHARS, + CHR, + CHR_BO, + CHR_DEFAULT, + D_DEFAULT, + F_DEFAULT, + FUNC, + FUNC_PLACE, + LIM_FUNC, + LIM_TO, + LIM_UPP, + POS, + POS_DEFAULT, + RAD, + RAD_DEFAULT, + SUB, + SUP, + D, + F, + M, + T, +) + +OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" + + +def load(stream): + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def load_string(string): + root = ET.fromstring(string) + for omath in root.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def escape_latex(strs): + last = None + new_chr = [] + strs = strs.replace(r"\\", "\\") + for c in strs: + if (c in CHARS) and (last != BACKSLASH): + new_chr.append(BACKSLASH + c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) + + +def get_val(key, default=None, store=CHR): + if key is not None: + return key if not store else store.get(key, key) + else: + return default + + +class Tag2Method(object): + + def call_method(self, elm, stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS, "") + method = getmethod(stag) + if method: + return method(self, elm) + else: + return None + + def process_children_list(self, elm, include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if OMML_NS not in _e.tag: + continue + stag = _e.tag.replace(OMML_NS, "") + if include and (stag not in include): + continue + t = self.call_method(_e, stag=stag) + if t is None: + t = self.process_unknow(_e, stag) + if t is None: + continue + yield (stag, t, _e) + + def process_children_dict(self, elm, include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag, t, e in self.process_children_list(elm, include): + latex_chars[stag] = t + return latex_chars + + def process_children(self, elm, include=None): + """ + process children of the elm,return string + """ + return BLANK.join( + ( + t if not isinstance(t, Tag2Method) else str(t) + for stag, t, e in self.process_children_list(elm, include) + ) + ) + + def process_unknow(self, elm, stag): + return None + + +class Pr(Tag2Method): + + text = "" + + __val_tags = ("chr", "pos", "begChr", "endChr", "type") + + __innerdict = None # can't use the __dict__ + + """ common properties of element""" + + def __init__(self, elm): + self.__innerdict = {} + self.text = self.process_children(elm) + + def __str__(self): + return self.text + + def __unicode__(self): + return self.__str__(self) + + def __getattr__(self, name): + return self.__innerdict.get(name, None) + + def do_brk(self, elm): + self.__innerdict["brk"] = BRK + return BRK + + def do_common(self, elm): + stag = elm.tag.replace(OMML_NS, "") + if stag in self.__val_tags: + t = elm.get("{0}val".format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + "brk": do_brk, + "chr": do_common, + "pos": do_common, + "begChr": do_common, + "endChr": do_common, + "type": do_common, + } + + +class oMath2Latex(Tag2Method): + """ + Convert oMath element of omml to latex + """ + + _t_dict = T + + __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") + u = UnicodeToLatexEncoder( + replacement_latex_protection="braces-all", + unknown_char_policy="keep", + unknown_char_warning=False, + ) + + def __init__(self, element): + self._latex = self.process_children(element) + + def __str__(self): + return self.latex + + def __unicode__(self): + return self.__str__(self) + + def process_unknow(self, elm, stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == "Pr": + return Pr(elm) + else: + return None + + @property + def latex(self): + return self._latex + + def do_acc(self, elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val( + c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR + ) + return latex_s.format(c_dict["e"]) + + def do_bar(self, elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["barPr"] + latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) + return pr.text + latex_s.format(c_dict["e"]) + + def do_d(self, elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["dPr"] + null = D_DEFAULT.get("null") + + print(pr.text) + s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) + print(pr.begChr, D_DEFAULT.get("left"), s_val) + + e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) + print(pr.endChr, D_DEFAULT.get("right"), s_val) + + delim = pr.text + D.format( + left=null if not s_val else escape_latex(s_val), + text=c_dict["e"], + right=null if not e_val else escape_latex(e_val), + ) + print(delim) + print() + return delim + + def do_spre(self, elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass + + def do_sub(self, elm): + text = self.process_children(elm) + return SUB.format(text) + + def do_sup(self, elm): + text = self.process_children(elm) + return SUP.format(text) + + def do_f(self, elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["fPr"] + latex_s = get_val(pr.type, default=F_DEFAULT, store=F) + return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) + + def do_func(self, elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get("fName") + return func_name.replace(FUNC_PLACE, c_dict.get("e")) + + def do_fname(self, elm): + """ + the func name + """ + latex_chars = [] + for stag, t, e in self.process_children_list(elm): + if stag == "r": + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else: + raise NotSupport("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this + + def do_groupchr(self, elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["groupChrPr"] + latex_s = get_val(pr.chr) + return pr.text + latex_s.format(c_dict["e"]) + + def do_rad(self, elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get("e") + deg_text = c_dict.get("deg") + if deg_text: + return RAD.format(deg=deg_text, text=text) + else: + return RAD_DEFAULT.format(text=text) + + def do_eqarr(self, elm): + """ + the Array object + """ + return ARR.format( + text=BRK.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + ) + + def do_limlow(self, elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + latex_s = LIM_FUNC.get(t_dict["e"]) + if not latex_s: + raise NotSupport("Not support lim %s" % t_dict["e"]) + else: + return latex_s.format(lim=t_dict.get("lim")) + + def do_limupp(self, elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) + + def do_lim(self, elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) + + def do_m(self, elm): + """ + the Matrix object + """ + rows = [] + for stag, t, e in self.process_children_list(elm): + if stag is "mPr": + pass + elif stag == "mr": + rows.append(t) + return M.format(text=BRK.join(rows)) + + def do_mr(self, elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + + def do_nary(self, elm): + """ + the n-ary object + """ + res = [] + bo = "" + for stag, t, e in self.process_children_list(elm): + if stag == "naryPr": + bo = get_val(t.chr, store=CHR_BO) + else: + res.append(t) + return bo + BLANK.join(res) + + def process_unicode(self, s): + # s = s if isinstance(s,unicode) else unicode(s,'utf-8') + # print(s, self._t_dict.get(s, s), unicode_to_latex(s)) + # _str.append( self._t_dict.get(s, s) ) + + out_latex_str = self.u.unicode_to_latex(s) + + # print(s, out_latex_str) + + if ( + s.startswith("{") is False + and out_latex_str.startswith("{") + and s.endswith("}") is False + and out_latex_str.endswith("}") + ): + out_latex_str = f" {out_latex_str[1:-1]} " + + # print(s, out_latex_str) + + if "ensuremath" in out_latex_str: + out_latex_str = out_latex_str.replace("\\ensuremath{", " ") + out_latex_str = out_latex_str.replace("}", " ") + + # print(s, out_latex_str) + + if out_latex_str.strip().startswith("\\text"): + out_latex_str = f" \\text{{{out_latex_str}}} " + + # print(s, out_latex_str) + + return out_latex_str + + def do_r(self, elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + _base_str = [] + for s in elm.findtext("./{0}t".format(OMML_NS)): + out_latex_str = self.process_unicode(s) + _str.append(out_latex_str) + _base_str.append(s) + + proc_str = escape_latex(BLANK.join(_str)) + base_proc_str = BLANK.join(_base_str) + + if "{" not in base_proc_str and "\\{" in proc_str: + proc_str = proc_str.replace("\\{", "{") + + if "}" not in base_proc_str and "\\}" in proc_str: + proc_str = proc_str.replace("\\}", "}") + + return proc_str + + tag2meth = { + "acc": do_acc, + "r": do_r, + "bar": do_bar, + "sub": do_sub, + "sup": do_sup, + "f": do_f, + "func": do_func, + "fName": do_fname, + "groupChr": do_groupchr, + "d": do_d, + "rad": do_rad, + "eqArr": do_eqarr, + "limLow": do_limlow, + "limUpp": do_limupp, + "lim": do_lim, + "m": do_m, + "mr": do_mr, + "nary": do_nary, + } diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index f8148d525..609dd66bc 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -19,6 +19,7 @@ from PIL import Image, UnidentifiedImageError from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.docx_latex.omml import oMath2Latex from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -133,6 +134,7 @@ def get_level(self) -> int: def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname + # Check for Inline Images (blip elements) namespaces = { "a": "http://schemas.openxmlformats.org/drawingml/2006/main", @@ -221,12 +223,28 @@ def get_label_and_level(self, paragraph): else: return label, None + def handle_equations_in_text(self, element, text): + only_texts = [] + texts_and_equations = [] + for subt in element.iter(): + if subt.tag.endswith("t") and "math" not in subt.tag: + only_texts.append(subt.text) + texts_and_equations.append(subt.text) + if "oMath" in subt.tag and "oMathPara" not in subt.tag: + texts_and_equations.append(f"${str(oMath2Latex(subt))}$") + + assert "".join(only_texts) == text + return "".join(texts_and_equations) + def handle_text_elements(self, element, docx_obj, doc): paragraph = docx.text.paragraph.Paragraph(element, docx_obj) + text = paragraph.text + text = self.handle_equations_in_text(element=element, text=text) + if paragraph.text is None: return - text = paragraph.text.strip() + text = text.strip() # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" @@ -291,7 +309,6 @@ def handle_text_elements(self, element, docx_obj, doc): doc.add_text( label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text ) - self.update_history(p_style_id, p_level, numid, ilevel) return diff --git a/poetry.lock b/poetry.lock index 0d685fe39..1cbcf1c1b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3823,10 +3823,10 @@ files = [ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3849,10 +3849,10 @@ files = [ numpy = [ {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -4037,8 +4037,8 @@ files = [ [package.dependencies] numpy = [ {version = ">=1.22.4", markers = "python_version < \"3.11\""}, - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4787,6 +4787,16 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pylatexenc" +version = "2.10" +description = "Simple LaTeX parser providing latex-to-unicode and unicode-to-latex conversion" +optional = false +python-versions = "*" +files = [ + {file = "pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3"}, +] + [[package]] name = "pylint" version = "2.17.7" @@ -7751,4 +7761,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "08d30cee8d77f9beee32d5dbec1643367ecae2b4c4b47b57fcb337711471eb5c" +content-hash = "3727fb425795e596dda2c7b5b726eb58fd28ff3c3c3c08e96b6458204ef9f7dc" diff --git a/pyproject.toml b/pyproject.toml index f45f1b61f..e2d2c236d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,6 +57,7 @@ onnxruntime = [ { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } ] pillow = "^10.0.0" +pylatexenc = "^2.10" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} From 31e30a2cb7b874fff9e95b0868e4b2932c98f9de Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Tue, 28 Jan 2025 09:41:56 +0100 Subject: [PATCH 2/8] Add parsing configuration Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 15 ++++++++++++--- pyproject.toml | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 609dd66bc..4ad59f1c3 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -27,7 +27,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + def __init__( + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + get_latex=False, + ): super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" @@ -49,6 +54,9 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.level = 0 self.listIter = 0 + # Transform MSWord equations to latex + self.get_latex = get_latex + self.history = { "names": [None], "levels": [None], @@ -240,9 +248,10 @@ def handle_text_elements(self, element, docx_obj, doc): paragraph = docx.text.paragraph.Paragraph(element, docx_obj) text = paragraph.text - text = self.handle_equations_in_text(element=element, text=text) + if self.get_latex: + text = self.handle_equations_in_text(element=element, text=text) - if paragraph.text is None: + if text is None: return text = text.strip() diff --git a/pyproject.toml b/pyproject.toml index e2d2c236d..7d89056eb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,7 +157,8 @@ module = [ "deepsearch_glm.*", "lxml.*", "bs4.*", - "huggingface_hub.*" + "huggingface_hub.*", + "pylatexenc.*" ] ignore_missing_imports = true From b3b7c387ca96f211dd711216e9fb6ab56045a32a Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Tue, 28 Jan 2025 13:39:54 +0100 Subject: [PATCH 3/8] Remove prints and backend flag Signed-off-by: Rafael Teixeira de Lima --- docling/backend/docx_latex/omml.py | 11 ++--------- docling/backend/msword_backend.py | 13 ++----------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/docling/backend/docx_latex/omml.py b/docling/backend/docx_latex/omml.py index 5a5e24c10..68f2519e6 100644 --- a/docling/backend/docx_latex/omml.py +++ b/docling/backend/docx_latex/omml.py @@ -187,7 +187,7 @@ def __init__(self, element): self._latex = self.process_children(element) def __str__(self): - return self.latex + return self.latex.replace(" ", " ") def __unicode__(self): return self.__str__(self) @@ -231,20 +231,13 @@ def do_d(self, elm): pr = c_dict["dPr"] null = D_DEFAULT.get("null") - print(pr.text) s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) - print(pr.begChr, D_DEFAULT.get("left"), s_val) - e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) - print(pr.endChr, D_DEFAULT.get("right"), s_val) - delim = pr.text + D.format( left=null if not s_val else escape_latex(s_val), text=c_dict["e"], right=null if not e_val else escape_latex(e_val), ) - print(delim) - print() return delim def do_spre(self, elm): @@ -355,7 +348,7 @@ def do_m(self, elm): """ rows = [] for stag, t, e in self.process_children_list(elm): - if stag is "mPr": + if stag == "mPr": pass elif stag == "mr": rows.append(t) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 4ad59f1c3..156255838 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -27,12 +27,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__( - self, - in_doc: "InputDocument", - path_or_stream: Union[BytesIO, Path], - get_latex=False, - ): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" @@ -54,9 +49,6 @@ def __init__( self.level = 0 self.listIter = 0 - # Transform MSWord equations to latex - self.get_latex = get_latex - self.history = { "names": [None], "levels": [None], @@ -248,8 +240,7 @@ def handle_text_elements(self, element, docx_obj, doc): paragraph = docx.text.paragraph.Paragraph(element, docx_obj) text = paragraph.text - if self.get_latex: - text = self.handle_equations_in_text(element=element, text=text) + text = self.handle_equations_in_text(element=element, text=text) if text is None: return From 7996dcbf3e2b69888cc6fa04ced4a12d6f6195e6 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Tue, 28 Jan 2025 13:59:17 +0100 Subject: [PATCH 4/8] Fix test_backend_msword Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 156255838..6d7f3debf 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -227,13 +227,16 @@ def handle_equations_in_text(self, element, text): only_texts = [] texts_and_equations = [] for subt in element.iter(): - if subt.tag.endswith("t") and "math" not in subt.tag: + tag_name = etree.QName(subt).localname + if tag_name == "t" and "math" not in subt.tag: only_texts.append(subt.text) texts_and_equations.append(subt.text) - if "oMath" in subt.tag and "oMathPara" not in subt.tag: + elif "oMath" in subt.tag and "oMathPara" not in subt.tag: texts_and_equations.append(f"${str(oMath2Latex(subt))}$") - assert "".join(only_texts) == text + if "".join(only_texts) != text: + return text + return "".join(texts_and_equations) def handle_text_elements(self, element, docx_obj, doc): From f5034944b83d66fcf2127313fe5a3cc8bbd31c80 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Tue, 28 Jan 2025 15:14:02 +0100 Subject: [PATCH 5/8] Adding test files Signed-off-by: Rafael Teixeira de Lima --- tests/data/docx/equations.docx | Bin 0 -> 15017 bytes .../docling_v2/equations.docx.itxt | 31 ++ .../docling_v2/equations.docx.json | 450 ++++++++++++++++++ .../groundtruth/docling_v2/equations.docx.md | 29 ++ 4 files changed, 510 insertions(+) create mode 100644 tests/data/docx/equations.docx create mode 100644 tests/data/groundtruth/docling_v2/equations.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/equations.docx.json create mode 100644 tests/data/groundtruth/docling_v2/equations.docx.md diff --git a/tests/data/docx/equations.docx b/tests/data/docx/equations.docx new file mode 100644 index 0000000000000000000000000000000000000000..8ab71b96dd1ad5257f8ab3c925376baa0259b8dd GIT binary patch literal 15017 zcmeHuV|XRo*6xbYvF)T|t2?%B+qRvKZM);7W81dvbkMPRv-dgY+c@Xm``r8Y`|5dW zu3Br<`_`(eG3Oj}Kvn_-6a@edfCK;lgn-S08B28_0Du4t06+#n0&57^SUVb7JL)L9 z*%~=$(Yjh$66Apblji_{KhFQ3?Z5F8s7n~N?x9Bzx=(z9k8M=a|Cv`x2^`LyL?eF$ zf#CtH{2Djd_SS(6ET;q%3vEe6%5=ZMq&yHXy^>}Lg;?)GdVzkqD+idCUK8s!V7#sW7@iJ z-vCM8Aho5CsfWOZ)6PNq6koH*;$f|*hZ1`1c{G=u;F&BNW4{{U zfO`>zg~5{|s}coq>WJ^iEJrmoKvd|WR}%iiQ&H2cxvGF#dKG8uM(I0g-O~zwVI)u+r_k;yE<8o-78ZcxP&yD*lIMr;~H!GzrnkXimN&%nkH%ZdrJEdayOtdCZ@=z_^4#I02hFYeI6>a*1Y z2mtW@4g!$|NeCA<#FBSz4S1= z=K-$))9rFA-5B}ObOuwa7|Rfl8e)>jt4rpKZ?Bw7OTb!3dLpAUv++~z4jIBuTM1eh zSV>B-VeL>0kJ`N&&uy-N#BXii8C(}^cVN;F?u=hYh{Y&I0>af$!zQu7V;@3N#=DUB zDFyHLi$0kVQB2Mmkksa9*(ojFr+72vq%zDen2Tz7K$LO`bp+wIOkjFL8}n2rGttCE z_iWT?^(h|5H!((5MPF>tc_n74kL zuuiqc?ej&EI&AvdK=OOF-7U%AK+}>wKV;hWO#~XOUjvND#RRX}e5u`}y-rWE?pO!C zV%>b*g1zG^c0`aMmveZCv4aJ8`MeTS6H}EE6GicT)ZJ-sL-LlOUM>NeOd!yY%DNnH zj+_s_;@wDLi`o4;VAs1ucDs?oAb8e2@1vu`V{ug-;2)vB5x%(AK~`MqCXsB16dG6r zaboCrM4^J&(Ti19bma#O_f;dbLlUJMV|Epo=%J{RN7;E`-QeqS4sDkkF)?%af6`0^ zT-C7z$oVH(z)WEY<`}83fa~7_dT?rVvBNB2NoN8}X$ZXO2qfpx;;QiFVI6k7MNe60 zYGCOlb^pBPZ0qSFl+1t_bc*{NF&qL0fxtOo9Z6r%-X%nT_&XMNWQ ztoJNbe=wp0PXOWFUKUGB<`W+%zNRE1s}8gVLE;_Yu`f&#?ubIYa9_~pDn3*~`t*yC z=FS$l3@|?Mb1060X8`^;djH0yAoi$j)CpN|GYUL@WO>S;r36D*(GnuAJ!bD7oVpmw z`Y5q#s7I#2ht~XOA=LEEdMo0t=&=F`_+wo%`kEl!>~i`L-aR{AW3F%qqeG+M%x-hv z-cWcRAgeow-j5qSQx7U66< zc?#xk9v)qAQeH<4X6RhVN=Tcx!VM$^R7eDD2<-8OaY{Jq7CzY3KkP@{*tU*5oYUc( z*$Z>e_B?pwp$(ogP=&6!D}ANpx@jSGuXS_ewi6^(n{r#VBa69?=~5OG)tCgcXJvFI zcSvl|)EEu2rxp=7-AQZ7)J+dX2K|L!tYHpxIf9ukr^hqbi!1~awpMG~j9W0kN~wl= zsajHz+(ttWrlumkH{ zuqSQl#Yud@f6@wNy#D-oIUY~*SUN*iI|vGL1MloeZ6v!EtEd8bHHYh(B=~uuJ(MT< zoD0^i@1$#leSY71(_xN1^CIeNX-3o-{9tx?n}&zm&tLoFtD{Yg$@#-G$G`R~R!5_h z#&zDmt5%KxM!0L$$lKFW&`g#+>%En^6)_d|RqbTc5}AnL*~PXXA+*%hsHGcR6=tR& zB(J6h57NfP^!KDIDXfBIQG7Oao5^g0NbCz`(ae9@N2-8|mBL2tyGYZ`x#KMnPQn`H z_JG(7VU?Ba@2f+&vS;d-_jr59jLFtT1+h$l2qUtiYRC)014e;Dxl{wfhlB;rMDcaD_bx0pe=$KohHBWHo*>HZA#?@TB z?W{(i$_uJ^G0ZCg_wZL7F1Snb2ucJex~ju~T}LGasnCQoTsR*oZS-4nPlBg#vmrsx zMMaoqE~LX2uVVkKa~Q)umEvFR_MWc*`ich*yG$RA% zp63J?sh&yQTgRKIQlHV?e?d@2d@Wx^M~JF87QHv9&MqTa?AKSg_2!|A&rm;bDv8g$ z^U@HP$NlNC&Yo24Li+6JqS@xo$DyokSZ_6}pB}`d7`g9ML+xc&U3$qIm)<$8?cIh< zrJ5sJGeQpQ>SeOZJm}>VvIr`S7}1CZ*k)1K#w-jiX=A%p`?h-T+8xr{)0>rd)lLAT zOVSUzrC)DUOjVNuy+>0kWfl9yZ+`!()ucYEK05c7p$=|fsw;T?oD^4&(TR|uZa4l5 z_`Kw(OH8|y{{}Z-!va*q&u7X+Y(BqAgyoan>2?{Oom1kTvI>>a4Wp@3&}Tgs`(H~x zU-@VFMP4~P5Z_)`li!twEfLv~U>(TeX2Di8)c0yRKV!FunVJtPXK$_^-goT3X!C)W zS^|zRz~9vu#%n1j!^BBF-fLWS{c?d%J?R!MlQO6E@Cql-$A2DOdpy5v92Sk35bfgb z-9vL4l)~IEG&Baq5pKtRzj}ikEW?_-^71~nhSu4%$qeGuuP%Nj^sgWn#^2(J6Lx0E zSsA>4e(+w|ZdW-6>C9w$8HIeg3);l?Hi7jy3E!scxoDPHzI@Epe_0l}>znR){Q0xs zJcKf#7g`JPdcN_8&)ck&w`N>)nhlOseC9Z$Z(==}PLI#!@^bl@Xps6VDg+{Vd5)NR z=wh$S26GfD%r#6K?Q!B2?@gxM?t$ByRD)*-@KIX{vux*U%bsTo8ZlLKc$$313`_ev z$fjll1Yl^-Wx86X>yYSiKA`T^_URKzh#10-eb!`E#u2re4mYS6lIw{Dx)kf#1VSC6 zwaZY9R{b;yWL)Iv5vLekYWWu`+fdCb+Ru{({m#0tII-EixUseenenPA;?j~-(_ouN zW_QF*j_RcA(@l>&A9mWmTXGAAwQ@Hf5{8d;5Dowm=yyx*AIqIT+jIX~`T%`+AwDd^ z|K3}9+~|kt|FH`H_7;%hMZ@7G%fDwSPPn@83J}8MDW--RzuE5mAu4tC3u0*&XH*G3QZ~Gh~nOGvbHGBBg5y-j6dr1Pc|56I@D+SX~WmZOg1Fb7_gWTR*`( zcKIoW;Q9o^uQ<#`+&L0-k zX&YMsY&IZYxfKX`Y~Lfsj@ZRjaWxFV;^`#>V-AIsCBmlg!wndVWT{KL^7~7=^s>r2 zwQ90vY2O;~2hOWf9GcW-axbH(+8dFhvwxBk%5b|semJH6U29ByU+N~n0f2D=008wv z8GhFqM^hs!Bf8&PhTqo!YUAP9Y$%=RPdp*b9C*>4$*uuBwNlRUq$dS?tU2vT+J@-}u$tDQ*@7C}5eVrX=1|z6G+r5>uL4P7W@o$0ij*ByBRJDseZ7eBXxlO8751-osh!4QiCc8K?E*DmwbLB zOZpPSFIWEPv@+p7^aUNN^6Nk#F2(#%%+%6wc|pg6zY7$|=a@R2FOz{;aMK{$0N~~r z5SqbxA8EULb2}PA!6-{%>zwx_dkMecv03{u{0gx3j>Y1Hg(nu@5G&2jTy6_5EWn^a z^Gu)wnnvluT2N!3G4F7`kV~yLW%@(J+f~DxamB6@OdB|9bmA`-xU^U8`7R6DE7kR$ zMy(&S)Yz`D;&>Js58|C#_fO#30l@zG%d9QRHq7P{H&~3{4JV;(7R;^N8v~1q^~qY0 z9!2;1)ghxPutj~HX;IjL^%3(aa3nN2&=Lah4jBACLG;n*X*#!Rmym$`*l|CJZeU;I zqbJUEjC@p1-T`??*;cK~z?LtI_iOXK+&R6xt{v>me&to&yVKn>+1Gx5 zxZT|J0eXA74PKP#tlo7)dp|y;(|LUx>?}bCudvB(cYizGchPx$oK^VLHoisv34aH{ z&h}HLKL>Ndr#eYr;nQCvK|p~(@=_eh8f(o;v8XISUk^3POdP1K;X=w-I=Ha4tz{1~o>fa3vGIbB z3lM3+d7%&j&0%B`8X~qvSL-c;a2hAfmH_St+v7&@i`<*`kAw|yAmsW*%+KfJ&)zt^ zX8bAOOp5$EY5o_4A!3>+*~M7UT5y7KEZ5f)aQcXf3_BFV>g>x@7&%C|lB&3z12{RW z4+q|U%QK#EQ0wI8s*4d;4MYf5D${Bb@7!ASC&q6>g>B>f#QhDJ4ZDW^dO7l?&$@OuR*3?n`Ymm#td$Ty)5aE%1wZ3wY zYFP&wOiYGP#AIywG}^I!Y>ClA zJNo8$92Z`tnQehJYP);etW@c=M#wLh=&W4q(QvVx64XEHLX3z`<>3g($saBhJidbWlVHE z!W^4)6+hsFZ;|^}XA~BgTs+FxPx47NQ)fFiDm7veTwnM~C*8Z2PRzKZ%LBsdHY+Nh zjY`PkHq^P=A+^dDX>@cNDj~_28wq1m$4KNFVv2u!og~avaVDfrLqmi8&Yk@2P2F7x zPa05aacNGfN75@5bR+mk5v${N-@}z@BI&T4W#lH+f2+@F6*Cl=E=GhLhUh60^g1ZjR|#9 zkazMJ5)-X^3AM*bZ2JHjpM)XUCONf9>lFci7K(WValCD)Ez?8T-OJS7J7akAD_3StWkm z(M>6~p59#9JmTb5)c1@JiD`ah2+q>K#82~!r9?}YFDBI*H_gqYPbeEaU9Jo+L`kbe zLxZmI=%=c%w~TU`15r~UrB6Nml2&_?B%|M%mQgYU@EE4`7x;Da)|uPVb^q1kyHAPs z{WQ#_zwC3hDv~5#&4+(EwTaA-6G2MM8=m>SRLh`^y4<(;oVlI9ha_{qHcuvCjMiVoVm4LI#=an!6QJsam5r4@5`280M#%1 zM)?yq=z94^jp8{?q0VXQIjXG1%DL4@@9(dsW){s?^Yvo>+~ZSt#Av9j4=M|ks|d4))r6d@$sCRK0AL#fnCTvgb1%+6c_D)SDt zEK1;7_vx%gMs^;p7^!^v#(q7<$%Hu>F&-REm*JEZI-?k7hM;{L0EgJzr)XK!t&?u# zg5y}n2TV)`mI@A$7ZAPJ`F5q3ZLSYfd;}*8nR(@jbs9l2t1WZL(G%MdM~VnQleaV6 zQnK>EDJd#FJ%~#lnu=gFhifn1#0Q6s%m7bBG835IXI7CP?NNrt!9j z*GN^#fZXN0L(xMBKyeq5__h2HT?5o~O)|Op9AYxjw!wYcx$ST-zn$N&(NhclJ?_gi z7HbF-J^^aQ?gQ%d^gZf>HTSffpWBj4+3ZmKC}f@rXm3WF<^Y2=ySuL2h`^;0XD>0l z6*4xQ4|NJMu%?zEBCIFQo!+=i6Emx;3ufkKepbB@_^%jlWy5jRs!@zxbH=8)v!RxC^PJmAMdU6a~kr(+SLzv(FVV&_*ylelushPR^Q+cj&Ck+-q@d z?9Jm=wd5&UyvINE3b+(%&bvVPcs=64xTs!;NFnsZ6ye6<1Q^d20d*6|#>IS4V0kv9C{9nd0frs}_91BPFBdWg#|DK~PST)fs!? zplb-pUuTxiknEyJ92+{qdgDzHbn}xhylUL*q6Q?L90?plx0AmL z%I-JJ9(zzhEjnEjW^k0Ad&xE7gq;e|tPoSkLv03s#M&-ITZ13Qn672RPOkLT8gg=n zzxaUWvrF&%#9@pM5@Y5GhsiUn<_z!KcC}xaB#Ox6tJT;fXv`OLT`-11gxMkcSTVVo z(^2|RcGgI(Y}fbWTuRE#sV$dDPMRAY9|b{2n}8lLd=XkuI_S1;1ZNhBpvl$b+(H^r zkaTjBfNnIVDrlgw$GsE8gXz6_aDZ(t|~bR;%M%!-Ko8iAB;fZJ>V>#GSvYie}Qqm>Kdc-)yOqD)J? z9q0Lm@Mym+V_5tn^;6>!$a~}d(|;a*P9H=Zp%4H7iy{C3{$KXDgQJ_J(eH!Lsn)9f zsxXq*X6Y*gO13jTPAHMka49*3Si?ePJuYb++oXUlIT0`rEkIqq!}~A@8yh4K!b;uF zN)t5}@I00ylRa>=#mMvaQWw@n$2s0Z!i^v4!;a&|c^6L*aXu-MeO{@P8P!H@V%L>N za&VY$k`?Lwc69FZe0fPdnoXSL223z9q5Sq}ntFIr8xSz2bA>wUExN#o#(^%@qTl?G zwt-)+`c$RXc@;mOw!h68(ev`S_4C4(m66^IKf$mX((8@vl8p-QaO90(KbBW=#}c!$ z7Wh{+{E9&w_zCl+TT*C%wh)$0e@{~&rVP*&<;^UK1ll9~j;>@hN&-DTeyL1ZF);f8 z5Dt)U>SVSpMiKZfuj+zb4vDD+)+aF4wRH`#ZRYIu8u$HNEzGex=xu$bWY&%mP!U03h^ z_>Ur9dot#&j^U40A$t(p_1S@r0ewEb=$6R)sV`|81qWr1g>Ral?XujWO{lMXFQji& zm|LKQp3iA_sBuEMGK?OD&}tSkB_4B~qOB-Un4e2Fs^03S^jiC9G-~M)iymG_b^{znXdZs z`Z=9hQa%=bQsjqDdSs&~YR9Jl-@$tPfl>OcPCB=jFS~s8uZtF__QnC(mim`&_KXUi zB5D_R!-p2Bg#8;SV&^__;IS{aWaU^Y2k$gp0|VVwcM=nOWx+f1W^S-${Z`s zW8(|eWjeW7X{TkmM;%ySH&g^_y;vM;egsM1&&T*bron!*BVg=!;<5zKIqRt|MvOE- z#Ks!TBV~>fqBO+`N19?!;uYCd-NA@7fKmXJ&U$DdF9&<>L;AJe)H;c1-P9@r_0dcM zJ{sf>?6b3%v!GUv%+~F*iH(n+;#u%i;OpD&`Hk(kRgJ<)hIdI0vv9KDgbk{@dSherHzf-{i54QG`j7h2Xw9H$a0 zLQ;if)6~ZurdNZL)1F^qV$F8lrc?)Al5A?DRyPf@vTbhgcmY`kYKvQlnF_O_2SxM{ zk24|um4Z=cesmkRtSV^KoP#RXd6F0DMqV51yIAje)*>gW(Q>bD!3jxNoCw|&;l_bk zo@7kRr34CM+l1p6Ok~C z!R*HD;gnvyjb|T1Nq}J0r6-(lh}cU9x{fMaaQ!5hN*W4}CVEGR2twKLQNvco4loqZQBdq z03_m6rGhwGGvy97j~RVz`-$^-7gzfQvyDW2rk^a&K^I8$pc^=5?3-AfT0iDKA{A#3 zh_dV4pacdZbhNIx^!6(iEC?eb3I-}NbVL|9m~g)zFfpm_1MXLrs_n+3xvrqivi$Cl zAOH(}DLXOOE3(#piJx45DsX-n6PPfn33LSI1epzIhiV8P{W?VDCh45?Tg7sEFGB1NPM3SGD^nVsJh8zL6C0QDJs9D?sl0$75Z*xQLE*r zsNV2xOH2iYrC>+vp`%S5uzXa6+3Lk}x%7*HvL;`5&rsy3f{H^hxyyT3hu_V3kVI-; z+wC57=-C`$oyy^@MeL@{Yu8`lJ-B5f+)p9H?gy&L%zKX#@#xa)636-yq5B~&hu}G= zVerqy(DH-ggb%*V^izcP_ykOQkXcQs;0r}TC zPB^O?;WP#XU|czG`)6@KRoM8}<5@Tq#-PUB)w_6`XbxOzCbkc>GS?V-&~fS^`mseO!lRSUYbese^_ia+AvL*YXBp=@MD9#OyUyBKBYqldvSK4R=^~y;? zc2BC6sNEV$ICHl0RHG5JsM}0sMyZ9`S{$*6Sa&Va*nGes+)bsSGF1TGW&dXy;f<|| zaU~`{gv6YNR@;CJSNz{$|U!ovLX+;nWmr}w6rVr9c zb6L{-k|4-`kcMmtviRB+Vexg7Jj%Ks&l|NKEUEd3YZ?^ja(K>jZL{cqgJr38Zh@NLnmX1@Qqfqoxl zG@1eH^JV*xeP0d9Yuiu6qZnUS#O-29^-WyAObDd^++myP5&ULlSKQAVq#Q_*#}hwc zR8h=e8F{k&+K9CbE`7AVMaZZrzXWMblL`;y!Q75&z9%Cz|4GOlgc)xXow&g=8A#5O zF0EO;M)ACYzy2W-(ZGp1+QbYEy#K7XcX|53U4Ldk{m1lO*W**Xsg6~=4k!<&!Y1P@ zpn^Vo)FliGUefq@nW|yP&7J!zOR3b2Mmo1==tvTbQ(@%9{?-yc?r8ULrp-8Ob7wV? zW^au;qO^4fwMyXi_mNR=Eg2bOO4hk(C2)3phVSp@wQUqa04;5cKy#qx9eoEcpfBwe z4Rz@f7dUDGg9QX>TVIY5ITNXG&$04eMz&ebFtvq@A0~P%^oJ^glQ&rD+sGC!7#lsN z0vGoa)s6prT&NzeSrs^`K221uonb$^sn0V@9;KM~4mcExxnx;(qM0B+-HCDQElT zM#v5>RjJTp{%$(-Wij}W4aJ~iQ8_E}-ptkewa1NC^N|HraaZ^}Yp5Pv@jPVYFt$+L(-nWz*PYp7 z=_CtWMp7X3RfRK*oALF;(CXc+4_zKj|NKF=UBtoSU6T$> z6N}Pv{N#HP$8^GRsu=cn5y6DC$NCPF?DOnxLZ&FupxeEi$ZaUZUSsa~tZf^mrp09} z#-m_EiOI?3r8J~!qe2uNIF08lSAm+m!y(IZ9OP>K!rKmf*x2iw!xBr*u@F~!1q&Z2 zr>RLv0*Yy55ms^O5OM}MuiTl31CpPqIM`4{f+Q)mTRQycI?WrKQlVLn)pA#9AxOKA z=%wLaQbkAT8kW!MHdYP{&UjLUN`du~BPsr{EF#KM_BL7=_xo38e;pUsqiu5JA&+aCMMYp4 z70(MvyS=%xjm?gNd?DHIsf;eBbkm3^Zmea+XT)P~4+2q#F`N9BqLE`FIyqUs(_TUz z{9wqL3xlxpsH)WL)UnV#zw-3h=J>Nf{5xAFJeb`aW{ZFf1Z?bS$P#+d^8}7syT
)*4+*a~|8N+5_yqnvYDc#5Bx>^!y`}kZT%!ETYv5v}uka_o zWyb8fbq@gwVC$LGBRI(8`&SkT0~R&32{ z1)I)Uy?96W*HE(0`Fi3=^f=#`(=$B1D#4>k?6NSsQLu)FfpFAl&(05=lzX2*+TeX! zNXQ3_(hdu5I!4Hy+CazG5UA+&YIw_1;k8WbjjAj7=-q&1&5dCg)Grpfr;o*a1luN-;eZl0E(6 zSnS@tk9~zudYci0R%QYG{RF2gL~T0dZF5%;%BI=W3vkqty^<8=o15DF!P=Zx{dE8F z1sUcZ_b`2=9J78D1|a?G1sQ)-11Re0TN?cyQ;i?BUi~ff4E6*Md|yreh&UL6%S@LO z?krb;&#RCiBuD}VXSM9JNl3u~9g(9vWkl13wLXnArWs{@epf2(+~d#eE>?p`8alOx zB5#wlcRzYPpFy&fLaS<3ry3Ukuy1^N+HcTG5atlA((dKIBrVCXKr@4GLkjC2V&*rf zVOC}cj&_cxG}dVxM`bJ3ldd$PStlH)$c%+Epvz6ENzmlP>QLfumCaU{Bc3)bHUtq- z_kgNkX)rC{1_qcYJMB@B-9p7nP4J4_TXZE@2lhJRf}tdtFbkG?gGdGgT`_e6nrE*}o*W0*Ma4CM_zYBF*0R^tW^4Gwa@j3?i#$~NU;1#x7_9b2RtpavXsRhI`N ze&*8GtXjEPqs1p>B|rJgieC~vofgKG<<#Ig8Zn>C6@vD{2X?;jUkP|?fd z4n=Kt{Mlw1?_icLg_DzVMp*d!$%QOM<`mU`kZ)sA+T$NQJii9Sy{a#)3FeW zm4~y>Od{5AXQF$WXQE3Hdvg2=wY~4Kgz%CT)wO#N4)YMs5A`7@+lz)NxY-);Y4?i; z{&4&8@mDs#egdn${>RK+WGVk{?+*#sN4^K*N502Ll83CljjaQnfsOs|8upRj{y#|` zAAK((PEO{d)M9W`>IF_>Q=~N)G?@X3g_BR;QFvFw3ck2pYZeWBbHh4dn~03etLfzR zZ1bd3s>nkPl!IP5!Q4L&h!2Qd**;ep`6g|$hXJ}gKWsKnk)98t$}~+&=ey+Ra^?#| zao7Q7h26Fi2ocxf2nN!KU%%mkx?vK-l7nlna ztV>H`^?Pj`(0F&Z&7(P`!(ZPPmoR&TwH??&OfiT}oZ#is6py%hS? zX_z1MbLPg>n}h74D?O&g7(Yc_c=E7;D9~qt}(_{_BrKr8pfy%jGuZZ=&&}VfPy~k00uI+fx zJmvs0XQ~!sd1q4jM;LF z@#mQP-{F6clKlZ^!uuEezhY*82mjqz{sU}A`%m!SOy|Ed{N0-Sg8`cUpA3Jo?f#Dc z>s0**8UV0j1OWbR;{H4QuY=~F;Y2Kdg8y>}m6ZVdP> Date: Wed, 29 Jan 2025 16:05:47 +0100 Subject: [PATCH 6/8] Add standalone equations as DocItem formula Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 6d7f3debf..52af584f5 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -242,8 +242,8 @@ def handle_equations_in_text(self, element, text): def handle_text_elements(self, element, docx_obj, doc): paragraph = docx.text.paragraph.Paragraph(element, docx_obj) - text = paragraph.text - text = self.handle_equations_in_text(element=element, text=text) + raw_text = paragraph.text + text = self.handle_equations_in_text(element=element, text=raw_text) if text is None: return @@ -281,21 +281,20 @@ def handle_text_elements(self, element, docx_obj, doc): self.parents[key] = None self.level = self.level_at_new_list - 1 self.level_at_new_list = None + if p_style_id in ["Title"]: for key, val in self.parents.items(): self.parents[key] = None self.parents[0] = doc.add_text( parent=None, label=DocItemLabel.TITLE, text=text ) + elif "Heading" in p_style_id: self.add_header(element, docx_obj, doc, p_style_id, p_level, text) elif p_style_id in [ - "Paragraph", - "Normal", "Subtitle", "Author", - "DefaultText", "ListParagraph", "ListBullet", "Quote", @@ -305,12 +304,32 @@ def handle_text_elements(self, element, docx_obj, doc): label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text ) + elif (raw_text is None or len(raw_text) == 0) and len(text) > 0: + # Standalone equation + # Entities in which all text comes from equations + level = self.get_level() + if text.strip().startswith("$") and text.strip().endswith("$"): + text = text.strip()[1:-1] + doc.add_text( + label=DocItemLabel.FORMULA, parent=self.parents[level - 1], text=text + ) + + elif p_style_id in [ + "Paragraph", + "Normal", + "DefaultText", + ]: + level = self.get_level() + doc.add_text( + label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text + ) + else: # Text style names can, and will have, not only default values but user values too # hence we treat all other labels as pure text level = self.get_level() doc.add_text( - label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text + label=DocItemLabel.TEXT, parent=self.parents[level - 1], text=text ) self.update_history(p_style_id, p_level, numid, ilevel) return From 97f444b11c2724d292c9f7444188c41a34b9e1ee Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Wed, 29 Jan 2025 16:06:33 +0100 Subject: [PATCH 7/8] Update test files Signed-off-by: Rafael Teixeira de Lima --- .../docling_v2/equations.docx.itxt | 10 +++---- .../docling_v2/equations.docx.json | 30 +++++++++---------- .../groundtruth/docling_v2/equations.docx.md | 10 +++---- .../docling_v2/word_sample.docx.itxt | 2 +- .../docling_v2/word_sample.docx.json | 2 +- 5 files changed, 27 insertions(+), 27 deletions(-) diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt index c28443a92..b6fc5d14f 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.itxt +++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt @@ -1,31 +1,31 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: item-2 at level 1: paragraph: - item-3 at level 1: paragraph: $a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$ + item-3 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23 item-4 at level 1: paragraph: And that is an equation by itself. Cheers! item-5 at level 1: paragraph: item-6 at level 1: paragraph: This is another equation: - item-7 at level 1: paragraph: $f\left(x\right)=a_{0}+\sum_{n=1 ... )+b_{n}\sin(\frac{n \pi x}{L})\right)$ + item-7 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right) item-8 at level 1: paragraph: item-9 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. item-10 at level 1: paragraph: item-11 at level 1: paragraph: item-12 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: item-13 at level 1: paragraph: - item-14 at level 1: paragraph: $\left(x+a\right)^{n}=\sum_{k=0} ... c{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$ + item-14 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k} item-15 at level 1: paragraph: item-16 at level 1: paragraph: And that is an equation by itself. Cheers! item-17 at level 1: paragraph: item-18 at level 1: paragraph: This is another equation: item-19 at level 1: paragraph: - item-20 at level 1: paragraph: $\left(1+x\right)^{n}=1+\frac{nx ... t)x^{2}}{2!}+ \text{ \textellipsis } $ + item-20 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ht)x^{2}}{2!}+ \text{ \textellipsis } item-21 at level 1: paragraph: item-22 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. item-23 at level 1: paragraph: item-24 at level 1: paragraph: item-25 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: item-26 at level 1: paragraph: - item-27 at level 1: paragraph: $e^{x}=1+\frac{x}{1!}+\frac{x^{2 ... ellipsis } , - \infty < x < \infty $ + item-27 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... tellipsis } , - \infty < x < \infty item-28 at level 1: paragraph: item-29 at level 1: paragraph: And that is an equation by itself. Cheers! item-30 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json index 2f6cb7ca9..1905f9cab 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.json +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -140,10 +140,10 @@ "$ref": "#/body" }, "children": [], - "label": "paragraph", + "label": "formula", "prov": [], - "orig": "$a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23$", - "text": "$a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23$" + "orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23", + "text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23" }, { "self_ref": "#/texts/3", @@ -184,10 +184,10 @@ "$ref": "#/body" }, "children": [], - "label": "paragraph", + "label": "formula", "prov": [], - "orig": "$f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)$", - "text": "$f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)$" + "orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)", + "text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)" }, { "self_ref": "#/texts/7", @@ -261,10 +261,10 @@ "$ref": "#/body" }, "children": [], - "label": "paragraph", + "label": "formula", "prov": [], - "orig": "$\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}$", - "text": "$\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}$" + "orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}", + "text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}" }, { "self_ref": "#/texts/14", @@ -327,10 +327,10 @@ "$ref": "#/body" }, "children": [], - "label": "paragraph", + "label": "formula", "prov": [], - "orig": "$\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } $", - "text": "$\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } $" + "orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } ", + "text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } " }, { "self_ref": "#/texts/20", @@ -404,10 +404,10 @@ "$ref": "#/body" }, "children": [], - "label": "paragraph", + "label": "formula", "prov": [], - "orig": "$e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty $", - "text": "$e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty $" + "orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty ", + "text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty " }, { "self_ref": "#/texts/27", diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md index 7364d1297..bb023bbda 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.md +++ b/tests/data/groundtruth/docling_v2/equations.docx.md @@ -1,29 +1,29 @@ This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this: -$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$ +$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$ And that is an equation by itself. Cheers! This is another equation: -$f\left(x\right)=a\_{0}+\sum\_{n=1}^{ \infty }\left(a\_{n}\cos(\frac{n \pi x}{L})+b\_{n}\sin(\frac{n \pi x}{L})\right)$ +$$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+b_{n}\sin(\frac{n \pi x}{L})\right)$$ This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this: -$\left(x+a\right)^{n}=\sum\_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$ +$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$ And that is an equation by itself. Cheers! This is another equation: -$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis } $ +$$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis } $$ This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this: -$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty $ +$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty $$ And that is an equation by itself. Cheers! \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt index ce60ad261..b03255109 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt @@ -3,7 +3,7 @@ item-0 at level 0: unspecified: group _root_ item-2 at level 1: title: Swimming in the lake item-3 at level 2: paragraph: Duck item-4 at level 2: picture - item-5 at level 2: paragraph: Figure 1: This is a cute duckling + item-5 at level 2: text: Figure 1: This is a cute duckling item-6 at level 2: section_header: Let’s swim! item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: item-8 at level 3: list: group list diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 8c6e62989..44b4bd61c 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -138,7 +138,7 @@ "$ref": "#/texts/1" }, "children": [], - "label": "paragraph", + "label": "text", "prov": [], "orig": "Figure 1: This is a cute duckling", "text": "Figure 1: This is a cute duckling" From c82ec123ffafde1dfddd94674639786fd0ea1072 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Fri, 31 Jan 2025 14:35:39 +0100 Subject: [PATCH 8/8] Fix poetry.lock Signed-off-by: Rafael Teixeira de Lima --- poetry.lock | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/poetry.lock b/poetry.lock index ef06fe91a..69d414824 100644 --- a/poetry.lock +++ b/poetry.lock @@ -182,8 +182,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -2817,8 +2817,8 @@ files = [ [package.dependencies] multiprocess = [ - {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""}, + {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""}, ] pygments = ">=2.0" pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""} @@ -3833,10 +3833,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, ] @@ -3859,10 +3859,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, + {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""}, {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""}, ] @@ -4048,9 +4048,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, - {version = ">=1.23.2", markers = "python_version == \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, + {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4824,8 +4824,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -7835,4 +7835,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "336970505f4bae6b21f4cf358ebf6b5ef4fa42a4980358297e63bfea381b350a" +content-hash = "3e5f23bc034f4eb241532773cfa9ccc4f79780a1b59618a6ded18be5f927ce1d"