diff --git a/tests/data/docx/equations.docx b/tests/data/docx/equations.docx new file mode 100644 index 00000000..8ab71b96 Binary files /dev/null and b/tests/data/docx/equations.docx differ diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt new file mode 100644 index 00000000..c28443a9 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt @@ -0,0 +1,31 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: + item-2 at level 1: paragraph: + item-3 at level 1: paragraph: $a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$ + item-4 at level 1: paragraph: And that is an equation by itself. Cheers! + item-5 at level 1: paragraph: + item-6 at level 1: paragraph: This is another equation: + item-7 at level 1: paragraph: $f\left(x\right)=a_{0}+\sum_{n=1 ... )+b_{n}\sin(\frac{n \pi x}{L})\right)$ + item-8 at level 1: paragraph: + item-9 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. + item-10 at level 1: paragraph: + item-11 at level 1: paragraph: + item-12 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: + item-13 at level 1: paragraph: + item-14 at level 1: paragraph: $\left(x+a\right)^{n}=\sum_{k=0} ... c{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$ + item-15 at level 1: paragraph: + item-16 at level 1: paragraph: And that is an equation by itself. Cheers! + item-17 at level 1: paragraph: + item-18 at level 1: paragraph: This is another equation: + item-19 at level 1: paragraph: + item-20 at level 1: paragraph: $\left(1+x\right)^{n}=1+\frac{nx ... t)x^{2}}{2!}+ \text{ \textellipsis } $ + item-21 at level 1: paragraph: + item-22 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. + item-23 at level 1: paragraph: + item-24 at level 1: paragraph: + item-25 at level 1: paragraph: This is a word document and this ... nt an equation by line, I can do this: + item-26 at level 1: paragraph: + item-27 at level 1: paragraph: $e^{x}=1+\frac{x}{1!}+\frac{x^{2 ... ellipsis } , - \infty < x < \infty $ + item-28 at level 1: paragraph: + item-29 at level 1: paragraph: And that is an equation by itself. Cheers! + item-30 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json new file mode 100644 index 00000000..2f6cb7ca --- /dev/null +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -0,0 +1,450 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "equations", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 11121138535595486899, + "filename": "equations.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:", + "text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "$a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23$", + "text": "$a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23$" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "And that is an equation by itself. Cheers!", + "text": "And that is an equation by itself. Cheers!" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is another equation:", + "text": "This is another equation:" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "$f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)$", + "text": "$f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)$" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", + "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text." + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:", + "text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "$\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}$", + "text": "$\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}$" + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "And that is an equation by itself. Cheers!", + "text": "And that is an equation by itself. Cheers!" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is another equation:", + "text": "This is another equation:" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "$\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } $", + "text": "$\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis } $" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", + "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text." + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:", + "text": "This is a word document and this is an inline equation: $A= \\pi r^{2} $. If instead, I want an equation by line, I can do this:" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "$e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty $", + "text": "$e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty $" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "And that is an equation by itself. Cheers!", + "text": "And that is an equation by itself. Cheers!" + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/body" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md new file mode 100644 index 00000000..7364d129 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/equations.docx.md @@ -0,0 +1,29 @@ +This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this: + +$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$ + +And that is an equation by itself. Cheers! + +This is another equation: + +$f\left(x\right)=a\_{0}+\sum\_{n=1}^{ \infty }\left(a\_{n}\cos(\frac{n \pi x}{L})+b\_{n}\sin(\frac{n \pi x}{L})\right)$ + +This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. + +This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this: + +$\left(x+a\right)^{n}=\sum\_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$ + +And that is an equation by itself. Cheers! + +This is another equation: + +$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis } $ + +This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. + +This is a word document and this is an inline equation: $A= \pi r^{2} $. If instead, I want an equation by line, I can do this: + +$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty $ + +And that is an equation by itself. Cheers! \ No newline at end of file