Skip to content

Commit

Permalink
Add parsing configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
rateixei committed Jan 28, 2025
1 parent 6fe4201 commit e3fd60d
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 4 deletions.
15 changes: 12 additions & 3 deletions docling/backend/msword_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,12 @@


class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
get_latex=False,
):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
Expand All @@ -49,6 +54,9 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.level = 0
self.listIter = 0

# Transform MSWord equations to latex
self.get_latex = get_latex

self.history = {
"names": [None],
"levels": [None],
Expand Down Expand Up @@ -240,9 +248,10 @@ def handle_text_elements(self, element, docx_obj, doc):
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)

text = paragraph.text
text = self.handle_equations_in_text(element=element, text=text)
if self.get_latex:
text = self.handle_equations_in_text(element=element, text=text)

if paragraph.text is None:
if text is None:
return
text = text.strip()

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ module = [
"deepsearch_glm.*",
"lxml.*",
"bs4.*",
"huggingface_hub.*"
"huggingface_hub.*",
"pylatexenc.*"
]
ignore_missing_imports = true

Expand Down

0 comments on commit e3fd60d

Please sign in to comment.