Skip to content

Commit

Permalink
feat: added http header support for document converter and cli (#642)
Browse files Browse the repository at this point in the history
* added http header support for document converter and cli

Signed-off-by: Luke Harrison <[email protected]>

* fixed formatting and typing issues

Signed-off-by: Luke Harrison <[email protected]>

* use pydantic to parse dict

suggested by @dolfim-ibm

Co-authored-by: Michele Dolfi <[email protected]>
Signed-off-by: Luke Harrison <[email protected]>

---------

Signed-off-by: Luke Harrison <[email protected]>
Signed-off-by: Luke Harrison <[email protected]>
Co-authored-by: Michele Dolfi <[email protected]>
  • Loading branch information
lharrison13 and dolfim-ibm authored Jan 7, 2025
1 parent 569038d commit 0ee849e
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 5 deletions.
16 changes: 14 additions & 2 deletions docling/cli/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,11 @@ def convert(
to_formats: List[OutputFormat] = typer.Option(
None, "--to", help="Specify output formats. Defaults to Markdown."
),
headers: str = typer.Option(
None,
"--headers",
help="Specify http request headers used when fetching url input sources in the form of a JSON string",
),
image_export_mode: Annotated[
ImageRefMode,
typer.Option(
Expand Down Expand Up @@ -279,12 +284,19 @@ def convert(
if from_formats is None:
from_formats = [e for e in InputFormat]

parsed_headers: Optional[Dict[str, str]] = None
if headers is not None:
headers_t = TypeAdapter(Dict[str, str])
parsed_headers = headers_t.validate_json(headers)

with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = []
for src in input_sources:
try:
# check if we can fetch some remote url
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
source = resolve_source_to_path(
source=src, headers=parsed_headers, workdir=Path(tempdir)
)
input_doc_paths.append(source)
except FileNotFoundError:
err_console.print(
Expand Down Expand Up @@ -390,7 +402,7 @@ def convert(
start_time = time.time()

conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)

output.mkdir(parents=True, exist_ok=True)
Expand Down
7 changes: 6 additions & 1 deletion docling/datamodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,13 +227,18 @@ def unload(self):
class _DocumentConversionInput(BaseModel):

path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
headers: Optional[Dict[str, str]] = None
limits: Optional[DocumentLimits] = DocumentLimits()

def docs(
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
obj = (
resolve_source_to_stream(item, self.headers)
if isinstance(item, str)
else item
)
format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
Expand Down
6 changes: 4 additions & 2 deletions docling/document_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ def initialize_pipeline(self, format: InputFormat):
def convert(
self,
source: Union[Path, str, DocumentStream], # TODO review naming
headers: Optional[Dict[str, str]] = None,
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
Expand All @@ -185,13 +186,15 @@ def convert(
raises_on_error=raises_on_error,
max_num_pages=max_num_pages,
max_file_size=max_file_size,
headers=headers,
)
return next(all_res)

@validate_call(config=ConfigDict(strict=True))
def convert_all(
self,
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
headers: Optional[Dict[str, str]] = None,
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
max_file_size: int = sys.maxsize,
Expand All @@ -201,8 +204,7 @@ def convert_all(
max_file_size=max_file_size,
)
conv_input = _DocumentConversionInput(
path_or_stream_iterator=source,
limits=limits,
path_or_stream_iterator=source, limits=limits, headers=headers
)
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)

Expand Down

0 comments on commit 0ee849e

Please sign in to comment.