diff --git a/docling/cli/main.py b/docling/cli/main.py index a83aecbf..e1ce289e 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -164,6 +164,11 @@ def convert( to_formats: List[OutputFormat] = typer.Option( None, "--to", help="Specify output formats. Defaults to Markdown." ), + headers: str = typer.Option( + None, + "--headers", + help="Specify http request headers used when fetching url input sources in the form of a JSON string", + ), image_export_mode: Annotated[ ImageRefMode, typer.Option( @@ -279,12 +284,19 @@ def convert( if from_formats is None: from_formats = [e for e in InputFormat] + parsed_headers: Optional[Dict[str, str]] = None + if headers is not None: + headers_t = TypeAdapter(Dict[str, str]) + parsed_headers = headers_t.validate_json(headers) + with tempfile.TemporaryDirectory() as tempdir: input_doc_paths: List[Path] = [] for src in input_sources: try: # check if we can fetch some remote url - source = resolve_source_to_path(source=src, workdir=Path(tempdir)) + source = resolve_source_to_path( + source=src, headers=parsed_headers, workdir=Path(tempdir) + ) input_doc_paths.append(source) except FileNotFoundError: err_console.print( @@ -390,7 +402,7 @@ def convert( start_time = time.time() conv_results = doc_converter.convert_all( - input_doc_paths, raises_on_error=abort_on_error + input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error ) output.mkdir(parents=True, exist_ok=True) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 4ed7d577..136428e8 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -227,13 +227,18 @@ def unload(self): class _DocumentConversionInput(BaseModel): path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] + headers: Optional[Dict[str, str]] = None limits: Optional[DocumentLimits] = DocumentLimits() def docs( self, format_options: Dict[InputFormat, "FormatOption"] ) -> Iterable[InputDocument]: for item in self.path_or_stream_iterator: - obj = resolve_source_to_stream(item) if isinstance(item, str) else item + obj = ( + resolve_source_to_stream(item, self.headers) + if isinstance(item, str) + else item + ) format = self._guess_format(obj) backend: Type[AbstractDocumentBackend] if format not in format_options.keys(): diff --git a/docling/document_converter.py b/docling/document_converter.py index c9cbedd2..cb073949 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -176,6 +176,7 @@ def initialize_pipeline(self, format: InputFormat): def convert( self, source: Union[Path, str, DocumentStream], # TODO review naming + headers: Optional[Dict[str, str]] = None, raises_on_error: bool = True, max_num_pages: int = sys.maxsize, max_file_size: int = sys.maxsize, @@ -185,6 +186,7 @@ def convert( raises_on_error=raises_on_error, max_num_pages=max_num_pages, max_file_size=max_file_size, + headers=headers, ) return next(all_res) @@ -192,6 +194,7 @@ def convert( def convert_all( self, source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming + headers: Optional[Dict[str, str]] = None, raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error max_num_pages: int = sys.maxsize, max_file_size: int = sys.maxsize, @@ -201,8 +204,7 @@ def convert_all( max_file_size=max_file_size, ) conv_input = _DocumentConversionInput( - path_or_stream_iterator=source, - limits=limits, + path_or_stream_iterator=source, limits=limits, headers=headers ) conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)