diff --git a/.github/workflows/pytest_ords.yml b/.github/workflows/pytest_ords.yml index 0b69bef3..00271206 100644 --- a/.github/workflows/pytest_ords.yml +++ b/.github/workflows/pytest_ords.yml @@ -34,6 +34,7 @@ jobs: shell: bash - name: Install dependencies run: | + conda install -y pip conda install -c conda-forge poppler python -m pip install --upgrade pip python -m pip install pdftotext diff --git a/.github/workflows/pytest_postgres.yml b/.github/workflows/pytest_postgres.yml index 84054c66..dedebae9 100644 --- a/.github/workflows/pytest_postgres.yml +++ b/.github/workflows/pytest_postgres.yml @@ -26,6 +26,7 @@ jobs: - name: Install dependencies shell: bash -l {0} run: | + conda install -y pip python -m pip install --upgrade pip python -m pip install psycopg2-binary python -m pip install boto3 diff --git a/docs/source/conf.py b/docs/source/conf.py index f305ea82..82213916 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -141,7 +141,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'rexdoc' +htmlhelp_basename = 'elmdoc' # -- Options for LaTeX output ------------------------------------------------ diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst index d97af80a..a6e9244a 100644 --- a/docs/source/dev/ords_architecture.rst +++ b/docs/source/dev/ords_architecture.rst @@ -363,7 +363,7 @@ for multiprocessing tasks. -------------------------------------------------------------------------------------------------------------------------------------------------- -**4.2.2** :class:`~elm.web.file_loader.AsyncFileLoader` +**4.2.2** :class:`~elm.web.file_loader.AsyncWebFileLoader` ------------------------------------------------------- .. literalinclude:: ../../../elm/web/file_loader.py @@ -376,10 +376,10 @@ for multiprocessing tasks. .. code-block:: python import asyncio - from elm.web.file_loader import AsyncFileLoader + from elm.web.file_loader import AsyncWebFileLoader async def main(): - loader = AsyncFileLoader() + loader = AsyncWebFileLoader() doc = await loader.fetch( url="https://en.wikipedia.org/wiki/National_Renewable_Energy_Laboratory" ) @@ -781,7 +781,7 @@ We give a rough breakdown of the following call: 1. :func:`~elm.web.search.run.web_search_links_as_docs()` is invoked with 3 queries and ``num_urls=4``. 2. Each of the three queries are processed asynchronously, creating a :class:`~elm.web.google_search.PlaywrightGoogleLinkSearch` instance and retrieving the top URL results. 3. Internal code reduces the URL lists returned from each of the queries into the top 4 URLs. -4. :class:`~elm.web.file_loader.AsyncFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored +4. :class:`~elm.web.file_loader.AsyncWebFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored in (:class:`~elm.web.document.HTMLDocument` or :class:`~elm.web.document.PDFDocument`), creates and populates the document instances, and returns the document to the caller. **Sequence Diagram:** @@ -791,7 +791,7 @@ We give a rough breakdown of the following call: sequenceDiagram participant A as web_search_links_as_docs() participant B as PlaywrightGoogleLinkSearch - participant D as AsyncFileLoader + participant D as AsyncWebFileLoader participant E as HTMLDocument participant F as PDFDocument diff --git a/elm/ords/download.py b/elm/ords/download.py index 0b0f6470..38e45700 100644 --- a/elm/ords/download.py +++ b/elm/ords/download.py @@ -48,9 +48,10 @@ async def download_county_ordinance( ordinance document. By default, ``5``. file_loader_kwargs : dict, optional Dictionary of keyword-argument pairs to initialize - :class:`elm.web.file_loader.AsyncFileLoader` with. If found, the - "pw_launch_kwargs" key in these will also be used to initialize - the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch` + :class:`elm.web.file_loader.AsyncWebFileLoader` with. If found, + the "pw_launch_kwargs" key in these will also be used to + initialize the + :class:`elm.web.google_search.PlaywrightGoogleLinkSearch` used for the google URL search. By default, ``None``. browser_semaphore : :class:`asyncio.Semaphore`, optional Semaphore instance that can be used to limit the number of diff --git a/elm/ords/process.py b/elm/ords/process.py index 91bb0aa5..69697bcb 100644 --- a/elm/ords/process.py +++ b/elm/ords/process.py @@ -425,7 +425,7 @@ async def process_county_with_logging( ordinance document. By default, ``5``. file_loader_kwargs : dict, optional Dictionary of keyword-argument pairs to initialize - :class:`elm.web.file_loader.AsyncFileLoader` with. The + :class:`elm.web.file_loader.AsyncWebFileLoader` with. The "pw_launch_kwargs" key in these will also be used to initialize the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch` used for the google URL search. By default, ``None``. @@ -498,7 +498,7 @@ async def process_county( ordinance document. By default, ``5``. file_loader_kwargs : dict, optional Dictionary of keyword-argument pairs to initialize - :class:`elm.web.file_loader.AsyncFileLoader` with. The + :class:`elm.web.file_loader.AsyncWebFileLoader` with. The "pw_launch_kwargs" key in these will also be used to initialize the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch` used for the google URL search. By default, ``None``. diff --git a/elm/version.py b/elm/version.py index 12fae424..0887c580 100644 --- a/elm/version.py +++ b/elm/version.py @@ -2,4 +2,4 @@ ELM version number """ -__version__ = "0.0.38" +__version__ = "0.0.39" diff --git a/elm/web/document.py b/elm/web/document.py index 8b625bc2..35493918 100644 --- a/elm/web/document.py +++ b/elm/web/document.py @@ -38,7 +38,7 @@ class BaseDocument(ABC): and formats tables. 3. Track pages and other document metadata. Key Relationships: - Created by :class:`~elm.web.file_loader.AsyncFileLoader` and + Created by :class:`~elm.web.file_loader.AsyncWebFileLoader` and used all over ordinance code. .. end desc @@ -339,6 +339,63 @@ def _raw_pages(self): return self.text_splitter.split_text("\n\n".join(self.pages)) +class MDDocument(BaseDocument): + """ELM Markdown document""" + + MARKDOWN_COMMENT_RE = re.compile(r"", re.DOTALL) + """Regex pattern to remove HTML comments from markdown text""" + WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"} + FILE_EXTENSION = "md" + + def __init__(self, pages, attrs=None, remove_comments=True, + text_splitter=None): + """ + + Parameters + ---------- + pages : iterable + Iterable of strings, where each string is a page of a + document. + attrs : dict, optional + Optional dict containing metadata for the document. + By default, ``None``. + remove_comments : bool, optional + Option remove HTML comments in Markdown text during + cleaning. By default, ``True``. + text_splitter : obj, optional + Instance of an object that implements a `split_text` method. + The method should take text as input (str) and return a list + of text chunks. The raw pages will be passed through this + splitter to create raw pages for this document. Langchain's + text splitters should work for this input. + By default, ``None``, which means the original pages input + becomes the raw pages attribute. + """ + super().__init__(pages, attrs=attrs) + self.remove_comments = remove_comments + self.text_splitter = text_splitter + + def _cleaned_text(self): + """Compute cleaned text from document""" + text = combine_pages(self.pages) + if self.remove_comments: + text = self.MARKDOWN_COMMENT_RE.sub("", text) + return text + + def _raw_pages(self): + """Get raw pages from document""" + if self.text_splitter is None: + return self.pages + return self.text_splitter.split_text("\n\n".join(self.pages)) + + @property + def empty(self): + """bool: ``True`` if the document contains no pages.""" + # Always strip comments when checking if doc is empty + return not any(_non_empty_pages(( + self.MARKDOWN_COMMENT_RE.sub("", p) for p in self.pages))) + + def _non_empty_pages(pages): """Return all pages with more than 10 chars""" return filter( diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py index 57b808bd..8f639b63 100644 --- a/elm/web/file_loader.py +++ b/elm/web/file_loader.py @@ -51,39 +51,14 @@ async def _read_html_file(html_fp, **kwargs): class BaseAsyncFileLoader(ABC): """Base class for async file loading""" - def __init__( - self, - pdf_read_coroutine, - html_read_coroutine, - pdf_read_kwargs=None, - html_read_kwargs=None, - pdf_ocr_read_coroutine=None, - file_cache_coroutine=None, - **__, # consume any extra kwargs - ): + def __init__(self, + file_cache_coroutine=None, + **__, # consume any extra kwargs + ): """ Parameters ---------- - pdf_read_coroutine : callable - PDF file read coroutine. Must by an async function. - Must return a :obj:`elm.web.document.PDFDocument`. - html_read_coroutine : callable, optional - HTML file read coroutine. Must by an async function. - Must return a :obj:`elm.web.document.HTMLDocument`. - pdf_read_kwargs : dict, optional - Keyword-value argument pairs to pass to the - `pdf_read_coroutine`. By default, ``None``. - html_read_kwargs : dict, optional - Keyword-value argument pairs to pass to the - `html_read_coroutine`. By default, ``None``. - pdf_ocr_read_coroutine : callable, optional - PDF OCR file read coroutine. Must by an async function. - Should accept PDF bytes as the first argument and kwargs as - the rest. Must return a :obj:`elm.web.document.PDFDocument`. - If ``None``, PDF OCR parsing is not attempted, and any - scanned PDF URL's will return a blank document. - By default, ``None``. file_cache_coroutine : callable, optional File caching coroutine. Can be used to cache files downloaded by this class. Must accept an @@ -92,11 +67,6 @@ def __init__( argument. If this method is not provided, no document caching is performed. By default, ``None``. """ - self.pdf_read_coroutine = pdf_read_coroutine - self.html_read_coroutine = html_read_coroutine - self.pdf_read_kwargs = pdf_read_kwargs or {} - self.html_read_kwargs = html_read_kwargs or {} - self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine self.file_cache_coroutine = file_cache_coroutine async def fetch_all(self, *sources): @@ -157,6 +127,10 @@ async def _fetch_doc_with_url_in_metadata(self, source): async def _cache_doc(self, doc, raw_content): """Cache doc if user provided a coroutine""" if doc.empty or not raw_content: + if self.file_cache_coroutine: + logger.debug("Not caching document for source %r because the " + "document is empty or there is no raw content", + doc.attrs.get("source", "Unknown")) return doc if not self.file_cache_coroutine: @@ -190,9 +164,6 @@ class AsyncWebFileLoader(BaseAsyncFileLoader): .. end desc """ - PAGE_LOAD_TIMEOUT = 60_000 - """Default page load timeout value in milliseconds""" - def __init__( self, header_template=None, @@ -276,65 +247,39 @@ def __init__( of attempts will always be 2, even if the user provides a value smaller than this. By default, ``3``. """ - super().__init__( - pdf_read_coroutine=pdf_read_coroutine or _read_pdf_doc, - html_read_coroutine=html_read_coroutine or _read_html_doc, - pdf_read_kwargs=pdf_read_kwargs, - html_read_kwargs=html_read_kwargs, - pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, - file_cache_coroutine=file_cache_coroutine - ) - self.pw_launch_kwargs = pw_launch_kwargs or {} - self.get_kwargs = { - "headers": self._header_from_template(header_template), - "ssl": None if verify_ssl else False, - **(aget_kwargs or {}), - } - self.browser_semaphore = browser_semaphore - self.uss = use_scrapling_stealth - self.num_pw_html_retries = num_pw_html_retries - - def _header_from_template(self, header_template): - """Compile header from user or default template""" - headers = header_template or DEFAULT_HEADERS - headers = dict(headers) - if not headers.get("User-Agent"): - headers["User-Agent"] = UserAgent().random - return headers + super().__init__(file_cache_coroutine=file_cache_coroutine) + self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc + self.pdf_read_kwargs = pdf_read_kwargs or {} + self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine + self.content_fetcher = AsyncFetchWithRetry( + header_template=header_template, verify_ssl=verify_ssl, + aget_kwargs=aget_kwargs) + self.html_loader = AsyncHTMLLoader( + pw_launch_kwargs=pw_launch_kwargs, + html_read_kwargs=html_read_kwargs,html_read_coroutine=html_read_coroutine, + browser_semaphore=browser_semaphore, use_scrapling_stealth=use_scrapling_stealth, + num_pw_html_retries=num_pw_html_retries) async def _fetch_doc(self, url): """Fetch a doc by trying pdf read, then HTML read, then PDF OCR""" - async with aiohttp.ClientSession() as session: - try: - logger.debug("Fetching content from %r", url) - out = await self._fetch_content_with_retry(url, session) - except ELMRuntimeError: - logger.exception("Could not fetch content from %r", url) - return PDFDocument(pages=[]), None + out = await self.content_fetcher.fetch(url) + if out is None: + return PDFDocument(pages=[]), None - raw_content, ct, charset = out + raw_content, ct, charset, __ = out logger.debug("Got content from %r", url) doc = await self.pdf_read_coroutine(raw_content, **self.pdf_read_kwargs) if not doc.empty: return doc, raw_content - logger.debug("PDF read failed; fetching HTML content from %r", url) - doc = await self._fetch_html_using_pw_with_retry(url) + logger.debug("PDF read failed") + doc = await self.html_loader.fetch(url, raw_content, ct, charset) if not doc.empty: return doc, doc.text - if "text" in ct: - logger.debug("HTML read with playwright failed; fetching HTML " - "content from response with content type %r and " - "charset %r for %r", ct, charset, url) - doc = await self._try_load_doc_from_response_text(raw_content, - charset) - if not doc.empty: - return doc, doc.text - - elif self.pdf_ocr_read_coroutine: + if self.pdf_ocr_read_coroutine: logger.debug("HTML read failed; fetching OCR content from %r", url) doc = await self.pdf_ocr_read_coroutine( raw_content, **self.pdf_read_kwargs @@ -342,6 +287,174 @@ async def _fetch_doc(self, url): return doc, raw_content + +class AsyncFetchWithRetry: + """Loader for fetching content from the web with retry attempts""" + + def __init__(self, header_template=None, verify_ssl=True, + aget_kwargs=None, client_kwargs=None): + """ + + Parameters + ---------- + header_template : dict, optional + Optional GET header template. If not specified, uses + :obj:`~elm.web.utilities.DEFAULT_HEADERS`. + By default, ``None``. + verify_ssl : bool, optional + Option to use aiohttp's default SSL check. If ``False``, + SSL certificate validation is skipped. By default, ``True``. + aget_kwargs : dict, optional + Other kwargs to pass to :meth:`aiohttp.ClientSession.get`. + By default, ``None``. + """ + self.get_kwargs = { + "headers": _header_from_template(header_template), + "ssl": None if verify_ssl else False, + **(aget_kwargs or {}), + } + self.client_kwargs = client_kwargs or {} + + async def fetch(self, url): + """Fetch content from the web + + Parameters + ---------- + url : str + URL to fetch content from. + + Returns + ------- + tuple or None + Tuple of (content bytes, content type, charset) if the fetch + was successful, else ``None``. + """ + async with aiohttp.ClientSession(**self.client_kwargs) as session: + try: + logger.debug("Fetching content from %r", url) + return await self._fetch_content_with_retry(url, session) + except ELMRuntimeError: + logger.exception("Could not fetch content from %r", url) + return None + + @async_retry_with_exponential_backoff( + base_delay=2, + exponential_base=1.5, + jitter=False, + max_retries=3, + errors=( + aiohttp.ClientConnectionError, + aiohttp.client_exceptions.ClientError, + ), + ) + async def _fetch_content_with_retry(self, url, session): + """Fetch content from URL with several retry attempts""" + async with session.get(url, **self.get_kwargs) as response: + body = await response.read() + headers = response.headers + ct = response.content_type.casefold() + charset = response.charset or 'utf-8' + return body, ct, charset, headers + + +class AsyncHTMLLoader: + """Loader specifically designed to load HTML documents from the web.""" + + PAGE_LOAD_TIMEOUT = 60_000 + """Default page load timeout value in milliseconds""" + + def __init__(self, pw_launch_kwargs=None, html_read_kwargs=None, + html_read_coroutine=None, browser_semaphore=None, + use_scrapling_stealth=False, num_pw_html_retries=3): + """ + + Parameters + ---------- + pw_launch_kwargs : dict, optional + Keyword-value argument pairs to pass to + :meth:`async_playwright.chromium.launch` (only used when + reading HTML). By default, ``None``. + html_read_kwargs : dict, optional + Keyword-value argument pairs to pass to the + `html_read_coroutine`. By default, ``None``. + html_read_coroutine : callable, optional + HTML file read coroutine. Must by an async function. Should + accept HTML text as the first argument and kwargs as the + rest. Must return a :obj:`elm.web.document.HTMLDocument`. + If ``None``, a default function that runs in the main thread + is used. By default, ``None``. + browser_semaphore : asyncio.Semaphore, optional + Semaphore instance that can be used to limit the number of + playwright browsers open concurrently. If ``None``, no + limits are applied. By default, ``None``. + use_scrapling_stealth : bool, default=False + Option to use scrapling stealth scripts instead of + tf-playwright-stealth. By default, ``False``. + num_pw_html_retries : int, default=3 + Number of attempts to load HTML content. This is useful + because the playwright parameters are stochastic, and + sometimes a combination of them can fail to load HTML. The + default value is likely a good balance between processing + attempts and retrieval success. Note that the minimum number + of attempts will always be 2, even if the user provides a + value smaller than this. By default, ``3``. + """ + self.pw_launch_kwargs = pw_launch_kwargs or {} + self.html_read_coroutine = html_read_coroutine or _read_html_doc + self.html_read_kwargs = html_read_kwargs or {} + self.uss = use_scrapling_stealth + self.browser_semaphore = browser_semaphore + self.num_pw_html_retries = num_pw_html_retries + + async def fetch(self, url, raw_content=None, ct=None, charset=None): + """Load an HTML doc from a URL + + Parameters + ---------- + url : str + URL to load HTML content from. + raw_content : bytes, optional + Raw content bytes from the URL response. This is used in + case the playwright HTML load fails and we need to try + loading HTML from the response content. If not provided, + this step is skipped. By default, ``None``. + ct : str, optional + Content type from the URL response. This is used to help + determine if the response content can be processed as text + in the case where the playwright HTML load fails. If not + provided, this step is skipped. By default, ``None``. + charset : str, optional + Charset from the URL response. This is used to decode the + response content in the case where the playwright HTML load + fails and we need to try loading HTML from the response + content. If not provided, this step is skipped. + By default, ``None``. + + Returns + ------- + HTMLDocument + Document instance containing text, if the load was + successful, else an empty document. + """ + logger.debug("Fetching HTML content from %r", url) + doc = await self._fetch_html_using_pw_with_retry(url) + if not doc.empty: + return doc + + can_process_response = (raw_content is not None + and ct is not None + and charset is not None + and "text" in ct) + if not can_process_response: + return HTMLDocument(pages=[]) + + logger.debug("HTML read with playwright failed; fetching HTML " + "content from response with content type %r and " + "charset %r for %r", ct, charset, url) + doc = await self._try_load_doc_from_response_text(raw_content, + charset) + return doc + async def _fetch_html_using_pw_with_retry(self, url): """Fetch HTML content with several retry attempts""" num_attempts = max(1, int(self.num_pw_html_retries) - 1) @@ -367,24 +480,6 @@ async def _fetch_html_using_pw_with_retry(self, url): **self.pw_launch_kwargs) return await self.html_read_coroutine(text, **self.html_read_kwargs) - @async_retry_with_exponential_backoff( - base_delay=2, - exponential_base=1.5, - jitter=False, - max_retries=3, - errors=( - aiohttp.ClientConnectionError, - aiohttp.client_exceptions.ClientError, - ), - ) - async def _fetch_content_with_retry(self, url, session): - """Fetch content from URL with several retry attempts""" - async with session.get(url, **self.get_kwargs) as response: - body = await response.read() - ct = response.content_type.casefold() - charset = response.charset or 'utf-8' - return body, ct, charset - async def _try_load_doc_from_response_text(self, raw_content, charset): """Try to load document by decoding response text""" try: @@ -453,14 +548,12 @@ def __init__( Additional document attributes to add to each loaded document. By default, ``None``. """ - super().__init__( - pdf_read_coroutine=pdf_read_coroutine or _read_pdf_file, - html_read_coroutine=html_read_coroutine or _read_html_file, - pdf_read_kwargs=pdf_read_kwargs, - html_read_kwargs=html_read_kwargs, - pdf_ocr_read_coroutine=pdf_ocr_read_coroutine, - file_cache_coroutine=file_cache_coroutine - ) + super().__init__(file_cache_coroutine=file_cache_coroutine) + self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_file + self.html_read_coroutine = html_read_coroutine or _read_html_file + self.pdf_read_kwargs = pdf_read_kwargs or {} + self.html_read_kwargs = html_read_kwargs or {} + self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine self.doc_attrs = doc_attrs or {} async def _fetch_doc(self, source): @@ -509,3 +602,12 @@ async def _fetch_doc_with_url_in_metadata(self, source): class AsyncFileLoader(AsyncWebFileLoader): """Alias for AsyncWebFileLoader (for backward compatibility)""" + + +def _header_from_template(header_template): + """Compile header from user or default template""" + headers = header_template or DEFAULT_HEADERS + headers = dict(headers) + if not headers.get("User-Agent"): + headers["User-Agent"] = UserAgent().random + return headers diff --git a/elm/web/search/run.py b/elm/web/search/run.py index a997dac7..6a4e4449 100644 --- a/elm/web/search/run.py +++ b/elm/web/search/run.py @@ -8,7 +8,7 @@ from itertools import zip_longest, chain from contextlib import AsyncExitStack -from elm.web.file_loader import AsyncFileLoader +from elm.web.file_loader import AsyncWebFileLoader from elm.web.search.bing import PlaywrightBingLinkSearch from elm.web.search.duckduckgo import (APIDuckDuckGoSearch, PlaywrightDuckDuckGoLinkSearch) @@ -125,7 +125,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, search failed). By default, ``None``. **kwargs Keyword-argument pairs to initialize - :class:`elm.web.file_loader.AsyncFileLoader`. This input can + :class:`elm.web.file_loader.AsyncWebFileLoader`. This input can also include and any/all of the following keywords: - ddg_api_kwargs @@ -145,7 +145,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, keyword-argument pairs that you can use to initialize the search engines in the `search_engines` input. If ``pw_launch_kwargs`` is detected, it will be added to the kwargs for all of the - PLaywright-based search engines so that you do not have to + Playwright-based search engines so that you do not have to repeatedly specify the launch parameters. For example, you may specify ``pw_launch_kwargs={"headless": False}`` to have all Playwright-based searches show the browser and _also_ @@ -173,7 +173,11 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE, await on_search_complete_hook(urls) logger.debug("Downloading documents for URLS: \n\t-%s", "\n\t-".join(urls)) - docs = await load_docs(urls, browser_semaphore, **kwargs) + logger.trace("kwargs for AsyncWebFileLoader:\n%s", + pprint.PrettyPrinter().pformat(kwargs)) + file_loader = AsyncWebFileLoader(browser_semaphore=browser_semaphore, + **kwargs) + docs = await load_docs(urls, file_loader) return docs @@ -288,20 +292,15 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE, return set() -async def load_docs(urls, browser_semaphore=None, **kwargs): +async def load_docs(sources, file_loader): """Load a document for each input URL Parameters ---------- - urls : iterable of str - Iterable of URL's (as strings) to fetch. - browser_semaphore : :class:`asyncio.Semaphore`, optional - Semaphore instance that can be used to limit the number of - playwright browsers open concurrently for document retrieval. If - ``None``, no limits are applied. By default, ``None``. - kwargs - Keyword-argument pairs to initialize - :class:`elm.web.file_loader.AsyncFileLoader`. + sources : iterable of str + Iterable of URL's or filepaths (as strings) to fetch. + file_loader : class:`elm.web.file_loader.AsyncWebFileLoader` + File loader instance used to fetch content from URL's. Returns ------- @@ -310,18 +309,20 @@ async def load_docs(urls, browser_semaphore=None, **kwargs): the URL's. If a URL could not be fetched (i.e. document instance is empty), it will not be included in the output list. """ - logger.trace("Downloading docs for the following URL's:\n%r", urls) - logger.trace("kwargs for AsyncFileLoader:\n%s", - pprint.PrettyPrinter().pformat(kwargs)) - file_loader = AsyncFileLoader(browser_semaphore=browser_semaphore, - **kwargs) - docs = await file_loader.fetch_all(*urls) - - page_lens = {doc.attrs.get("source", "Unknown"): len(doc.pages) - for doc in docs} + logger.trace("Downloading docs for the following sources:\n%r", sources) + docs = await file_loader.fetch_all(*sources) + logger.debug("Loaded %d docs from %d sources", len(docs), len(sources)) + docs = [doc for doc in docs if not doc.empty] + logger.debug("%d docs are not empty", len(docs)) + + page_lens = {} + for doc in docs: + source = doc.attrs.get("source", "Unknown") + page_lens.setdefault(source, []).append(len(doc.pages)) + page_lens = {k: v if len(v) > 1 else v[0] for k, v in page_lens.items()} logger.debug("Loaded the following number of pages for docs:\n%s", pprint.PrettyPrinter().pformat(page_lens)) - return [doc for doc in docs if not doc.empty] + return docs async def _single_se_search(se_name, queries, num_urls, ignore_url_parts, diff --git a/elm/web/website_crawl.py b/elm/web/website_crawl.py index 09cccfcb..64dded48 100644 --- a/elm/web/website_crawl.py +++ b/elm/web/website_crawl.py @@ -16,7 +16,6 @@ from crawl4ai.deep_crawling.filters import (FilterChain, URLPatternFilter, ContentTypeFilter, URLFilter) -from elm.web.file_loader import AsyncFileLoader from elm.web.document import HTMLDocument @@ -375,7 +374,7 @@ async def _arun_best_first(self, start_url, crawler, config): class ELMWebsiteCrawler: """Crawl a website for documents of interest""" - def __init__(self, validator, file_loader_kwargs=None, + def __init__(self, validator, async_file_loader, browser_config_kwargs=None, crawl_strategy_kwargs=None, crawler_config_kwargs=None, cte_kwargs=None, extra_url_filters=None, include_external=False, @@ -390,10 +389,9 @@ def __init__(self, validator, file_loader_kwargs=None, indicating whether the text passes the validation check. This is used to determine whether or not to keep (i.e. return) the document. - file_loader_kwargs : dict, optional - Additional keyword-value argument pairs to pass to the - :class:`~elm.web.file_loader.AsyncFileLoader` class. - By default, ``None``. + async_file_loader : object + :class:`~elm.web.file_loader.BaseAsyncFileLoader` instance + used to fetch the content of the crawled pages. browser_config_kwargs : dict, optional Additional keyword-value argument pairs to pass to the :class:`crawl4ai.async_configs.BrowserConfig` class. @@ -440,10 +438,7 @@ def __init__(self, validator, file_loader_kwargs=None, """ self.validator = validator self.page_limit = page_limit or 2 * max_pages - - flk = {"verify_ssl": False} - flk.update(file_loader_kwargs or {}) - self.afl = AsyncFileLoader(**flk) + self.afl = async_file_loader bck = {"headless": True, "verbose": False} bck.update(browser_config_kwargs or {}) diff --git a/examples/web_information_retrieval/example_website_retrieval_atb.ipynb b/examples/web_information_retrieval/example_website_retrieval_atb.ipynb index b84f11c4..2e26c58a 100644 --- a/examples/web_information_retrieval/example_website_retrieval_atb.ipynb +++ b/examples/web_information_retrieval/example_website_retrieval_atb.ipynb @@ -43,6 +43,7 @@ "source": [ "from crawl4ai.deep_crawling.filters import URLFilter\n", "from elm.web.website_crawl import ELMWebsiteCrawler\n", + "from elm.web.file_loader import AsyncWebFileLoader\n", "from rex import init_logger" ] }, @@ -194,9 +195,10 @@ "metadata": {}, "outputs": [], "source": [ + "afl = AsyncWebFileLoader(verify_ssl=False)\n", "crawler = ELMWebsiteCrawler(empty_validator,\n", + " async_file_loader=afl,\n", " url_scorer=empty_link_scorer,\n", - " file_loader_kwargs={\"verify_ssl\": False},\n", " include_external=False,\n", " extra_url_filters=[ATBFilter()],\n", " crawl_strategy_kwargs={\"max_depth\": 3},\n", @@ -417,7 +419,7 @@ "outputs": [], "source": [ "crawler = ELMWebsiteCrawler(llm_geothermal_validator,\n", - " file_loader_kwargs={\"verify_ssl\": False},\n", + " async_file_loader=afl,\n", " extra_url_filters=[ATBFilter()],\n", " url_scorer=geothermal_link_scorer,\n", " crawl_strategy_kwargs={\"max_depth\": 3},\n", diff --git a/requirements.txt b/requirements.txt index 6c34ff6e..f0534f1c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,7 +14,6 @@ langchain-text-splitters lxml matplotlib networkx -nlr-rex nltk numpy pandas diff --git a/setup.py b/setup.py index d9eb11e2..c2e99f81 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ with open("requirements.txt") as f: install_requires = f.readlines() - +examples_require = ["nlr-rex>=0.5.0"] test_requires = ["pytest>=5.2", "pytest-mock", "pytest-asyncio", "pytest-cov", "flaky>=3.8.1"] description = "Energy Language Model" @@ -49,7 +49,7 @@ ], install_requires=install_requires, extras_require={ - "dev": install_requires + test_requires, + "dev": install_requires + examples_require + test_requires, }, entry_points={"console_scripts": ["elm=elm.cli:main"]} ) diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py index c8b03523..89acb820 100644 --- a/tests/ords/test_integrated.py +++ b/tests/ords/test_integrated.py @@ -22,7 +22,7 @@ from elm.ords.services.provider import RunningAsyncServices from elm.ords.utilities.queued_logging import LocationFileLog, LogListener from elm.web.search.yahoo import PlaywrightYahooLinkSearch -from elm.web.file_loader import AsyncFileLoader +from elm.web.file_loader import AsyncWebFileLoader from elm.web.document import HTMLDocument @@ -34,6 +34,7 @@ def __init__(self, read_return): self.read_return = read_return self.content_type = "application/pdf" self.charset = "utf-8" + self.headers = {} async def read(self): return self.read_return @@ -202,7 +203,7 @@ async def search_location_with_logs( @pytest.mark.asyncio async def test_async_file_loader_with_temp_cache(monkeypatch): - """Test `AsyncFileLoader` with a `TempFileCache` service""" + """Test `AsyncWebFileLoader` with a `TempFileCache` service""" monkeypatch.setattr( aiohttp.ClientSession, @@ -223,7 +224,7 @@ async def test_async_file_loader_with_temp_cache(monkeypatch): truth = HTMLDocument([content]) async with RunningAsyncServices([TempFileCache()]): - loader = AsyncFileLoader(file_cache_coroutine=TempFileCache.call) + loader = AsyncWebFileLoader(file_cache_coroutine=TempFileCache.call) doc = await loader.fetch("Whatcom") assert doc.text == truth.text assert doc.attrs["source"] == "Whatcom" diff --git a/tests/web/search/test_web_search_run.py b/tests/web/search/test_web_search_run.py index b5a4d9b4..01fd1aa2 100644 --- a/tests/web/search/test_web_search_run.py +++ b/tests/web/search/test_web_search_run.py @@ -8,6 +8,7 @@ _init_se, load_docs) from elm.web.search.google import (APIGoogleCSESearch, PlaywrightGoogleLinkSearch) +from elm.web.file_loader import AsyncWebFileLoader from elm.exceptions import ELMKeyError @@ -63,7 +64,7 @@ def test_init_se_does_not_pop_kwargs(): @pytest.mark.asyncio async def test_load_docs_empty(): """Test loading docs for no URLs""" - assert await load_docs(set()) == [] + assert await load_docs(set(), AsyncWebFileLoader()) == [] @pytest.mark.asyncio diff --git a/tests/web/test_web_crawling.py b/tests/web/test_web_crawling.py index 1ce692e0..151f9cf8 100644 --- a/tests/web/test_web_crawling.py +++ b/tests/web/test_web_crawling.py @@ -7,6 +7,7 @@ from elm.ords.validation.content import possibly_mentions_wind from elm.web.website_crawl import ELMLinkScorer, ELMWebsiteCrawler +from elm.web.file_loader import AsyncWebFileLoader @pytest.mark.asyncio @@ -24,7 +25,9 @@ async def validation(doc): async def found_enough_test_docs(out_docs): return len(out_docs) >= 1 + afl = AsyncWebFileLoader(verify_ssl=False) crawler = ELMWebsiteCrawler(validator=validation, + async_file_loader=afl, url_scorer=ELMLinkScorer(kw).score) out_docs = await crawler.run("https://www.elpasoco.com", diff --git a/tests/web/test_web_document.py b/tests/web/test_web_document.py index 5faa7593..e3c53643 100644 --- a/tests/web/test_web_document.py +++ b/tests/web/test_web_document.py @@ -8,7 +8,7 @@ import pandas as pd from elm import TEST_DATA_DIR -from elm.web.document import PDFDocument, HTMLDocument +from elm.web.document import PDFDocument, HTMLDocument, MDDocument class TestSplitter: @@ -19,7 +19,9 @@ def split_text(self, text): return text.split("\n") -@pytest.mark.parametrize("doc_type", [PDFDocument, HTMLDocument]) +@pytest.mark.parametrize( + "doc_type", [PDFDocument, HTMLDocument, MDDocument] +) def test_basic_document(doc_type): """Test basic properties of the `Document` class""" @@ -100,6 +102,54 @@ def test_html_doc_with_splitter(): assert len(doc.raw_pages) == og_text.count("\n") + 1 +def test_markdown_doc_removes_comments(): + """Test markdown comment stripping during cleaning""" + + pages = [ + "# Heading\nVisible text\n", + "More text\n\nFinal line", + ] + + doc = MDDocument(pages) + + assert "hidden comment" not in doc.text + assert "another\ncomment" not in doc.text + assert "Visible text" in doc.text + assert "Final line" in doc.text + assert doc.raw_pages == pages + + +def test_markdown_doc_keeps_comments_when_disabled(): + """Test markdown comments remain when comment removal is disabled""" + + page = "Visible text\n\nFinal line" + + doc = MDDocument([page], remove_comments=False) + + assert doc.text == page + + +@pytest.mark.parametrize("remove_comments", [True, False]) +def test_markdown_doc_empty_ignores_comments(remove_comments): + """Test empty check ignores markdown comments for all settings""" + + page = "" + + doc = MDDocument([page], remove_comments=remove_comments) + + assert doc.empty + + +def test_markdown_doc_with_splitter(): + """Test markdown raw pages with a text splitter""" + + pages = ["# Heading\n\nBody line", "Tail section"] + + doc = MDDocument(pages, text_splitter=TestSplitter()) + + assert doc.raw_pages == ["# Heading", "", "Body line", "", "Tail section"] + + def test_doc_repr(): """Test document repr method""" @@ -143,6 +193,7 @@ def test_doc_is_empty(pages): assert PDFDocument(pages).empty assert HTMLDocument(pages).empty + assert MDDocument(pages).empty def test_html_string_is_empty_doc(): diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py index cc2f2451..778154cc 100644 --- a/tests/web/test_web_file_loader.py +++ b/tests/web/test_web_file_loader.py @@ -29,6 +29,7 @@ def __init__(self, read_return): self.read_return = read_return self.content_type = "application/pdf" self.charset = "utf-8" + self.headers = {} async def read(self): """Return what class was initialized with.""" @@ -58,7 +59,7 @@ async def patched_get_html(url, *args, **kwargs): @pytest.mark.asyncio async def test_async_file_loader_basic_pdf(monkeypatch): - """Test `AsyncFileLoader` for a basic PDF doc""" + """Test `AsyncWebFileLoader` for a basic PDF doc""" monkeypatch.setattr( aiohttp.ClientSession, @@ -82,7 +83,7 @@ async def test_async_file_loader_basic_pdf(monkeypatch): @pytest.mark.asyncio async def test_async_file_loader_basic_html(monkeypatch): - """Test `AsyncFileLoader` for a basic HTML doc""" + """Test `AsyncWebFileLoader` for a basic HTML doc""" monkeypatch.setattr( aiohttp.ClientSession,