diff --git a/.github/workflows/pytest_ords.yml b/.github/workflows/pytest_ords.yml
index 0b69bef3..00271206 100644
--- a/.github/workflows/pytest_ords.yml
+++ b/.github/workflows/pytest_ords.yml
@@ -34,6 +34,7 @@ jobs:
       shell: bash
     - name: Install dependencies
       run: |
+        conda install -y pip
         conda install -c conda-forge poppler
         python -m pip install --upgrade pip
         python -m pip install pdftotext
diff --git a/.github/workflows/pytest_postgres.yml b/.github/workflows/pytest_postgres.yml
index 84054c66..dedebae9 100644
--- a/.github/workflows/pytest_postgres.yml
+++ b/.github/workflows/pytest_postgres.yml
@@ -26,6 +26,7 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
+        conda install -y pip
         python -m pip install --upgrade pip
         python -m pip install psycopg2-binary
         python -m pip install boto3
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f305ea82..82213916 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -141,7 +141,7 @@
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'rexdoc'
+htmlhelp_basename = 'elmdoc'
 
 # -- Options for LaTeX output ------------------------------------------------
 
diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst
index d97af80a..a6e9244a 100644
--- a/docs/source/dev/ords_architecture.rst
+++ b/docs/source/dev/ords_architecture.rst
@@ -363,7 +363,7 @@ for multiprocessing tasks.
 
 --------------------------------------------------------------------------------------------------------------------------------------------------
 
-**4.2.2** :class:`~elm.web.file_loader.AsyncFileLoader`
+**4.2.2** :class:`~elm.web.file_loader.AsyncWebFileLoader`
 -------------------------------------------------------
 
 .. literalinclude:: ../../../elm/web/file_loader.py
@@ -376,10 +376,10 @@ for multiprocessing tasks.
 .. code-block:: python
 
     import asyncio
-    from elm.web.file_loader import AsyncFileLoader
+    from elm.web.file_loader import AsyncWebFileLoader
 
     async def main():
-        loader = AsyncFileLoader()
+        loader = AsyncWebFileLoader()
         doc = await loader.fetch(
             url="https://en.wikipedia.org/wiki/National_Renewable_Energy_Laboratory"
         )
@@ -781,7 +781,7 @@ We give a rough breakdown of the following call:
 1. :func:`~elm.web.search.run.web_search_links_as_docs()` is invoked with 3 queries and ``num_urls=4``.
 2. Each of the three queries are processed asynchronously, creating a :class:`~elm.web.google_search.PlaywrightGoogleLinkSearch` instance and retrieving the top URL results.
 3. Internal code reduces the URL lists returned from each of the queries into the top 4 URLs.
-4. :class:`~elm.web.file_loader.AsyncFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored
+4. :class:`~elm.web.file_loader.AsyncWebFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored
    in (:class:`~elm.web.document.HTMLDocument` or :class:`~elm.web.document.PDFDocument`), creates and populates the document instances, and returns the document to the caller.
 
 **Sequence Diagram:**
@@ -791,7 +791,7 @@ We give a rough breakdown of the following call:
     sequenceDiagram
         participant A as web_search_links_as_docs()
         participant B as PlaywrightGoogleLinkSearch
-        participant D as AsyncFileLoader
+        participant D as AsyncWebFileLoader
         participant E as HTMLDocument
         participant F as PDFDocument
 
diff --git a/elm/ords/download.py b/elm/ords/download.py
index 0b0f6470..38e45700 100644
--- a/elm/ords/download.py
+++ b/elm/ords/download.py
@@ -48,9 +48,10 @@ async def download_county_ordinance(
         ordinance document. By default, ``5``.
     file_loader_kwargs : dict, optional
         Dictionary of keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. If found, the
-        "pw_launch_kwargs" key in these will also be used to initialize
-        the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
+        :class:`elm.web.file_loader.AsyncWebFileLoader` with. If found,
+        the "pw_launch_kwargs" key in these will also be used to
+        initialize the
+        :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
         used for the google URL search. By default, ``None``.
     browser_semaphore : :class:`asyncio.Semaphore`, optional
         Semaphore instance that can be used to limit the number of
diff --git a/elm/ords/process.py b/elm/ords/process.py
index 91bb0aa5..69697bcb 100644
--- a/elm/ords/process.py
+++ b/elm/ords/process.py
@@ -425,7 +425,7 @@ async def process_county_with_logging(
         ordinance document. By default, ``5``.
     file_loader_kwargs : dict, optional
         Dictionary of keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. The
+        :class:`elm.web.file_loader.AsyncWebFileLoader` with. The
         "pw_launch_kwargs" key in these will also be used to initialize
         the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
         used for the google URL search. By default, ``None``.
@@ -498,7 +498,7 @@ async def process_county(
         ordinance document. By default, ``5``.
     file_loader_kwargs : dict, optional
         Dictionary of keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. The
+        :class:`elm.web.file_loader.AsyncWebFileLoader` with. The
         "pw_launch_kwargs" key in these will also be used to initialize
         the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
         used for the google URL search. By default, ``None``.
diff --git a/elm/version.py b/elm/version.py
index 12fae424..0887c580 100644
--- a/elm/version.py
+++ b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.38"
+__version__ = "0.0.39"
diff --git a/elm/web/document.py b/elm/web/document.py
index 8b625bc2..35493918 100644
--- a/elm/web/document.py
+++ b/elm/web/document.py
@@ -38,7 +38,7 @@ class BaseDocument(ABC):
            and formats tables.
         3. Track pages and other document metadata.
     Key Relationships:
-        Created by :class:`~elm.web.file_loader.AsyncFileLoader` and
+        Created by :class:`~elm.web.file_loader.AsyncWebFileLoader` and
         used all over ordinance code.
 
     .. end desc
@@ -339,6 +339,63 @@ def _raw_pages(self):
         return self.text_splitter.split_text("\n\n".join(self.pages))
 
 
+class MDDocument(BaseDocument):
+    """ELM Markdown document"""
+
+    MARKDOWN_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
+    """Regex pattern to remove HTML comments from markdown text"""
+    WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"}
+    FILE_EXTENSION = "md"
+
+    def __init__(self, pages, attrs=None, remove_comments=True,
+                 text_splitter=None):
+        """
+
+        Parameters
+        ----------
+        pages : iterable
+            Iterable of strings, where each string is a page of a
+            document.
+        attrs : dict, optional
+            Optional dict containing metadata for the document.
+            By default, ``None``.
+        remove_comments : bool, optional
+            Option remove HTML comments in Markdown text during
+            cleaning. By default, ``True``.
+        text_splitter : obj, optional
+            Instance of an object that implements a `split_text` method.
+            The method should take text as input (str) and return a list
+            of text chunks. The raw pages will be passed through this
+            splitter to create raw pages for this document. Langchain's
+            text splitters should work for this input.
+            By default, ``None``, which means the original pages input
+            becomes the raw pages attribute.
+        """
+        super().__init__(pages, attrs=attrs)
+        self.remove_comments = remove_comments
+        self.text_splitter = text_splitter
+
+    def _cleaned_text(self):
+        """Compute cleaned text from document"""
+        text = combine_pages(self.pages)
+        if self.remove_comments:
+            text = self.MARKDOWN_COMMENT_RE.sub("", text)
+        return text
+
+    def _raw_pages(self):
+        """Get raw pages from document"""
+        if self.text_splitter is None:
+            return self.pages
+        return self.text_splitter.split_text("\n\n".join(self.pages))
+
+    @property
+    def empty(self):
+        """bool: ``True`` if the document contains no pages."""
+        # Always strip comments when checking if doc is empty
+        return not any(_non_empty_pages((
+            self.MARKDOWN_COMMENT_RE.sub("", p) for p in self.pages)))
+
+
 def _non_empty_pages(pages):
     """Return all pages with more than 10 chars"""
     return filter(
diff --git a/elm/web/file_loader.py b/elm/web/file_loader.py
index 57b808bd..8f639b63 100644
--- a/elm/web/file_loader.py
+++ b/elm/web/file_loader.py
@@ -51,39 +51,14 @@ async def _read_html_file(html_fp, **kwargs):
 class BaseAsyncFileLoader(ABC):
     """Base class for async file loading"""
 
-    def __init__(
-        self,
-        pdf_read_coroutine,
-        html_read_coroutine,
-        pdf_read_kwargs=None,
-        html_read_kwargs=None,
-        pdf_ocr_read_coroutine=None,
-        file_cache_coroutine=None,
-        **__,  # consume any extra kwargs
-    ):
+    def __init__(self,
+                 file_cache_coroutine=None,
+                 **__,  # consume any extra kwargs
+                 ):
         """
 
         Parameters
         ----------
-        pdf_read_coroutine : callable
-            PDF file read coroutine. Must by an async function.
-            Must return a :obj:`elm.web.document.PDFDocument`.
-        html_read_coroutine : callable, optional
-            HTML file read coroutine. Must by an async function.
-            Must return a :obj:`elm.web.document.HTMLDocument`.
-        pdf_read_kwargs : dict, optional
-            Keyword-value argument pairs to pass to the
-            `pdf_read_coroutine`. By default, ``None``.
-        html_read_kwargs : dict, optional
-            Keyword-value argument pairs to pass to the
-            `html_read_coroutine`. By default, ``None``.
-        pdf_ocr_read_coroutine : callable, optional
-            PDF OCR file read coroutine. Must by an async function.
-            Should accept PDF bytes as the first argument and kwargs as
-            the rest. Must return a :obj:`elm.web.document.PDFDocument`.
-            If ``None``, PDF OCR parsing is not attempted, and any
-            scanned PDF URL's will return a blank document.
-            By default, ``None``.
         file_cache_coroutine : callable, optional
             File caching coroutine. Can be used to cache files
             downloaded by this class. Must accept an
@@ -92,11 +67,6 @@ def __init__(
             argument. If this method is not provided, no document
             caching is performed. By default, ``None``.
         """
-        self.pdf_read_coroutine = pdf_read_coroutine
-        self.html_read_coroutine = html_read_coroutine
-        self.pdf_read_kwargs = pdf_read_kwargs or {}
-        self.html_read_kwargs = html_read_kwargs or {}
-        self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine
         self.file_cache_coroutine = file_cache_coroutine
 
     async def fetch_all(self, *sources):
@@ -157,6 +127,10 @@ async def _fetch_doc_with_url_in_metadata(self, source):
     async def _cache_doc(self, doc, raw_content):
         """Cache doc if user provided a coroutine"""
         if doc.empty or not raw_content:
+            if self.file_cache_coroutine:
+                logger.debug("Not caching document for source %r because the "
+                             "document is empty or there is no raw content",
+                             doc.attrs.get("source", "Unknown"))
             return doc
 
         if not self.file_cache_coroutine:
@@ -190,9 +164,6 @@ class AsyncWebFileLoader(BaseAsyncFileLoader):
     .. end desc
     """
 
-    PAGE_LOAD_TIMEOUT = 60_000
-    """Default page load timeout value in milliseconds"""
-
     def __init__(
         self,
         header_template=None,
@@ -276,65 +247,39 @@ def __init__(
             of attempts will always be 2, even if the user provides a
             value smaller than this. By default, ``3``.
         """
-        super().__init__(
-            pdf_read_coroutine=pdf_read_coroutine or _read_pdf_doc,
-            html_read_coroutine=html_read_coroutine or _read_html_doc,
-            pdf_read_kwargs=pdf_read_kwargs,
-            html_read_kwargs=html_read_kwargs,
-            pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
-            file_cache_coroutine=file_cache_coroutine
-        )
-        self.pw_launch_kwargs = pw_launch_kwargs or {}
-        self.get_kwargs = {
-            "headers": self._header_from_template(header_template),
-            "ssl": None if verify_ssl else False,
-            **(aget_kwargs or {}),
-        }
-        self.browser_semaphore = browser_semaphore
-        self.uss = use_scrapling_stealth
-        self.num_pw_html_retries = num_pw_html_retries
-
-    def _header_from_template(self, header_template):
-        """Compile header from user or default template"""
-        headers = header_template or DEFAULT_HEADERS
-        headers = dict(headers)
-        if not headers.get("User-Agent"):
-            headers["User-Agent"] = UserAgent().random
-        return headers
+        super().__init__(file_cache_coroutine=file_cache_coroutine)
+        self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_doc
+        self.pdf_read_kwargs = pdf_read_kwargs or {}
+        self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine
+        self.content_fetcher = AsyncFetchWithRetry(
+            header_template=header_template, verify_ssl=verify_ssl,
+            aget_kwargs=aget_kwargs)
+        self.html_loader = AsyncHTMLLoader(
+            pw_launch_kwargs=pw_launch_kwargs,
+            html_read_kwargs=html_read_kwargs,html_read_coroutine=html_read_coroutine,
+            browser_semaphore=browser_semaphore, use_scrapling_stealth=use_scrapling_stealth,
+            num_pw_html_retries=num_pw_html_retries)
 
     async def _fetch_doc(self, url):
         """Fetch a doc by trying pdf read, then HTML read, then PDF OCR"""
 
-        async with aiohttp.ClientSession() as session:
-            try:
-                logger.debug("Fetching content from %r", url)
-                out = await self._fetch_content_with_retry(url, session)
-            except ELMRuntimeError:
-                logger.exception("Could not fetch content from %r", url)
-                return PDFDocument(pages=[]), None
+        out = await self.content_fetcher.fetch(url)
+        if out is None:
+            return PDFDocument(pages=[]), None
 
-        raw_content, ct, charset = out
+        raw_content, ct, charset, __ = out
         logger.debug("Got content from %r", url)
         doc = await self.pdf_read_coroutine(raw_content,
                                             **self.pdf_read_kwargs)
         if not doc.empty:
             return doc, raw_content
 
-        logger.debug("PDF read failed; fetching HTML content from %r", url)
-        doc = await self._fetch_html_using_pw_with_retry(url)
+        logger.debug("PDF read failed")
+        doc = await self.html_loader.fetch(url, raw_content, ct, charset)
         if not doc.empty:
             return doc, doc.text
 
-        if "text" in ct:
-            logger.debug("HTML read with playwright failed; fetching HTML "
-                         "content from response with content type %r and "
-                         "charset %r for %r", ct, charset, url)
-            doc = await self._try_load_doc_from_response_text(raw_content,
-                                                              charset)
-            if not doc.empty:
-                return doc, doc.text
-
-        elif self.pdf_ocr_read_coroutine:
+        if self.pdf_ocr_read_coroutine:
             logger.debug("HTML read failed; fetching OCR content from %r", url)
             doc = await self.pdf_ocr_read_coroutine(
                 raw_content, **self.pdf_read_kwargs
@@ -342,6 +287,174 @@ async def _fetch_doc(self, url):
 
         return doc, raw_content
 
+
+class AsyncFetchWithRetry:
+    """Loader for fetching content from the web with retry attempts"""
+
+    def __init__(self, header_template=None, verify_ssl=True,
+                 aget_kwargs=None, client_kwargs=None):
+        """
+
+        Parameters
+        ----------
+        header_template : dict, optional
+            Optional GET header template. If not specified, uses
+            :obj:`~elm.web.utilities.DEFAULT_HEADERS`.
+            By default, ``None``.
+        verify_ssl : bool, optional
+            Option to use aiohttp's default SSL check. If ``False``,
+            SSL certificate validation is skipped. By default, ``True``.
+        aget_kwargs : dict, optional
+            Other kwargs to pass to :meth:`aiohttp.ClientSession.get`.
+            By default, ``None``.
+        """
+        self.get_kwargs = {
+            "headers": _header_from_template(header_template),
+            "ssl": None if verify_ssl else False,
+            **(aget_kwargs or {}),
+        }
+        self.client_kwargs = client_kwargs or {}
+
+    async def fetch(self, url):
+        """Fetch content from the web
+
+        Parameters
+        ----------
+        url : str
+            URL to fetch content from.
+
+        Returns
+        -------
+        tuple or None
+            Tuple of (content bytes, content type, charset) if the fetch
+            was successful, else ``None``.
+        """
+        async with aiohttp.ClientSession(**self.client_kwargs) as session:
+            try:
+                logger.debug("Fetching content from %r", url)
+                return await self._fetch_content_with_retry(url, session)
+            except ELMRuntimeError:
+                logger.exception("Could not fetch content from %r", url)
+                return None
+
+    @async_retry_with_exponential_backoff(
+        base_delay=2,
+        exponential_base=1.5,
+        jitter=False,
+        max_retries=3,
+        errors=(
+            aiohttp.ClientConnectionError,
+            aiohttp.client_exceptions.ClientError,
+        ),
+    )
+    async def _fetch_content_with_retry(self, url, session):
+        """Fetch content from URL with several retry attempts"""
+        async with session.get(url, **self.get_kwargs) as response:
+            body = await response.read()
+            headers = response.headers
+            ct = response.content_type.casefold()
+            charset = response.charset or 'utf-8'
+            return body, ct, charset, headers
+
+
+class AsyncHTMLLoader:
+    """Loader specifically designed to load HTML documents from the web."""
+
+    PAGE_LOAD_TIMEOUT = 60_000
+    """Default page load timeout value in milliseconds"""
+
+    def __init__(self, pw_launch_kwargs=None, html_read_kwargs=None,
+                 html_read_coroutine=None, browser_semaphore=None,
+                 use_scrapling_stealth=False, num_pw_html_retries=3):
+        """
+
+        Parameters
+        ----------
+        pw_launch_kwargs : dict, optional
+            Keyword-value argument pairs to pass to
+            :meth:`async_playwright.chromium.launch` (only used when
+            reading HTML). By default, ``None``.
+        html_read_kwargs : dict, optional
+            Keyword-value argument pairs to pass to the
+            `html_read_coroutine`. By default, ``None``.
+        html_read_coroutine : callable, optional
+            HTML file read coroutine. Must by an async function. Should
+            accept HTML text as the first argument and kwargs as the
+            rest. Must return a :obj:`elm.web.document.HTMLDocument`.
+            If ``None``, a default function that runs in the main thread
+            is used. By default, ``None``.
+        browser_semaphore : asyncio.Semaphore, optional
+            Semaphore instance that can be used to limit the number of
+            playwright browsers open concurrently. If ``None``, no
+            limits are applied. By default, ``None``.
+        use_scrapling_stealth : bool, default=False
+            Option to use scrapling stealth scripts instead of
+            tf-playwright-stealth. By default, ``False``.
+        num_pw_html_retries : int, default=3
+            Number of attempts to load HTML content. This is useful
+            because the playwright parameters are stochastic, and
+            sometimes a combination of them can fail to load HTML. The
+            default value is likely a good balance between processing
+            attempts and retrieval success. Note that the minimum number
+            of attempts will always be 2, even if the user provides a
+            value smaller than this. By default, ``3``.
+        """
+        self.pw_launch_kwargs = pw_launch_kwargs or {}
+        self.html_read_coroutine = html_read_coroutine or _read_html_doc
+        self.html_read_kwargs = html_read_kwargs or {}
+        self.uss = use_scrapling_stealth
+        self.browser_semaphore = browser_semaphore
+        self.num_pw_html_retries = num_pw_html_retries
+
+    async def fetch(self, url, raw_content=None, ct=None, charset=None):
+        """Load an HTML doc from a URL
+
+        Parameters
+        ----------
+        url : str
+             URL to load HTML content from.
+        raw_content : bytes, optional
+            Raw content bytes from the URL response. This is used in
+            case the playwright HTML load fails and we need to try
+            loading HTML from the response content. If not provided,
+            this step is skipped. By default, ``None``.
+        ct : str, optional
+            Content type from the URL response. This is used to help
+            determine if the response content can be processed as text
+            in the case where the playwright HTML load fails. If not
+            provided, this step is skipped. By default, ``None``.
+        charset : str, optional
+            Charset from the URL response. This is used to decode the
+            response content in the case where the playwright HTML load
+            fails and we need to try loading HTML from the response
+            content. If not provided, this step is skipped.
+            By default, ``None``.
+
+        Returns
+        -------
+        HTMLDocument
+            Document instance containing text, if the load was
+            successful, else an empty document.
+        """
+        logger.debug("Fetching HTML content from %r", url)
+        doc = await self._fetch_html_using_pw_with_retry(url)
+        if not doc.empty:
+            return doc
+
+        can_process_response = (raw_content is not None
+                                and ct is not None
+                                and charset is not None
+                                and "text" in ct)
+        if not can_process_response:
+            return HTMLDocument(pages=[])
+
+        logger.debug("HTML read with playwright failed; fetching HTML "
+                     "content from response with content type %r and "
+                     "charset %r for %r", ct, charset, url)
+        doc = await self._try_load_doc_from_response_text(raw_content,
+                                                          charset)
+        return doc
+
     async def _fetch_html_using_pw_with_retry(self, url):
         """Fetch HTML content with several retry attempts"""
         num_attempts = max(1, int(self.num_pw_html_retries) - 1)
@@ -367,24 +480,6 @@ async def _fetch_html_using_pw_with_retry(self, url):
                                        **self.pw_launch_kwargs)
         return await self.html_read_coroutine(text, **self.html_read_kwargs)
 
-    @async_retry_with_exponential_backoff(
-        base_delay=2,
-        exponential_base=1.5,
-        jitter=False,
-        max_retries=3,
-        errors=(
-            aiohttp.ClientConnectionError,
-            aiohttp.client_exceptions.ClientError,
-        ),
-    )
-    async def _fetch_content_with_retry(self, url, session):
-        """Fetch content from URL with several retry attempts"""
-        async with session.get(url, **self.get_kwargs) as response:
-            body = await response.read()
-            ct = response.content_type.casefold()
-            charset = response.charset or 'utf-8'
-            return body, ct, charset
-
     async def _try_load_doc_from_response_text(self, raw_content, charset):
         """Try to load document by decoding response text"""
         try:
@@ -453,14 +548,12 @@ def __init__(
             Additional document attributes to add to each loaded
             document. By default, ``None``.
         """
-        super().__init__(
-            pdf_read_coroutine=pdf_read_coroutine or _read_pdf_file,
-            html_read_coroutine=html_read_coroutine or _read_html_file,
-            pdf_read_kwargs=pdf_read_kwargs,
-            html_read_kwargs=html_read_kwargs,
-            pdf_ocr_read_coroutine=pdf_ocr_read_coroutine,
-            file_cache_coroutine=file_cache_coroutine
-        )
+        super().__init__(file_cache_coroutine=file_cache_coroutine)
+        self.pdf_read_coroutine = pdf_read_coroutine or _read_pdf_file
+        self.html_read_coroutine = html_read_coroutine or _read_html_file
+        self.pdf_read_kwargs = pdf_read_kwargs or {}
+        self.html_read_kwargs = html_read_kwargs or {}
+        self.pdf_ocr_read_coroutine = pdf_ocr_read_coroutine
         self.doc_attrs = doc_attrs or {}
 
     async def _fetch_doc(self, source):
@@ -509,3 +602,12 @@ async def _fetch_doc_with_url_in_metadata(self, source):
 
 class AsyncFileLoader(AsyncWebFileLoader):
     """Alias for AsyncWebFileLoader (for backward compatibility)"""
+
+
+def _header_from_template(header_template):
+    """Compile header from user or default template"""
+    headers = header_template or DEFAULT_HEADERS
+    headers = dict(headers)
+    if not headers.get("User-Agent"):
+        headers["User-Agent"] = UserAgent().random
+    return headers
diff --git a/elm/web/search/run.py b/elm/web/search/run.py
index a997dac7..6a4e4449 100644
--- a/elm/web/search/run.py
+++ b/elm/web/search/run.py
@@ -8,7 +8,7 @@
 from itertools import zip_longest, chain
 from contextlib import AsyncExitStack
 
-from elm.web.file_loader import AsyncFileLoader
+from elm.web.file_loader import AsyncWebFileLoader
 from elm.web.search.bing import PlaywrightBingLinkSearch
 from elm.web.search.duckduckgo import (APIDuckDuckGoSearch,
                                        PlaywrightDuckDuckGoLinkSearch)
@@ -125,7 +125,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
         search failed). By default, ``None``.
     **kwargs
         Keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader`. This input can
+        :class:`elm.web.file_loader.AsyncWebFileLoader`. This input can
         also include and any/all of the following keywords:
 
             - ddg_api_kwargs
@@ -145,7 +145,7 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
         keyword-argument pairs that you can use to initialize the search
         engines in the `search_engines` input. If ``pw_launch_kwargs``
         is detected, it will be added to the kwargs for all of the
-        PLaywright-based search engines so that you do not have to
+        Playwright-based search engines so that you do not have to
         repeatedly specify the launch parameters. For example, you may
         specify ``pw_launch_kwargs={"headless": False}`` to
         have all Playwright-based searches show the browser and _also_
@@ -173,7 +173,11 @@ async def web_search_links_as_docs(queries, search_engines=_DEFAULT_SE,
         await on_search_complete_hook(urls)
 
     logger.debug("Downloading documents for URLS: \n\t-%s", "\n\t-".join(urls))
-    docs = await load_docs(urls, browser_semaphore, **kwargs)
+    logger.trace("kwargs for AsyncWebFileLoader:\n%s",
+                 pprint.PrettyPrinter().pformat(kwargs))
+    file_loader = AsyncWebFileLoader(browser_semaphore=browser_semaphore,
+                                     **kwargs)
+    docs = await load_docs(urls, file_loader)
     return docs
 
 
@@ -288,20 +292,15 @@ async def search_with_fallback(queries, search_engines=_DEFAULT_SE,
     return set()
 
 
-async def load_docs(urls, browser_semaphore=None, **kwargs):
+async def load_docs(sources, file_loader):
     """Load a document for each input URL
 
     Parameters
     ----------
-    urls : iterable of str
-        Iterable of URL's (as strings) to fetch.
-    browser_semaphore : :class:`asyncio.Semaphore`, optional
-        Semaphore instance that can be used to limit the number of
-        playwright browsers open concurrently for document retrieval. If
-        ``None``, no limits are applied. By default, ``None``.
-    kwargs
-        Keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader`.
+    sources : iterable of str
+        Iterable of URL's or filepaths (as strings) to fetch.
+    file_loader : class:`elm.web.file_loader.AsyncWebFileLoader`
+        File loader instance used to fetch content from URL's.
 
     Returns
     -------
@@ -310,18 +309,20 @@ async def load_docs(urls, browser_semaphore=None, **kwargs):
         the URL's. If a URL could not be fetched (i.e. document instance
         is empty), it will not be included in the output list.
     """
-    logger.trace("Downloading docs for the following URL's:\n%r", urls)
-    logger.trace("kwargs for AsyncFileLoader:\n%s",
-                 pprint.PrettyPrinter().pformat(kwargs))
-    file_loader = AsyncFileLoader(browser_semaphore=browser_semaphore,
-                                  **kwargs)
-    docs = await file_loader.fetch_all(*urls)
-
-    page_lens = {doc.attrs.get("source", "Unknown"): len(doc.pages)
-                 for doc in docs}
+    logger.trace("Downloading docs for the following sources:\n%r", sources)
+    docs = await file_loader.fetch_all(*sources)
+    logger.debug("Loaded %d docs from %d sources", len(docs), len(sources))
+    docs = [doc for doc in docs if not doc.empty]
+    logger.debug("%d docs are not empty", len(docs))
+
+    page_lens = {}
+    for doc in docs:
+        source = doc.attrs.get("source", "Unknown")
+        page_lens.setdefault(source, []).append(len(doc.pages))
+    page_lens = {k: v if len(v) > 1 else v[0] for k, v in page_lens.items()}
     logger.debug("Loaded the following number of pages for docs:\n%s",
                  pprint.PrettyPrinter().pformat(page_lens))
-    return [doc for doc in docs if not doc.empty]
+    return docs
 
 
 async def _single_se_search(se_name, queries, num_urls, ignore_url_parts,
diff --git a/elm/web/website_crawl.py b/elm/web/website_crawl.py
index 09cccfcb..64dded48 100644
--- a/elm/web/website_crawl.py
+++ b/elm/web/website_crawl.py
@@ -16,7 +16,6 @@
 from crawl4ai.deep_crawling.filters import (FilterChain, URLPatternFilter,
                                             ContentTypeFilter, URLFilter)
 
-from elm.web.file_loader import AsyncFileLoader
 from elm.web.document import HTMLDocument
 
 
@@ -375,7 +374,7 @@ async def _arun_best_first(self, start_url, crawler, config):
 class ELMWebsiteCrawler:
     """Crawl a website for documents of interest"""
 
-    def __init__(self, validator, file_loader_kwargs=None,
+    def __init__(self, validator, async_file_loader,
                  browser_config_kwargs=None, crawl_strategy_kwargs=None,
                  crawler_config_kwargs=None, cte_kwargs=None,
                  extra_url_filters=None, include_external=False,
@@ -390,10 +389,9 @@ def __init__(self, validator, file_loader_kwargs=None,
             indicating whether the text passes the validation check.
             This is used to determine whether or not to keep (i.e.
             return) the document.
-        file_loader_kwargs : dict, optional
-            Additional keyword-value argument pairs to pass to the
-            :class:`~elm.web.file_loader.AsyncFileLoader` class.
-            By default, ``None``.
+        async_file_loader : object
+            :class:`~elm.web.file_loader.BaseAsyncFileLoader` instance
+            used to fetch the content of the crawled pages.
         browser_config_kwargs : dict, optional
             Additional keyword-value argument pairs to pass to the
             :class:`crawl4ai.async_configs.BrowserConfig` class.
@@ -440,10 +438,7 @@ def __init__(self, validator, file_loader_kwargs=None,
         """
         self.validator = validator
         self.page_limit = page_limit or 2 * max_pages
-
-        flk = {"verify_ssl": False}
-        flk.update(file_loader_kwargs or {})
-        self.afl = AsyncFileLoader(**flk)
+        self.afl = async_file_loader
 
         bck = {"headless": True, "verbose": False}
         bck.update(browser_config_kwargs or {})
diff --git a/examples/web_information_retrieval/example_website_retrieval_atb.ipynb b/examples/web_information_retrieval/example_website_retrieval_atb.ipynb
index b84f11c4..2e26c58a 100644
--- a/examples/web_information_retrieval/example_website_retrieval_atb.ipynb
+++ b/examples/web_information_retrieval/example_website_retrieval_atb.ipynb
@@ -43,6 +43,7 @@
    "source": [
     "from crawl4ai.deep_crawling.filters import URLFilter\n",
     "from elm.web.website_crawl import ELMWebsiteCrawler\n",
+    "from elm.web.file_loader import AsyncWebFileLoader\n",
     "from rex import init_logger"
    ]
   },
@@ -194,9 +195,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "afl = AsyncWebFileLoader(verify_ssl=False)\n",
     "crawler = ELMWebsiteCrawler(empty_validator,\n",
+    "                            async_file_loader=afl,\n",
     "                            url_scorer=empty_link_scorer,\n",
-    "                            file_loader_kwargs={\"verify_ssl\": False},\n",
     "                            include_external=False,\n",
     "                            extra_url_filters=[ATBFilter()],\n",
     "                            crawl_strategy_kwargs={\"max_depth\": 3},\n",
@@ -417,7 +419,7 @@
    "outputs": [],
    "source": [
     "crawler = ELMWebsiteCrawler(llm_geothermal_validator,\n",
-    "                            file_loader_kwargs={\"verify_ssl\": False},\n",
+    "                            async_file_loader=afl,\n",
     "                            extra_url_filters=[ATBFilter()],\n",
     "                            url_scorer=geothermal_link_scorer,\n",
     "                            crawl_strategy_kwargs={\"max_depth\": 3},\n",
diff --git a/requirements.txt b/requirements.txt
index 6c34ff6e..f0534f1c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,6 @@ langchain-text-splitters
 lxml
 matplotlib
 networkx
-nlr-rex
 nltk
 numpy
 pandas
diff --git a/setup.py b/setup.py
index d9eb11e2..c2e99f81 100644
--- a/setup.py
+++ b/setup.py
@@ -19,7 +19,7 @@
 with open("requirements.txt") as f:
     install_requires = f.readlines()
 
-
+examples_require = ["nlr-rex>=0.5.0"]
 test_requires = ["pytest>=5.2", "pytest-mock", "pytest-asyncio", "pytest-cov",
                  "flaky>=3.8.1"]
 description = "Energy Language Model"
@@ -49,7 +49,7 @@
     ],
     install_requires=install_requires,
     extras_require={
-        "dev": install_requires + test_requires,
+        "dev": install_requires + examples_require + test_requires,
     },
     entry_points={"console_scripts": ["elm=elm.cli:main"]}
 )
diff --git a/tests/ords/test_integrated.py b/tests/ords/test_integrated.py
index c8b03523..89acb820 100644
--- a/tests/ords/test_integrated.py
+++ b/tests/ords/test_integrated.py
@@ -22,7 +22,7 @@
 from elm.ords.services.provider import RunningAsyncServices
 from elm.ords.utilities.queued_logging import LocationFileLog, LogListener
 from elm.web.search.yahoo import PlaywrightYahooLinkSearch
-from elm.web.file_loader import AsyncFileLoader
+from elm.web.file_loader import AsyncWebFileLoader
 from elm.web.document import HTMLDocument
 
 
@@ -34,6 +34,7 @@ def __init__(self, read_return):
         self.read_return = read_return
         self.content_type = "application/pdf"
         self.charset = "utf-8"
+        self.headers = {}
 
     async def read(self):
         return self.read_return
@@ -202,7 +203,7 @@ async def search_location_with_logs(
 
 @pytest.mark.asyncio
 async def test_async_file_loader_with_temp_cache(monkeypatch):
-    """Test `AsyncFileLoader` with a `TempFileCache` service"""
+    """Test `AsyncWebFileLoader` with a `TempFileCache` service"""
 
     monkeypatch.setattr(
         aiohttp.ClientSession,
@@ -223,7 +224,7 @@ async def test_async_file_loader_with_temp_cache(monkeypatch):
     truth = HTMLDocument([content])
 
     async with RunningAsyncServices([TempFileCache()]):
-        loader = AsyncFileLoader(file_cache_coroutine=TempFileCache.call)
+        loader = AsyncWebFileLoader(file_cache_coroutine=TempFileCache.call)
         doc = await loader.fetch("Whatcom")
         assert doc.text == truth.text
         assert doc.attrs["source"] == "Whatcom"
diff --git a/tests/web/search/test_web_search_run.py b/tests/web/search/test_web_search_run.py
index b5a4d9b4..01fd1aa2 100644
--- a/tests/web/search/test_web_search_run.py
+++ b/tests/web/search/test_web_search_run.py
@@ -8,6 +8,7 @@
                                 _init_se, load_docs)
 from elm.web.search.google import (APIGoogleCSESearch,
                                    PlaywrightGoogleLinkSearch)
+from elm.web.file_loader import AsyncWebFileLoader
 from elm.exceptions import ELMKeyError
 
 
@@ -63,7 +64,7 @@ def test_init_se_does_not_pop_kwargs():
 @pytest.mark.asyncio
 async def test_load_docs_empty():
     """Test loading docs for no URLs"""
-    assert await load_docs(set()) == []
+    assert await load_docs(set(), AsyncWebFileLoader()) == []
 
 
 @pytest.mark.asyncio
diff --git a/tests/web/test_web_crawling.py b/tests/web/test_web_crawling.py
index 1ce692e0..151f9cf8 100644
--- a/tests/web/test_web_crawling.py
+++ b/tests/web/test_web_crawling.py
@@ -7,6 +7,7 @@
 
 from elm.ords.validation.content import possibly_mentions_wind
 from elm.web.website_crawl import ELMLinkScorer, ELMWebsiteCrawler
+from elm.web.file_loader import AsyncWebFileLoader
 
 
 @pytest.mark.asyncio
@@ -24,7 +25,9 @@ async def validation(doc):
     async def found_enough_test_docs(out_docs):
         return len(out_docs) >= 1
 
+    afl = AsyncWebFileLoader(verify_ssl=False)
     crawler = ELMWebsiteCrawler(validator=validation,
+                                async_file_loader=afl,
                                 url_scorer=ELMLinkScorer(kw).score)
 
     out_docs = await crawler.run("https://www.elpasoco.com",
diff --git a/tests/web/test_web_document.py b/tests/web/test_web_document.py
index 5faa7593..e3c53643 100644
--- a/tests/web/test_web_document.py
+++ b/tests/web/test_web_document.py
@@ -8,7 +8,7 @@
 import pandas as pd
 
 from elm import TEST_DATA_DIR
-from elm.web.document import PDFDocument, HTMLDocument
+from elm.web.document import PDFDocument, HTMLDocument, MDDocument
 
 
 class TestSplitter:
@@ -19,7 +19,9 @@ def split_text(self, text):
         return text.split("\n")
 
 
-@pytest.mark.parametrize("doc_type", [PDFDocument, HTMLDocument])
+@pytest.mark.parametrize(
+    "doc_type", [PDFDocument, HTMLDocument, MDDocument]
+)
 def test_basic_document(doc_type):
     """Test basic properties of the `Document` class"""
 
@@ -100,6 +102,54 @@ def test_html_doc_with_splitter():
     assert len(doc.raw_pages) == og_text.count("\n") + 1
 
 
+def test_markdown_doc_removes_comments():
+    """Test markdown comment stripping during cleaning"""
+
+    pages = [
+        "# Heading\nVisible text\n<!-- hidden comment -->",
+        "More text\n<!-- another\ncomment -->\nFinal line",
+    ]
+
+    doc = MDDocument(pages)
+
+    assert "hidden comment" not in doc.text
+    assert "another\ncomment" not in doc.text
+    assert "Visible text" in doc.text
+    assert "Final line" in doc.text
+    assert doc.raw_pages == pages
+
+
+def test_markdown_doc_keeps_comments_when_disabled():
+    """Test markdown comments remain when comment removal is disabled"""
+
+    page = "Visible text\n<!-- hidden comment -->\nFinal line"
+
+    doc = MDDocument([page], remove_comments=False)
+
+    assert doc.text == page
+
+
+@pytest.mark.parametrize("remove_comments", [True, False])
+def test_markdown_doc_empty_ignores_comments(remove_comments):
+    """Test empty check ignores markdown comments for all settings"""
+
+    page = "<!-- comment text that would otherwise count as content -->"
+
+    doc = MDDocument([page], remove_comments=remove_comments)
+
+    assert doc.empty
+
+
+def test_markdown_doc_with_splitter():
+    """Test markdown raw pages with a text splitter"""
+
+    pages = ["# Heading\n\nBody line", "Tail section"]
+
+    doc = MDDocument(pages, text_splitter=TestSplitter())
+
+    assert doc.raw_pages == ["# Heading", "", "Body line", "", "Tail section"]
+
+
 def test_doc_repr():
     """Test document repr method"""
 
@@ -143,6 +193,7 @@ def test_doc_is_empty(pages):
 
     assert PDFDocument(pages).empty
     assert HTMLDocument(pages).empty
+    assert MDDocument(pages).empty
 
 
 def test_html_string_is_empty_doc():
diff --git a/tests/web/test_web_file_loader.py b/tests/web/test_web_file_loader.py
index cc2f2451..778154cc 100644
--- a/tests/web/test_web_file_loader.py
+++ b/tests/web/test_web_file_loader.py
@@ -29,6 +29,7 @@ def __init__(self, read_return):
         self.read_return = read_return
         self.content_type = "application/pdf"
         self.charset = "utf-8"
+        self.headers = {}
 
     async def read(self):
         """Return what class was initialized with."""
@@ -58,7 +59,7 @@ async def patched_get_html(url, *args, **kwargs):
 
 @pytest.mark.asyncio
 async def test_async_file_loader_basic_pdf(monkeypatch):
-    """Test `AsyncFileLoader` for a basic PDF doc"""
+    """Test `AsyncWebFileLoader` for a basic PDF doc"""
 
     monkeypatch.setattr(
         aiohttp.ClientSession,
@@ -82,7 +83,7 @@ async def test_async_file_loader_basic_pdf(monkeypatch):
 
 @pytest.mark.asyncio
 async def test_async_file_loader_basic_html(monkeypatch):
-    """Test `AsyncFileLoader` for a basic HTML doc"""
+    """Test `AsyncWebFileLoader` for a basic HTML doc"""
 
     monkeypatch.setattr(
         aiohttp.ClientSession,