NatLabRockies · ppinchuk · May 3, 2026 · Apr 27, 2026 · Apr 27, 2026 · Apr 27, 2026
diff --git a/.github/workflows/pytest_ords.yml b/.github/workflows/pytest_ords.yml
@@ -34,6 +34,7 @@ jobs:
       shell: bash
     - name: Install dependencies
       run: |
+        conda install -y pip
         conda install -c conda-forge poppler
         python -m pip install --upgrade pip
         python -m pip install pdftotext

diff --git a/.github/workflows/pytest_postgres.yml b/.github/workflows/pytest_postgres.yml
@@ -26,6 +26,7 @@ jobs:
     - name: Install dependencies
       shell: bash -l {0}
       run: |
+        conda install -y pip
         python -m pip install --upgrade pip
         python -m pip install psycopg2-binary
         python -m pip install boto3

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -141,7 +141,7 @@
 # -- Options for HTMLHelp output ---------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'rexdoc'
+htmlhelp_basename = 'elmdoc'
 
 # -- Options for LaTeX output ------------------------------------------------
 

diff --git a/docs/source/dev/ords_architecture.rst b/docs/source/dev/ords_architecture.rst
@@ -363,7 +363,7 @@ for multiprocessing tasks.
 
 --------------------------------------------------------------------------------------------------------------------------------------------------
 
-**4.2.2** :class:`~elm.web.file_loader.AsyncFileLoader`
+**4.2.2** :class:`~elm.web.file_loader.AsyncWebFileLoader`
 -------------------------------------------------------
 
 .. literalinclude:: ../../../elm/web/file_loader.py
@@ -376,10 +376,10 @@ for multiprocessing tasks.
 .. code-block:: python
 
     import asyncio
-    from elm.web.file_loader import AsyncFileLoader
+    from elm.web.file_loader import AsyncWebFileLoader
 
     async def main():
-        loader = AsyncFileLoader()
+        loader = AsyncWebFileLoader()
         doc = await loader.fetch(
             url="https://en.wikipedia.org/wiki/National_Renewable_Energy_Laboratory"
         )
@@ -781,7 +781,7 @@ We give a rough breakdown of the following call:
 1. :func:`~elm.web.search.run.web_search_links_as_docs()` is invoked with 3 queries and ``num_urls=4``.
 2. Each of the three queries are processed asynchronously, creating a :class:`~elm.web.google_search.PlaywrightGoogleLinkSearch` instance and retrieving the top URL results.
 3. Internal code reduces the URL lists returned from each of the queries into the top 4 URLs.
-4. :class:`~elm.web.file_loader.AsyncFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored
+4. :class:`~elm.web.file_loader.AsyncWebFileLoader` asynchronously downloads the content for reach of the top 4 URLs, determines the document type the content should be stored
    in (:class:`~elm.web.document.HTMLDocument` or :class:`~elm.web.document.PDFDocument`), creates and populates the document instances, and returns the document to the caller.
 
 **Sequence Diagram:**
@@ -791,7 +791,7 @@ We give a rough breakdown of the following call:
     sequenceDiagram
         participant A as web_search_links_as_docs()
         participant B as PlaywrightGoogleLinkSearch
-        participant D as AsyncFileLoader
+        participant D as AsyncWebFileLoader
         participant E as HTMLDocument
         participant F as PDFDocument
 

diff --git a/elm/ords/download.py b/elm/ords/download.py
@@ -48,9 +48,10 @@ async def download_county_ordinance(
         ordinance document. By default, ``5``.
     file_loader_kwargs : dict, optional
         Dictionary of keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. If found, the
-        "pw_launch_kwargs" key in these will also be used to initialize
-        the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
+        :class:`elm.web.file_loader.AsyncWebFileLoader` with. If found,
+        the "pw_launch_kwargs" key in these will also be used to
+        initialize the
+        :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
         used for the google URL search. By default, ``None``.
     browser_semaphore : :class:`asyncio.Semaphore`, optional
         Semaphore instance that can be used to limit the number of

diff --git a/elm/ords/process.py b/elm/ords/process.py
@@ -425,7 +425,7 @@ async def process_county_with_logging(
         ordinance document. By default, ``5``.
     file_loader_kwargs : dict, optional
         Dictionary of keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. The
+        :class:`elm.web.file_loader.AsyncWebFileLoader` with. The
         "pw_launch_kwargs" key in these will also be used to initialize
         the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
         used for the google URL search. By default, ``None``.
@@ -498,7 +498,7 @@ async def process_county(
         ordinance document. By default, ``5``.
     file_loader_kwargs : dict, optional
         Dictionary of keyword-argument pairs to initialize
-        :class:`elm.web.file_loader.AsyncFileLoader` with. The
+        :class:`elm.web.file_loader.AsyncWebFileLoader` with. The
         "pw_launch_kwargs" key in these will also be used to initialize
         the :class:`elm.web.google_search.PlaywrightGoogleLinkSearch`
         used for the google URL search. By default, ``None``.

diff --git a/elm/version.py b/elm/version.py
@@ -2,4 +2,4 @@
 ELM version number
 """
 
-__version__ = "0.0.38"
+__version__ = "0.0.39"
diff --git a/elm/web/document.py b/elm/web/document.py
@@ -38,7 +38,7 @@ class BaseDocument(ABC):
            and formats tables.
         3. Track pages and other document metadata.
     Key Relationships:
-        Created by :class:`~elm.web.file_loader.AsyncFileLoader` and
+        Created by :class:`~elm.web.file_loader.AsyncWebFileLoader` and
         used all over ordinance code.
 
     .. end desc
@@ -339,6 +339,63 @@ def _raw_pages(self):
         return self.text_splitter.split_text("\n\n".join(self.pages))
 
 
+class MDDocument(BaseDocument):
+    """ELM Markdown document"""
+
+    MARKDOWN_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
+    """Regex pattern to remove HTML comments from markdown text"""
+    WRITE_KWARGS = {"mode": "w", "encoding": "utf-8"}
+    FILE_EXTENSION = "md"
+
+    def __init__(self, pages, attrs=None, remove_comments=True,
+                 text_splitter=None):
+        """
+
+        Parameters
+        ----------
+        pages : iterable
+            Iterable of strings, where each string is a page of a
+            document.
+        attrs : dict, optional
+            Optional dict containing metadata for the document.
+            By default, ``None``.
+        remove_comments : bool, optional
+            Option remove HTML comments in Markdown text during
+            cleaning. By default, ``True``.
+        text_splitter : obj, optional
+            Instance of an object that implements a `split_text` method.
+            The method should take text as input (str) and return a list
+            of text chunks. The raw pages will be passed through this
+            splitter to create raw pages for this document. Langchain's
+            text splitters should work for this input.
+            By default, ``None``, which means the original pages input
+            becomes the raw pages attribute.
+        """
+        super().__init__(pages, attrs=attrs)
+        self.remove_comments = remove_comments
+        self.text_splitter = text_splitter
+
+    def _cleaned_text(self):
+        """Compute cleaned text from document"""
+        text = combine_pages(self.pages)
+        if self.remove_comments:
+            text = self.MARKDOWN_COMMENT_RE.sub("", text)
+        return text
+
+    def _raw_pages(self):
+        """Get raw pages from document"""
+        if self.text_splitter is None:
+            return self.pages
+        return self.text_splitter.split_text("\n\n".join(self.pages))
+
+    @property
+    def empty(self):
+        """bool: ``True`` if the document contains no pages."""
+        # Always strip comments when checking if doc is empty
+        return not any(_non_empty_pages((
+            self.MARKDOWN_COMMENT_RE.sub("", p) for p in self.pages)))
+
+
 def _non_empty_pages(pages):
     """Return all pages with more than 10 chars"""
     return filter(